PyPI - datacontract-cli - Versions diffs - 0.10.11__py3-none-any.whl → 0.10.12__py3-none-any.whl - Mend

datacontract-cli 0.10.11py3-none-any.whl → 0.10.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datacontract-cli might be problematic. Click here for more details.

Files changed (35) hide show

datacontract/cli.py +19 -3
datacontract/data_contract.py +5 -10
datacontract/engines/fastjsonschema/check_jsonschema.py +11 -0
datacontract/engines/fastjsonschema/s3/s3_read_files.py +2 -0
datacontract/engines/soda/check_soda_execute.py +2 -8
datacontract/engines/soda/connections/duckdb.py +23 -24
datacontract/engines/soda/connections/kafka.py +81 -23
datacontract/export/avro_converter.py +12 -2
datacontract/export/dbml_converter.py +3 -2
datacontract/export/exporter.py +1 -0
datacontract/export/exporter_factory.py +6 -0
datacontract/export/spark_converter.py +4 -0
datacontract/export/sql_type_converter.py +64 -29
datacontract/export/sqlalchemy_converter.py +169 -0
datacontract/imports/avro_importer.py +1 -0
datacontract/imports/bigquery_importer.py +2 -2
datacontract/imports/dbml_importer.py +112 -0
datacontract/imports/dbt_importer.py +67 -91
datacontract/imports/glue_importer.py +62 -58
datacontract/imports/importer.py +2 -1
datacontract/imports/importer_factory.py +5 -0
datacontract/imports/odcs_importer.py +1 -1
datacontract/imports/spark_importer.py +29 -10
datacontract/imports/sql_importer.py +1 -1
datacontract/imports/unity_importer.py +1 -1
datacontract/integration/{publish_datamesh_manager.py → datamesh_manager.py} +33 -5
datacontract/integration/{publish_opentelemetry.py → opentelemetry.py} +1 -1
datacontract/model/data_contract_specification.py +6 -2
{datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.12.dist-info}/METADATA +103 -28
{datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.12.dist-info}/RECORD +34 -33
{datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.12.dist-info}/WHEEL +1 -1
datacontract/publish/publish.py +0 -32
{datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.12.dist-info}/LICENSE +0 -0
{datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.12.dist-info}/entry_points.txt +0 -0
{datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.12.dist-info}/top_level.txt +0 -0

datacontract/imports/glue_importer.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import boto3
-from typing import List
+from typing import List, Dict, Generator
+import re
 from datacontract.imports.importer import Importer
 from datacontract.model.data_contract_specification import (
     DataContractSpecification,
@@ -13,7 +13,7 @@ from datacontract.model.data_contract_specification import (
 class GlueImporter(Importer):
     def import_source(
         self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
-    ) -> dict:
+    ) -> DataContractSpecification:
         return import_glue(data_contract_specification, source, import_args.get("glue_table"))
@@ -39,7 +39,7 @@ def get_glue_database(database_name: str):
     return (
         response["Database"]["CatalogId"],
-        response["Database"].get("LocationUri", "None"),
+        response["Database"].get("LocationUri"),
     )
@@ -75,7 +75,7 @@ def get_glue_tables(database_name: str) -> List[str]:
     return table_names
-def get_glue_table_schema(database_name: str, table_name: str):
+def get_glue_table_schema(database_name: str, table_name: str) -> List[Dict]:
     """Get the schema of a Glue table.
     Args:
@@ -93,11 +93,11 @@ def get_glue_table_schema(database_name: str, table_name: str):
         response = glue.get_table(DatabaseName=database_name, Name=table_name)
     except glue.exceptions.EntityNotFoundException:
         print(f"Table {table_name} not found in database {database_name}.")
-        return {}
+        return []
     except Exception as e:
         # todo catch all
         print(f"Error: {e}")
-        return {}
+        return []
     table_schema = response["Table"]["StorageDescriptor"]["Columns"]
@@ -109,10 +109,9 @@ def get_glue_table_schema(database_name: str, table_name: str):
                     "Name": pk["Name"],
                     "Type": pk["Type"],
                     "Hive": True,
-                    "Comment": "Partition Key",
+                    "Comment": pk.get("Comment"),
                 }
             )
     return table_schema
@@ -120,7 +119,7 @@ def import_glue(
     data_contract_specification: DataContractSpecification,
     source: str,
     table_names: List[str],
-):
+) -> DataContractSpecification:
     """Import the schema of a Glue database.
     Args:
@@ -140,8 +139,13 @@ def import_glue(
     if table_names is None:
         table_names = get_glue_tables(source)
+    server_kwargs = {"type": "glue", "account": catalogid, "database": source}
+    if location_uri:
+        server_kwargs["location"] = location_uri
     data_contract_specification.servers = {
-        "production": Server(type="glue", account=catalogid, database=source, location=location_uri),
+        "production": Server(**server_kwargs),
     }
     for table_name in table_names:
@@ -161,12 +165,6 @@ def import_glue(
             field.description = column.get("Comment")
             fields[column["Name"]] = field
-            if "decimal" in column["Type"]:
-                # Extract precision and scale from the string
-                perc_scale = column["Type"][8:-1].split(",")
-                field.precision = int(perc_scale[0])
-                field.scale = int(perc_scale[1])
         data_contract_specification.models[table_name] = Model(
             type="table",
             fields=fields,
@@ -186,27 +184,43 @@ def create_typed_field(dtype: str) -> Field:
     """
     field = Field()
     dtype = dtype.strip().lower().replace(" ", "")
-    if dtype.startswith(("array", "struct", "map")):
-        orig_dtype: str = dtype
-        if dtype.startswith("array"):
-            field.type = "array"
-            field.items = create_typed_field(orig_dtype[6:-1])
-        elif dtype.startswith("struct"):
-            field.type = "struct"
-            for f in split_struct(orig_dtype[7:-1]):
-                field.fields[f.split(":", 1)[0].strip()] = create_typed_field(f.split(":", 1)[1])
-        elif dtype.startswith("map"):
-            field.type = "map"
-            key_type = orig_dtype[4:-1].split(",", 1)[0]
-            value_type = orig_dtype[4:-1].split(",", 1)[1]
+    # Example: array<string>
+    if dtype.startswith("array"):
+        field.type = "array"
+        field.items = create_typed_field(dtype[6:-1])
+    # Example: struct<field1:float,field2:string>
+    elif dtype.startswith("struct"):
+        field.type = "struct"
+        for f in split_struct(dtype[7:-1]):
+            field_name, field_key = f.split(":", 1)
+            field.fields[field_name] = create_typed_field(field_key)
+    # Example: map<string,int>
+    elif dtype.startswith("map"):
+        field.type = "map"
+        map_match = re.match(r"map<(.+?),\s*(.+)>", dtype)
+        if map_match:
+            key_type = map_match.group(1)
+            value_type = map_match.group(2)
             field.keys = create_typed_field(key_type)
             field.values = create_typed_field(value_type)
+    # Example: decimal(38, 6) or decimal
+    elif dtype.startswith("decimal"):
+        field.type = "decimal"
+        decimal_match = re.match(r"decimal\((\d+),\s*(\d+)\)", dtype)
+        if decimal_match:  # if precision specified
+            field.precision = int(decimal_match.group(1))
+            field.scale = int(decimal_match.group(2))
+    # Example: varchar(255) or varchar
+    elif dtype.startswith("varchar"):
+        field.type = "varchar"
+        if len(dtype) > 7:
+            field.maxLength = int(dtype[8:-1])
     else:
         field.type = map_type_from_sql(dtype)
     return field
-def split_fields(s: str):
+def split_fields(s: str) -> Generator[str, None, None]:
     """Split a string of fields considering nested structures.
     Args:
@@ -253,30 +267,20 @@ def map_type_from_sql(sql_type: str) -> str:
         return None
     sql_type = sql_type.lower()
-    if sql_type.startswith("varchar"):
-        return "varchar"
-    if sql_type.startswith("string"):
-        return "string"
-    if sql_type.startswith("text"):
-        return "text"
-    if sql_type.startswith("byte"):
-        return "byte"
-    if sql_type.startswith("short"):
-        return "short"
-    if sql_type.startswith("integer") or sql_type.startswith("int"):
-        return "integer"
-    if sql_type.startswith("long") or sql_type.startswith("bigint"):
-        return "long"
-    if sql_type.startswith("float"):
-        return "float"
-    if sql_type.startswith("double"):
-        return "double"
-    if sql_type.startswith("boolean"):
-        return "boolean"
-    if sql_type.startswith("timestamp"):
-        return "timestamp"
-    if sql_type.startswith("date"):
-        return "date"
-    if sql_type.startswith("decimal"):
-        return "decimal"
-    return "variant"
+    type_mapping = {
+        "string": "string",
+        "int": "int",
+        "bigint": "bigint",
+        "float": "float",
+        "double": "double",
+        "boolean": "boolean",
+        "timestamp": "timestamp",
+        "date": "date",
+    }
+    for prefix, mapped_type in type_mapping.items():
+        if sql_type.startswith(prefix):
+            return mapped_type
+    return "unknown"

datacontract/imports/importer.py CHANGED Viewed

@@ -14,7 +14,7 @@ class Importer(ABC):
         data_contract_specification: DataContractSpecification,
         source: str,
         import_args: dict,
-    ) -> dict:
+    ) -> DataContractSpecification:
         pass
@@ -22,6 +22,7 @@ class ImportFormat(str, Enum):
     sql = "sql"
     avro = "avro"
     dbt = "dbt"
+    dbml = "dbml"
     glue = "glue"
     jsonschema = "jsonschema"
     bigquery = "bigquery"

datacontract/imports/importer_factory.py CHANGED Viewed

@@ -88,3 +88,8 @@ importer_factory.register_lazy_importer(
 importer_factory.register_lazy_importer(
     name=ImportFormat.dbt, module_path="datacontract.imports.dbt_importer", class_name="DbtManifestImporter"
 )
+importer_factory.register_lazy_importer(
+    name=ImportFormat.dbml,
+    module_path="datacontract.imports.dbml_importer",
+    class_name="DBMLImporter",
+)

datacontract/imports/odcs_importer.py CHANGED Viewed

@@ -46,7 +46,7 @@ DATACONTRACT_TYPES = [
 class OdcsImporter(Importer):
     def import_source(
         self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
-    ) -> dict:
+    ) -> DataContractSpecification:
         return import_odcs(data_contract_specification, source)

datacontract/imports/spark_importer.py CHANGED Viewed

@@ -14,7 +14,7 @@ class SparkImporter(Importer):
         data_contract_specification: DataContractSpecification,
         source: str,
         import_args: dict,
-    ) -> dict:
+    ) -> DataContractSpecification:
         """
         Imports data from a Spark source into the data contract specification.
@@ -63,12 +63,12 @@ def import_from_spark_df(df: DataFrame) -> Model:
     schema = df.schema
     for field in schema:
-        model.fields[field.name] = _field_from_spark(field)
+        model.fields[field.name] = _field_from_struct_type(field)
     return model
-def _field_from_spark(spark_field: types.StructField) -> Field:
+def _field_from_struct_type(spark_field: types.StructField) -> Field:
     """
     Converts a Spark StructField into a Field object for the data contract.
@@ -76,18 +76,35 @@ def _field_from_spark(spark_field: types.StructField) -> Field:
         spark_field: The Spark StructField to convert.
     Returns:
-        Field: The corresponding Field object.
+        Field: The generated Field object.
     """
-    field_type = _data_type_from_spark(spark_field.dataType)
     field = Field()
-    field.type = field_type
     field.required = not spark_field.nullable
+    return _type_from_data_type(field, spark_field.dataType)
-    if field_type == "array":
-        field.items = _field_from_spark(spark_field.dataType.elementType)
-    if field_type == "struct":
-        field.fields = {sf.name: _field_from_spark(sf) for sf in spark_field.dataType.fields}
+def _type_from_data_type(field: Field, spark_type: types.DataType) -> Field:
+    """
+    Maps Spark data types to the Data Contract type system and updates the field.
+    Args:
+        field: The Field object to update.
+        spark_type: The Spark data type to map.
+    Returns:
+        Field: The updated Field object.
+    """
+    field.type = _data_type_from_spark(spark_type)
+    if field.type == "array":
+        field.items = _type_from_data_type(Field(required=not spark_type.containsNull), spark_type.elementType)
+    elif field.type == "map":
+        field.keys = _type_from_data_type(Field(required=True), spark_type.keyType)
+        field.values = _type_from_data_type(Field(required=not spark_type.valueContainsNull), spark_type.valueType)
+    elif field.type == "struct":
+        field.fields = {sf.name: _field_from_struct_type(sf) for sf in spark_type.fields}
     return field
@@ -116,6 +133,8 @@ def _data_type_from_spark(spark_type: types.DataType) -> str:
         return "struct"
     elif isinstance(spark_type, types.ArrayType):
         return "array"
+    elif isinstance(spark_type, types.MapType):
+        return "map"
     elif isinstance(spark_type, types.TimestampType):
         return "timestamp"
     elif isinstance(spark_type, types.TimestampNTZType):

datacontract/imports/sql_importer.py CHANGED Viewed

@@ -7,7 +7,7 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
 class SqlImporter(Importer):
     def import_source(
         self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
-    ) -> dict:
+    ) -> DataContractSpecification:
         return import_sql(data_contract_specification, self.import_format, source)

datacontract/imports/unity_importer.py CHANGED Viewed

@@ -11,7 +11,7 @@ from datacontract.model.exceptions import DataContractException
 class UnityImporter(Importer):
     def import_source(
         self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
-    ) -> dict:
+    ) -> DataContractSpecification:
         if source is not None:
             data_contract_specification = import_unity_from_json(data_contract_specification, source)
         else:

datacontract/integration/{publish_datamesh_manager.py → datamesh_manager.py} RENAMED Viewed

@@ -2,28 +2,29 @@ import os
 import requests
+from datacontract.model.data_contract_specification import DataContractSpecification
 from datacontract.model.run import Run
-def publish_datamesh_manager(run: Run, publish_url: str):
+def publish_test_results_to_datamesh_manager(run: Run, publish_url: str):
     try:
         if publish_url is None:
             # this url supports Data Mesh Manager and Data Contract Manager
             url = "https://api.datamesh-manager.com/api/test-results"
         else:
             url = publish_url
         api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
         if api_key is None:
             api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY")
-        if run.dataContractId is None:
-            raise Exception("Cannot publish run results, as data contract ID is unknown")
         if api_key is None:
             raise Exception(
                 "Cannot publish run results, as DATAMESH_MANAGER_API_KEY nor DATACONTRACT_MANAGER_API_KEY are not set"
             )
+        if run.dataContractId is None:
+            raise Exception("Cannot publish run results, as data contract ID is unknown")
         headers = {"Content-Type": "application/json", "x-api-key": api_key}
         request_body = run.model_dump_json()
         # print("Request Body:", request_body)
@@ -36,3 +37,30 @@ def publish_datamesh_manager(run: Run, publish_url: str):
         run.log_info(f"Published test results to {url}")
     except Exception as e:
         run.log_error(f"Failed publishing test results. Error: {str(e)}")
+def publish_data_contract_to_datamesh_manager(data_contract_specification: DataContractSpecification):
+    try:
+        api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
+        if api_key is None:
+            api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY")
+        if api_key is None:
+            raise Exception(
+                "Cannot publish data contract, as neither DATAMESH_MANAGER_API_KEY nor DATACONTRACT_MANAGER_API_KEY is set"
+            )
+        headers = {"Content-Type": "application/json", "x-api-key": api_key}
+        spec = data_contract_specification
+        id = spec.id
+        url = "https://api.datamesh-manager.com/api/datacontracts/{0}".format(id)
+        request_body = spec.model_dump_json().encode("utf-8")
+        response = requests.put(
+            url=url,
+            data=request_body,
+            headers=headers,
+        )
+        if response.status_code != 200:
+            print(f"Error publishing data contract to Data Mesh Manager: {response.text}")
+            exit(1)
+        print(f"Published data contract to {url}")
+    except Exception as e:
+        print(f"Failed publishing data contract. Error: {str(e)}")

datacontract/integration/{publish_opentelemetry.py → opentelemetry.py} RENAMED Viewed

@@ -34,7 +34,7 @@ from datacontract.model.run import Run
 # - Metrics only, no logs yet (but loosely planned)
-def publish_opentelemetry(run: Run):
+def publish_test_results_to_opentelemetry(run: Run):
     try:
         if run.dataContractId is None:
             raise Exception("Cannot publish run results, as data contract ID is unknown")

datacontract/model/data_contract_specification.py CHANGED Viewed

@@ -73,7 +73,7 @@ class Definition(pyd.BaseModel):
     exclusiveMaximum: int = None
     pii: bool = None
     classification: str = None
-    fields: Dict[str, "Definition"] = {}
+    fields: Dict[str, "Field"] = {}
     tags: List[str] = []
     links: Dict[str, str] = {}
     example: str = None
@@ -239,4 +239,8 @@ class DataContractSpecification(pyd.BaseModel):
         return DataContractSpecification(**data)
     def to_yaml(self):
-        return yaml.dump(self.model_dump(exclude_defaults=True, exclude_none=True), sort_keys=False, allow_unicode=True)
+        return yaml.dump(
+            self.model_dump(exclude_defaults=True, exclude_none=True, by_alias=True),
+            sort_keys=False,
+            allow_unicode=True,
+        )

datacontract-cli 0.10.11__py3-none-any.whl → 0.10.12__py3-none-any.whl

Potentially problematic release.

datacontract-cli 0.10.11py3-none-any.whl → 0.10.12py3-none-any.whl