PyPI - datacontract-cli - Versions diffs - 0.10.9__py3-none-any.whl → 0.10.11__py3-none-any.whl - Mend

datacontract-cli 0.10.9py3-none-any.whl → 0.10.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datacontract-cli might be problematic. Click here for more details.

Files changed (32) hide show

datacontract/cli.py +7 -0
datacontract/data_contract.py +16 -9
datacontract/engines/fastjsonschema/check_jsonschema.py +4 -1
datacontract/engines/soda/check_soda_execute.py +5 -2
datacontract/engines/soda/connections/duckdb.py +20 -12
datacontract/engines/soda/connections/snowflake.py +8 -5
datacontract/export/avro_converter.py +1 -1
datacontract/export/dbml_converter.py +41 -19
datacontract/export/exporter.py +1 -1
datacontract/export/jsonschema_converter.py +1 -4
datacontract/export/sodacl_converter.py +1 -1
datacontract/imports/avro_importer.py +142 -8
datacontract/imports/dbt_importer.py +117 -0
datacontract/imports/glue_importer.py +9 -3
datacontract/imports/importer.py +7 -2
datacontract/imports/importer_factory.py +24 -6
datacontract/imports/jsonschema_importer.py +106 -117
datacontract/imports/spark_importer.py +134 -0
datacontract/imports/sql_importer.py +4 -0
datacontract/integration/publish_datamesh_manager.py +10 -5
datacontract/lint/resolve.py +72 -27
datacontract/lint/schema.py +24 -4
datacontract/model/data_contract_specification.py +3 -0
datacontract/templates/datacontract.html +1 -1
datacontract/templates/index.html +1 -1
datacontract/templates/partials/model_field.html +10 -2
{datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.11.dist-info}/METADATA +300 -192
{datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.11.dist-info}/RECORD +32 -30
{datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.11.dist-info}/WHEEL +1 -1
{datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.11.dist-info}/LICENSE +0 -0
{datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.11.dist-info}/entry_points.txt +0 -0
{datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.11.dist-info}/top_level.txt +0 -0

datacontract/imports/glue_importer.py CHANGED Viewed

@@ -14,7 +14,7 @@ class GlueImporter(Importer):
     def import_source(
         self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
     ) -> dict:
-        return import_glue(data_contract_specification, source, import_args.get("glue_tables"))
+        return import_glue(data_contract_specification, source, import_args.get("glue_table"))
 def get_glue_database(database_name: str):
@@ -154,7 +154,7 @@ def import_glue(
         for column in table_schema:
             field = create_typed_field(column["Type"])
-            # hive partitons are required, but are not primary keys
+            # hive partitions are required, but are not primary keys
             if column.get("Hive"):
                 field.required = True
@@ -186,7 +186,7 @@ def create_typed_field(dtype: str) -> Field:
     """
     field = Field()
     dtype = dtype.strip().lower().replace(" ", "")
-    if dtype.startswith(("array", "struct")):
+    if dtype.startswith(("array", "struct", "map")):
         orig_dtype: str = dtype
         if dtype.startswith("array"):
             field.type = "array"
@@ -195,6 +195,12 @@ def create_typed_field(dtype: str) -> Field:
             field.type = "struct"
             for f in split_struct(orig_dtype[7:-1]):
                 field.fields[f.split(":", 1)[0].strip()] = create_typed_field(f.split(":", 1)[1])
+        elif dtype.startswith("map"):
+            field.type = "map"
+            key_type = orig_dtype[4:-1].split(",", 1)[0]
+            value_type = orig_dtype[4:-1].split(",", 1)[1]
+            field.keys = create_typed_field(key_type)
+            field.values = create_typed_field(value_type)
     else:
         field.type = map_type_from_sql(dtype)
     return field

datacontract/imports/importer.py CHANGED Viewed

@@ -10,7 +10,10 @@ class Importer(ABC):
     @abstractmethod
     def import_source(
-        self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
+        self,
+        data_contract_specification: DataContractSpecification,
+        source: str,
+        import_args: dict,
     ) -> dict:
         pass
@@ -18,12 +21,14 @@ class Importer(ABC):
 class ImportFormat(str, Enum):
     sql = "sql"
     avro = "avro"
+    dbt = "dbt"
     glue = "glue"
     jsonschema = "jsonschema"
     bigquery = "bigquery"
     odcs = "odcs"
     unity = "unity"
+    spark = "spark"
     @classmethod
-    def get_suported_formats(cls):
+    def get_supported_formats(cls):
         return list(map(lambda c: c.value, cls))

datacontract/imports/importer_factory.py CHANGED Viewed

@@ -18,7 +18,7 @@ class ImporterFactory:
         importers = self.dict_importer.copy()
         importers.update(self.dict_lazy_importer.copy())
         if name not in importers.keys():
-            raise ValueError(f"The '{name}' format is not suportted.")
+            raise ValueError(f"The '{name}' format is not supported.")
         importer_class = importers[name]
         if type(importers[name]) is tuple:
             importer_class = load_module_class(module_path=importers[name][0], class_name=importers[name][1])
@@ -46,7 +46,9 @@ def load_module_class(module_path, class_name):
 importer_factory = ImporterFactory()
 importer_factory.register_lazy_importer(
-    name=ImportFormat.avro, module_path="datacontract.imports.avro_importer", class_name="AvroImporter"
+    name=ImportFormat.avro,
+    module_path="datacontract.imports.avro_importer",
+    class_name="AvroImporter",
 )
 importer_factory.register_lazy_importer(
     name=ImportFormat.bigquery,
@@ -54,7 +56,9 @@ importer_factory.register_lazy_importer(
     class_name="BigQueryImporter",
 )
 importer_factory.register_lazy_importer(
-    name=ImportFormat.glue, module_path="datacontract.imports.glue_importer", class_name="GlueImporter"
+    name=ImportFormat.glue,
+    module_path="datacontract.imports.glue_importer",
+    class_name="GlueImporter",
 )
 importer_factory.register_lazy_importer(
     name=ImportFormat.jsonschema,
@@ -62,11 +66,25 @@ importer_factory.register_lazy_importer(
     class_name="JsonSchemaImporter",
 )
 importer_factory.register_lazy_importer(
-    name=ImportFormat.odcs, module_path="datacontract.imports.odcs_importer", class_name="OdcsImporter"
+    name=ImportFormat.odcs,
+    module_path="datacontract.imports.odcs_importer",
+    class_name="OdcsImporter",
 )
 importer_factory.register_lazy_importer(
-    name=ImportFormat.sql, module_path="datacontract.imports.sql_importer", class_name="SqlImporter"
+    name=ImportFormat.sql,
+    module_path="datacontract.imports.sql_importer",
+    class_name="SqlImporter",
 )
 importer_factory.register_lazy_importer(
-    name=ImportFormat.unity, module_path="datacontract.imports.unity_importer", class_name="UnityImporter"
+    name=ImportFormat.unity,
+    module_path="datacontract.imports.unity_importer",
+    class_name="UnityImporter",
+)
+importer_factory.register_lazy_importer(
+    name=ImportFormat.spark,
+    module_path="datacontract.imports.spark_importer",
+    class_name="SparkImporter",
+)
+importer_factory.register_lazy_importer(
+    name=ImportFormat.dbt, module_path="datacontract.imports.dbt_importer", class_name="DbtManifestImporter"
 )

datacontract/imports/jsonschema_importer.py CHANGED Viewed

@@ -10,137 +10,49 @@ from datacontract.model.exceptions import DataContractException
 class JsonSchemaImporter(Importer):
     def import_source(
         self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
-    ) -> dict:
+    ) -> DataContractSpecification:
         return import_jsonschema(data_contract_specification, source)
-def convert_json_schema_properties(properties, is_definition=False):
-    fields = {}
-    for field_name, field_schema in properties.items():
-        field_kwargs = {}
-        field_type = field_schema.get("type")
-        # Determine if the field is required and set the type to the non-null option if applicable
-        if isinstance(field_type, list) and "null" in field_type:
-            field_kwargs["required"] = False
-            non_null_types = [t for t in field_type if t != "null"]
-            if non_null_types:
-                field_type = non_null_types[0]
-            else:
-                field_type = None
-        else:
-            field_kwargs["required"] = True
-        # Set the non-null type
-        if field_type:
-            field_kwargs["type"] = field_type
-        for key, value in field_schema.items():
-            match key:
-                case "title":
-                    field_kwargs["title"] = value
-                case "type":
-                    pass  # type is already handled above
-                case "format":
-                    field_kwargs["format"] = value
-                case "description":
-                    field_kwargs["description"] = value
-                case "pattern":
-                    field_kwargs["pattern"] = value
-                case "minLength":
-                    field_kwargs["minLength"] = value
-                case "maxLength":
-                    field_kwargs["maxLength"] = value
-                case "minimum":
-                    field_kwargs["minimum"] = value
-                case "exclusiveMinimum":
-                    field_kwargs["exclusiveMinimum"] = value
-                case "maximum":
-                    field_kwargs["maximum"] = value
-                case "exclusiveMaximum":
-                    field_kwargs["exclusiveMaximum"] = value
-                case "enum":
-                    field_kwargs["enum"] = value
-                case "tags":
-                    field_kwargs["tags"] = value
-                case "properties":
-                    field_kwargs["fields"] = convert_json_schema_properties(value)
-                case "items":
-                    field_kwargs["items"] = convert_json_schema_properties(value)
-        field = Field(**field_kwargs)
-        fields[field_name] = field
-    return fields
 def import_jsonschema(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
     if data_contract_specification.models is None:
         data_contract_specification.models = {}
+    json_schema = load_and_validate_json_schema(source)
+    title = json_schema.get("title", "default_model")
+    description = json_schema.get("description")
+    type_ = json_schema.get("type")
+    properties = json_schema.get("properties", {})
+    required_properties = json_schema.get("required", [])
+    fields_kwargs = jsonschema_to_args(properties, required_properties)
+    fields = {name: Field(**kwargs) for name, kwargs in fields_kwargs.items()}
+    model = Model(description=description, type=type_, title=title, fields=fields)
+    data_contract_specification.models[title] = model
+    definitions = json_schema.get("definitions", {})
+    for name, schema in definitions.items():
+        kwargs = schema_to_args(schema)
+        data_contract_specification.definitions[name] = Definition(name=name, **kwargs)
+    return data_contract_specification
+def load_and_validate_json_schema(source):
     try:
         with open(source, "r") as file:
             json_schema = json.loads(file.read())
-            validator = fastjsonschema.compile({})
-            validator(json_schema)
-            model = Model(
-                description=json_schema.get("description"),
-                type=json_schema.get("type"),
-                title=json_schema.get("title"),
-                fields=convert_json_schema_properties(json_schema.get("properties", {})),
-            )
-            data_contract_specification.models[json_schema.get("title", "default_model")] = model
-            if "definitions" in json_schema:
-                for def_name, def_schema in json_schema["definitions"].items():
-                    definition_kwargs = {}
-                    for key, value in def_schema.items():
-                        match key:
-                            case "domain":
-                                definition_kwargs["domain"] = value
-                            case "title":
-                                definition_kwargs["title"] = value
-                            case "description":
-                                definition_kwargs["description"] = value
-                            case "type":
-                                definition_kwargs["type"] = value
-                            case "enum":
-                                definition_kwargs["enum"] = value
-                            case "format":
-                                definition_kwargs["format"] = value
-                            case "minLength":
-                                definition_kwargs["minLength"] = value
-                            case "maxLength":
-                                definition_kwargs["maxLength"] = value
-                            case "pattern":
-                                definition_kwargs["pattern"] = value
-                            case "minimum":
-                                definition_kwargs["minimum"] = value
-                            case "exclusiveMinimum":
-                                definition_kwargs["exclusiveMinimum"] = value
-                            case "maximum":
-                                definition_kwargs["maximum"] = value
-                            case "exclusiveMaximum":
-                                definition_kwargs["exclusiveMaximum"] = value
-                            case "pii":
-                                definition_kwargs["pii"] = value
-                            case "classification":
-                                definition_kwargs["classification"] = value
-                            case "tags":
-                                definition_kwargs["tags"] = value
-                            case "properties":
-                                definition_kwargs["fields"] = convert_json_schema_properties(value, is_definition=True)
-                    definition = Definition(name=def_name, **definition_kwargs)
-                    data_contract_specification.definitions[def_name] = definition
+        validator = fastjsonschema.compile({})
+        validator(json_schema)
     except fastjsonschema.JsonSchemaException as e:
         raise DataContractException(
             type="schema",
             name="Parse json schema",
-            reason=f"Failed to parse json schema from {source}: {e}",
+            reason=f"Failed to validate json schema from {source}: {e}",
             engine="datacontract",
         )
@@ -152,5 +64,82 @@ def import_jsonschema(data_contract_specification: DataContractSpecification, so
             engine="datacontract",
             original_exception=e,
         )
+    return json_schema
-    return data_contract_specification
+def jsonschema_to_args(properties, required_properties):
+    args = {}
+    for property, property_schema in properties.items():
+        is_required = property in required_properties
+        args[property] = schema_to_args(property_schema, is_required)
+    return args
+def schema_to_args(property_schema, is_required: bool = None) -> dict:
+    direct_mappings = {
+        "title",
+        "description",
+        "format",
+        "pattern",
+        "enum",
+        "tags",
+        "pii",
+        "minLength",
+        "maxLength",
+        "minimum",
+        "exclusiveMinimum",
+        "maximum",
+        "exclusiveMaximum",
+    }
+    field_kwargs = {key: value for key, value in property_schema.items() if key in direct_mappings}
+    if is_required is not None:
+        field_kwargs["required"] = is_required
+    property_type = determine_type(property_schema)
+    if property_type is not None:
+        field_kwargs["type"] = property_type
+    if property_type == "array":
+        nested_item_type, nested_items = determine_nested_item_type(property_schema)
+        if nested_items is not None:
+            field_kwargs["items"] = schema_to_args(nested_item_type)
+    nested_properties = property_schema.get("properties")
+    if nested_properties is not None:
+        # recursive call for complex nested properties
+        field_kwargs["fields"] = jsonschema_to_args(nested_properties, property_schema["required"])
+    return field_kwargs
+def determine_nested_item_type(property_schema):
+    nested_items = property_schema.get("items")
+    nested_items_is_list = isinstance(nested_items, list)
+    if nested_items_is_list and len(nested_items) != 1:
+        raise DataContractException(
+            type="schema",
+            name="Parse json schema",
+            reason=f"Union types for arrays are currently not supported ({nested_items})",
+            engine="datacontract",
+        )
+    if nested_items_is_list and len(nested_items) == 1:
+        nested_item_type = nested_items[0]
+    elif not nested_items_is_list and nested_items is not None:
+        nested_item_type = nested_items
+    return nested_item_type, nested_items
+def determine_type(property_schema):
+    property_type = property_schema.get("type")
+    type_is_list = isinstance(property_type, list)
+    if type_is_list:
+        non_null_types = [t for t in property_type if t != "null"]
+        if non_null_types:
+            property_type = non_null_types[0]
+        else:
+            property_type = None
+    return property_type

datacontract/imports/spark_importer.py ADDED Viewed

@@ -0,0 +1,134 @@
+from pyspark.sql import DataFrame, SparkSession, types
+from datacontract.imports.importer import Importer
+from datacontract.model.data_contract_specification import (
+    DataContractSpecification,
+    Model,
+    Field,
+    Server,
+)
+class SparkImporter(Importer):
+    def import_source(
+        self,
+        data_contract_specification: DataContractSpecification,
+        source: str,
+        import_args: dict,
+    ) -> dict:
+        """
+        Imports data from a Spark source into the data contract specification.
+        Args:
+            data_contract_specification: The data contract specification object.
+            source: The source string indicating the Spark tables to read.
+            import_args: Additional arguments for the import process.
+        Returns:
+            dict: The updated data contract specification.
+        """
+        return import_spark(data_contract_specification, source)
+def import_spark(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
+    """
+    Reads Spark tables and updates the data contract specification with their schemas.
+    Args:
+        data_contract_specification: The data contract specification to update.
+        source: A comma-separated string of Spark temporary views to read.
+    Returns:
+        DataContractSpecification: The updated data contract specification.
+    """
+    spark = SparkSession.builder.getOrCreate()
+    data_contract_specification.servers["local"] = Server(type="dataframe")
+    for temp_view in source.split(","):
+        temp_view = temp_view.strip()
+        df = spark.read.table(temp_view)
+        data_contract_specification.models[temp_view] = import_from_spark_df(df)
+    return data_contract_specification
+def import_from_spark_df(df: DataFrame) -> Model:
+    """
+    Converts a Spark DataFrame into a Model.
+    Args:
+        df: The Spark DataFrame to convert.
+    Returns:
+        Model: The generated data contract model.
+    """
+    model = Model()
+    schema = df.schema
+    for field in schema:
+        model.fields[field.name] = _field_from_spark(field)
+    return model
+def _field_from_spark(spark_field: types.StructField) -> Field:
+    """
+    Converts a Spark StructField into a Field object for the data contract.
+    Args:
+        spark_field: The Spark StructField to convert.
+    Returns:
+        Field: The corresponding Field object.
+    """
+    field_type = _data_type_from_spark(spark_field.dataType)
+    field = Field()
+    field.type = field_type
+    field.required = not spark_field.nullable
+    if field_type == "array":
+        field.items = _field_from_spark(spark_field.dataType.elementType)
+    if field_type == "struct":
+        field.fields = {sf.name: _field_from_spark(sf) for sf in spark_field.dataType.fields}
+    return field
+def _data_type_from_spark(spark_type: types.DataType) -> str:
+    """
+    Maps Spark data types to the Data Contract type system.
+    Args:
+        spark_type: The Spark data type to map.
+    Returns:
+        str: The corresponding Data Contract type.
+    """
+    if isinstance(spark_type, types.StringType):
+        return "string"
+    elif isinstance(spark_type, types.IntegerType):
+        return "integer"
+    elif isinstance(spark_type, types.LongType):
+        return "long"
+    elif isinstance(spark_type, types.FloatType):
+        return "float"
+    elif isinstance(spark_type, types.DoubleType):
+        return "double"
+    elif isinstance(spark_type, types.StructType):
+        return "struct"
+    elif isinstance(spark_type, types.ArrayType):
+        return "array"
+    elif isinstance(spark_type, types.TimestampType):
+        return "timestamp"
+    elif isinstance(spark_type, types.TimestampNTZType):
+        return "timestamp_ntz"
+    elif isinstance(spark_type, types.DateType):
+        return "date"
+    elif isinstance(spark_type, types.BooleanType):
+        return "boolean"
+    elif isinstance(spark_type, types.BinaryType):
+        return "bytes"
+    elif isinstance(spark_type, types.DecimalType):
+        return "decimal"
+    elif isinstance(spark_type, types.NullType):
+        return "null"
+    else:
+        raise ValueError(f"Unsupported Spark type: {spark_type}")

datacontract/imports/sql_importer.py CHANGED Viewed

@@ -64,6 +64,10 @@ def map_type_from_sql(sql_type: str):
         return "integer"
     elif sql_type_normed.startswith("float"):
         return "float"
+    elif sql_type_normed.startswith("decimal"):
+        return "decimal"
+    elif sql_type_normed.startswith("numeric"):
+        return "numeric"
     elif sql_type_normed.startswith("bool"):
         return "boolean"
     elif sql_type_normed.startswith("timestamp"):

datacontract/integration/publish_datamesh_manager.py CHANGED Viewed

@@ -8,18 +8,23 @@ from datacontract.model.run import Run
 def publish_datamesh_manager(run: Run, publish_url: str):
     try:
         if publish_url is None:
-            url = "https://api.datamesh-manager.com/api/runs"
+            # this url supports Data Mesh Manager and Data Contract Manager
+            url = "https://api.datamesh-manager.com/api/test-results"
         else:
             url = publish_url
-        datamesh_manager_api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
+        api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
+        if api_key is None:
+            api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY")
         if run.dataContractId is None:
             raise Exception("Cannot publish run results, as data contract ID is unknown")
-        if datamesh_manager_api_key is None:
-            raise Exception("Cannot publish run results, as DATAMESH_MANAGER_API_KEY is not set")
+        if api_key is None:
+            raise Exception(
+                "Cannot publish run results, as DATAMESH_MANAGER_API_KEY nor DATACONTRACT_MANAGER_API_KEY are not set"
+            )
-        headers = {"Content-Type": "application/json", "x-api-key": datamesh_manager_api_key}
+        headers = {"Content-Type": "application/json", "x-api-key": api_key}
         request_body = run.model_dump_json()
         # print("Request Body:", request_body)
         response = requests.post(url, data=request_body, headers=headers)

datacontract-cli 0.10.9__py3-none-any.whl → 0.10.11__py3-none-any.whl

Potentially problematic release.

datacontract-cli 0.10.9py3-none-any.whl → 0.10.11py3-none-any.whl