PyPI - datacontract-cli - Versions diffs - 0.10.9__py3-none-any.whl → 0.10.10__py3-none-any.whl - Mend

datacontract-cli 0.10.9py3-none-any.whl → 0.10.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datacontract-cli might be problematic. Click here for more details.

Files changed (25) hide show

datacontract/cli.py +7 -0
datacontract/data_contract.py +4 -2
datacontract/engines/soda/check_soda_execute.py +5 -2
datacontract/engines/soda/connections/duckdb.py +4 -0
datacontract/export/avro_converter.py +1 -1
datacontract/export/sodacl_converter.py +1 -1
datacontract/imports/avro_importer.py +142 -8
datacontract/imports/dbt_importer.py +117 -0
datacontract/imports/glue_importer.py +2 -2
datacontract/imports/importer.py +6 -1
datacontract/imports/importer_factory.py +24 -6
datacontract/imports/jsonschema_importer.py +6 -3
datacontract/imports/spark_importer.py +134 -0
datacontract/integration/publish_datamesh_manager.py +10 -5
datacontract/lint/resolve.py +72 -27
datacontract/lint/schema.py +24 -4
datacontract/model/data_contract_specification.py +3 -0
datacontract/templates/datacontract.html +1 -1
datacontract/templates/index.html +1 -1
{datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.10.dist-info}/METADATA +114 -101
{datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.10.dist-info}/RECORD +25 -23
{datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.10.dist-info}/WHEEL +1 -1
{datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.10.dist-info}/LICENSE +0 -0
{datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.10.dist-info}/entry_points.txt +0 -0
{datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.10.dist-info}/top_level.txt +0 -0

datacontract/cli.py CHANGED Viewed

@@ -226,6 +226,12 @@ def import_(
     unity_table_full_name: Annotated[
         Optional[str], typer.Option(help="Full name of a table in the unity catalog")
     ] = None,
+    dbt_model: Annotated[
+        Optional[List[str]],
+        typer.Option(
+            help="List of models names to import from the dbt manifest file (repeat for multiple models names, leave empty for all models in the dataset)."
+        ),
+    ] = None,
 ):
     """
     Create a data contract from the given source location. Prints to stdout.
@@ -238,6 +244,7 @@ def import_(
         bigquery_project=bigquery_project,
         bigquery_dataset=bigquery_dataset,
         unity_table_full_name=unity_table_full_name,
+        dbt_model=dbt_model,
     )
     console.print(result.to_yaml())

datacontract/data_contract.py CHANGED Viewed

@@ -4,7 +4,9 @@ import tempfile
 import typing
 import yaml
-from pyspark.sql import SparkSession
+if typing.TYPE_CHECKING:
+    from pyspark.sql import SparkSession
 from datacontract.breaking.breaking import models_breaking_changes, quality_breaking_changes
 from datacontract.engines.datacontract.check_that_datacontract_contains_valid_servers_configuration import (
@@ -43,7 +45,7 @@ class DataContract:
         examples: bool = False,
         publish_url: str = None,
         publish_to_opentelemetry: bool = False,
-        spark: SparkSession = None,
+        spark: "SparkSession" = None,
         inline_definitions: bool = False,
         inline_quality: bool = False,
     ):

datacontract/engines/soda/check_soda_execute.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import logging
+import typing
+if typing.TYPE_CHECKING:
+    from pyspark.sql import SparkSession
-from pyspark.sql import SparkSession
 from soda.scan import Scan
 from datacontract.engines.soda.connections.bigquery import to_bigquery_soda_configuration
@@ -17,7 +20,7 @@ from datacontract.model.run import Run, Check, Log
 def check_soda_execute(
-    run: Run, data_contract: DataContractSpecification, server: Server, spark: SparkSession, tmp_dir
+    run: Run, data_contract: DataContractSpecification, server: Server, spark: "SparkSession", tmp_dir
 ):
     if data_contract is None:
         run.log_warn("Cannot run engine soda-core, as data contract is invalid")

datacontract/engines/soda/connections/duckdb.py CHANGED Viewed

@@ -50,6 +50,10 @@ def get_duckdb_connection(data_contract, server, run: Run):
                 )
         elif server.format == "delta":
             if server.type == "azure":
+                # After switching to native delta table support
+                # in https://github.com/datacontract/datacontract-cli/issues/258,
+                # azure storage should also work
+                # https://github.com/duckdb/duckdb_delta/issues/21
                 raise NotImplementedError("Support for Delta Tables on Azure Storage is not implemented yet")
             storage_options = {

datacontract/export/avro_converter.py CHANGED Viewed

@@ -65,7 +65,7 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
             if field.config["avroLogicalType"] in ["time-millis", "date"]:
                 return {"type": "int", "logicalType": field.config["avroLogicalType"]}
         if "avroType" in field.config:
-            return field.config["avroLogicalType"]
+            return field.config["avroType"]
     if field.type is None:
         return "null"

datacontract/export/sodacl_converter.py CHANGED Viewed

@@ -131,7 +131,7 @@ def check_field_minimum(field_name, minimum, quote_field_name: bool = False):
         field_name = f'"{field_name}"'
     return {
         f"invalid_count({field_name}) = 0": {
-            "name": f"Check that field {field_name} has a minimum of {min}",
+            "name": f"Check that field {field_name} has a minimum of {minimum}",
             "valid min": minimum,
         }
     }

datacontract/imports/avro_importer.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from typing import Dict, List
 import avro.schema
 from datacontract.imports.importer import Importer
@@ -6,13 +8,39 @@ from datacontract.model.exceptions import DataContractException
 class AvroImporter(Importer):
+    """Class to import Avro Schema file"""
     def import_source(
         self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
-    ) -> dict:
+    ) -> DataContractSpecification:
+        """
+        Import Avro schema from a source file.
+        Args:
+            data_contract_specification: The data contract specification to update.
+            source: The path to the Avro schema file.
+            import_args: Additional import arguments.
+        Returns:
+            The updated data contract specification.
+        """
         return import_avro(data_contract_specification, source)
 def import_avro(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
+    """
+    Import an Avro schema from a file and update the data contract specification.
+    Args:
+        data_contract_specification: The data contract specification to update.
+        source: The path to the Avro schema file.
+    Returns:
+        DataContractSpecification: The updated data contract specification.
+    Raises:
+        DataContractException: If there's an error parsing the Avro schema.
+    """
     if data_contract_specification.models is None:
         data_contract_specification.models = {}
@@ -45,7 +73,14 @@ def import_avro(data_contract_specification: DataContractSpecification, source:
     return data_contract_specification
-def handle_config_avro_custom_properties(field, imported_field):
+def handle_config_avro_custom_properties(field: avro.schema.Field, imported_field: Field) -> None:
+    """
+    Handle custom Avro properties and add them to the imported field's config.
+    Args:
+        field: The Avro field.
+        imported_field: The imported field to update.
+    """
     if field.get_prop("logicalType") is not None:
         if imported_field.config is None:
             imported_field.config = {}
@@ -57,7 +92,16 @@ def handle_config_avro_custom_properties(field, imported_field):
         imported_field.config["avroDefault"] = field.default
-def import_record_fields(record_fields):
+def import_record_fields(record_fields: List[avro.schema.Field]) -> Dict[str, Field]:
+    """
+    Import Avro record fields and convert them to data contract fields.
+    Args:
+        record_fields: List of Avro record fields.
+    Returns:
+        A dictionary of imported fields.
+    """
     imported_fields = {}
     for field in record_fields:
         imported_field = Field()
@@ -83,6 +127,15 @@ def import_record_fields(record_fields):
         elif field.type.type == "array":
             imported_field.type = "array"
             imported_field.items = import_avro_array_items(field.type)
+        elif field.type.type == "map":
+            imported_field.type = "map"
+            imported_field.values = import_avro_map_values(field.type)
+        elif field.type.type == "enum":
+            imported_field.type = "string"
+            imported_field.enum = field.type.symbols
+            if not imported_field.config:
+                imported_field.config = {}
+            imported_field.config["avroType"] = "enum"
         else:  # primitive type
             imported_field.type = map_type_from_avro(field.type.type)
@@ -91,7 +144,16 @@ def import_record_fields(record_fields):
     return imported_fields
-def import_avro_array_items(array_schema):
+def import_avro_array_items(array_schema: avro.schema.ArraySchema) -> Field:
+    """
+    Import Avro array items and convert them to a data contract field.
+    Args:
+        array_schema: The Avro array schema.
+    Returns:
+        Field: The imported field representing the array items.
+    """
     items = Field()
     for prop in array_schema.other_props:
         items.__setattr__(prop, array_schema.other_props[prop])
@@ -108,7 +170,45 @@ def import_avro_array_items(array_schema):
     return items
-def import_type_of_optional_field(field):
+def import_avro_map_values(map_schema: avro.schema.MapSchema) -> Field:
+    """
+    Import Avro map values and convert them to a data contract field.
+    Args:
+        map_schema: The Avro map schema.
+    Returns:
+        Field: The imported field representing the map values.
+    """
+    values = Field()
+    for prop in map_schema.other_props:
+        values.__setattr__(prop, map_schema.other_props[prop])
+    if map_schema.values.type == "record":
+        values.type = "object"
+        values.fields = import_record_fields(map_schema.values.fields)
+    elif map_schema.values.type == "array":
+        values.type = "array"
+        values.items = import_avro_array_items(map_schema.values)
+    else:  # primitive type
+        values.type = map_type_from_avro(map_schema.values.type)
+    return values
+def import_type_of_optional_field(field: avro.schema.Field) -> str:
+    """
+    Determine the type of optional field in an Avro union.
+    Args:
+        field: The Avro field with a union type.
+    Returns:
+        str: The mapped type of the non-null field in the union.
+    Raises:
+        DataContractException: If no non-null type is found in the union.
+    """
     for field_type in field.type.schemas:
         if field_type.type != "null":
             return map_type_from_avro(field_type.type)
@@ -121,21 +221,51 @@ def import_type_of_optional_field(field):
     )
-def get_record_from_union_field(field):
+def get_record_from_union_field(field: avro.schema.Field) -> avro.schema.RecordSchema | None:
+    """
+    Get the record schema from a union field.
+    Args:
+        field: The Avro field with a union type.
+    Returns:
+        The record schema if found, None otherwise.
+    """
     for field_type in field.type.schemas:
         if field_type.type == "record":
             return field_type
     return None
-def get_array_from_union_field(field):
+def get_array_from_union_field(field: avro.schema.Field) -> avro.schema.ArraySchema | None:
+    """
+    Get the array schema from a union field.
+    Args:
+        field: The Avro field with a union type.
+    Returns:
+        The array schema if found, None otherwise.
+    """
     for field_type in field.type.schemas:
         if field_type.type == "array":
             return field_type
     return None
-def map_type_from_avro(avro_type_str: str):
+def map_type_from_avro(avro_type_str: str) -> str:
+    """
+    Map Avro type strings to data contract type strings.
+    Args:
+        avro_type_str (str): The Avro type string.
+    Returns:
+        str: The corresponding data contract type string.
+    Raises:
+        DataContractException: If the Avro type is unsupported.
+    """
     # TODO: ambiguous mapping in the export
     if avro_type_str == "null":
         return "null"
@@ -155,6 +285,10 @@ def map_type_from_avro(avro_type_str: str):
         return "record"
     elif avro_type_str == "array":
         return "array"
+    elif avro_type_str == "map":
+        return "map"
+    elif avro_type_str == "enum":
+        return "string"
     else:
         raise DataContractException(
             type="schema",

datacontract/imports/dbt_importer.py ADDED Viewed

@@ -0,0 +1,117 @@
+import json
+from typing import (
+    List,
+)
+from datacontract.imports.importer import Importer
+from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
+class DbtManifestImporter(Importer):
+    def import_source(
+        self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
+    ) -> dict:
+        data = read_dbt_manifest(manifest_path=source)
+        return import_dbt_manifest(
+            data_contract_specification, manifest_dict=data, dbt_models=import_args.get("dbt_model")
+        )
+def import_dbt_manifest(
+    data_contract_specification: DataContractSpecification, manifest_dict: dict, dbt_models: List[str]
+):
+    data_contract_specification.info.title = manifest_dict.get("info").get("project_name")
+    data_contract_specification.info.dbt_version = manifest_dict.get("info").get("dbt_version")
+    if data_contract_specification.models is None:
+        data_contract_specification.models = {}
+    for model in manifest_dict.get("models", []):
+        if dbt_models and model.name not in dbt_models:
+            continue
+        dc_model = Model(
+            description=model.description,
+            tags=model.tags,
+            fields=create_fields(model.columns),
+        )
+        data_contract_specification.models[model.name] = dc_model
+    return data_contract_specification
+def create_fields(columns: List):
+    fields = {}
+    for column in columns:
+        field = Field(
+            description=column.description, type=column.data_type if column.data_type else "", tags=column.tags
+        )
+        fields[column.name] = field
+    return fields
+def read_dbt_manifest(manifest_path: str):
+    with open(manifest_path, "r", encoding="utf-8") as f:
+        manifest = json.load(f)
+    return {"info": manifest.get("metadata"), "models": create_manifest_models(manifest)}
+def create_manifest_models(manifest: dict) -> List:
+    models = []
+    nodes = manifest.get("nodes")
+    for node in nodes.values():
+        if node["resource_type"] != "model":
+            continue
+        models.append(DbtModel(node))
+    return models
+class DbtColumn:
+    name: str
+    description: str
+    data_type: str
+    meta: dict
+    tags: List
+    def __init__(self, node_column: dict):
+        self.name = node_column.get("name")
+        self.description = node_column.get("description")
+        self.data_type = node_column.get("data_type")
+        self.meta = node_column.get("meta", {})
+        self.tags = node_column.get("tags", [])
+    def __repr__(self) -> str:
+        return self.name
+class DbtModel:
+    name: str
+    database: str
+    schema: str
+    description: str
+    unique_id: str
+    tags: List
+    def __init__(self, node: dict):
+        self.name = node.get("name")
+        self.database = node.get("database")
+        self.schema = node.get("schema")
+        self.description = node.get("description")
+        self.display_name = node.get("display_name")
+        self.unique_id = node.get("unique_id")
+        self.columns = []
+        self.tags = node.get("tags")
+        if node.get("columns"):
+            self.add_columns(node.get("columns").values())
+    def add_columns(self, model_columns: List):
+        for column in model_columns:
+            self.columns.append(DbtColumn(column))
+    def __repr__(self) -> str:
+        return self.name

datacontract/imports/glue_importer.py CHANGED Viewed

@@ -14,7 +14,7 @@ class GlueImporter(Importer):
     def import_source(
         self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
     ) -> dict:
-        return import_glue(data_contract_specification, source, import_args.get("glue_tables"))
+        return import_glue(data_contract_specification, source, import_args.get("glue_table"))
 def get_glue_database(database_name: str):
@@ -154,7 +154,7 @@ def import_glue(
         for column in table_schema:
             field = create_typed_field(column["Type"])
-            # hive partitons are required, but are not primary keys
+            # hive partitions are required, but are not primary keys
             if column.get("Hive"):
                 field.required = True

datacontract/imports/importer.py CHANGED Viewed

@@ -10,7 +10,10 @@ class Importer(ABC):
     @abstractmethod
     def import_source(
-        self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
+        self,
+        data_contract_specification: DataContractSpecification,
+        source: str,
+        import_args: dict,
     ) -> dict:
         pass
@@ -18,11 +21,13 @@ class Importer(ABC):
 class ImportFormat(str, Enum):
     sql = "sql"
     avro = "avro"
+    dbt = "dbt"
     glue = "glue"
     jsonschema = "jsonschema"
     bigquery = "bigquery"
     odcs = "odcs"
     unity = "unity"
+    spark = "spark"
     @classmethod
     def get_suported_formats(cls):

datacontract/imports/importer_factory.py CHANGED Viewed

@@ -18,7 +18,7 @@ class ImporterFactory:
         importers = self.dict_importer.copy()
         importers.update(self.dict_lazy_importer.copy())
         if name not in importers.keys():
-            raise ValueError(f"The '{name}' format is not suportted.")
+            raise ValueError(f"The '{name}' format is not supported.")
         importer_class = importers[name]
         if type(importers[name]) is tuple:
             importer_class = load_module_class(module_path=importers[name][0], class_name=importers[name][1])
@@ -46,7 +46,9 @@ def load_module_class(module_path, class_name):
 importer_factory = ImporterFactory()
 importer_factory.register_lazy_importer(
-    name=ImportFormat.avro, module_path="datacontract.imports.avro_importer", class_name="AvroImporter"
+    name=ImportFormat.avro,
+    module_path="datacontract.imports.avro_importer",
+    class_name="AvroImporter",
 )
 importer_factory.register_lazy_importer(
     name=ImportFormat.bigquery,
@@ -54,7 +56,9 @@ importer_factory.register_lazy_importer(
     class_name="BigQueryImporter",
 )
 importer_factory.register_lazy_importer(
-    name=ImportFormat.glue, module_path="datacontract.imports.glue_importer", class_name="GlueImporter"
+    name=ImportFormat.glue,
+    module_path="datacontract.imports.glue_importer",
+    class_name="GlueImporter",
 )
 importer_factory.register_lazy_importer(
     name=ImportFormat.jsonschema,
@@ -62,11 +66,25 @@ importer_factory.register_lazy_importer(
     class_name="JsonSchemaImporter",
 )
 importer_factory.register_lazy_importer(
-    name=ImportFormat.odcs, module_path="datacontract.imports.odcs_importer", class_name="OdcsImporter"
+    name=ImportFormat.odcs,
+    module_path="datacontract.imports.odcs_importer",
+    class_name="OdcsImporter",
 )
 importer_factory.register_lazy_importer(
-    name=ImportFormat.sql, module_path="datacontract.imports.sql_importer", class_name="SqlImporter"
+    name=ImportFormat.sql,
+    module_path="datacontract.imports.sql_importer",
+    class_name="SqlImporter",
 )
 importer_factory.register_lazy_importer(
-    name=ImportFormat.unity, module_path="datacontract.imports.unity_importer", class_name="UnityImporter"
+    name=ImportFormat.unity,
+    module_path="datacontract.imports.unity_importer",
+    class_name="UnityImporter",
+)
+importer_factory.register_lazy_importer(
+    name=ImportFormat.spark,
+    module_path="datacontract.imports.spark_importer",
+    class_name="SparkImporter",
+)
+importer_factory.register_lazy_importer(
+    name=ImportFormat.dbt, module_path="datacontract.imports.dbt_importer", class_name="DbtManifestImporter"
 )

datacontract/imports/jsonschema_importer.py CHANGED Viewed

@@ -64,11 +64,14 @@ def convert_json_schema_properties(properties, is_definition=False):
                 case "tags":
                     field_kwargs["tags"] = value
                 case "properties":
-                    field_kwargs["fields"] = convert_json_schema_properties(value)
+                    field_kwargs["fields"] = convert_json_schema_properties(value, is_definition=is_definition)
                 case "items":
-                    field_kwargs["items"] = convert_json_schema_properties(value)
+                    field_kwargs["items"] = convert_json_schema_properties(value, is_definition=is_definition)
-        field = Field(**field_kwargs)
+        if is_definition:
+            field = Definition(**field_kwargs)
+        else:
+            field = Field(**field_kwargs)
         fields[field_name] = field
     return fields

datacontract-cli 0.10.9__py3-none-any.whl → 0.10.10__py3-none-any.whl

Potentially problematic release.

datacontract-cli 0.10.9py3-none-any.whl → 0.10.10py3-none-any.whl