PyPI - datacontract-cli - Versions diffs - 0.10.12__py3-none-any.whl → 0.10.14__py3-none-any.whl - Mend

datacontract-cli 0.10.12py3-none-any.whl → 0.10.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datacontract-cli might be problematic. Click here for more details.

Files changed (37) hide show

datacontract/cli.py +5 -0
datacontract/data_contract.py +9 -1
datacontract/engines/soda/connections/kafka.py +28 -6
datacontract/export/avro_converter.py +8 -1
datacontract/export/avro_idl_converter.py +1 -0
datacontract/export/bigquery_converter.py +30 -23
datacontract/export/data_caterer_converter.py +148 -0
datacontract/export/dcs_exporter.py +6 -0
datacontract/export/exporter.py +5 -1
datacontract/export/exporter_factory.py +19 -1
datacontract/export/jsonschema_converter.py +13 -2
datacontract/export/{odcs_converter.py → odcs_v2_exporter.py} +4 -4
datacontract/export/odcs_v3_exporter.py +294 -0
datacontract/export/sodacl_converter.py +82 -2
datacontract/export/spark_converter.py +3 -1
datacontract/export/sql_type_converter.py +56 -21
datacontract/imports/iceberg_importer.py +162 -0
datacontract/imports/importer.py +1 -0
datacontract/imports/importer_factory.py +5 -0
datacontract/imports/odcs_importer.py +25 -168
datacontract/imports/odcs_v2_importer.py +177 -0
datacontract/imports/odcs_v3_importer.py +309 -0
datacontract/imports/spark_importer.py +5 -1
datacontract/imports/unity_importer.py +105 -84
datacontract/integration/datamesh_manager.py +1 -1
datacontract/lint/resolve.py +24 -10
datacontract/lint/resources.py +21 -0
datacontract/lint/urls.py +29 -13
datacontract/model/data_contract_specification.py +72 -8
datacontract/model/odcs.py +11 -0
{datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/METADATA +106 -52
{datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/RECORD +36 -29
{datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/WHEEL +1 -1
datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
{datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/LICENSE +0 -0
{datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/entry_points.txt +0 -0
{datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/top_level.txt +0 -0

datacontract/cli.py CHANGED Viewed

@@ -244,6 +244,10 @@ def import_(
             help="List of table names to import from the DBML file (repeat for multiple table names, leave empty for all tables in the file)."
         ),
     ] = None,
+    iceberg_table: Annotated[
+        Optional[str],
+        typer.Option(help="Table name to assign to the model created from the Iceberg schema."),
+    ] = None,
 ):
     """
     Create a data contract from the given source location. Prints to stdout.
@@ -259,6 +263,7 @@ def import_(
         dbt_model=dbt_model,
         dbml_schema=dbml_schema,
         dbml_table=dbml_table,
+        iceberg_table=iceberg_table,
     )
     console.print(result.to_yaml())

datacontract/data_contract.py CHANGED Viewed

@@ -199,7 +199,15 @@ class DataContract:
         except DataContractException as e:
             run.checks.append(
-                Check(type=e.type, result=e.result, name=e.name, reason=e.reason, engine=e.engine, details="")
+                Check(
+                    type=e.type,
+                    name=e.name,
+                    result=e.result,
+                    reason=e.reason,
+                    model=e.model,
+                    engine=e.engine,
+                    details="",
+                )
             )
             run.log_error(str(e))
         except Exception as e:

datacontract/engines/soda/connections/kafka.py CHANGED Viewed

@@ -25,9 +25,10 @@ def create_spark_session(tmp_dir: str):
         SparkSession.builder.appName("datacontract")
         .config("spark.sql.warehouse.dir", f"{tmp_dir}/spark-warehouse")
         .config("spark.streaming.stopGracefullyOnShutdown", "true")
+        .config("spark.ui.enabled", "false")
         .config(
             "spark.jars.packages",
-            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,org.apache.spark:spark-avro_2.12:3.5.0",
+            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.2,org.apache.spark:spark-avro_2.12:3.5.2",
         )
         .getOrCreate()
     )
@@ -111,17 +112,38 @@ def get_auth_options():
     """Retrieve Kafka authentication options from environment variables."""
     kafka_sasl_username = os.getenv("DATACONTRACT_KAFKA_SASL_USERNAME")
     kafka_sasl_password = os.getenv("DATACONTRACT_KAFKA_SASL_PASSWORD")
+    kafka_sasl_mechanism = os.getenv("DATACONTRACT_KAFKA_SASL_MECHANISM", "PLAIN").upper()
-    if kafka_sasl_username is None:
+    # Skip authentication if credentials are not provided
+    if not kafka_sasl_username or not kafka_sasl_password:
         return {}
-    return {
-        "kafka.sasl.mechanism": "PLAIN",
-        "kafka.security.protocol": "SASL_SSL",
-        "kafka.sasl.jaas.config": (
+    # SASL mechanisms supported by Kafka
+    jaas_config = {
+        "PLAIN": (
             f"org.apache.kafka.common.security.plain.PlainLoginModule required "
             f'username="{kafka_sasl_username}" password="{kafka_sasl_password}";'
         ),
+        "SCRAM-SHA-256": (
+            f"org.apache.kafka.common.security.scram.ScramLoginModule required "
+            f'username="{kafka_sasl_username}" password="{kafka_sasl_password}";'
+        ),
+        "SCRAM-SHA-512": (
+            f"org.apache.kafka.common.security.scram.ScramLoginModule required "
+            f'username="{kafka_sasl_username}" password="{kafka_sasl_password}";'
+        ),
+        # Add more mechanisms as needed
+    }
+    # Validate SASL mechanism
+    if kafka_sasl_mechanism not in jaas_config:
+        raise ValueError(f"Unsupported SASL mechanism: {kafka_sasl_mechanism}")
+    # Return config
+    return {
+        "kafka.sasl.mechanism": kafka_sasl_mechanism,
+        "kafka.security.protocol": "SASL_SSL",
+        "kafka.sasl.jaas.config": jaas_config[kafka_sasl_mechanism],
     }

datacontract/export/avro_converter.py CHANGED Viewed

@@ -81,9 +81,16 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
         return "null"
     if field.type in ["string", "varchar", "text"]:
         return "string"
-    elif field.type in ["number", "decimal", "numeric"]:
+    elif field.type in ["number", "numeric"]:
         # https://avro.apache.org/docs/1.11.1/specification/#decimal
         return "bytes"
+    elif field.type in ["decimal"]:
+        typeVal = {"type": "bytes", "logicalType": "decimal"}
+        if field.scale is not None:
+            typeVal["scale"] = field.scale
+        if field.precision is not None:
+            typeVal["precision"] = field.precision
+        return typeVal
     elif field.type in ["float", "double"]:
         return "double"
     elif field.type in ["integer", "int"]:

datacontract/export/avro_idl_converter.py CHANGED Viewed

@@ -64,6 +64,7 @@ class AvroIDLProtocol:
     model_types: list[AvroModelType]
+# TODO use DATACONTRACT_TYPES from datacontract/model/data_contract_specification.py
 avro_primitive_types = set(
     [
         "string",

datacontract/export/bigquery_converter.py CHANGED Viewed

@@ -44,7 +44,7 @@ def to_fields_array(fields: Dict[str, Field]) -> List[Dict[str, Field]]:
 def to_field(field_name: str, field: Field) -> dict:
-    bq_type = map_type_to_bigquery(field.type, field_name)
+    bq_type = map_type_to_bigquery(field)
     bq_field = {
         "name": field_name,
         "type": bq_type,
@@ -59,10 +59,8 @@ def to_field(field_name: str, field: Field) -> dict:
             # in case the array type is a complex object, we want to copy all its fields
             bq_field["fields"] = to_fields_array(field.items.fields)
         else:
-            # otherwise we make up a structure that gets us a single field of the specified type
-            bq_field["fields"] = to_fields_array(
-                {f"{field_name}_1": Field(type=field.items.type, required=False, description="")}
-            )
+            bq_field["type"] = map_type_to_bigquery(field.items)
     # all of these can carry other fields
     elif bq_type.lower() in ["record", "struct"]:
         bq_field["fields"] = to_fields_array(field.fields)
@@ -79,37 +77,46 @@ def to_field(field_name: str, field: Field) -> dict:
     return bq_field
-def map_type_to_bigquery(type_str: str, field_name: str) -> str:
+def map_type_to_bigquery(field: Field) -> str:
     logger = logging.getLogger(__name__)
-    if type_str.lower() in ["string", "varchar", "text"]:
+    field_type = field.type
+    if not field_type:
+        return None
+    if field.config and "bigqueryType" in field.config:
+        return field.config["bigqueryType"]
+    if field_type.lower() in ["string", "varchar", "text"]:
         return "STRING"
-    elif type_str == "bytes":
+    elif field_type.lower() == "bytes":
         return "BYTES"
-    elif type_str.lower() in ["int", "integer"]:
+    elif field_type.lower() in ["int", "integer"]:
         return "INTEGER"
-    elif type_str.lower() in ["long", "bigint"]:
+    elif field_type.lower() in ["long", "bigint"]:
         return "INT64"
-    elif type_str == "float":
-        return "FLOAT"
-    elif type_str == "boolean":
+    elif field_type.lower() == "float":
+        return "FLOAT64"
+    elif field_type.lower() == "boolean":
         return "BOOL"
-    elif type_str.lower() in ["timestamp", "timestamp_tz"]:
+    elif field_type.lower() in ["timestamp", "timestamp_tz"]:
         return "TIMESTAMP"
-    elif type_str == "date":
+    elif field_type.lower() == "date":
         return "DATE"
-    elif type_str == "timestamp_ntz":
+    elif field_type.lower() == "timestamp_ntz":
         return "TIME"
-    elif type_str.lower() in ["number", "decimal", "numeric"]:
+    elif field_type.lower() in ["number", "decimal", "numeric"]:
         return "NUMERIC"
-    elif type_str == "double":
+    elif field_type.lower() == "double":
         return "BIGNUMERIC"
-    elif type_str.lower() in ["object", "record", "array"]:
+    elif field_type.lower() in ["object", "record", "array"]:
         return "RECORD"
-    elif type_str == "struct":
+    elif field_type.lower() == "struct":
         return "STRUCT"
-    elif type_str == "null":
+    elif field_type.lower() == "null":
         logger.info(
-            f"Can't properly map {field_name} to bigquery Schema, as 'null' is not supported as a type. Mapping it to STRING."
+            f"Can't properly map {field.title} to bigquery Schema, as 'null' \
+                 is not supported as a type. Mapping it to STRING."
         )
         return "STRING"
     else:
@@ -117,6 +124,6 @@ def map_type_to_bigquery(type_str: str, field_name: str) -> str:
             type="schema",
             result="failed",
             name="Map datacontract type to bigquery data type",
-            reason=f"Unsupported type {type_str} in data contract definition.",
+            reason=f"Unsupported type {field_type} in data contract definition.",
             engine="datacontract",
         )

datacontract/export/data_caterer_converter.py ADDED Viewed

@@ -0,0 +1,148 @@
+from typing import Dict
+import yaml
+from datacontract.export.exporter import Exporter
+from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field, Server
+class DataCatererExporter(Exporter):
+    """
+    Exporter class for Data Caterer.
+    Creates a YAML file, based on the data contract, for Data Caterer to generate synthetic data.
+    """
+    def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
+        return to_data_caterer_generate_yaml(data_contract, server)
+def to_data_caterer_generate_yaml(data_contract_spec: DataContractSpecification, server):
+    generation_task = {"name": data_contract_spec.info.title, "steps": []}
+    server_info = _get_server_info(data_contract_spec, server)
+    for model_key, model_value in data_contract_spec.models.items():
+        odcs_table = _to_data_caterer_generate_step(model_key, model_value, server_info)
+        generation_task["steps"].append(odcs_table)
+    return yaml.dump(generation_task, indent=2, sort_keys=False, allow_unicode=True)
+def _get_server_info(data_contract_spec: DataContractSpecification, server):
+    if server is not None and server in data_contract_spec.servers:
+        return data_contract_spec.servers.get(server)
+    elif server is not None:
+        raise Exception(f"Server name not found in servers list in data contract, server-name={server}")
+    elif len(data_contract_spec.servers.keys()) > 0:
+        return next(iter(data_contract_spec.servers.values()))
+    else:
+        return None
+def _to_data_caterer_generate_step(model_key, model_value: Model, server: Server) -> dict:
+    step = {
+        "name": model_key,
+        "type": _to_step_type(server),
+        "options": _to_data_source_options(model_key, server),
+        "schema": [],
+    }
+    fields = _to_fields(model_value.fields)
+    if fields:
+        step["schema"] = fields
+    return step
+def _to_step_type(server: Server):
+    if server is not None and server.type is not None:
+        if server.type in ["s3", "gcs", "azure", "local"]:
+            return server.format
+        else:
+            return server.type
+    else:
+        return "csv"
+def _to_data_source_options(model_key, server: Server):
+    options = {}
+    if server is not None and server.type is not None:
+        if server.type in ["s3", "gcs", "azure", "local"]:
+            if server.path is not None:
+                options["path"] = server.path
+            elif server.location is not None:
+                options["path"] = server.location
+            else:
+                options["path"] = "/tmp/data_caterer_data"
+        elif server.type == "postgres":
+            options["schema"] = server.schema_
+            options["table"] = model_key
+        elif server.type == "kafka":
+            options["topic"] = server.topic
+    return options
+def _to_fields(fields: Dict[str, Field]) -> list:
+    dc_fields = []
+    for field_name, field in fields.items():
+        column = _to_field(field_name, field)
+        dc_fields.append(column)
+    return dc_fields
+def _to_field(field_name: str, field: Field) -> dict:
+    dc_field = {"name": field_name}
+    dc_generator_opts = {}
+    if field.type is not None:
+        new_type = _to_data_type(field.type)
+        dc_field["type"] = _to_data_type(field.type)
+        if new_type == "object" or new_type == "record" or new_type == "struct":
+            # need to get nested field definitions
+            nested_fields = _to_fields(field.fields)
+            dc_field["schema"] = {"fields": nested_fields}
+    if field.enum is not None and len(field.enum) > 0:
+        dc_generator_opts["oneOf"] = field.enum
+    if field.unique is not None and field.unique:
+        dc_generator_opts["isUnique"] = field.unique
+    if field.minLength is not None:
+        dc_generator_opts["minLength"] = field.minLength
+    if field.maxLength is not None:
+        dc_generator_opts["maxLength"] = field.maxLength
+    if field.pattern is not None:
+        dc_generator_opts["regex"] = field.pattern
+    if field.minimum is not None:
+        dc_generator_opts["min"] = field.minimum
+    if field.maximum is not None:
+        dc_generator_opts["max"] = field.maximum
+    if len(dc_generator_opts.keys()) > 0:
+        dc_field["generator"] = {"options": dc_generator_opts}
+    return dc_field
+def _to_data_type(data_type):
+    if data_type == "number" or data_type == "numeric" or data_type == "double":
+        return "double"
+    elif data_type == "decimal" or data_type == "bigint":
+        return "decimal"
+    elif data_type == "int":
+        return "integer"
+    elif data_type == "long":
+        return "long"
+    elif data_type == "float":
+        return "float"
+    elif data_type == "string" or data_type == "text" or data_type == "varchar":
+        return "string"
+    if data_type == "boolean":
+        return "boolean"
+    if data_type == "timestamp" or data_type == "timestamp_tz" or data_type == "timestamp_ntz":
+        return "timestamp"
+    elif data_type == "date":
+        return "date"
+    elif data_type == "array":
+        return "array"
+    elif data_type == "map" or data_type == "object" or data_type == "record" or data_type == "struct":
+        return "struct"
+    elif data_type == "bytes":
+        return "binary"
+    else:
+        return "string"

datacontract/export/dcs_exporter.py ADDED Viewed

@@ -0,0 +1,6 @@
+from datacontract.export.exporter import Exporter
+class DcsExporter(Exporter):
+    def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
+        return data_contract.to_yaml()

datacontract/export/exporter.py CHANGED Viewed

@@ -10,7 +10,7 @@ class Exporter(ABC):
         self.export_format = export_format
     @abstractmethod
-    def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
+    def export(self, data_contract, model, server, sql_server_type, export_args) -> dict | str:
         pass
@@ -22,6 +22,8 @@ class ExportFormat(str, Enum):
     dbt_sources = "dbt-sources"
     dbt_staging_sql = "dbt-staging-sql"
     odcs = "odcs"
+    odcs_v2 = "odcs_v2"
+    odcs_v3 = "odcs_v3"
     rdf = "rdf"
     avro = "avro"
     protobuf = "protobuf"
@@ -36,6 +38,8 @@ class ExportFormat(str, Enum):
     dbml = "dbml"
     spark = "spark"
     sqlalchemy = "sqlalchemy"
+    data_caterer = "data-caterer"
+    dcs = "dcs"
     @classmethod
     def get_supported_formats(cls):

datacontract/export/exporter_factory.py CHANGED Viewed

@@ -62,6 +62,12 @@ exporter_factory.register_lazy_exporter(
     class_name="BigQueryExporter",
 )
+exporter_factory.register_lazy_exporter(
+    name=ExportFormat.data_caterer,
+    module_path="datacontract.export.data_caterer_converter",
+    class_name="DataCatererExporter",
+)
 exporter_factory.register_lazy_exporter(
     name=ExportFormat.dbml, module_path="datacontract.export.dbml_converter", class_name="DbmlExporter"
 )
@@ -93,7 +99,15 @@ exporter_factory.register_lazy_exporter(
 )
 exporter_factory.register_lazy_exporter(
-    name=ExportFormat.odcs, module_path="datacontract.export.odcs_converter", class_name="OdcsExporter"
+    name=ExportFormat.odcs_v2, module_path="datacontract.export.odcs_v2_exporter", class_name="OdcsV2Exporter"
+)
+exporter_factory.register_lazy_exporter(
+    name=ExportFormat.odcs_v3, module_path="datacontract.export.odcs_v3_exporter", class_name="OdcsV3Exporter"
+)
+exporter_factory.register_lazy_exporter(
+    name=ExportFormat.odcs, module_path="datacontract.export.odcs_v3_exporter", class_name="OdcsV3Exporter"
 )
 exporter_factory.register_lazy_exporter(
@@ -149,3 +163,7 @@ exporter_factory.register_lazy_exporter(
     module_path="datacontract.export.sqlalchemy_converter",
     class_name="SQLAlchemyExporter",
 )
+exporter_factory.register_lazy_exporter(
+    name=ExportFormat.dcs, module_path="datacontract.export.dcs_exporter", class_name="DcsExporter"
+)

datacontract/export/jsonschema_converter.py CHANGED Viewed

@@ -36,7 +36,19 @@ def to_property(field: Field) -> dict:
     property = {}
     json_type, json_format = convert_type_format(field.type, field.format)
     if json_type is not None:
-        property["type"] = json_type
+        if not field.required:
+            """
+            From: https://json-schema.org/understanding-json-schema/reference/type
+            The type keyword may either be a string or an array:
+            If it's a string, it is the name of one of the basic types above.
+            If it is an array, it must be an array of strings, where each string
+            is the name of one of the basic types, and each element is unique.
+            In this case, the JSON snippet is valid if it matches any of the given types.
+            """
+            property["type"] = [json_type, "null"]
+        else:
+            property["type"] = json_type
     if json_format is not None:
         property["format"] = json_format
     if field.unique:
@@ -50,7 +62,6 @@ def to_property(field: Field) -> dict:
         property["required"] = to_required(field.fields)
     if json_type == "array":
         property["items"] = to_property(field.items)
     if field.pattern:
         property["pattern"] = field.pattern
     if field.enum:

datacontract/export/{odcs_converter.py → odcs_v2_exporter.py} RENAMED Viewed

@@ -6,12 +6,12 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
 from datacontract.export.exporter import Exporter
-class OdcsExporter(Exporter):
+class OdcsV2Exporter(Exporter):
     def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
-        return to_odcs_yaml(data_contract)
+        return to_odcs_v2_yaml(data_contract)
-def to_odcs_yaml(data_contract_spec: DataContractSpecification):
+def to_odcs_v2_yaml(data_contract_spec: DataContractSpecification):
     odcs = {
         "kind": "DataContract",
         "apiVersion": "2.3.0",
@@ -25,7 +25,7 @@ def to_odcs_yaml(data_contract_spec: DataContractSpecification):
     if data_contract_spec.info.contact is not None:
         if data_contract_spec.info.contact.email is not None:
             odcs["productDl"] = data_contract_spec.info.contact.email
-        if data_contract_spec.info.contact.email is not None:
+        if data_contract_spec.info.contact.url is not None:
             odcs["productFeedbackUrl"] = data_contract_spec.info.contact.url
     if data_contract_spec.terms is not None:

datacontract-cli 0.10.12__py3-none-any.whl → 0.10.14__py3-none-any.whl

Potentially problematic release.

datacontract-cli 0.10.12py3-none-any.whl → 0.10.14py3-none-any.whl