PyPI - datacontract-cli - Versions diffs - 0.10.7__py3-none-any.whl → 0.10.9__py3-none-any.whl - Mend

datacontract-cli 0.10.7py3-none-any.whl → 0.10.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datacontract-cli might be problematic. Click here for more details.

Files changed (55) hide show

datacontract/catalog/catalog.py +4 -2
datacontract/cli.py +44 -15
datacontract/data_contract.py +52 -206
datacontract/engines/fastjsonschema/s3/s3_read_files.py +13 -1
datacontract/engines/soda/check_soda_execute.py +9 -2
datacontract/engines/soda/connections/bigquery.py +8 -1
datacontract/engines/soda/connections/duckdb.py +28 -12
datacontract/engines/soda/connections/trino.py +26 -0
datacontract/export/__init__.py +0 -0
datacontract/export/avro_converter.py +15 -3
datacontract/export/avro_idl_converter.py +29 -22
datacontract/export/bigquery_converter.py +15 -0
datacontract/export/dbml_converter.py +9 -0
datacontract/export/dbt_converter.py +26 -1
datacontract/export/exporter.py +88 -0
datacontract/export/exporter_factory.py +145 -0
datacontract/export/go_converter.py +6 -0
datacontract/export/great_expectations_converter.py +10 -0
datacontract/export/html_export.py +6 -0
datacontract/export/jsonschema_converter.py +31 -23
datacontract/export/odcs_converter.py +24 -1
datacontract/export/protobuf_converter.py +6 -0
datacontract/export/pydantic_converter.py +6 -0
datacontract/export/rdf_converter.py +9 -0
datacontract/export/sodacl_converter.py +23 -12
datacontract/export/spark_converter.py +211 -0
datacontract/export/sql_converter.py +32 -2
datacontract/export/sql_type_converter.py +32 -5
datacontract/export/terraform_converter.py +6 -0
datacontract/imports/avro_importer.py +8 -0
datacontract/imports/bigquery_importer.py +47 -4
datacontract/imports/glue_importer.py +122 -30
datacontract/imports/importer.py +29 -0
datacontract/imports/importer_factory.py +72 -0
datacontract/imports/jsonschema_importer.py +8 -0
datacontract/imports/odcs_importer.py +200 -0
datacontract/imports/sql_importer.py +8 -0
datacontract/imports/unity_importer.py +152 -0
datacontract/lint/resolve.py +22 -1
datacontract/model/data_contract_specification.py +36 -4
datacontract/templates/datacontract.html +17 -2
datacontract/templates/partials/datacontract_information.html +20 -0
datacontract/templates/partials/datacontract_terms.html +7 -0
datacontract/templates/partials/definition.html +9 -1
datacontract/templates/partials/model_field.html +23 -6
datacontract/templates/partials/server.html +113 -48
datacontract/templates/style/output.css +51 -0
datacontract/web.py +17 -0
{datacontract_cli-0.10.7.dist-info → datacontract_cli-0.10.9.dist-info}/METADATA +298 -59
datacontract_cli-0.10.9.dist-info/RECORD +93 -0
{datacontract_cli-0.10.7.dist-info → datacontract_cli-0.10.9.dist-info}/WHEEL +1 -1
datacontract_cli-0.10.7.dist-info/RECORD +0 -84
{datacontract_cli-0.10.7.dist-info → datacontract_cli-0.10.9.dist-info}/LICENSE +0 -0
{datacontract_cli-0.10.7.dist-info → datacontract_cli-0.10.9.dist-info}/entry_points.txt +0 -0
{datacontract_cli-0.10.7.dist-info → datacontract_cli-0.10.9.dist-info}/top_level.txt +0 -0

datacontract/imports/glue_importer.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import boto3
 from typing import List
+from datacontract.imports.importer import Importer
 from datacontract.model.data_contract_specification import (
     DataContractSpecification,
     Model,
@@ -9,7 +10,14 @@ from datacontract.model.data_contract_specification import (
 )
-def get_glue_database(datebase_name: str):
+class GlueImporter(Importer):
+    def import_source(
+        self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
+    ) -> dict:
+        return import_glue(data_contract_specification, source, import_args.get("glue_tables"))
+def get_glue_database(database_name: str):
     """Get the details Glue database.
     Args:
@@ -18,31 +26,32 @@ def get_glue_database(datebase_name: str):
     Returns:
         set: catalogid and locationUri
     """
     glue = boto3.client("glue")
     try:
-        response = glue.get_database(Name=datebase_name)
+        response = glue.get_database(Name=database_name)
     except glue.exceptions.EntityNotFoundException:
-        print(f"Database not found {datebase_name}.")
+        print(f"Database not found {database_name}.")
         return (None, None)
     except Exception as e:
         # todo catch all
         print(f"Error: {e}")
         return (None, None)
-    return (response["Database"]["CatalogId"], response["Database"].get("LocationUri", "None"))
+    return (
+        response["Database"]["CatalogId"],
+        response["Database"].get("LocationUri", "None"),
+    )
 def get_glue_tables(database_name: str) -> List[str]:
     """Get the list of tables in a Glue database.
     Args:
-        database_name (str): glue database to request.
+        database_name (str): Glue database to request.
     Returns:
-        List[string]: List of table names
+        List[str]: List of table names
     """
     glue = boto3.client("glue")
     # Set the paginator
@@ -107,9 +116,21 @@ def get_glue_table_schema(database_name: str, table_name: str):
     return table_schema
-def import_glue(data_contract_specification: DataContractSpecification, source: str, table_names: List[str]):
-    """Import the schema of a Glue database."""
+def import_glue(
+    data_contract_specification: DataContractSpecification,
+    source: str,
+    table_names: List[str],
+):
+    """Import the schema of a Glue database.
+    Args:
+        data_contract_specification (DataContractSpecification): The data contract specification to update.
+        source (str): The name of the Glue database.
+        table_names (List[str]): List of table names to import. If None, all tables in the database are imported.
+    Returns:
+        DataContractSpecification: The updated data contract specification.
+    """
     catalogid, location_uri = get_glue_database(source)
     # something went wrong
@@ -131,17 +152,21 @@ def import_glue(data_contract_specification: DataContractSpecification, source:
         fields = {}
         for column in table_schema:
-            field = Field()
-            field.type = map_type_from_sql(column["Type"])
+            field = create_typed_field(column["Type"])
             # hive partitons are required, but are not primary keys
             if column.get("Hive"):
                 field.required = True
             field.description = column.get("Comment")
             fields[column["Name"]] = field
+            if "decimal" in column["Type"]:
+                # Extract precision and scale from the string
+                perc_scale = column["Type"][8:-1].split(",")
+                field.precision = int(perc_scale[0])
+                field.scale = int(perc_scale[1])
         data_contract_specification.models[table_name] = Model(
             type="table",
             fields=fields,
@@ -150,35 +175,102 @@ def import_glue(data_contract_specification: DataContractSpecification, source:
     return data_contract_specification
-def map_type_from_sql(sql_type: str):
+def create_typed_field(dtype: str) -> Field:
+    """Create a typed field based on the given data type.
+    Args:
+        dtype (str): The data type of the field.
+    Returns:
+        Field: The created field with the appropriate type.
+    """
+    field = Field()
+    dtype = dtype.strip().lower().replace(" ", "")
+    if dtype.startswith(("array", "struct")):
+        orig_dtype: str = dtype
+        if dtype.startswith("array"):
+            field.type = "array"
+            field.items = create_typed_field(orig_dtype[6:-1])
+        elif dtype.startswith("struct"):
+            field.type = "struct"
+            for f in split_struct(orig_dtype[7:-1]):
+                field.fields[f.split(":", 1)[0].strip()] = create_typed_field(f.split(":", 1)[1])
+    else:
+        field.type = map_type_from_sql(dtype)
+    return field
+def split_fields(s: str):
+    """Split a string of fields considering nested structures.
+    Args:
+        s (str): The string to split.
+    Yields:
+        str: The next field in the string.
+    """
+    counter: int = 0
+    last: int = 0
+    for i, x in enumerate(s):
+        if x in ("<", "("):
+            counter += 1
+        elif x in (">", ")"):
+            counter -= 1
+        elif x == "," and counter == 0:
+            yield s[last:i]
+            last = i + 1
+    yield s[last:]
+def split_struct(s: str) -> List[str]:
+    """Split a struct string into individual fields.
+    Args:
+        s (str): The struct string to split.
+    Returns:
+        List[str]: List of individual fields in the struct.
+    """
+    return list(split_fields(s=s))
+def map_type_from_sql(sql_type: str) -> str:
+    """Map an SQL type to a corresponding field type.
+    Args:
+        sql_type (str): The SQL type to map.
+    Returns:
+        str: The corresponding field type.
+    """
     if sql_type is None:
         return None
-    if sql_type.lower().startswith("varchar"):
+    sql_type = sql_type.lower()
+    if sql_type.startswith("varchar"):
         return "varchar"
-    if sql_type.lower().startswith("string"):
+    if sql_type.startswith("string"):
         return "string"
-    if sql_type.lower().startswith("text"):
+    if sql_type.startswith("text"):
         return "text"
-    elif sql_type.lower().startswith("byte"):
+    if sql_type.startswith("byte"):
         return "byte"
-    elif sql_type.lower().startswith("short"):
+    if sql_type.startswith("short"):
         return "short"
-    elif sql_type.lower().startswith("integer"):
+    if sql_type.startswith("integer") or sql_type.startswith("int"):
         return "integer"
-    elif sql_type.lower().startswith("long"):
+    if sql_type.startswith("long") or sql_type.startswith("bigint"):
         return "long"
-    elif sql_type.lower().startswith("bigint"):
-        return "long"
-    elif sql_type.lower().startswith("float"):
+    if sql_type.startswith("float"):
         return "float"
-    elif sql_type.lower().startswith("double"):
+    if sql_type.startswith("double"):
         return "double"
-    elif sql_type.lower().startswith("boolean"):
+    if sql_type.startswith("boolean"):
         return "boolean"
-    elif sql_type.lower().startswith("timestamp"):
+    if sql_type.startswith("timestamp"):
         return "timestamp"
-    elif sql_type.lower().startswith("date"):
+    if sql_type.startswith("date"):
         return "date"
-    else:
-        return "variant"
+    if sql_type.startswith("decimal"):
+        return "decimal"
+    return "variant"

datacontract/imports/importer.py ADDED Viewed

@@ -0,0 +1,29 @@
+from abc import ABC, abstractmethod
+from enum import Enum
+from datacontract.model.data_contract_specification import DataContractSpecification
+class Importer(ABC):
+    def __init__(self, import_format) -> None:
+        self.import_format = import_format
+    @abstractmethod
+    def import_source(
+        self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
+    ) -> dict:
+        pass
+class ImportFormat(str, Enum):
+    sql = "sql"
+    avro = "avro"
+    glue = "glue"
+    jsonschema = "jsonschema"
+    bigquery = "bigquery"
+    odcs = "odcs"
+    unity = "unity"
+    @classmethod
+    def get_suported_formats(cls):
+        return list(map(lambda c: c.value, cls))

datacontract/imports/importer_factory.py ADDED Viewed

@@ -0,0 +1,72 @@
+import importlib.util
+import sys
+from datacontract.imports.importer import ImportFormat, Importer
+class ImporterFactory:
+    def __init__(self):
+        self.dict_importer = {}
+        self.dict_lazy_importer = {}
+    def register_importer(self, name, importer: Importer):
+        self.dict_importer.update({name: importer})
+    def register_lazy_importer(self, name: str, module_path: str, class_name: str):
+        self.dict_lazy_importer.update({name: (module_path, class_name)})
+    def create(self, name) -> Importer:
+        importers = self.dict_importer.copy()
+        importers.update(self.dict_lazy_importer.copy())
+        if name not in importers.keys():
+            raise ValueError(f"The '{name}' format is not suportted.")
+        importer_class = importers[name]
+        if type(importers[name]) is tuple:
+            importer_class = load_module_class(module_path=importers[name][0], class_name=importers[name][1])
+        if not importer_class:
+            raise ValueError(f"Module {name} could not be loaded.")
+        return importer_class(name)
+def import_module(module_path):
+    if importlib.util.find_spec(module_path) is not None:
+        try:
+            module = importlib.import_module(module_path)
+        except ModuleNotFoundError:
+            return None
+        sys.modules[module_path] = module
+        return module
+def load_module_class(module_path, class_name):
+    module = import_module(module_path)
+    if not module:
+        return None
+    return getattr(module, class_name)
+importer_factory = ImporterFactory()
+importer_factory.register_lazy_importer(
+    name=ImportFormat.avro, module_path="datacontract.imports.avro_importer", class_name="AvroImporter"
+)
+importer_factory.register_lazy_importer(
+    name=ImportFormat.bigquery,
+    module_path="datacontract.imports.bigquery_importer",
+    class_name="BigQueryImporter",
+)
+importer_factory.register_lazy_importer(
+    name=ImportFormat.glue, module_path="datacontract.imports.glue_importer", class_name="GlueImporter"
+)
+importer_factory.register_lazy_importer(
+    name=ImportFormat.jsonschema,
+    module_path="datacontract.imports.jsonschema_importer",
+    class_name="JsonSchemaImporter",
+)
+importer_factory.register_lazy_importer(
+    name=ImportFormat.odcs, module_path="datacontract.imports.odcs_importer", class_name="OdcsImporter"
+)
+importer_factory.register_lazy_importer(
+    name=ImportFormat.sql, module_path="datacontract.imports.sql_importer", class_name="SqlImporter"
+)
+importer_factory.register_lazy_importer(
+    name=ImportFormat.unity, module_path="datacontract.imports.unity_importer", class_name="UnityImporter"
+)

datacontract/imports/jsonschema_importer.py CHANGED Viewed

@@ -2,10 +2,18 @@ import json
 import fastjsonschema
+from datacontract.imports.importer import Importer
 from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field, Definition
 from datacontract.model.exceptions import DataContractException
+class JsonSchemaImporter(Importer):
+    def import_source(
+        self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
+    ) -> dict:
+        return import_jsonschema(data_contract_specification, source)
 def convert_json_schema_properties(properties, is_definition=False):
     fields = {}
     for field_name, field_schema in properties.items():

datacontract/imports/odcs_importer.py ADDED Viewed

@@ -0,0 +1,200 @@
+import datetime
+import logging
+from typing import Any, Dict, List
+import yaml
+from datacontract.imports.importer import Importer
+from datacontract.model.data_contract_specification import (
+    Availability,
+    Contact,
+    DataContractSpecification,
+    Info,
+    Model,
+    Field,
+    Retention,
+    ServiceLevel,
+    Terms,
+)
+from datacontract.model.exceptions import DataContractException
+DATACONTRACT_TYPES = [
+    "string",
+    "text",
+    "varchar",
+    "number",
+    "decimal",
+    "numeric",
+    "int",
+    "integer",
+    "long",
+    "bigint",
+    "float",
+    "double",
+    "boolean",
+    "timestamp",
+    "timestamp_tz",
+    "timestamp_ntz",
+    "date",
+    "array",
+    "bytes",
+    "object",
+    "record",
+    "struct",
+    "null",
+]
+class OdcsImporter(Importer):
+    def import_source(
+        self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
+    ) -> dict:
+        return import_odcs(data_contract_specification, source)
+def import_odcs(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
+    try:
+        with open(source, "r") as file:
+            odcs_contract = yaml.safe_load(file.read())
+    except Exception as e:
+        raise DataContractException(
+            type="schema",
+            name="Parse ODCS contract",
+            reason=f"Failed to parse odcs contract from {source}",
+            engine="datacontract",
+            original_exception=e,
+        )
+    data_contract_specification.id = odcs_contract["uuid"]
+    data_contract_specification.info = import_info(odcs_contract)
+    data_contract_specification.terms = import_terms(odcs_contract)
+    data_contract_specification.servicelevels = import_servicelevels(odcs_contract)
+    data_contract_specification.models = import_models(odcs_contract)
+    return data_contract_specification
+def import_info(odcs_contract: Dict[str, Any]) -> Info:
+    info = Info(title=odcs_contract.get("quantumName"), version=odcs_contract.get("version"))
+    if odcs_contract.get("description").get("purpose") is not None:
+        info.description = odcs_contract.get("description").get("purpose")
+    if odcs_contract.get("datasetDomain") is not None:
+        info.owner = odcs_contract.get("datasetDomain")
+    if odcs_contract.get("productDl") is not None or odcs_contract.get("productFeedbackUrl") is not None:
+        contact = Contact()
+        if odcs_contract.get("productDl") is not None:
+            contact.name = odcs_contract.get("productDl")
+        if odcs_contract.get("productFeedbackUrl") is not None:
+            contact.url = odcs_contract.get("productFeedbackUrl")
+        info.contact = contact
+    return info
+def import_terms(odcs_contract: Dict[str, Any]) -> Terms | None:
+    if (
+        odcs_contract.get("description").get("usage") is not None
+        or odcs_contract.get("description").get("limitations") is not None
+        or odcs_contract.get("price") is not None
+    ):
+        terms = Terms()
+        if odcs_contract.get("description").get("usage") is not None:
+            terms.usage = odcs_contract.get("description").get("usage")
+        if odcs_contract.get("description").get("limitations") is not None:
+            terms.limitations = odcs_contract.get("description").get("limitations")
+        if odcs_contract.get("price") is not None:
+            terms.billing = f"{odcs_contract.get('price').get('priceAmount')} {odcs_contract.get('price').get('priceCurrency')} / {odcs_contract.get('price').get('priceUnit')}"
+        return terms
+    else:
+        return None
+def import_servicelevels(odcs_contract: Dict[str, Any]) -> ServiceLevel:
+    # find the two properties we can map (based on the examples)
+    sla_properties = odcs_contract.get("slaProperties") if odcs_contract.get("slaProperties") is not None else []
+    availability = next((p for p in sla_properties if p["property"] == "generalAvailability"), None)
+    retention = next((p for p in sla_properties if p["property"] == "retention"), None)
+    if availability is not None or retention is not None:
+        servicelevel = ServiceLevel()
+        if availability is not None:
+            value = availability.get("value")
+            if isinstance(value, datetime.datetime):
+                value = value.isoformat()
+            servicelevel.availability = Availability(description=value)
+        if retention is not None:
+            servicelevel.retention = Retention(period=f"{retention.get('value')}{retention.get('unit')}")
+        return servicelevel
+    else:
+        return None
+def import_models(odcs_contract: Dict[str, Any]) -> Dict[str, Model]:
+    custom_type_mappings = get_custom_type_mappings(odcs_contract.get("customProperties"))
+    odcs_tables = odcs_contract.get("dataset") if odcs_contract.get("dataset") is not None else []
+    result = {}
+    for table in odcs_tables:
+        description = table.get("description") if table.get("description") is not None else ""
+        model = Model(description=" ".join(description.splitlines()), type="table")
+        model.fields = import_fields(table.get("columns"), custom_type_mappings)
+        result[table.get("table")] = model
+    return result
+def import_fields(odcs_columns: Dict[str, Any], custom_type_mappings: Dict[str, str]) -> Dict[str, Field]:
+    logger = logging.getLogger(__name__)
+    result = {}
+    for column in odcs_columns:
+        mapped_type = map_type(column.get("logicalType"), custom_type_mappings)
+        if mapped_type is not None:
+            description = column.get("description") if column.get("description") is not None else ""
+            field = Field(
+                description=" ".join(description.splitlines()),
+                type=mapped_type,
+                title=column.get("businessName") if column.get("businessName") is not None else "",
+                required=not column.get("isNullable") if column.get("isNullable") is not None else False,
+                primary=column.get("isPrimary") if column.get("isPrimary") is not None else False,
+                unique=column.get("isUnique") if column.get("isUnique") is not None else False,
+                classification=column.get("classification") if column.get("classification") is not None else "",
+                tags=column.get("tags") if column.get("tags") is not None else [],
+            )
+            result[column["column"]] = field
+        else:
+            logger.info(
+                f"Can't properly map {column.get('column')} to the Datacontract Mapping types, as there is no equivalent or special mapping. Consider introducing a customProperty 'dc_mapping_{column.get('logicalName')}' that defines your expected type as the 'value'"
+            )
+    return result
+def map_type(odcs_type: str, custom_mappings: Dict[str, str]) -> str | None:
+    t = odcs_type.lower()
+    if t in DATACONTRACT_TYPES:
+        return t
+    elif custom_mappings.get(t) is not None:
+        return custom_mappings.get(t)
+    else:
+        return None
+def get_custom_type_mappings(odcs_custom_properties: List[Any]) -> Dict[str, str]:
+    result = {}
+    if odcs_custom_properties is not None:
+        for prop in odcs_custom_properties:
+            if prop["property"].startswith("dc_mapping_"):
+                odcs_type_name = prop["property"].substring(11)
+                datacontract_type = prop["value"]
+                result[odcs_type_name] = datacontract_type
+    return result

datacontract/imports/sql_importer.py CHANGED Viewed

@@ -1,8 +1,16 @@
 from simple_ddl_parser import parse_from_file
+from datacontract.imports.importer import Importer
 from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
+class SqlImporter(Importer):
+    def import_source(
+        self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
+    ) -> dict:
+        return import_sql(data_contract_specification, self.import_format, source)
 def import_sql(data_contract_specification: DataContractSpecification, format: str, source: str):
     ddl = parse_from_file(source, group_by_type=True)
     tables = ddl["tables"]

datacontract-cli 0.10.7__py3-none-any.whl → 0.10.9__py3-none-any.whl

Potentially problematic release.

datacontract-cli 0.10.7py3-none-any.whl → 0.10.9py3-none-any.whl