PyPI - datacontract-cli - Versions diffs - 0.10.3__py3-none-any.whl → 0.10.5__py3-none-any.whl - Mend

datacontract-cli 0.10.3py3-none-any.whl → 0.10.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datacontract-cli might be problematic. Click here for more details.

Files changed (41) hide show

datacontract/breaking/breaking.py +12 -0
datacontract/breaking/breaking_rules.py +4 -0
datacontract/catalog/catalog.py +2 -2
datacontract/cli.py +42 -8
datacontract/data_contract.py +84 -134
datacontract/engines/soda/check_soda_execute.py +5 -0
datacontract/engines/soda/connections/duckdb.py +1 -2
datacontract/engines/soda/connections/sqlserver.py +43 -0
datacontract/export/avro_converter.py +23 -2
datacontract/export/bigquery_converter.py +107 -0
datacontract/export/dbml_converter.py +118 -0
datacontract/export/go_converter.py +98 -0
datacontract/export/html_export.py +4 -2
datacontract/export/jsonschema_converter.py +41 -2
datacontract/export/rdf_converter.py +1 -2
datacontract/export/sql_converter.py +1 -0
datacontract/export/sql_type_converter.py +125 -4
datacontract/imports/avro_importer.py +41 -14
datacontract/imports/bigquery_importer.py +178 -0
datacontract/imports/jsonschema_importer.py +148 -0
datacontract/imports/sql_importer.py +2 -2
datacontract/lint/resolve.py +1 -2
datacontract/model/data_contract_specification.py +65 -1
datacontract/publish/publish.py +32 -0
datacontract/py.typed +0 -0
datacontract/templates/datacontract.html +37 -346
datacontract/templates/index.html +70 -5
datacontract/templates/partials/datacontract_information.html +66 -0
datacontract/templates/partials/datacontract_servicelevels.html +253 -0
datacontract/templates/partials/datacontract_terms.html +44 -0
datacontract/templates/partials/definition.html +99 -0
datacontract/templates/partials/example.html +27 -0
datacontract/templates/partials/model_field.html +97 -0
datacontract/templates/partials/server.html +144 -0
datacontract/templates/style/output.css +99 -13
{datacontract_cli-0.10.3.dist-info → datacontract_cli-0.10.5.dist-info}/METADATA +276 -139
{datacontract_cli-0.10.3.dist-info → datacontract_cli-0.10.5.dist-info}/RECORD +41 -26
{datacontract_cli-0.10.3.dist-info → datacontract_cli-0.10.5.dist-info}/LICENSE +0 -0
{datacontract_cli-0.10.3.dist-info → datacontract_cli-0.10.5.dist-info}/WHEEL +0 -0
{datacontract_cli-0.10.3.dist-info → datacontract_cli-0.10.5.dist-info}/entry_points.txt +0 -0
{datacontract_cli-0.10.3.dist-info → datacontract_cli-0.10.5.dist-info}/top_level.txt +0 -0

datacontract/imports/bigquery_importer.py ADDED Viewed

@@ -0,0 +1,178 @@
+import json
+from typing import List
+from google.cloud import bigquery
+from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
+from datacontract.model.exceptions import DataContractException
+def import_bigquery_from_json(
+    data_contract_specification: DataContractSpecification, source: str
+) -> DataContractSpecification:
+    try:
+        with open(source, "r") as file:
+            bigquery_schema = json.loads(file.read())
+    except json.JSONDecodeError as e:
+        raise DataContractException(
+            type="schema",
+            name="Parse bigquery schema",
+            reason=f"Failed to parse bigquery schema from {source}",
+            engine="datacontract",
+            original_exception=e,
+        )
+    return convert_bigquery_schema(data_contract_specification, bigquery_schema)
+def import_bigquery_from_api(
+    data_contract_specification: DataContractSpecification,
+    bigquery_tables: List[str],
+    bigquery_project: str,
+    bigquery_dataset: str,
+) -> DataContractSpecification:
+    client = bigquery.Client(project=bigquery_project)
+    if bigquery_tables is None:
+        bigquery_tables = fetch_table_names(client, bigquery_dataset)
+    for table in bigquery_tables:
+        try:
+            api_table = client.get_table("{}.{}.{}".format(bigquery_project, bigquery_dataset, table))
+        except ValueError as e:
+            raise DataContractException(
+                type="schema",
+                result="failed",
+                name="Invalid table name for bigquery API",
+                reason=f"Tablename {table} is invalid for the bigquery API",
+                original_exception=e,
+                engine="datacontract",
+            )
+        if api_table is None:
+            raise DataContractException(
+                type="request",
+                result="failed",
+                name="Query bigtable Schema from API",
+                reason=f"Table {table} bnot found on bigtable schema Project {bigquery_project}, dataset {bigquery_dataset}.",
+                engine="datacontract",
+            )
+        convert_bigquery_schema(data_contract_specification, api_table.to_api_repr())
+    return data_contract_specification
+def fetch_table_names(client: bigquery.Client, dataset: str) -> List[str]:
+    table_names = []
+    api_tables = client.list_tables(dataset)
+    for api_table in api_tables:
+        table_names.append(api_table.table_id)
+    return table_names
+def convert_bigquery_schema(
+    data_contract_specification: DataContractSpecification, bigquery_schema: dict
+) -> DataContractSpecification:
+    if data_contract_specification.models is None:
+        data_contract_specification.models = {}
+    fields = import_table_fields(bigquery_schema.get("schema").get("fields"))
+    # Looking at actual export data, I guess this is always set and friendlyName isn't, though I couldn't say
+    # what exactly leads to friendlyName being set
+    table_id = bigquery_schema.get("tableReference").get("tableId")
+    data_contract_specification.models[table_id] = Model(fields=fields, type="table")
+    # Copy the description, if it exists
+    if bigquery_schema.get("description") is not None:
+        data_contract_specification.models[table_id].description = bigquery_schema.get("description")
+    # Set the title from friendlyName if it exists
+    if bigquery_schema.get("friendlyName") is not None:
+        data_contract_specification.models[table_id].title = bigquery_schema.get("friendlyName")
+    return data_contract_specification
+def import_table_fields(table_fields):
+    imported_fields = {}
+    for field in table_fields:
+        field_name = field.get("name")
+        imported_fields[field_name] = Field()
+        imported_fields[field_name].required = field.get("mode") == "REQUIRED"
+        imported_fields[field_name].description = field.get("description")
+        if field.get("type") == "RECORD":
+            imported_fields[field_name].type = "object"
+            imported_fields[field_name].fields = import_table_fields(field.get("fields"))
+        elif field.get("type") == "STRUCT":
+            imported_fields[field_name].type = "struct"
+            imported_fields[field_name].fields = import_table_fields(field.get("fields"))
+        elif field.get("type") == "RANGE":
+            # This is a range of date/datetime/timestamp but multiple values
+            # So we map it to an array
+            imported_fields[field_name].type = "array"
+            imported_fields[field_name].items = Field(
+                type=map_type_from_bigquery(field["rangeElementType"].get("type"))
+            )
+        else:  # primitive type
+            imported_fields[field_name].type = map_type_from_bigquery(field.get("type"))
+        if field.get("type") == "STRING":
+            # in bigquery both string and bytes have maxLength but in the datacontracts
+            # spec it is only valid for strings
+            if field.get("maxLength") is not None:
+                imported_fields[field_name].maxLength = int(field.get("maxLength"))
+        if field.get("type") == "NUMERIC" or field.get("type") == "BIGNUMERIC":
+            if field.get("precision") is not None:
+                imported_fields[field_name].precision = int(field.get("precision"))
+            if field.get("scale") is not None:
+                imported_fields[field_name].scale = int(field.get("scale"))
+    return imported_fields
+def map_type_from_bigquery(bigquery_type_str: str):
+    if bigquery_type_str == "STRING":
+        return "string"
+    elif bigquery_type_str == "BYTES":
+        return "bytes"
+    elif bigquery_type_str == "INTEGER":
+        return "int"
+    elif bigquery_type_str == "INT64":
+        return "bigint"
+    elif bigquery_type_str == "FLOAT":
+        return "float"
+    elif bigquery_type_str == "FLOAT64":
+        return "double"
+    elif bigquery_type_str == "BOOLEAN" or bigquery_type_str == "BOOL":
+        return "boolean"
+    elif bigquery_type_str == "TIMESTAMP":
+        return "timestamp"
+    elif bigquery_type_str == "DATE":
+        return "date"
+    elif bigquery_type_str == "TIME":
+        return "timestamp_ntz"
+    elif bigquery_type_str == "DATETIME":
+        return "timestamp"
+    elif bigquery_type_str == "NUMERIC":
+        return "numeric"
+    elif bigquery_type_str == "BIGNUMERIC":
+        return "double"
+    elif bigquery_type_str == "GEOGRAPHY":
+        return "object"
+    elif bigquery_type_str == "JSON":
+        return "object"
+    else:
+        raise DataContractException(
+            type="schema",
+            result="failed",
+            name="Map bigquery type to data contract type",
+            reason=f"Unsupported type {bigquery_type_str} in bigquery json definition.",
+            engine="datacontract",
+        )

datacontract/imports/jsonschema_importer.py ADDED Viewed

@@ -0,0 +1,148 @@
+import json
+import fastjsonschema
+from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field, Definition
+from datacontract.model.exceptions import DataContractException
+def convert_json_schema_properties(properties, is_definition=False):
+    fields = {}
+    for field_name, field_schema in properties.items():
+        field_kwargs = {}
+        field_type = field_schema.get("type")
+        # Determine if the field is required and set the type to the non-null option if applicable
+        if isinstance(field_type, list) and "null" in field_type:
+            field_kwargs["required"] = False
+            non_null_types = [t for t in field_type if t != "null"]
+            if non_null_types:
+                field_type = non_null_types[0]
+            else:
+                field_type = None
+        else:
+            field_kwargs["required"] = True
+        # Set the non-null type
+        if field_type:
+            field_kwargs["type"] = field_type
+        for key, value in field_schema.items():
+            match key:
+                case "title":
+                    field_kwargs["title"] = value
+                case "type":
+                    pass  # type is already handled above
+                case "format":
+                    field_kwargs["format"] = value
+                case "description":
+                    field_kwargs["description"] = value
+                case "pattern":
+                    field_kwargs["pattern"] = value
+                case "minLength":
+                    field_kwargs["minLength"] = value
+                case "maxLength":
+                    field_kwargs["maxLength"] = value
+                case "minimum":
+                    field_kwargs["minimum"] = value
+                case "exclusiveMinimum":
+                    field_kwargs["exclusiveMinimum"] = value
+                case "maximum":
+                    field_kwargs["maximum"] = value
+                case "exclusiveMaximum":
+                    field_kwargs["exclusiveMaximum"] = value
+                case "enum":
+                    field_kwargs["enum"] = value
+                case "tags":
+                    field_kwargs["tags"] = value
+                case "properties":
+                    field_kwargs["fields"] = convert_json_schema_properties(value)
+                case "items":
+                    field_kwargs["items"] = convert_json_schema_properties(value)
+        field = Field(**field_kwargs)
+        fields[field_name] = field
+    return fields
+def import_jsonschema(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
+    if data_contract_specification.models is None:
+        data_contract_specification.models = {}
+    try:
+        with open(source, "r") as file:
+            json_schema = json.loads(file.read())
+            validator = fastjsonschema.compile({})
+            validator(json_schema)
+            model = Model(
+                description=json_schema.get("description"),
+                type=json_schema.get("type"),
+                title=json_schema.get("title"),
+                fields=convert_json_schema_properties(json_schema.get("properties", {})),
+            )
+            data_contract_specification.models[json_schema.get("title", "default_model")] = model
+            if "definitions" in json_schema:
+                for def_name, def_schema in json_schema["definitions"].items():
+                    definition_kwargs = {}
+                    for key, value in def_schema.items():
+                        match key:
+                            case "domain":
+                                definition_kwargs["domain"] = value
+                            case "title":
+                                definition_kwargs["title"] = value
+                            case "description":
+                                definition_kwargs["description"] = value
+                            case "type":
+                                definition_kwargs["type"] = value
+                            case "enum":
+                                definition_kwargs["enum"] = value
+                            case "format":
+                                definition_kwargs["format"] = value
+                            case "minLength":
+                                definition_kwargs["minLength"] = value
+                            case "maxLength":
+                                definition_kwargs["maxLength"] = value
+                            case "pattern":
+                                definition_kwargs["pattern"] = value
+                            case "minimum":
+                                definition_kwargs["minimum"] = value
+                            case "exclusiveMinimum":
+                                definition_kwargs["exclusiveMinimum"] = value
+                            case "maximum":
+                                definition_kwargs["maximum"] = value
+                            case "exclusiveMaximum":
+                                definition_kwargs["exclusiveMaximum"] = value
+                            case "pii":
+                                definition_kwargs["pii"] = value
+                            case "classification":
+                                definition_kwargs["classification"] = value
+                            case "tags":
+                                definition_kwargs["tags"] = value
+                            case "properties":
+                                definition_kwargs["fields"] = convert_json_schema_properties(value, is_definition=True)
+                    definition = Definition(name=def_name, **definition_kwargs)
+                    data_contract_specification.definitions[def_name] = definition
+    except fastjsonschema.JsonSchemaException as e:
+        raise DataContractException(
+            type="schema",
+            name="Parse json schema",
+            reason=f"Failed to parse json schema from {source}: {e}",
+            engine="datacontract",
+        )
+    except Exception as e:
+        raise DataContractException(
+            type="schema",
+            name="Parse json schema",
+            reason=f"Failed to parse json schema from {source}",
+            engine="datacontract",
+            original_exception=e,
+        )
+    return data_contract_specification

datacontract/imports/sql_importer.py CHANGED Viewed

@@ -45,7 +45,7 @@ def map_type_from_sql(sql_type: str):
         return None
     sql_type_normed = sql_type.lower().strip()
     if sql_type_normed.startswith("varchar"):
         return "varchar"
     elif sql_type_normed.startswith("string"):
@@ -69,6 +69,6 @@ def map_type_from_sql(sql_type: str):
     elif sql_type_normed == "datetime2":
         return "timestamp_ntz"
     elif sql_type_normed == "datetimeoffset":
-        return "timestamp_tz"
+        return "timestamp_tz"
     else:
         return "variant"

datacontract/lint/resolve.py CHANGED Viewed

@@ -8,8 +8,7 @@ from fastjsonschema import JsonSchemaValueException
 from datacontract.lint.files import read_file
 from datacontract.lint.schema import fetch_schema
 from datacontract.lint.urls import fetch_resource
-from datacontract.model.data_contract_specification import \
-    DataContractSpecification, Definition, Quality
+from datacontract.model.data_contract_specification import DataContractSpecification, Definition, Quality
 from datacontract.model.exceptions import DataContractException

datacontract/model/data_contract_specification.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-from typing import List, Dict
+from typing import List, Dict, Optional, Any
 import pydantic as pyd
 import yaml
@@ -31,6 +31,7 @@ class Server(pyd.BaseModel):
     token: str = None  # Use ENV variable
     dataProductId: str = None
     outputPortId: str = None
+    driver: str = None
 class Terms(pyd.BaseModel):
@@ -58,6 +59,7 @@ class Definition(pyd.BaseModel):
     pii: bool = None
     classification: str = None
     tags: List[str] = []
+    example: str = None
 class Field(pyd.BaseModel):
@@ -84,12 +86,17 @@ class Field(pyd.BaseModel):
     tags: List[str] = []
     fields: Dict[str, "Field"] = {}
     items: "Field" = None
+    precision: int = None
+    scale: int = None
+    example: str = None
+    config: Dict[str, Any] = None
 class Model(pyd.BaseModel):
     description: str = None
     type: str = None
     namespace: str = None
+    title: str = None
     fields: Dict[str, Field] = {}
@@ -114,6 +121,62 @@ class Quality(pyd.BaseModel):
     specification: str | object = None
+class Availability(pyd.BaseModel):
+    description: Optional[str] = None
+    percentage: Optional[str] = None
+class Retention(pyd.BaseModel):
+    description: Optional[str] = None
+    period: Optional[str] = None
+    unlimited: Optional[bool] = None
+    timestampField: Optional[str] = None
+class Latency(pyd.BaseModel):
+    description: Optional[str] = None
+    threshold: Optional[str] = None
+    sourceTimestampField: Optional[str] = None
+    processedTimestampField: Optional[str] = None
+class Freshness(pyd.BaseModel):
+    description: Optional[str] = None
+    threshold: Optional[str] = None
+    timestampField: Optional[str] = None
+class Frequency(pyd.BaseModel):
+    description: Optional[str] = None
+    type: Optional[str] = None
+    interval: Optional[str] = None
+    cron: Optional[str] = None
+class Support(pyd.BaseModel):
+    description: Optional[str] = None
+    time: Optional[str] = None
+    responseTime: Optional[str] = None
+class Backup(pyd.BaseModel):
+    description: Optional[str] = None
+    interval: Optional[str] = None
+    cron: Optional[str] = None
+    recoveryTime: Optional[str] = None
+    recoveryPoint: Optional[str] = None
+class ServiceLevel(pyd.BaseModel):
+    availability: Optional[Availability] = None
+    retention: Optional[Retention] = None
+    latency: Optional[Latency] = None
+    freshness: Optional[Freshness] = None
+    frequency: Optional[Frequency] = None
+    support: Optional[Support] = None
+    backup: Optional[Backup] = None
 class DataContractSpecification(pyd.BaseModel):
     dataContractSpecification: str = None
     id: str = None
@@ -125,6 +188,7 @@ class DataContractSpecification(pyd.BaseModel):
     # schema: Dict[str, str]
     examples: List[Example] = []
     quality: Quality = None
+    servicelevels: Optional[ServiceLevel] = None
     @classmethod
     def from_file(cls, file):

datacontract/publish/publish.py ADDED Viewed

@@ -0,0 +1,32 @@
+import os
+import requests
+from datacontract.data_contract import DataContract
+def publish_to_datamesh_manager(data_contract: DataContract):
+    try:
+        headers = {"Content-Type": "application/json", "x-api-key": _require_datamesh_manager_api_key()}
+        spec = data_contract.get_data_contract_specification()
+        id = spec.id
+        url = "https://api.datamesh-manager.com/api/datacontracts/{0}".format(id)
+        request_body = spec.model_dump_json().encode("utf-8")
+        response = requests.put(
+            url=url,
+            data=request_body,
+            headers=headers,
+        )
+        if response.status_code != 200:
+            print(f"Error publishing data contract to Data Mesh Manager: {response.text}")
+            exit(1)
+        print(f"Published data contract to {url}")
+    except Exception as e:
+        print(f"Failed publishing data contract. Error: {str(e)}")
+def _require_datamesh_manager_api_key():
+    datamesh_manager_api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
+    if datamesh_manager_api_key is None:
+        raise Exception("Cannot publish data contract, as DATAMESH_MANAGER_API_KEY is not set")
+    return datamesh_manager_api_key

datacontract/py.typed ADDED Viewed

File without changes

datacontract-cli 0.10.3__py3-none-any.whl → 0.10.5__py3-none-any.whl

Potentially problematic release.

datacontract-cli 0.10.3py3-none-any.whl → 0.10.5py3-none-any.whl