PyPI - datacontract-cli - Versions diffs - 0.10.3__py3-none-any.whl → 0.10.5__py3-none-any.whl - Mend

datacontract-cli 0.10.3py3-none-any.whl → 0.10.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datacontract-cli might be problematic. Click here for more details.

Files changed (41) hide show

datacontract/breaking/breaking.py +12 -0
datacontract/breaking/breaking_rules.py +4 -0
datacontract/catalog/catalog.py +2 -2
datacontract/cli.py +42 -8
datacontract/data_contract.py +84 -134
datacontract/engines/soda/check_soda_execute.py +5 -0
datacontract/engines/soda/connections/duckdb.py +1 -2
datacontract/engines/soda/connections/sqlserver.py +43 -0
datacontract/export/avro_converter.py +23 -2
datacontract/export/bigquery_converter.py +107 -0
datacontract/export/dbml_converter.py +118 -0
datacontract/export/go_converter.py +98 -0
datacontract/export/html_export.py +4 -2
datacontract/export/jsonschema_converter.py +41 -2
datacontract/export/rdf_converter.py +1 -2
datacontract/export/sql_converter.py +1 -0
datacontract/export/sql_type_converter.py +125 -4
datacontract/imports/avro_importer.py +41 -14
datacontract/imports/bigquery_importer.py +178 -0
datacontract/imports/jsonschema_importer.py +148 -0
datacontract/imports/sql_importer.py +2 -2
datacontract/lint/resolve.py +1 -2
datacontract/model/data_contract_specification.py +65 -1
datacontract/publish/publish.py +32 -0
datacontract/py.typed +0 -0
datacontract/templates/datacontract.html +37 -346
datacontract/templates/index.html +70 -5
datacontract/templates/partials/datacontract_information.html +66 -0
datacontract/templates/partials/datacontract_servicelevels.html +253 -0
datacontract/templates/partials/datacontract_terms.html +44 -0
datacontract/templates/partials/definition.html +99 -0
datacontract/templates/partials/example.html +27 -0
datacontract/templates/partials/model_field.html +97 -0
datacontract/templates/partials/server.html +144 -0
datacontract/templates/style/output.css +99 -13
{datacontract_cli-0.10.3.dist-info → datacontract_cli-0.10.5.dist-info}/METADATA +276 -139
{datacontract_cli-0.10.3.dist-info → datacontract_cli-0.10.5.dist-info}/RECORD +41 -26
{datacontract_cli-0.10.3.dist-info → datacontract_cli-0.10.5.dist-info}/LICENSE +0 -0
{datacontract_cli-0.10.3.dist-info → datacontract_cli-0.10.5.dist-info}/WHEEL +0 -0
{datacontract_cli-0.10.3.dist-info → datacontract_cli-0.10.5.dist-info}/entry_points.txt +0 -0
{datacontract_cli-0.10.3.dist-info → datacontract_cli-0.10.5.dist-info}/top_level.txt +0 -0

datacontract/export/dbml_converter.py ADDED Viewed

@@ -0,0 +1,118 @@
+from datetime import datetime
+from importlib.metadata import version
+from typing import Tuple
+import pytz
+import datacontract.model.data_contract_specification as spec
+from datacontract.export.sql_type_converter import convert_to_sql_type
+def to_dbml_diagram(contract: spec.DataContractSpecification, server: spec.Server) -> str:
+    result = ""
+    result += add_generated_info(contract, server) + "\n"
+    result += generate_project_info(contract) + "\n"
+    for model_name, model in contract.models.items():
+        table_description = generate_table(model_name, model, server)
+        result += f"\n{table_description}\n"
+    return result
+def add_generated_info(contract: spec.DataContractSpecification, server: spec.Server) -> str:
+    tz = pytz.timezone("UTC")
+    now = datetime.now(tz)
+    formatted_date = now.strftime("%b %d %Y")
+    datacontract_cli_version = get_version()
+    dialect = "Logical Datacontract" if server is None else server.type
+    generated_info = """
+Generated at {0} by datacontract-cli version {1}
+for datacontract {2} ({3}) version {4}
+Using {5} Types for the field types
+    """.format(
+        formatted_date, datacontract_cli_version, contract.info.title, contract.id, contract.info.version, dialect
+    )
+    comment = """/*
+{0}
+*/
+    """.format(generated_info)
+    note = """Note project_info {{
+'''
+{0}
+'''
+}}
+    """.format(generated_info)
+    return """{0}
+{1}
+    """.format(comment, note)
+def get_version() -> str:
+    try:
+        return version("datacontract_cli")
+    except Exception:
+        return ""
+def generate_project_info(contract: spec.DataContractSpecification) -> str:
+    return """Project "{0}" {{
+    Note: "{1}"
+}}\n
+    """.format(contract.info.title, " ".join(contract.info.description.splitlines()))
+def generate_table(model_name: str, model: spec.Model, server: spec.Server) -> str:
+    result = """Table "{0}" {{
+Note: "{1}"
+    """.format(model_name, " ".join(model.description.splitlines()))
+    references = []
+    # Add all the fields
+    for field_name, field in model.fields.items():
+        ref, field_string = generate_field(field_name, field, model_name, server)
+        if ref is not None:
+            references.append(ref)
+        result += "{0}\n".format(field_string)
+    result += "}\n"
+    # and if any: add the references
+    if len(references) > 0:
+        for ref in references:
+            result += "Ref: {0}\n".format(ref)
+        result += "\n"
+    return result
+def generate_field(field_name: str, field: spec.Field, model_name: str, server: spec.Server) -> Tuple[str, str]:
+    field_attrs = []
+    if field.primary:
+        field_attrs.append("pk")
+    if field.unique:
+        field_attrs.append("unique")
+    if field.required:
+        field_attrs.append("not null")
+    else:
+        field_attrs.append("null")
+    if field.description:
+        field_attrs.append('Note: "{0}"'.format(" ".join(field.description.splitlines())))
+    field_type = field.type if server is None else convert_to_sql_type(field, server.type)
+    field_str = '"{0}" "{1}" [{2}]'.format(field_name, field_type, ",".join(field_attrs))
+    ref_str = None
+    if (field.references) is not None:
+        # we always assume many to one, as datacontract doesn't really give us more info
+        ref_str = "{0}.{1} > {2}".format(model_name, field_name, field.references)
+    return (ref_str, field_str)

datacontract/export/go_converter.py ADDED Viewed

@@ -0,0 +1,98 @@
+import datacontract.model.data_contract_specification as spec
+from typing import List
+import re
+def to_go_types(contract: spec.DataContractSpecification) -> str:
+    result = "package main\n\n"
+    for key in contract.models.keys():
+        go_types = generate_go_type(contract.models[key], key)
+        for go_type in go_types:
+            # print(go_type + "\n\n")
+            result += f"\n{go_type}\n"
+    return result
+def python_type_to_go_type(py_type) -> str:
+    match py_type:
+        case "text":
+            return "string"
+        case "timestamp":
+            return "time.Time"
+        case "long":
+            return "int64"
+        case "int":
+            return "int"
+        case "float":
+            return "float64"
+        case "boolean":
+            return "bool"
+        case _:
+            return "interface{}"
+def to_camel_case(snake_str) -> str:
+    return "".join(word.capitalize() for word in re.split(r"_|(?<!^)(?=[A-Z])", snake_str))
+def get_subtype(field_info, nested_types, type_name, camel_case_name) -> str:
+    go_type = "interface{}"
+    if field_info.fields:
+        nested_type_name = to_camel_case(f"{type_name}_{camel_case_name}")
+        nested_types[nested_type_name] = field_info.fields
+        go_type = nested_type_name
+    match field_info.type:
+        case "array":
+            if field_info.items:
+                item_type = get_subtype(field_info.items, nested_types, type_name, camel_case_name + "Item")
+                go_type = f"[]{item_type}"
+            else:
+                go_type = "[]interface{}"
+        case "record":
+            if field_info.fields:
+                nested_type_name = to_camel_case(f"{type_name}_{camel_case_name}")
+                nested_types[nested_type_name] = field_info.fields
+                go_type = nested_type_name
+            else:
+                go_type = "interface{}"
+        case "object":
+            pass
+        case _:
+            go_type = field_info.type
+    return go_type
+def generate_go_type(model, model_name) -> List[str]:
+    go_types = []
+    type_name = to_camel_case(model_name)
+    lines = [f"type {type_name} struct {{"]
+    nested_types = {}
+    for field_name, field_info in model.fields.items():
+        go_type = python_type_to_go_type(field_info.type)
+        camel_case_name = to_camel_case(field_name)
+        json_tag = field_name if field_info.required else f"{field_name},omitempty"
+        avro_tag = field_name
+        if go_type == "interface{}":
+            go_type = get_subtype(field_info, nested_types, type_name, camel_case_name)
+        go_type = go_type if field_info.required else f"*{go_type}"
+        lines.append(
+            f'    {camel_case_name} {go_type} `json:"{json_tag}" avro:"{avro_tag}"`  // {field_info.description}'
+        )
+    lines.append("}")
+    go_types.append("\n".join(lines))
+    for nested_type_name, nested_fields in nested_types.items():
+        nested_model = spec.Model(fields=nested_fields)
+        nested_go_types = generate_go_type(nested_model, nested_type_name)
+        go_types.extend(nested_go_types)
+    return go_types

datacontract/export/html_export.py CHANGED Viewed

@@ -2,12 +2,12 @@ import datetime
 import logging
 from importlib.metadata import version
+import jinja_partials
 import pytz
 import yaml
 from jinja2 import Environment, PackageLoader, select_autoescape
-from datacontract.model.data_contract_specification import \
-    DataContractSpecification
+from datacontract.model.data_contract_specification import DataContractSpecification
 def to_html(data_contract_spec: DataContractSpecification) -> str:
@@ -20,6 +20,8 @@ def to_html(data_contract_spec: DataContractSpecification) -> str:
             default_for_string=True,
         ),
     )
+    # Set up for partials
+    jinja_partials.register_environment(env)
     # Load the required template
     # needs to be included in /MANIFEST.in

datacontract/export/jsonschema_converter.py CHANGED Viewed

@@ -18,12 +18,18 @@ def to_jsonschema_json(model_key, model_value: Model) -> str:
 def to_jsonschema(model_key, model_value: Model) -> dict:
-    return {
+    model = {
         "$schema": "http://json-schema.org/draft-07/schema#",
         "type": "object",
         "properties": to_properties(model_value.fields),
         "required": to_required(model_value.fields),
     }
+    if model_value.title:
+        model["title"] = model_value.title
+    if model_value.description:
+        model["description"] = model_value.description
+    return model
 def to_properties(fields: Dict[str, Field]) -> dict:
@@ -46,8 +52,41 @@ def to_property(field: Field) -> dict:
     if field.unique:
         property["unique"] = True
     if json_type == "object":
-        property["properties"] = to_properties(field.fields)
+        # TODO: any better idea to distinguish between properties and patternProperties?
+        if field.fields.keys() and next(iter(field.fields.keys())).startswith("^"):
+            property["patternProperties"] = to_properties(field.fields)
+        else:
+            property["properties"] = to_properties(field.fields)
         property["required"] = to_required(field.fields)
+    if json_type == "array":
+        property["items"] = to_property(field.items)
+    if field.pattern:
+        property["pattern"] = field.pattern
+    if field.enum:
+        property["enum"] = field.enum
+    if field.minLength:
+        property["minLength"] = field.minLength
+    if field.maxLength:
+        property["maxLength"] = field.maxLength
+    if field.title:
+        property["title"] = field.title
+    if field.description:
+        property["description"] = field.description
+    if field.exclusiveMinimum:
+        property["exclusiveMinimum"] = field.exclusiveMinimum
+    if field.exclusiveMaximum:
+        property["exclusiveMaximum"] = field.exclusiveMaximum
+    if field.minimum:
+        property["minimum"] = field.minimum
+    if field.maximum:
+        property["maximum"] = field.maximum
+    if field.tags:
+        property["tags"] = field.tags
+    if field.pii:
+        property["pii"] = field.pii
+    if field.classification:
+        property["classification"] = field.classification
     # TODO: all constraints
     return property

datacontract/export/rdf_converter.py CHANGED Viewed

@@ -1,8 +1,7 @@
 from pydantic import BaseModel
 from rdflib import Graph, Literal, BNode, RDF, URIRef, Namespace
-from datacontract.model.data_contract_specification import \
-    DataContractSpecification
+from datacontract.model.data_contract_specification import DataContractSpecification
 def is_literal(property_name):

datacontract/export/sql_converter.py CHANGED Viewed

@@ -63,6 +63,7 @@ def to_sql_ddl(data_contract_spec: DataContractSpecification, server_type: str =
     result = ""
     result += f"-- Data Contract: {data_contract_spec.id}\n"
     result += f"-- SQL Dialect: {server_type}\n"
     for model_name, model in iter(data_contract_spec.models.items()):
         result += _to_sql_table(table_prefix + model_name, model, server_type)

datacontract/export/sql_type_converter.py CHANGED Viewed

@@ -1,19 +1,29 @@
+from datacontract.export.bigquery_converter import map_type_to_bigquery
 from datacontract.model.data_contract_specification import Field
 def convert_to_sql_type(field: Field, server_type: str) -> str:
     if server_type == "snowflake":
         return convert_to_snowflake(field)
-    if server_type == "postgres":
+    elif server_type == "postgres":
         return convert_type_to_postgres(field)
-    if server_type == "databricks":
+    elif server_type == "databricks":
         return convert_to_databricks(field)
+    elif server_type == "local" or server_type == "s3":
+        return convert_to_duckdb(field)
+    elif server_type == "sqlserver":
+        return convert_type_to_sqlserver(field)
+    elif server_type == "bigquery":
+        return convert_type_to_bigquery(field)
     return field.type
 # snowflake data types:
 # https://docs.snowflake.com/en/sql-reference/data-types.html
-def convert_to_snowflake(field) -> None | str:
+def convert_to_snowflake(field: Field) -> None | str:
+    if field.config and field.config["snowflakeType"] is not None:
+        return field.config["snowflakeType"]
     type = field.type
     # currently optimized for snowflake
     # LEARNING: data contract has no direct support for CHAR,CHARACTER
@@ -54,6 +64,9 @@ def convert_to_snowflake(field) -> None | str:
 # https://www.postgresql.org/docs/current/datatype.html
 # Using the name whenever possible
 def convert_type_to_postgres(field: Field) -> None | str:
+    if field.config and field.config["postgresType"] is not None:
+        return field.config["postgresType"]
     type = field.type
     if type is None:
         return None
@@ -95,7 +108,9 @@ def convert_type_to_postgres(field: Field) -> None | str:
 # databricks data types:
 # https://docs.databricks.com/en/sql/language-manual/sql-ref-datatypes.html
-def convert_to_databricks(field) -> None | str:
+def convert_to_databricks(field: Field) -> None | str:
+    if field.config and field.config["databricksType"] is not None:
+        return field.config["databricksType"]
     type = field.type
     if type is None:
         return None
@@ -129,3 +144,109 @@ def convert_to_databricks(field) -> None | str:
     if type.lower() in ["array"]:
         return "ARRAY"
     return None
+def convert_to_duckdb(field: Field) -> None | str:
+    type = field.type
+    if type is None:
+        return None
+    if type.lower() in ["string", "varchar", "text"]:
+        return "VARCHAR"  # aliases: VARCHAR, CHAR, BPCHAR, STRING, TEXT, VARCHAR(n)	STRING(n), TEXT(n)
+    if type.lower() in ["timestamp", "timestamp_tz"]:
+        return "TIMESTAMP WITH TIME ZONE"  # aliases: TIMESTAMPTZ
+    if type.lower() in ["timestamp_ntz"]:
+        return "DATETIME"  # timestamp with microsecond precision (ignores time zone), aliases: TIMESTAMP
+    if type.lower() in ["date"]:
+        return "DATE"
+    if type.lower() in ["time"]:
+        return "TIME"  # TIME WITHOUT TIME ZONE
+    if type.lower() in ["number", "decimal", "numeric"]:
+        # precision and scale not supported by data contract
+        return "DECIMAL"
+    if type.lower() in ["float"]:
+        return "FLOAT"
+    if type.lower() in ["double"]:
+        return "DOUBLE"
+    if type.lower() in ["integer", "int"]:
+        return "INT"
+    if type.lower() in ["long", "bigint"]:
+        return "BIGINT"
+    if type.lower() in ["boolean"]:
+        return "BOOLEAN"
+    if type.lower() in ["object", "record", "struct"]:
+        return "STRUCT"
+    if type.lower() in ["bytes"]:
+        return "BLOB"
+    if type.lower() in ["array"]:
+        return "ARRAY"
+    return None
+def convert_type_to_sqlserver(field: Field) -> None | str:
+    """Convert from supported datacontract types to equivalent sqlserver types"""
+    field_type = field.type
+    if not field_type:
+        return None
+    # If provided sql-server config type, prefer it over default mapping
+    if sqlserver_type := get_type_config(field, "sqlserverType"):
+        return sqlserver_type
+    field_type = field_type.lower()
+    if field_type in ["string", "varchar", "text"]:
+        if field.format == "uuid":
+            return "uniqueidentifier"
+        return "varchar"
+    if field_type in ["timestamp", "timestamp_tz"]:
+        return "datetimeoffset"
+    if field_type in ["timestamp_ntz"]:
+        if field.format == "datetime":
+            return "datetime"
+        return "datetime2"
+    if field_type in ["date"]:
+        return "date"
+    if field_type in ["time"]:
+        return "time"
+    if field_type in ["number", "decimal", "numeric"]:
+        # precision and scale not supported by data contract
+        if field_type == "number":
+            return "numeric"
+        return field_type
+    if field_type in ["float"]:
+        return "float"
+    if field_type in ["double"]:
+        return "double precision"
+    if field_type in ["integer", "int", "bigint"]:
+        return field_type
+    if field_type in ["long"]:
+        return "bigint"
+    if field_type in ["boolean"]:
+        return "bit"
+    if field_type in ["object", "record", "struct"]:
+        return "jsonb"
+    if field_type in ["bytes"]:
+        return "binary"
+    if field_type in ["array"]:
+        raise NotImplementedError("SQLServer does not support array types.")
+    return None
+def convert_type_to_bigquery(field: Field) -> None | str:
+    """Convert from supported datacontract types to equivalent bigquery types"""
+    field_type = field.type
+    if not field_type:
+        return None
+    # If provided sql-server config type, prefer it over default mapping
+    if bigquery_type := get_type_config(field, "bigqueryType"):
+        return bigquery_type
+    field_type = field_type.lower()
+    return map_type_to_bigquery(field_type, field.title)
+def get_type_config(field: Field, config_attr: str) -> dict[str, str] | None:
+    """Retrieve type configuration if provided in datacontract."""
+    if not field.config:
+        return None
+    return field.config.get(config_attr, None)

datacontract/imports/avro_importer.py CHANGED Viewed

@@ -37,30 +37,48 @@ def import_avro(data_contract_specification: DataContractSpecification, source:
     return data_contract_specification
+def handle_config_avro_custom_properties(field, imported_field):
+    if field.get_prop("logicalType") is not None:
+        if imported_field.config is None:
+            imported_field.config = {}
+        imported_field.config["avroLogicalType"] = field.get_prop("logicalType")
+    if field.default is not None:
+        if imported_field.config is None:
+            imported_field.config = {}
+        imported_field.config["avroDefault"] = field.default
 def import_record_fields(record_fields):
     imported_fields = {}
     for field in record_fields:
-        imported_fields[field.name] = Field()
-        imported_fields[field.name].required = True
-        imported_fields[field.name].description = field.doc
-        for prop in field.other_props:
-            imported_fields[field.name].__setattr__(prop, field.other_props[prop])
+        imported_field = Field()
+        imported_field.required = True
+        imported_field.description = field.doc
+        handle_config_avro_custom_properties(field, imported_field)
+        # Determine field type and handle nested structures
         if field.type.type == "record":
-            imported_fields[field.name].type = "object"
-            imported_fields[field.name].description = field.type.doc
-            imported_fields[field.name].fields = import_record_fields(field.type.fields)
+            imported_field.type = "object"
+            imported_field.description = field.type.doc
+            imported_field.fields = import_record_fields(field.type.fields)
         elif field.type.type == "union":
-            imported_fields[field.name].required = False
+            imported_field.required = False
             type = import_type_of_optional_field(field)
-            imported_fields[field.name].type = type
+            imported_field.type = type
             if type == "record":
-                imported_fields[field.name].fields = import_record_fields(get_record_from_union_field(field).fields)
+                imported_field.fields = import_record_fields(get_record_from_union_field(field).fields)
+            elif type == "array":
+                imported_field.type = "array"
+                imported_field.items = import_avro_array_items(get_array_from_union_field(field))
         elif field.type.type == "array":
-            imported_fields[field.name].type = "array"
-            imported_fields[field.name].items = import_avro_array_items(field.type)
+            imported_field.type = "array"
+            imported_field.items = import_avro_array_items(field.type)
         else:  # primitive type
-            imported_fields[field.name].type = map_type_from_avro(field.type.type)
+            imported_field.type = map_type_from_avro(field.type.type)
+        imported_fields[field.name] = imported_field
     return imported_fields
@@ -102,6 +120,13 @@ def get_record_from_union_field(field):
     return None
+def get_array_from_union_field(field):
+    for field_type in field.type.schemas:
+        if field_type.type == "array":
+            return field_type
+    return None
 def map_type_from_avro(avro_type_str: str):
     # TODO: ambiguous mapping in the export
     if avro_type_str == "null":
@@ -120,6 +145,8 @@ def map_type_from_avro(avro_type_str: str):
         return "boolean"
     elif avro_type_str == "record":
         return "record"
+    elif avro_type_str == "array":
+        return "array"
     else:
         raise DataContractException(
             type="schema",

datacontract-cli 0.10.3__py3-none-any.whl → 0.10.5__py3-none-any.whl

Potentially problematic release.

datacontract-cli 0.10.3py3-none-any.whl → 0.10.5py3-none-any.whl