PyPI - datacontract-cli - Versions diffs - 0.10.28__py3-none-any.whl → 0.10.29__py3-none-any.whl - Mend

datacontract-cli 0.10.28py3-none-any.whl → 0.10.29py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datacontract-cli might be problematic. Click here for more details.

Files changed (21) hide show

datacontract/api.py CHANGED Viewed

@@ -10,7 +10,7 @@ from fastapi.security.api_key import APIKeyHeader
 from datacontract.data_contract import DataContract, ExportFormat
 from datacontract.model.run import Run
-DATA_CONTRACT_EXAMPLE_PAYLOAD = """dataContractSpecification: 1.1.0
+DATA_CONTRACT_EXAMPLE_PAYLOAD = """dataContractSpecification: 1.2.0
 id: urn:datacontract:checkout:orders-latest
 info:
   title: Orders Latest

datacontract/cli.py CHANGED Viewed

@@ -469,8 +469,26 @@ def diff(
     console.print(result.changelog_str())
-@app.command()
+def _get_uvicorn_arguments(port: int, host: str, context: typer.Context) -> dict:
+    """
+    Take the default datacontract uvicorn arguments and merge them with the
+    extra arguments passed to the command to start the API.
+    """
+    default_args = {
+        "app": "datacontract.api:app",
+        "port": port,
+        "host": host,
+        "reload": True,
+    }
+    # Create a list of the extra arguments, remove the leading -- from the cli arguments
+    trimmed_keys = list(map(lambda x : str(x).replace("--", ""),context.args[::2]))
+    # Merge the two dicts and return them as one dict
+    return default_args | dict(zip(trimmed_keys, context.args[1::2]))
+@app.command(context_settings={"allow_extra_args": True, "ignore_unknown_options": True})
 def api(
+    ctx: Annotated[typer.Context, typer.Option(help="Extra arguments to pass to uvicorn.run().")],
     port: Annotated[int, typer.Option(help="Bind socket to this port.")] = 4242,
     host: Annotated[
         str, typer.Option(help="Bind socket to this host. Hint: For running in docker, set it to 0.0.0.0")
@@ -488,6 +506,9 @@ def api(
     To connect to servers (such as a Snowflake data source), set the credentials as environment variables as documented in
     https://cli.datacontract.com/#test
+    It is possible to run the API with extra arguments for `uvicorn.run()` as keyword arguments, e.g.:
+    `datacontract api --port 1234 --root_path /datacontract`.
     """
     import uvicorn
     from uvicorn.config import LOGGING_CONFIG
@@ -495,7 +516,11 @@ def api(
     log_config = LOGGING_CONFIG
     log_config["root"] = {"level": "INFO"}
-    uvicorn.run(app="datacontract.api:app", port=port, host=host, reload=True, log_config=LOGGING_CONFIG)
+    uvicorn_args = _get_uvicorn_arguments(port, host, ctx)
+    # Add the log config
+    uvicorn_args["log_config"] = log_config
+    # Run uvicorn
+    uvicorn.run(**uvicorn_args)
 def _print_logs(run):

datacontract/engines/soda/connections/duckdb_connection.py CHANGED Viewed

@@ -132,10 +132,10 @@ def setup_s3_connection(con, server):
     use_ssl = "true"
     url_style = "vhost"
     if server.endpointUrl is not None:
+        url_style = "path"
         s3_endpoint = server.endpointUrl.removeprefix("http://").removeprefix("https://")
         if server.endpointUrl.startswith("http://"):
             use_ssl = "false"
-            url_style = "path"
     if s3_access_key_id is not None:
         if s3_session_token is not None:

datacontract/export/rdf_converter.py CHANGED Viewed

@@ -57,8 +57,8 @@ def to_rdf(data_contract_spec: DataContractSpecification, base) -> Graph:
     else:
         g = Graph(base=Namespace(""))
-    dc = Namespace("https://datacontract.com/DataContractSpecification/1.1.0/")
-    dcx = Namespace("https://datacontract.com/DataContractSpecification/1.1.0/Extension/")
+    dc = Namespace("https://datacontract.com/DataContractSpecification/1.2.0/")
+    dcx = Namespace("https://datacontract.com/DataContractSpecification/1.2.0/Extension/")
     g.bind("dc", dc)
     g.bind("dcx", dcx)

datacontract/export/sql_type_converter.py CHANGED Viewed

@@ -194,8 +194,8 @@ def convert_to_databricks(field: Field) -> None | str:
         nested_fields = []
         for nested_field_name, nested_field in field.fields.items():
             nested_field_type = convert_to_databricks(nested_field)
-            nested_fields.append(f"{nested_field_name} {nested_field_type}")
-        return f"STRUCT<{', '.join(nested_fields)}>"
+            nested_fields.append(f"{nested_field_name}:{nested_field_type}")
+        return f"STRUCT<{','.join(nested_fields)}>"
     if type.lower() in ["bytes"]:
         return "BINARY"
     if type.lower() in ["array"]:

datacontract/imports/excel_importer.py CHANGED Viewed

@@ -568,6 +568,8 @@ def import_roles(workbook: Workbook) -> Optional[List[Role]]:
         roles_list = []
         for row_idx in range(roles_range[0], roles_range[1]):
+            if len(list(roles_sheet.rows)) < row_idx + 1:
+                break
             row = list(roles_sheet.rows)[row_idx]
             role_name = get_cell_value(row, headers.get("role"))

datacontract/imports/importer.py CHANGED Viewed

@@ -26,6 +26,7 @@ class ImportFormat(str, Enum):
     dbml = "dbml"
     glue = "glue"
     jsonschema = "jsonschema"
+    json = "json"
     bigquery = "bigquery"
     odcs = "odcs"
     unity = "unity"

datacontract/imports/importer_factory.py CHANGED Viewed

@@ -119,3 +119,10 @@ importer_factory.register_lazy_importer(
     module_path="datacontract.imports.excel_importer",
     class_name="ExcelImporter",
 )
+importer_factory.register_lazy_importer(
+    name=ImportFormat.json,
+    module_path="datacontract.imports.json_importer",
+    class_name="JsonImporter",
+)

datacontract/imports/json_importer.py ADDED Viewed

@@ -0,0 +1,325 @@
+import json
+import os
+import re
+from typing import Any, Dict, List, Optional, Tuple
+from datacontract.imports.importer import Importer
+from datacontract.model.data_contract_specification import DataContractSpecification, Model, Server
+class JsonImporter(Importer):
+    def import_source(
+        self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
+    ) -> DataContractSpecification:
+        return import_json(data_contract_specification, source)
+def is_ndjson(file_path: str) -> bool:
+    """Check if a file contains newline-delimited JSON."""
+    with open(file_path, "r", encoding="utf-8") as file:
+        for _ in range(5):
+            line = file.readline().strip()
+            if not line:
+                continue
+            try:
+                json.loads(line)
+                return True
+            except json.JSONDecodeError:
+                break
+    return False
+def import_json(
+    data_contract_specification: DataContractSpecification, source: str, include_examples: bool = False
+) -> DataContractSpecification:
+    # use the file name as base model name
+    base_model_name = os.path.splitext(os.path.basename(source))[0]
+    # check if file is newline-delimited JSON
+    if is_ndjson(source):
+        # load NDJSON data
+        json_data = []
+        with open(source, "r", encoding="utf-8") as file:
+            for line in file:
+                line = line.strip()
+                if line:
+                    try:
+                        json_data.append(json.loads(line))
+                    except json.JSONDecodeError:
+                        continue
+    else:
+        # load regular JSON data
+        with open(source, "r", encoding="utf-8") as file:
+            json_data = json.load(file)
+    if data_contract_specification.servers is None:
+        data_contract_specification.servers = {}
+    data_contract_specification.servers["production"] = Server(type="local", path=source, format="json")
+    # initialisation
+    models = {}
+    if isinstance(json_data, list) and json_data:
+        # Array of items
+        if all(isinstance(item, dict) for item in json_data[:5]):
+            # Array of objects, as table
+            fields = {}
+            for item in json_data[:20]:
+                for key, value in item.items():
+                    field_def = generate_field_definition(value, key, base_model_name, models)
+                    if key in fields:
+                        fields[key] = merge_field_definitions(fields[key], field_def)
+                    else:
+                        fields[key] = field_def
+            models[base_model_name] = {
+                "type": "table",
+                "description": f"Generated from JSON array in {source}",
+                "fields": fields,
+                "examples": json_data[:3] if include_examples else None,
+            }
+        else:
+            # Simple array
+            item_type, item_format = infer_array_type(json_data[:20])
+            models[base_model_name] = {
+                "type": "array",
+                "description": f"Generated from JSON array in {source}",
+                "items": {"type": item_type, "format": item_format} if item_format else {"type": item_type},
+                "examples": [json_data[:5]] if include_examples else None,
+            }
+    elif isinstance(json_data, dict):
+        # Single object
+        fields = {}
+        for key, value in json_data.items():
+            fields[key] = generate_field_definition(value, key, base_model_name, models)
+        models[base_model_name] = {
+            "type": "object",
+            "description": f"Generated from JSON object in {source}",
+            "fields": fields,
+            "examples": [json_data] if include_examples else None,
+        }
+    else:
+        # Primitive value
+        field_type, field_format = determine_type_and_format(json_data)
+        models[base_model_name] = {
+            "type": field_type,
+            "description": f"Generated from JSON primitive in {source}",
+            "format": field_format,
+            "examples": [json_data] if include_examples and field_type != "boolean" else None,
+        }
+    for model_name, model_def in models.items():
+        model_type = model_def.pop("type")
+        data_contract_specification.models[model_name] = Model(type=model_type, **model_def)
+    return data_contract_specification
+def generate_field_definition(
+    value: Any, field_name: str, parent_model: str, models: Dict[str, Dict[str, Any]]
+) -> Dict[str, Any]:
+    """Generate a field definition for a JSON value, creating nested models."""
+    if isinstance(value, dict):
+        # Handle object fields
+        fields = {}
+        for key, nested_value in value.items():
+            fields[key] = generate_field_definition(nested_value, key, parent_model, models)
+        return {"type": "object", "fields": fields}
+    elif isinstance(value, list):
+        # Handle array fields
+        if not value:
+            return {"type": "array", "items": {"type": "string"}}
+        if all(isinstance(item, dict) for item in value):
+            # Array of objects
+            fields = {}
+            for item in value:
+                for key, nested_value in item.items():
+                    field_def = generate_field_definition(nested_value, key, parent_model, models)
+                    if key in fields:
+                        fields[key] = merge_field_definitions(fields[key], field_def)
+                    else:
+                        fields[key] = field_def
+            return {"type": "array", "items": {"type": "object", "fields": fields}}
+        elif all(isinstance(item, list) for item in value):
+            # Array of arrays
+            inner_type, inner_format = infer_array_type(value[0])
+            return {
+                "type": "array",
+                "items": {
+                    "type": "array",
+                    "items": {"type": inner_type, "format": inner_format} if inner_format else {"type": inner_type},
+                },
+                "examples": value[:5],  # Include examples for nested arrays
+            }
+        else:
+            # Array of simple or mixed types
+            item_type, item_format = infer_array_type(value)
+            items_def = {"type": item_type}
+            if item_format:
+                items_def["format"] = item_format
+            field_def = {"type": "array", "items": items_def}
+            # Add examples if appropriate
+            sample_values = [item for item in value[:5] if item is not None]
+            if sample_values:
+                field_def["examples"] = sample_values
+            return field_def
+    else:
+        # Handle primitive types
+        field_type, field_format = determine_type_and_format(value)
+        field_def = {"type": field_type}
+        if field_format:
+            field_def["format"] = field_format
+        # Add examples
+        if value is not None and field_type != "boolean":
+            field_def["examples"] = [value]
+        return field_def
+def infer_array_type(array: List) -> Tuple[str, Optional[str]]:
+    """Infer the common type of items in an array."""
+    if not array:
+        return "string", None
+    # if all items are dictionaries with the same structure
+    if all(isinstance(item, dict) for item in array):
+        return "object", None
+    # if all items are of the same primitive type
+    non_null_items = [item for item in array if item is not None]
+    if not non_null_items:
+        return "null", None
+    types_and_formats = [determine_type_and_format(item) for item in non_null_items]
+    types = {t for t, _ in types_and_formats}
+    formats = {f for _, f in types_and_formats if f is not None}
+    # simplify type combinations
+    if types == {"integer", "number"}:
+        return "number", None
+    if len(types) == 1:
+        type_name = next(iter(types))
+        format_name = next(iter(formats)) if len(formats) == 1 else None
+        return type_name, format_name
+    if all(t in {"string", "integer", "number", "boolean", "null"} for t in types):
+        # If all string values have the same format, keep it
+        if len(formats) == 1 and "string" in types:
+            return "string", next(iter(formats))
+        return "string", None
+    # Mixed types
+    return "string", None
+def determine_type_and_format(value: Any) -> Tuple[str, Optional[str]]:
+    """determine the datacontract type and format for a JSON value."""
+    if value is None:
+        return "null", None
+    elif isinstance(value, bool):
+        return "boolean", None
+    elif isinstance(value, int):
+        return "integer", None
+    elif isinstance(value, float):
+        return "number", None
+    elif isinstance(value, str):
+        try:
+            if re.match(r"^\d{4}-\d{2}-\d{2}$", value):
+                return "string", "date"
+            elif re.match(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:\d{2})?$", value):
+                return "string", "date-time"
+            elif re.match(r"^[\w\.-]+@([\w-]+\.)+[\w-]{2,4}$", value):
+                return "string", "email"
+            elif re.match(r"^[a-f0-9]{8}-?[a-f0-9]{4}-?[a-f0-9]{4}-?[a-f0-9]{4}-?[a-f0-9]{12}$", value.lower()):
+                return "string", "uuid"
+            else:
+                return "string", None
+        except re.error:
+            return "string", None
+    elif isinstance(value, dict):
+        return "object", None
+    elif isinstance(value, list):
+        return "array", None
+    else:
+        return "string", None
+def merge_field_definitions(field1: Dict[str, Any], field2: Dict[str, Any]) -> Dict[str, Any]:
+    """Merge two field definitions."""
+    result = field1.copy()
+    if field1.get("type") == "object" and field2.get("type") != "object":
+        return field1
+    if field2.get("type") == "object" and field1.get("type") != "object":
+        return field2
+    # Handle type differences
+    if field1.get("type") != field2.get("type"):
+        type1, _ = field1.get("type", "string"), field1.get("format")
+        type2, _ = field2.get("type", "string"), field2.get("format")
+        if type1 == "integer" and type2 == "number" or type1 == "number" and type2 == "integer":
+            common_type = "number"
+            common_format = None
+        elif "string" in [type1, type2]:
+            common_type = "string"
+            common_format = None
+        elif all(t in ["string", "integer", "number", "boolean", "null"] for t in [type1, type2]):
+            common_type = "string"
+            common_format = None
+        elif type1 == "array" and type2 == "array":
+            # Handle mixed array types
+            items1 = field1.get("items", {})
+            items2 = field2.get("items", {})
+            if items1.get("type") == "object" or items2.get("type") == "object":
+                if items1.get("type") == "object" and items2.get("type") == "object":
+                    merged_items = merge_field_definitions(items1, items2)
+                else:
+                    merged_items = items1 if items1.get("type") == "object" else items2
+                return {"type": "array", "items": merged_items}
+            else:
+                merged_items = merge_field_definitions(items1, items2)
+                return {"type": "array", "items": merged_items}
+        else:
+            common_type = "array" if "array" in [type1, type2] else "object"
+            common_format = None
+        result["type"] = common_type
+        if common_format:
+            result["format"] = common_format
+        elif "format" in result:
+            del result["format"]
+    # Merge examples
+    if "examples" in field2:
+        if "examples" in result:
+            combined = result["examples"] + [ex for ex in field2["examples"] if ex not in result["examples"]]
+            result["examples"] = combined[:5]  # Limit to 5 examples
+        else:
+            result["examples"] = field2["examples"]
+    # Handle nested structures
+    if result.get("type") == "array" and "items" in field1 and "items" in field2:
+        result["items"] = merge_field_definitions(field1["items"], field2["items"])
+    elif result.get("type") == "object" and "fields" in field1 and "fields" in field2:
+        # Merge fields from both objects
+        merged_fields = field1["fields"].copy()
+        for key, field_def in field2["fields"].items():
+            if key in merged_fields:
+                merged_fields[key] = merge_field_definitions(merged_fields[key], field_def)
+            else:
+                merged_fields[key] = field_def
+        result["fields"] = merged_fields
+    return result

datacontract/init/init_template.py CHANGED Viewed

@@ -3,7 +3,7 @@ import logging
 import requests
-DEFAULT_DATA_CONTRACT_INIT_TEMPLATE = "datacontract-1.1.0.init.yaml"
+DEFAULT_DATA_CONTRACT_INIT_TEMPLATE = "datacontract-1.2.0.init.yaml"
 def get_init_template(location: str = None) -> str:

datacontract/lint/resolve.py CHANGED Viewed

@@ -303,7 +303,7 @@ def _resolve_data_contract_from_str(
         # if ODCS, then validate the ODCS schema and import to DataContractSpecification directly
         odcs = parse_odcs_v3_from_str(data_contract_str)
-        data_contract_specification = DataContractSpecification(dataContractSpecification="1.1.0")
+        data_contract_specification = DataContractSpecification(dataContractSpecification="1.2.0")
         return import_from_odcs(data_contract_specification, odcs)
     logging.info("Importing DCS")

datacontract/lint/schema.py CHANGED Viewed

@@ -8,7 +8,7 @@ import requests
 from datacontract.model.exceptions import DataContractException
-DEFAULT_DATA_CONTRACT_SCHEMA = "datacontract-1.1.0.schema.json"
+DEFAULT_DATA_CONTRACT_SCHEMA = "datacontract-1.2.0.schema.json"
 def fetch_schema(location: str = None) -> Dict[str, Any]:

datacontract/schemas/datacontract-1.1.0.init.yaml CHANGED Viewed

@@ -1,4 +1,4 @@
-dataContractSpecification: 1.1.0
+dataContractSpecification: 1.2.0
 id: my-data-contract-id
 info:
   title: My Data Contract

datacontract/schemas/datacontract-1.2.0.init.yaml ADDED Viewed

@@ -0,0 +1,91 @@
+dataContractSpecification: 1.2.0
+id: my-data-contract-id
+info:
+  title: My Data Contract
+  version: 0.0.1
+#  description:
+#  owner:
+#  contact:
+#    name:
+#    url:
+#    email:
+### servers
+#servers:
+#  production:
+#    type: s3
+#    location: s3://
+#    format: parquet
+#    delimiter: new_line
+### terms
+#terms:
+#  usage:
+#  limitations:
+#  billing:
+#  noticePeriod:
+### models
+# models:
+#   my_model:
+#     description:
+#     type:
+#     fields:
+#       my_field:
+#         type:
+#         description:
+### definitions
+# definitions:
+#   my_field:
+#     domain:
+#     name:
+#     title:
+#     type:
+#     description:
+#     example:
+#     pii:
+#     classification:
+### servicelevels
+#servicelevels:
+#  availability:
+#    description: The server is available during support hours
+#    percentage: 99.9%
+#  retention:
+#    description: Data is retained for one year because!
+#    period: P1Y
+#    unlimited: false
+#  latency:
+#    description: Data is available within 25 hours after the order was placed
+#    threshold: 25h
+#    sourceTimestampField: orders.order_timestamp
+#    processedTimestampField: orders.processed_timestamp
+#  freshness:
+#    description: The age of the youngest row in a table.
+#    threshold: 25h
+#    timestampField: orders.order_timestamp
+#  frequency:
+#    description: Data is delivered once a day
+#    type: batch # or streaming
+#    interval: daily # for batch, either or cron
+#    cron: 0 0 * * * # for batch, either or interval
+#  support:
+#    description: The data is available during typical business hours at headquarters
+#    time: 9am to 5pm in EST on business days
+#    responseTime: 1h
+#  backup:
+#    description: Data is backed up once a week, every Sunday at 0:00 UTC.
+#    interval: weekly
+#    cron: 0 0 * * 0
+#    recoveryTime: 24 hours
+#    recoveryPoint: 1 week

datacontract-cli 0.10.28__py3-none-any.whl → 0.10.29__py3-none-any.whl

Potentially problematic release.

datacontract-cli 0.10.28py3-none-any.whl → 0.10.29py3-none-any.whl