PyPI - datacontract-cli - Versions diffs - 0.10.23__py3-none-any.whl → 0.10.40__py3-none-any.whl - Mend

datacontract-cli 0.10.23py3-none-any.whl → 0.10.40py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

datacontract/__init__.py +13 -0
datacontract/api.py +12 -5
datacontract/catalog/catalog.py +5 -3
datacontract/cli.py +119 -13
datacontract/data_contract.py +145 -67
datacontract/engines/data_contract_checks.py +366 -60
datacontract/engines/data_contract_test.py +50 -4
datacontract/engines/fastjsonschema/check_jsonschema.py +37 -19
datacontract/engines/fastjsonschema/s3/s3_read_files.py +3 -2
datacontract/engines/soda/check_soda_execute.py +27 -3
datacontract/engines/soda/connections/athena.py +79 -0
datacontract/engines/soda/connections/duckdb_connection.py +65 -6
datacontract/engines/soda/connections/kafka.py +4 -2
datacontract/engines/soda/connections/oracle.py +50 -0
datacontract/export/avro_converter.py +20 -3
datacontract/export/bigquery_converter.py +1 -1
datacontract/export/dbt_converter.py +36 -7
datacontract/export/dqx_converter.py +126 -0
datacontract/export/duckdb_type_converter.py +57 -0
datacontract/export/excel_exporter.py +923 -0
datacontract/export/exporter.py +3 -0
datacontract/export/exporter_factory.py +17 -1
datacontract/export/great_expectations_converter.py +55 -5
datacontract/export/{html_export.py → html_exporter.py} +31 -20
datacontract/export/markdown_converter.py +134 -5
datacontract/export/mermaid_exporter.py +110 -0
datacontract/export/odcs_v3_exporter.py +193 -149
datacontract/export/protobuf_converter.py +163 -69
datacontract/export/rdf_converter.py +2 -2
datacontract/export/sodacl_converter.py +9 -1
datacontract/export/spark_converter.py +31 -4
datacontract/export/sql_converter.py +6 -2
datacontract/export/sql_type_converter.py +124 -8
datacontract/imports/avro_importer.py +63 -12
datacontract/imports/csv_importer.py +111 -57
datacontract/imports/excel_importer.py +1112 -0
datacontract/imports/importer.py +16 -3
datacontract/imports/importer_factory.py +17 -0
datacontract/imports/json_importer.py +325 -0
datacontract/imports/odcs_importer.py +2 -2
datacontract/imports/odcs_v3_importer.py +367 -151
datacontract/imports/protobuf_importer.py +264 -0
datacontract/imports/spark_importer.py +117 -13
datacontract/imports/sql_importer.py +32 -16
datacontract/imports/unity_importer.py +84 -38
datacontract/init/init_template.py +1 -1
datacontract/integration/entropy_data.py +126 -0
datacontract/lint/resolve.py +112 -23
datacontract/lint/schema.py +24 -15
datacontract/lint/urls.py +17 -3
datacontract/model/data_contract_specification/__init__.py +1 -0
datacontract/model/odcs.py +13 -0
datacontract/model/run.py +3 -0
datacontract/output/junit_test_results.py +3 -3
datacontract/schemas/datacontract-1.1.0.init.yaml +1 -1
datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
datacontract/schemas/odcs-3.1.0.schema.json +2809 -0
datacontract/templates/datacontract.html +54 -3
datacontract/templates/datacontract_odcs.html +685 -0
datacontract/templates/index.html +5 -2
datacontract/templates/partials/server.html +2 -0
datacontract/templates/style/output.css +319 -145
{datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/METADATA +711 -433
datacontract_cli-0.10.40.dist-info/RECORD +121 -0
{datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/WHEEL +1 -1
{datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info/licenses}/LICENSE +1 -1
datacontract/export/csv_type_converter.py +0 -36
datacontract/integration/datamesh_manager.py +0 -72
datacontract/lint/lint.py +0 -142
datacontract/lint/linters/description_linter.py +0 -35
datacontract/lint/linters/field_pattern_linter.py +0 -34
datacontract/lint/linters/field_reference_linter.py +0 -48
datacontract/lint/linters/notice_period_linter.py +0 -55
datacontract/lint/linters/quality_schema_linter.py +0 -52
datacontract/lint/linters/valid_constraints_linter.py +0 -100
datacontract/model/data_contract_specification.py +0 -327
datacontract_cli-0.10.23.dist-info/RECORD +0 -113
/datacontract/{lint/linters → output}/__init__.py +0 -0
{datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/entry_points.txt +0 -0
{datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/top_level.txt +0 -0

datacontract/engines/data_contract_test.py CHANGED Viewed

@@ -1,5 +1,11 @@
+import atexit
+import os
+import tempfile
 import typing
+import requests
+from duckdb.duckdb import DuckDBPyConnection
 from datacontract.engines.data_contract_checks import create_checks
 if typing.TYPE_CHECKING:
@@ -10,7 +16,7 @@ from datacontract.engines.datacontract.check_that_datacontract_contains_valid_se
 )
 from datacontract.engines.fastjsonschema.check_jsonschema import check_jsonschema
 from datacontract.engines.soda.check_soda_execute import check_soda_execute
-from datacontract.model.data_contract_specification import DataContractSpecification
+from datacontract.model.data_contract_specification import DataContractSpecification, Server
 from datacontract.model.exceptions import DataContractException
 from datacontract.model.run import ResultEnum, Run
@@ -20,6 +26,7 @@ def execute_data_contract_test(
     run: Run,
     server_name: str = None,
     spark: "SparkSession" = None,
+    duckdb_connection: DuckDBPyConnection = None,
 ):
     if data_contract_specification.models is None or len(data_contract_specification.models) == 0:
         raise DataContractException(
@@ -29,6 +36,12 @@ def execute_data_contract_test(
             reason="Models block is missing. Skip executing tests.",
             engine="datacontract",
         )
+    if (
+        server_name is None
+        and data_contract_specification.servers is not None
+        and len(data_contract_specification.servers) > 0
+    ):
+        server_name = list(data_contract_specification.servers.keys())[0]
     server = get_server(data_contract_specification, server_name)
     run.log_info(f"Running tests for data contract {data_contract_specification.id} with server {server_name}")
     run.dataContractId = data_contract_specification.id
@@ -37,16 +50,19 @@ def execute_data_contract_test(
     run.outputPortId = server.outputPortId
     run.server = server_name
+    if server.type == "api":
+        server = process_api_response(run, server)
     run.checks.extend(create_checks(data_contract_specification, server))
     # TODO check server is supported type for nicer error messages
     # TODO check server credentials are complete for nicer error messages
     if server.format == "json" and server.type != "kafka":
         check_jsonschema(run, data_contract_specification, server)
-    check_soda_execute(run, data_contract_specification, server, spark)
+    check_soda_execute(run, data_contract_specification, server, spark, duckdb_connection)
-def get_server(data_contract_specification: DataContractSpecification, server_name: str = None):
+def get_server(data_contract_specification: DataContractSpecification, server_name: str = None) -> Server | None:
     """Get the server configuration from the data contract specification.
     Args:
@@ -59,9 +75,39 @@ def get_server(data_contract_specification: DataContractSpecification, server_na
     check_that_datacontract_contains_valid_server_configuration(data_contract_specification, server_name)
-    if server_name:
+    if server_name is not None:
         server = data_contract_specification.servers.get(server_name)
     else:
         server_name = list(data_contract_specification.servers.keys())[0]
         server = data_contract_specification.servers.get(server_name)
     return server
+def process_api_response(run, server):
+    tmp_dir = tempfile.TemporaryDirectory(prefix="datacontract_cli_api_")
+    atexit.register(tmp_dir.cleanup)
+    headers = {}
+    if os.getenv("DATACONTRACT_API_HEADER_AUTHORIZATION") is not None:
+        headers["Authorization"] = os.getenv("DATACONTRACT_API_HEADER_AUTHORIZATION")
+    try:
+        response = requests.get(server.location, headers=headers)
+        response.raise_for_status()
+    except requests.exceptions.RequestException as e:
+        raise DataContractException(
+            type="connection",
+            name="API server connection error",
+            result=ResultEnum.error,
+            reason=f"Failed to fetch API response from {server.location}: {e}",
+            engine="datacontract",
+        )
+    with open(f"{tmp_dir.name}/api_response.json", "w") as f:
+        f.write(response.text)
+    run.log_info(f"Saved API response to {tmp_dir.name}/api_response.json")
+    server = Server(
+        type="local",
+        format="json",
+        path=f"{tmp_dir.name}/api_response.json",
+        dataProductId=server.dataProductId,
+        outputPortId=server.outputPortId,
+    )
+    return server

datacontract/engines/fastjsonschema/check_jsonschema.py CHANGED Viewed

@@ -1,8 +1,9 @@
+import glob
 import json
 import logging
 import os
 import threading
-from typing import List, Optional
+from typing import Any, Callable, Generator, List, Optional
 import fastjsonschema
 from fastjsonschema import JsonSchemaValueException
@@ -85,7 +86,7 @@ def process_exceptions(run, exceptions: List[DataContractException]):
 def validate_json_stream(
-    schema: dict, model_name: str, validate: callable, json_stream: list[dict]
+    schema: dict, model_name: str, validate: Callable, json_stream: Generator[Any, Any, None]
 ) -> List[DataContractException]:
     logging.info(f"Validating JSON stream for model: '{model_name}'.")
     exceptions: List[DataContractException] = []
@@ -99,7 +100,7 @@ def validate_json_stream(
                 DataContractException(
                     type="schema",
                     name="Check that JSON has valid schema",
-                    result="failed",
+                    result=ResultEnum.failed,
                     reason=f"{f'#{primary_key_value}: ' if primary_key_value is not None else ''}{e.message}",
                     model=model_name,
                     engine="jsonschema",
@@ -159,27 +160,44 @@ def process_json_file(run, schema, model_name, validate, file, delimiter):
 def process_local_file(run, server, schema, model_name, validate):
     path = server.path
+    if not path:
+        raise DataContractException(
+            type="schema",
+            name="Check that JSON has valid schema",
+            result=ResultEnum.warning,
+            reason="For server with type 'local', a 'path' must be defined.",
+            engine="datacontract",
+        )
     if "{model}" in path:
         path = path.format(model=model_name)
+    all_files = []
     if os.path.isdir(path):
-        return process_directory(run, path, server, model_name, validate)
+        # Fetch all JSONs in the directory
+        for root, _, files in os.walk(path):
+            for file in files:
+                if file.endswith(".json"):
+                    all_files.append(os.path.join(root, file))
     else:
-        logging.info(f"Processing file {path}")
-        with open(path, "r") as file:
-            process_json_file(run, schema, model_name, validate, file, server.delimiter)
+        # Use glob to fetch all JSONs
+        for file_path in glob.glob(path, recursive=True):
+            if os.path.isfile(file_path):
+                if file_path.endswith(".json"):
+                    all_files.append(file_path)
+    if not all_files:
+        raise DataContractException(
+            type="schema",
+            name="Check that JSON has valid schema",
+            result=ResultEnum.warning,
+            reason=f"No files found in '{path}'.",
+            engine="datacontract",
+        )
-def process_directory(run, path, server, model_name, validate):
-    success = True
-    for filename in os.listdir(path):
-        if filename.endswith(".json"):  # or make this a parameter
-            file_path = os.path.join(path, filename)
-            with open(file_path, "r") as file:
-                if not process_json_file(run, model_name, validate, file, server.delimiter):
-                    success = False
-                    break
-    return success
+    for file in all_files:
+        logging.info(f"Processing file: {file}")
+        with open(file, "r") as f:
+            process_json_file(run, schema, model_name, validate, f, server.delimiter)
 def process_s3_file(run, server, schema, model_name, validate):
@@ -201,7 +219,7 @@ def process_s3_file(run, server, schema, model_name, validate):
         raise DataContractException(
             type="schema",
             name="Check that JSON has valid schema",
-            result="warning",
+            result=ResultEnum.warning,
             reason=f"Cannot find any file in {s3_location}",
             engine="datacontract",
         )
@@ -222,7 +240,7 @@ def check_jsonschema(run: Run, data_contract: DataContractSpecification, server:
             Check(
                 type="schema",
                 name="Check that JSON has valid schema",
-                result="warning",
+                result=ResultEnum.warning,
                 reason="Server format is not 'json'. Skip validating jsonschema.",
                 engine="jsonschema",
             )

datacontract/engines/fastjsonschema/s3/s3_read_files.py CHANGED Viewed

@@ -2,6 +2,7 @@ import logging
 import os
 from datacontract.model.exceptions import DataContractException
+from datacontract.model.run import ResultEnum
 def yield_s3_files(s3_endpoint_url, s3_location):
@@ -19,9 +20,9 @@ def s3_fs(s3_endpoint_url):
     except ImportError as e:
         raise DataContractException(
             type="schema",
-            result="failed",
+            result=ResultEnum.failed,
             name="s3 extra missing",
-            reason="Install the extra datacontract-cli\[s3] to use s3",
+            reason="Install the extra s3 to use s3",
             engine="datacontract",
             original_exception=e,
         )

datacontract/engines/soda/check_soda_execute.py CHANGED Viewed

@@ -1,6 +1,15 @@
 import logging
+import typing
 import uuid
+from datacontract.engines.soda.connections.athena import to_athena_soda_configuration
+from datacontract.engines.soda.connections.oracle import initialize_client_and_create_soda_configuration
+if typing.TYPE_CHECKING:
+    from pyspark.sql import SparkSession
+from duckdb.duckdb import DuckDBPyConnection
 from datacontract.engines.soda.connections.bigquery import to_bigquery_soda_configuration
 from datacontract.engines.soda.connections.databricks import to_databricks_soda_configuration
 from datacontract.engines.soda.connections.duckdb_connection import get_duckdb_connection
@@ -14,7 +23,13 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
 from datacontract.model.run import Check, Log, ResultEnum, Run
-def check_soda_execute(run: Run, data_contract: DataContractSpecification, server: Server, spark):
+def check_soda_execute(
+    run: Run,
+    data_contract: DataContractSpecification,
+    server: Server,
+    spark: "SparkSession" = None,
+    duckdb_connection: DuckDBPyConnection = None,
+):
     from soda.common.config_helper import ConfigHelper
     ConfigHelper.get_instance().upsert_value("send_anonymous_usage_stats", False)
@@ -30,7 +45,7 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
     if server.type in ["s3", "gcs", "azure", "local"]:
         if server.format in ["json", "parquet", "csv", "delta"]:
             run.log_info(f"Configuring engine soda-core to connect to {server.type} {server.format} with duckdb")
-            con = get_duckdb_connection(data_contract, server, run)
+            con = get_duckdb_connection(data_contract, server, run, duckdb_connection)
             scan.add_duckdb_connection(duckdb_connection=con, data_source_name=server.type)
             scan.set_data_source_name(server.type)
         else:
@@ -62,7 +77,8 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
             run.log_info("Connecting to databricks via spark")
             scan.add_spark_session(spark, data_source_name=server.type)
             scan.set_data_source_name(server.type)
-            spark.sql(f"USE {server.catalog}.{server.schema_}")
+            database_name = ".".join(filter(None, [server.catalog, server.schema_]))
+            spark.sql(f"USE {database_name}")
         else:
             run.log_info("Connecting to databricks directly")
             soda_configuration_str = to_databricks_soda_configuration(server)
@@ -89,10 +105,18 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
         soda_configuration_str = to_sqlserver_soda_configuration(server)
         scan.add_configuration_yaml_str(soda_configuration_str)
         scan.set_data_source_name(server.type)
+    elif server.type == "oracle":
+        soda_configuration_str = initialize_client_and_create_soda_configuration(server)
+        scan.add_configuration_yaml_str(soda_configuration_str)
+        scan.set_data_source_name(server.type)
     elif server.type == "trino":
         soda_configuration_str = to_trino_soda_configuration(server)
         scan.add_configuration_yaml_str(soda_configuration_str)
         scan.set_data_source_name(server.type)
+    elif server.type == "athena":
+        soda_configuration_str = to_athena_soda_configuration(server)
+        scan.add_configuration_yaml_str(soda_configuration_str)
+        scan.set_data_source_name(server.type)
     else:
         run.checks.append(

datacontract/engines/soda/connections/athena.py ADDED Viewed

@@ -0,0 +1,79 @@
+import os
+import yaml
+from datacontract.model.exceptions import DataContractException
+def to_athena_soda_configuration(server):
+    s3_region = os.getenv("DATACONTRACT_S3_REGION")
+    s3_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
+    s3_secret_access_key = os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY")
+    s3_session_token = os.getenv("DATACONTRACT_S3_SESSION_TOKEN")
+    # Validate required parameters
+    if not s3_access_key_id:
+        raise DataContractException(
+            type="athena-connection",
+            name="missing_access_key_id",
+            reason="AWS access key ID is required. Set the DATACONTRACT_S3_ACCESS_KEY_ID environment variable.",
+            engine="datacontract",
+        )
+    if not s3_secret_access_key:
+        raise DataContractException(
+            type="athena-connection",
+            name="missing_secret_access_key",
+            reason="AWS secret access key is required. Set the DATACONTRACT_S3_SECRET_ACCESS_KEY environment variable.",
+            engine="datacontract",
+        )
+    if not hasattr(server, "schema_") or not server.schema_:
+        raise DataContractException(
+            type="athena-connection",
+            name="missing_schema",
+            reason="Schema is required for Athena connection. Specify the schema where your tables exist in the server configuration.",
+            engine="datacontract",
+        )
+    if not hasattr(server, "stagingDir") or not server.stagingDir:
+        raise DataContractException(
+            type="athena-connection",
+            name="missing_s3_staging_dir",
+            reason="S3 staging directory is required for Athena connection. This should be the Amazon S3 Query Result Location (e.g., 's3://my-bucket/athena-results/').",
+            engine="datacontract",
+        )
+    # Validate S3 staging directory format
+    if not server.stagingDir.startswith("s3://"):
+        raise DataContractException(
+            type="athena-connection",
+            name="invalid_s3_staging_dir",
+            reason=f"S3 staging directory must start with 's3://'. Got: {server.s3_staging_dir}. Example: 's3://my-bucket/athena-results/'",
+            engine="datacontract",
+        )
+    data_source = {
+        "type": "athena",
+        "access_key_id": s3_access_key_id,
+        "secret_access_key": s3_secret_access_key,
+        "schema": server.schema_,
+        "staging_dir": server.stagingDir,
+    }
+    if s3_region:
+        data_source["region_name"] = s3_region
+    elif server.region_name:
+        data_source["region_name"] = server.region_name
+    if server.catalog:
+        # Optional, Identify the name of the Data Source, also referred to as a Catalog. The default value is `awsdatacatalog`.
+        data_source["catalog"] = server.catalog
+    if s3_session_token:
+        data_source["session_token"] = s3_session_token
+    soda_configuration = {f"data_source {server.type}": data_source}
+    soda_configuration_str = yaml.dump(soda_configuration)
+    return soda_configuration_str

datacontract/engines/soda/connections/duckdb_connection.py CHANGED Viewed

@@ -1,14 +1,24 @@
 import os
-from typing import Any
+from typing import Any, Dict
 import duckdb
-from datacontract.export.csv_type_converter import convert_to_duckdb_csv_type
+from datacontract.export.duckdb_type_converter import convert_to_duckdb_csv_type, convert_to_duckdb_json_type
+from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Server
 from datacontract.model.run import Run
-def get_duckdb_connection(data_contract, server, run: Run):
-    con = duckdb.connect(database=":memory:")
+def get_duckdb_connection(
+    data_contract: DataContractSpecification,
+    server: Server,
+    run: Run,
+    duckdb_connection: duckdb.DuckDBPyConnection | None = None,
+) -> duckdb.DuckDBPyConnection:
+    if duckdb_connection is None:
+        con = duckdb.connect(database=":memory:")
+    else:
+        con = duckdb_connection
     path: str = ""
     if server.type == "local":
         path = server.path
@@ -33,9 +43,16 @@ def get_duckdb_connection(data_contract, server, run: Run):
                 json_format = "newline_delimited"
             elif server.delimiter == "array":
                 json_format = "array"
-            con.sql(f"""
+            columns = to_json_types(model)
+            if columns is None:
+                con.sql(f"""
                         CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', hive_partitioning=1);
                         """)
+            else:
+                con.sql(
+                    f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', columns={columns}, hive_partitioning=1);"""
+                )
+                add_nested_views(con, model_name, model.fields)
         elif server.format == "parquet":
             con.sql(f"""
                         CREATE VIEW "{model_name}" AS SELECT * FROM read_parquet('{model_path}', hive_partitioning=1);
@@ -54,6 +71,9 @@ def get_duckdb_connection(data_contract, server, run: Run):
         elif server.format == "delta":
             con.sql("update extensions;")  # Make sure we have the latest delta extension
             con.sql(f"""CREATE VIEW "{model_name}" AS SELECT * FROM delta_scan('{model_path}');""")
+        table_info = con.sql(f"PRAGMA table_info('{model_name}');").fetchdf()
+        if table_info is not None and not table_info.empty:
+            run.log_info(f"DuckDB Table Info: {table_info.to_string(index=False)}")
     return con
@@ -67,6 +87,45 @@ def to_csv_types(model) -> dict[Any, str | None] | None:
     return columns
+def to_json_types(model: Model) -> dict[Any, str | None] | None:
+    if model is None:
+        return None
+    columns = {}
+    for field_name, field in model.fields.items():
+        columns[field_name] = convert_to_duckdb_json_type(field)
+    return columns
+def add_nested_views(con: duckdb.DuckDBPyConnection, model_name: str, fields: Dict[str, Field] | None):
+    model_name = model_name.strip('"')
+    if fields is None:
+        return
+    for field_name, field in fields.items():
+        if field.type is None or field.type.lower() not in ["array", "object"]:
+            continue
+        field_type = field.type.lower()
+        if field_type == "array" and field.items is None:
+            continue
+        elif field_type == "object" and field.fields is None:
+            continue
+        nested_model_name = f"{model_name}__{field_name}"
+        max_depth = 2 if field_type == "array" else 1
+        ## if parent field is not required, the nested objects may respolve
+        ## to a row of NULLs -- but if the objects themselves have required
+        ## fields, this will fail the check.
+        where = "" if field.required else f" WHERE {field_name} IS NOT NULL"
+        con.sql(f"""
+            CREATE VIEW IF NOT EXISTS "{nested_model_name}" AS
+            SELECT unnest({field_name}, max_depth := {max_depth}) as {field_name} FROM "{model_name}" {where}
+            """)
+        if field_type == "array":
+            add_nested_views(con, nested_model_name, field.items.fields)
+        elif field_type == "object":
+            add_nested_views(con, nested_model_name, field.fields)
 def setup_s3_connection(con, server):
     s3_region = os.getenv("DATACONTRACT_S3_REGION")
     s3_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
@@ -76,10 +135,10 @@ def setup_s3_connection(con, server):
     use_ssl = "true"
     url_style = "vhost"
     if server.endpointUrl is not None:
+        url_style = "path"
         s3_endpoint = server.endpointUrl.removeprefix("http://").removeprefix("https://")
         if server.endpointUrl.startswith("http://"):
             use_ssl = "false"
-            url_style = "path"
     if s3_access_key_id is not None:
         if s3_session_token is not None:

datacontract/engines/soda/connections/kafka.py CHANGED Viewed

@@ -6,6 +6,7 @@ import tempfile
 from datacontract.export.avro_converter import to_avro_schema_json
 from datacontract.model.data_contract_specification import DataContractSpecification, Field, Server
 from datacontract.model.exceptions import DataContractException
+from datacontract.model.run import ResultEnum
 def create_spark_session():
@@ -16,7 +17,7 @@ def create_spark_session():
     except ImportError as e:
         raise DataContractException(
             type="schema",
-            result="failed",
+            result=ResultEnum.failed,
             name="pyspark is missing",
             reason="Install the extra datacontract-cli[kafka] to use kafka",
             engine="datacontract",
@@ -26,6 +27,7 @@ def create_spark_session():
     tmp_dir = tempfile.TemporaryDirectory(prefix="datacontract-cli-spark")
     atexit.register(tmp_dir.cleanup)
+    pyspark_version = "3.5.5"  # MUST be the same as in the pyproject.toml
     spark = (
         SparkSession.builder.appName("datacontract")
         .config("spark.sql.warehouse.dir", f"{tmp_dir}/spark-warehouse")
@@ -33,7 +35,7 @@ def create_spark_session():
         .config("spark.ui.enabled", "false")
         .config(
             "spark.jars.packages",
-            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.2,org.apache.spark:spark-avro_2.12:3.5.2",
+            f"org.apache.spark:spark-sql-kafka-0-10_2.12:{pyspark_version},org.apache.spark:spark-avro_2.12:{pyspark_version}",
         )
         .getOrCreate()
     )

datacontract/engines/soda/connections/oracle.py ADDED Viewed

@@ -0,0 +1,50 @@
+import os
+import yaml
+from datacontract.model.data_contract_specification import Server
+def initialize_client_and_create_soda_configuration(server: Server) -> str:
+    import oracledb
+    soda_config = to_oracle_soda_configuration(server)
+    oracle_client_dir = os.getenv("DATACONTRACT_ORACLE_CLIENT_DIR")
+    if oracle_client_dir is not None:
+        # Soda Core currently does not support thick mode natively, see https://github.com/sodadata/soda-core/issues/2036
+        #   but the oracledb client can be configured accordingly before Soda initializes as a work-around
+        oracledb.init_oracle_client(lib_dir=oracle_client_dir)
+    return soda_config
+def to_oracle_soda_configuration(server: Server) -> str:
+    """Serialize server config to soda configuration.
+    ### Example:
+        type: oracle
+        host: database-1.us-east-1.rds.amazonaws.com
+        port: '1521'
+        username: simple
+        password: simple_pass
+        connectstring: database-1.us-east-1.rds.amazonaws.com:1521/ORCL (database is equal to service name at oracle)
+        schema: SYSTEM
+    """
+    service_name = server.service_name or server.database
+    # with service account key, using an external json file
+    soda_configuration = {
+        f"data_source {server.type}": {
+            "type": "oracle",
+            "host": server.host,
+            "port": str(server.port),
+            "username": os.getenv("DATACONTRACT_ORACLE_USERNAME", ""),
+            "password": os.getenv("DATACONTRACT_ORACLE_PASSWORD", ""),
+            "connectstring": f"{server.host}:{server.port}/{service_name}",
+            "schema": server.schema_,
+        }
+    }
+    soda_configuration_str = yaml.dump(soda_configuration)
+    return soda_configuration_str

datacontract/export/avro_converter.py CHANGED Viewed

@@ -44,12 +44,18 @@ def to_avro_field(field, field_name):
     avro_type = to_avro_type(field, field_name)
     avro_field["type"] = avro_type if is_required_avro else ["null", avro_type]
-    if avro_field["type"] == "enum":
-        avro_field["type"] = {
+    # Handle enum types - both required and optional
+    if avro_type == "enum" or (isinstance(avro_field["type"], list) and "enum" in avro_field["type"]):
+        enum_def = {
             "type": "enum",
             "name": field.title,
             "symbols": field.enum,
         }
+        if is_required_avro:
+            avro_field["type"] = enum_def
+        else:
+            # Replace "enum" with the full enum definition in the union
+            avro_field["type"] = ["null", enum_def]
     if field.config:
         if "avroDefault" in field.config:
@@ -77,6 +83,10 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
         if "avroType" in field.config:
             return field.config["avroType"]
+    # Check for enum fields based on presence of enum list and avroType config
+    if field.enum and field.config and field.config.get("avroType") == "enum":
+        return "enum"
     if field.type is None:
         return "null"
     if field.type in ["string", "varchar", "text"]:
@@ -91,7 +101,9 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
         if field.precision is not None:
             typeVal["precision"] = field.precision
         return typeVal
-    elif field.type in ["float", "double"]:
+    elif field.type in ["float"]:
+        return "float"
+    elif field.type in ["double"]:
         return "double"
     elif field.type in ["integer", "int"]:
         return "int"
@@ -107,6 +119,11 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
         return {"type": "int", "logicalType": "date"}
     elif field.type in ["time"]:
         return "long"
+    elif field.type in ["map"]:
+        if field.config is not None and "values" in field.config:
+            return {"type": "map", "values": field.config["values"]}
+        else:
+            return "bytes"
     elif field.type in ["object", "record", "struct"]:
         if field.config is not None and "namespace" in field.config:
             return to_avro_record(field_name, field.fields, field.description, field.config["namespace"])

datacontract/export/bigquery_converter.py CHANGED Viewed

@@ -103,7 +103,7 @@ def map_type_to_bigquery(field: Field) -> str:
     elif field_type.lower() == "date":
         return "DATE"
     elif field_type.lower() == "timestamp_ntz":
-        return "TIME"
+        return "DATETIME"
     elif field_type.lower() in ["number", "decimal", "numeric"]:
         return "NUMERIC"
     elif field_type.lower() == "double":

datacontract-cli 0.10.23__py3-none-any.whl → 0.10.40__py3-none-any.whl

datacontract-cli 0.10.23py3-none-any.whl → 0.10.40py3-none-any.whl