PyPI - datacontract-cli - Versions diffs - 0.10.0__py3-none-any.whl → 0.10.37__py3-none-any.whl - Mend

datacontract-cli 0.10.0py3-none-any.whl → 0.10.37py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (136) hide show

datacontract/__init__.py +13 -0
datacontract/api.py +260 -0
datacontract/breaking/breaking.py +242 -12
datacontract/breaking/breaking_rules.py +37 -1
datacontract/catalog/catalog.py +80 -0
datacontract/cli.py +387 -117
datacontract/data_contract.py +216 -353
datacontract/engines/data_contract_checks.py +1041 -0
datacontract/engines/data_contract_test.py +113 -0
datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +2 -3
datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
datacontract/engines/fastjsonschema/check_jsonschema.py +176 -42
datacontract/engines/fastjsonschema/s3/s3_read_files.py +16 -1
datacontract/engines/soda/check_soda_execute.py +100 -56
datacontract/engines/soda/connections/athena.py +79 -0
datacontract/engines/soda/connections/bigquery.py +8 -1
datacontract/engines/soda/connections/databricks.py +12 -3
datacontract/engines/soda/connections/duckdb_connection.py +241 -0
datacontract/engines/soda/connections/kafka.py +206 -113
datacontract/engines/soda/connections/snowflake.py +8 -5
datacontract/engines/soda/connections/sqlserver.py +43 -0
datacontract/engines/soda/connections/trino.py +26 -0
datacontract/export/avro_converter.py +72 -8
datacontract/export/avro_idl_converter.py +31 -25
datacontract/export/bigquery_converter.py +130 -0
datacontract/export/custom_converter.py +40 -0
datacontract/export/data_caterer_converter.py +161 -0
datacontract/export/dbml_converter.py +148 -0
datacontract/export/dbt_converter.py +141 -54
datacontract/export/dcs_exporter.py +6 -0
datacontract/export/dqx_converter.py +126 -0
datacontract/export/duckdb_type_converter.py +57 -0
datacontract/export/excel_exporter.py +923 -0
datacontract/export/exporter.py +100 -0
datacontract/export/exporter_factory.py +216 -0
datacontract/export/go_converter.py +105 -0
datacontract/export/great_expectations_converter.py +257 -36
datacontract/export/html_exporter.py +86 -0
datacontract/export/iceberg_converter.py +188 -0
datacontract/export/jsonschema_converter.py +71 -16
datacontract/export/markdown_converter.py +337 -0
datacontract/export/mermaid_exporter.py +110 -0
datacontract/export/odcs_v3_exporter.py +375 -0
datacontract/export/pandas_type_converter.py +40 -0
datacontract/export/protobuf_converter.py +168 -68
datacontract/export/pydantic_converter.py +6 -0
datacontract/export/rdf_converter.py +13 -6
datacontract/export/sodacl_converter.py +36 -188
datacontract/export/spark_converter.py +245 -0
datacontract/export/sql_converter.py +37 -3
datacontract/export/sql_type_converter.py +269 -8
datacontract/export/sqlalchemy_converter.py +170 -0
datacontract/export/terraform_converter.py +7 -2
datacontract/imports/avro_importer.py +246 -26
datacontract/imports/bigquery_importer.py +221 -0
datacontract/imports/csv_importer.py +143 -0
datacontract/imports/dbml_importer.py +112 -0
datacontract/imports/dbt_importer.py +240 -0
datacontract/imports/excel_importer.py +1111 -0
datacontract/imports/glue_importer.py +288 -0
datacontract/imports/iceberg_importer.py +172 -0
datacontract/imports/importer.py +51 -0
datacontract/imports/importer_factory.py +128 -0
datacontract/imports/json_importer.py +325 -0
datacontract/imports/jsonschema_importer.py +146 -0
datacontract/imports/odcs_importer.py +60 -0
datacontract/imports/odcs_v3_importer.py +516 -0
datacontract/imports/parquet_importer.py +81 -0
datacontract/imports/protobuf_importer.py +264 -0
datacontract/imports/spark_importer.py +262 -0
datacontract/imports/sql_importer.py +274 -35
datacontract/imports/unity_importer.py +219 -0
datacontract/init/init_template.py +20 -0
datacontract/integration/datamesh_manager.py +86 -0
datacontract/lint/resolve.py +271 -49
datacontract/lint/resources.py +21 -0
datacontract/lint/schema.py +53 -17
datacontract/lint/urls.py +32 -12
datacontract/model/data_contract_specification/__init__.py +1 -0
datacontract/model/exceptions.py +4 -1
datacontract/model/odcs.py +24 -0
datacontract/model/run.py +49 -29
datacontract/output/__init__.py +0 -0
datacontract/output/junit_test_results.py +135 -0
datacontract/output/output_format.py +10 -0
datacontract/output/test_results_writer.py +79 -0
datacontract/py.typed +0 -0
datacontract/schemas/datacontract-1.1.0.init.yaml +91 -0
datacontract/schemas/datacontract-1.1.0.schema.json +1975 -0
datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
datacontract/schemas/odcs-3.0.1.schema.json +2634 -0
datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
datacontract/templates/datacontract.html +139 -294
datacontract/templates/datacontract_odcs.html +685 -0
datacontract/templates/index.html +236 -0
datacontract/templates/partials/datacontract_information.html +86 -0
datacontract/templates/partials/datacontract_servicelevels.html +253 -0
datacontract/templates/partials/datacontract_terms.html +51 -0
datacontract/templates/partials/definition.html +25 -0
datacontract/templates/partials/example.html +27 -0
datacontract/templates/partials/model_field.html +144 -0
datacontract/templates/partials/quality.html +49 -0
datacontract/templates/partials/server.html +211 -0
datacontract/templates/style/output.css +491 -72
datacontract_cli-0.10.37.dist-info/METADATA +2235 -0
datacontract_cli-0.10.37.dist-info/RECORD +119 -0
{datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/WHEEL +1 -1
{datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info/licenses}/LICENSE +1 -1
datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
datacontract/engines/soda/connections/dask.py +0 -28
datacontract/engines/soda/connections/duckdb.py +0 -76
datacontract/export/csv_type_converter.py +0 -36
datacontract/export/html_export.py +0 -66
datacontract/export/odcs_converter.py +0 -102
datacontract/init/download_datacontract_file.py +0 -17
datacontract/integration/publish_datamesh_manager.py +0 -33
datacontract/integration/publish_opentelemetry.py +0 -107
datacontract/lint/lint.py +0 -141
datacontract/lint/linters/description_linter.py +0 -34
datacontract/lint/linters/example_model_linter.py +0 -91
datacontract/lint/linters/field_pattern_linter.py +0 -34
datacontract/lint/linters/field_reference_linter.py +0 -38
datacontract/lint/linters/notice_period_linter.py +0 -55
datacontract/lint/linters/quality_schema_linter.py +0 -52
datacontract/lint/linters/valid_constraints_linter.py +0 -99
datacontract/model/data_contract_specification.py +0 -141
datacontract/web.py +0 -14
datacontract_cli-0.10.0.dist-info/METADATA +0 -951
datacontract_cli-0.10.0.dist-info/RECORD +0 -66
/datacontract/{model → breaking}/breaking_change.py +0 -0
/datacontract/{lint/linters → export}/__init__.py +0 -0
{datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/entry_points.txt +0 -0
{datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/top_level.txt +0 -0

datacontract/engines/soda/connections/kafka.py CHANGED Viewed

@@ -1,43 +1,41 @@
+import atexit
+import logging
 import os
-import pyspark.sql.functions as fn
-from pyspark.sql import SparkSession
-from pyspark.sql.avro.functions import from_avro
-from pyspark.sql.functions import from_json, col
-from pyspark.sql.types import (
-    StructType,
-    DataType,
-    NullType,
-    ArrayType,
-    BinaryType,
-    DateType,
-    TimestampNTZType,
-    TimestampType,
-    BooleanType,
-    LongType,
-    IntegerType,
-    DoubleType,
-    DecimalType,
-    StringType,
-    StructField,
-)
+import tempfile
 from datacontract.export.avro_converter import to_avro_schema_json
-from datacontract.model.data_contract_specification import \
-    DataContractSpecification, Server, Field
+from datacontract.model.data_contract_specification import DataContractSpecification, Field, Server
 from datacontract.model.exceptions import DataContractException
+from datacontract.model.run import ResultEnum
+def create_spark_session():
+    """Create and configure a Spark session."""
+    try:
+        from pyspark.sql import SparkSession
+    except ImportError as e:
+        raise DataContractException(
+            type="schema",
+            result=ResultEnum.failed,
+            name="pyspark is missing",
+            reason="Install the extra datacontract-cli[kafka] to use kafka",
+            engine="datacontract",
+            original_exception=e,
+        )
+    tmp_dir = tempfile.TemporaryDirectory(prefix="datacontract-cli-spark")
+    atexit.register(tmp_dir.cleanup)
-def create_spark_session(tmp_dir) -> SparkSession:
-    # TODO: Update dependency versions when updating pyspark
-    # TODO: add protobuf library
+    pyspark_version = "3.5.5"  # MUST be the same as in the pyproject.toml
     spark = (
         SparkSession.builder.appName("datacontract")
-        .config("spark.sql.warehouse.dir", tmp_dir + "/spark-warehouse")
-        .config("spark.streaming.stopGracefullyOnShutdown", True)
+        .config("spark.sql.warehouse.dir", f"{tmp_dir}/spark-warehouse")
+        .config("spark.streaming.stopGracefullyOnShutdown", "true")
+        .config("spark.ui.enabled", "false")
         .config(
             "spark.jars.packages",
-            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,org.apache.spark:spark-avro_2.12:3.5.0",
+            f"org.apache.spark:spark-sql-kafka-0-10_2.12:{pyspark_version},org.apache.spark:spark-avro_2.12:{pyspark_version}",
         )
         .getOrCreate()
     )
@@ -46,107 +44,202 @@ def create_spark_session(tmp_dir) -> SparkSession:
     return spark
-def read_kafka_topic(spark: SparkSession, data_contract: DataContractSpecification, server: Server, tmp_dir):
-    host = server.host
-    topic = server.topic
-    auth_options = get_auth_options()
+def read_kafka_topic(spark, data_contract: DataContractSpecification, server: Server):
+    """Read and process data from a Kafka topic based on the server configuration."""
-    # read full kafka topic
+    logging.info("Reading data from Kafka server %s topic %s", server.host, server.topic)
     df = (
         spark.read.format("kafka")
-        .options(**auth_options)
-        .option("kafka.bootstrap.servers", host)
-        .option("subscribe", topic)
+        .options(**get_auth_options())
+        .option("kafka.bootstrap.servers", server.host)
+        .option("subscribe", server.topic)
         .option("startingOffsets", "earliest")
         .load()
     )
-    # TODO a warning if none or multiple models
     model_name, model = next(iter(data_contract.models.items()))
-    if server.format == "avro":
-        avro_schema = to_avro_schema_json(model_name, model)
-        # Parse out the extra bytes from the Avro data
-        # A Kafka message contains a key and a value. Data going through a Kafka topic in Confluent Cloud has five bytes added to the beginning of every Avro value. If you are using Avro format keys, then five bytes will be added to the beginning of those as well. For this example, we’re assuming string keys. These bytes consist of one magic byte and four bytes representing the schema ID of the schema in the registry that is needed to decode that data. The bytes need to be removed so that the schema ID can be determined and the Avro data can be parsed. To manipulate the data, we need a couple of imports:
-        df2 = df.withColumn("fixedValue", fn.expr("substring(value, 6, length(value)-5)"))
-        options = {"mode": "PERMISSIVE"}
-        df3 = df2.select(from_avro(col("fixedValue"), avro_schema, options).alias("avro")).select(col("avro.*"))
-    elif server.format == "json":
-        # TODO A good warning when the conversion to json fails
-        struct_type = to_struct_type(model.fields)
-        df2 = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-        options = {"mode": "PERMISSIVE"}
-        df3 = df2.select(from_json(df2.value, struct_type, options).alias("json")).select(col("json.*"))
-    else:
+    match server.format:
+        case "avro":
+            process_avro_format(df, model_name, model)
+        case "json":
+            process_json_format(df, model_name, model)
+        case _:
+            raise DataContractException(
+                type="test",
+                name="Configuring Kafka checks",
+                result="warning",
+                reason=f"Kafka format '{server.format}' is not supported. Skip executing tests.",
+                engine="datacontract",
+            )
+def process_avro_format(df, model_name, model):
+    try:
+        from pyspark.sql.avro.functions import from_avro
+        from pyspark.sql.functions import col, expr
+    except ImportError as e:
+        raise DataContractException(
+            type="schema",
+            result="failed",
+            name="pyspark is missing",
+            reason="Install the extra datacontract-cli[kafka] to use kafka",
+            engine="datacontract",
+            original_exception=e,
+        )
+    avro_schema = to_avro_schema_json(model_name, model)
+    df2 = df.withColumn("fixedValue", expr("substring(value, 6, length(value)-5)"))
+    options = {"mode": "PERMISSIVE"}
+    df2.select(from_avro(col("fixedValue"), avro_schema, options).alias("avro")).select(
+        col("avro.*")
+    ).createOrReplaceTempView(model_name)
+def process_json_format(df, model_name, model):
+    try:
+        from pyspark.sql.functions import col, from_json
+    except ImportError as e:
         raise DataContractException(
-            type="test",
-            name="Configuring Kafka checks",
-            result="warning",
-            reason=f"Kafka format '{server.format}' is not supported. Skip executing tests.",
+            type="schema",
+            result="failed",
+            name="pyspark is missing",
+            reason="Install the extra datacontract-cli[kafka] to use kafka",
             engine="datacontract",
+            original_exception=e,
         )
-    # df3.writeStream.toTable(model_name, checkpointLocation=tmp_dir + "/checkpoint")
-    df3.createOrReplaceTempView(model_name)
-    # print(spark.sql(f"select * from {model_name}").show())
+    struct_type = to_struct_type(model.fields)
+    df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").select(
+        from_json(col("value"), struct_type, {"mode": "PERMISSIVE"}).alias("json")
+    ).select(col("json.*")).createOrReplaceTempView(model_name)
 def get_auth_options():
+    """Retrieve Kafka authentication options from environment variables."""
     kafka_sasl_username = os.getenv("DATACONTRACT_KAFKA_SASL_USERNAME")
     kafka_sasl_password = os.getenv("DATACONTRACT_KAFKA_SASL_PASSWORD")
-    if kafka_sasl_username is None:
-        auth_options = {}
-    else:
-        kafka_sasl_jaas_config = f'org.apache.kafka.common.security.plain.PlainLoginModule required username="{kafka_sasl_username}" password="{kafka_sasl_password}";'
-        auth_options = {
-            "kafka.sasl.mechanism": "PLAIN",
-            "kafka.security.protocol": "SASL_SSL",
-            "kafka.sasl.jaas.config": kafka_sasl_jaas_config,
-        }
-    return auth_options
+    kafka_sasl_mechanism = os.getenv("DATACONTRACT_KAFKA_SASL_MECHANISM", "PLAIN").upper()
+    # Skip authentication if credentials are not provided
+    if not kafka_sasl_username or not kafka_sasl_password:
+        return {}
+    # SASL mechanisms supported by Kafka
+    jaas_config = {
+        "PLAIN": (
+            f"org.apache.kafka.common.security.plain.PlainLoginModule required "
+            f'username="{kafka_sasl_username}" password="{kafka_sasl_password}";'
+        ),
+        "SCRAM-SHA-256": (
+            f"org.apache.kafka.common.security.scram.ScramLoginModule required "
+            f'username="{kafka_sasl_username}" password="{kafka_sasl_password}";'
+        ),
+        "SCRAM-SHA-512": (
+            f"org.apache.kafka.common.security.scram.ScramLoginModule required "
+            f'username="{kafka_sasl_username}" password="{kafka_sasl_password}";'
+        ),
+        # Add more mechanisms as needed
+    }
+    # Validate SASL mechanism
+    if kafka_sasl_mechanism not in jaas_config:
+        raise ValueError(f"Unsupported SASL mechanism: {kafka_sasl_mechanism}")
+    # Return config
+    return {
+        "kafka.sasl.mechanism": kafka_sasl_mechanism,
+        "kafka.security.protocol": "SASL_SSL",
+        "kafka.sasl.jaas.config": jaas_config[kafka_sasl_mechanism],
+    }
 def to_struct_type(fields):
-    struct_fields = []
-    for field_name, field in fields.items():
-        struct_fields.append(to_struct_field(field_name, field))
-    return StructType(struct_fields)
-def to_struct_field(field_name: str, field: Field) -> StructField:
-    if field.type is None:
-        data_type = DataType()
-    if field.type in ["string", "varchar", "text"]:
-        data_type = StringType()
-    elif field.type in ["number", "decimal", "numeric"]:
-        data_type = DecimalType()
-    elif field.type in ["float", "double"]:
-        data_type = DoubleType()
-    elif field.type in ["integer", "int"]:
-        data_type = IntegerType()
-    elif field.type in ["long", "bigint"]:
-        data_type = LongType()
-    elif field.type in ["boolean"]:
-        data_type = BooleanType()
-    elif field.type in ["timestamp", "timestamp_tz"]:
-        data_type = TimestampType()
-    elif field.type in ["timestamp_ntz"]:
-        data_type = TimestampNTZType()
-    elif field.type in ["date"]:
-        data_type = DateType()
-    elif field.type in ["time"]:
-        data_type = DataType()
-    elif field.type in ["object", "record", "struct"]:
-        data_type = to_struct_type(field.fields)
-    elif field.type in ["binary"]:
-        data_type = BinaryType()
-    elif field.type in ["array"]:
-        # TODO support array structs
-        data_type = ArrayType()
-    elif field.type in ["null"]:
-        data_type = NullType()
-    else:
-        data_type = DataType()
+    try:
+        from pyspark.sql.types import StructType
+    except ImportError as e:
+        raise DataContractException(
+            type="schema",
+            result="failed",
+            name="pyspark is missing",
+            reason="Install the extra datacontract-cli[kafka] to use kafka",
+            engine="datacontract",
+            original_exception=e,
+        )
+    """Convert field definitions to Spark StructType."""
+    return StructType([to_struct_field(field_name, field) for field_name, field in fields.items()])
+def to_struct_field(field_name: str, field: Field):
+    try:
+        from pyspark.sql.types import (
+            ArrayType,
+            BinaryType,
+            BooleanType,
+            DataType,
+            DateType,
+            DecimalType,
+            DoubleType,
+            IntegerType,
+            LongType,
+            NullType,
+            StringType,
+            StructField,
+            StructType,
+            TimestampNTZType,
+            TimestampType,
+        )
+    except ImportError as e:
+        raise DataContractException(
+            type="schema",
+            result="failed",
+            name="pyspark is missing",
+            reason="Install the extra datacontract-cli[kafka] to use kafka",
+            engine="datacontract",
+            original_exception=e,
+        )
+    """Map field definitions to Spark StructField using match-case."""
+    match field.type:
+        case "string" | "varchar" | "text":
+            data_type = StringType()
+        case "number" | "decimal" | "numeric":
+            data_type = DecimalType()
+        case "float" | "double":
+            data_type = DoubleType()
+        case "integer" | "int":
+            data_type = IntegerType()
+        case "long" | "bigint":
+            data_type = LongType()
+        case "boolean":
+            data_type = BooleanType()
+        case "timestamp" | "timestamp_tz":
+            data_type = TimestampType()
+        case "timestamp_ntz":
+            data_type = TimestampNTZType()
+        case "date":
+            data_type = DateType()
+        case "time":
+            data_type = DataType()  # Specific handling for time type
+        case "object" | "record" | "struct":
+            data_type = StructType(
+                [to_struct_field(sub_field_name, sub_field) for sub_field_name, sub_field in field.fields.items()]
+            )
+        case "binary":
+            data_type = BinaryType()
+        case "array":
+            element_type = (
+                StructType(
+                    [to_struct_field(sub_field_name, sub_field) for sub_field_name, sub_field in field.fields.items()]
+                )
+                if field.fields
+                else DataType()
+            )
+            data_type = ArrayType(element_type)
+        case "null":
+            data_type = NullType()
+        case _:
+            data_type = DataType()  # Fallback generic DataType
     return StructField(field_name, data_type, nullable=not field.required)

datacontract/engines/soda/connections/snowflake.py CHANGED Viewed

@@ -4,17 +4,20 @@ import yaml
 def to_snowflake_soda_configuration(server):
+    prefix = "DATACONTRACT_SNOWFLAKE_"
+    snowflake_soda_params = {k.replace(prefix, "").lower(): v for k, v in os.environ.items() if k.startswith(prefix)}
+    # backward compatibility
+    if "connection_timeout" not in snowflake_soda_params:
+        snowflake_soda_params["connection_timeout"] = "5"  # minutes
     soda_configuration = {
         f"data_source {server.type}": {
             "type": "snowflake",
-            "username": os.getenv("DATACONTRACT_SNOWFLAKE_USERNAME"),
-            "password": os.getenv("DATACONTRACT_SNOWFLAKE_PASSWORD"),
-            "role": os.getenv("DATACONTRACT_SNOWFLAKE_ROLE"),
             "account": server.account,
             "database": server.database,
             "schema": server.schema_,
-            "warehouse": os.getenv("DATACONTRACT_SNOWFLAKE_WAREHOUSE"),
-            "connection_timeout": 5,  # minutes
+            **snowflake_soda_params,
         }
     }
     soda_configuration_str = yaml.dump(soda_configuration)

datacontract/engines/soda/connections/sqlserver.py ADDED Viewed

@@ -0,0 +1,43 @@
+import os
+import yaml
+from datacontract.model.data_contract_specification import Server
+def to_sqlserver_soda_configuration(server: Server) -> str:
+    """Serialize server config to soda configuration.
+    ### Example:
+        type: sqlserver
+        host: host
+        port: '1433'
+        username: simple
+        password: simple_pass
+        database: database
+        schema: dbo
+        trusted_connection: false
+        encrypt: false
+        trust_server_certificate: false
+        driver: ODBC Driver 18 for SQL Server
+    """
+    # with service account key, using an external json file
+    soda_configuration = {
+        f"data_source {server.type}": {
+            "type": "sqlserver",
+            "host": server.host,
+            "port": str(server.port),
+            "username": os.getenv("DATACONTRACT_SQLSERVER_USERNAME", ""),
+            "password": os.getenv("DATACONTRACT_SQLSERVER_PASSWORD", ""),
+            "database": server.database,
+            "schema": server.schema_,
+            "trusted_connection": os.getenv("DATACONTRACT_SQLSERVER_TRUSTED_CONNECTION", False),
+            "trust_server_certificate": os.getenv("DATACONTRACT_SQLSERVER_TRUST_SERVER_CERTIFICATE", False),
+            "encrypt": os.getenv("DATACONTRACT_SQLSERVER_ENCRYPTED_CONNECTION", True),
+            "driver": server.driver,
+        }
+    }
+    soda_configuration_str = yaml.dump(soda_configuration)
+    return soda_configuration_str

datacontract/engines/soda/connections/trino.py ADDED Viewed

@@ -0,0 +1,26 @@
+import os
+import yaml
+def to_trino_soda_configuration(server):
+    password = os.getenv("DATACONTRACT_TRINO_PASSWORD")
+    username = os.getenv("DATACONTRACT_TRINO_USERNAME")
+    data_source = {
+        "type": "trino",
+        "host": server.host,
+        "port": str(server.port),
+        "username": username,
+        "password": password,
+        "catalog": server.catalog,
+        "schema": server.schema_,
+    }
+    if password is None or password == "":
+        data_source["auth_type"] = "NoAuthentication"  # default is BasicAuthentication
+    soda_configuration = {f"data_source {server.type}": data_source}
+    soda_configuration_str = yaml.dump(soda_configuration)
+    return soda_configuration_str

datacontract/export/avro_converter.py CHANGED Viewed

@@ -1,8 +1,15 @@
 import json
+from datacontract.export.exporter import Exporter, _check_models_for_export
 from datacontract.model.data_contract_specification import Field
+class AvroExporter(Exporter):
+    def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
+        model_name, model_value = _check_models_for_export(data_contract, model, self.export_format)
+        return to_avro_schema_json(model_name, model_value)
 def to_avro_schema(model_name, model) -> dict:
     return to_avro_record(model_name, model.fields, model.description, model.namespace)
@@ -33,19 +40,70 @@ def to_avro_field(field, field_name):
     avro_field = {"name": field_name}
     if field.description is not None:
         avro_field["doc"] = field.description
-    avro_field["type"] = to_avro_type(field, field_name)
+    is_required_avro = field.required if field.required is not None else True
+    avro_type = to_avro_type(field, field_name)
+    avro_field["type"] = avro_type if is_required_avro else ["null", avro_type]
+    # Handle enum types - both required and optional
+    if avro_type == "enum" or (isinstance(avro_field["type"], list) and "enum" in avro_field["type"]):
+        enum_def = {
+            "type": "enum",
+            "name": field.title,
+            "symbols": field.enum,
+        }
+        if is_required_avro:
+            avro_field["type"] = enum_def
+        else:
+            # Replace "enum" with the full enum definition in the union
+            avro_field["type"] = ["null", enum_def]
+    if field.config:
+        if "avroDefault" in field.config:
+            if field.config.get("avroType") != "enum":
+                avro_field["default"] = field.config["avroDefault"]
     return avro_field
 def to_avro_type(field: Field, field_name: str) -> str | dict:
+    if field.config:
+        if "avroLogicalType" in field.config and "avroType" in field.config:
+            return {"type": field.config["avroType"], "logicalType": field.config["avroLogicalType"]}
+        if "avroLogicalType" in field.config:
+            if field.config["avroLogicalType"] in [
+                "timestamp-millis",
+                "timestamp-micros",
+                "local-timestamp-millis",
+                "local-timestamp-micros",
+                "time-micros",
+            ]:
+                return {"type": "long", "logicalType": field.config["avroLogicalType"]}
+            if field.config["avroLogicalType"] in ["time-millis", "date"]:
+                return {"type": "int", "logicalType": field.config["avroLogicalType"]}
+        if "avroType" in field.config:
+            return field.config["avroType"]
+    # Check for enum fields based on presence of enum list and avroType config
+    if field.enum and field.config and field.config.get("avroType") == "enum":
+        return "enum"
     if field.type is None:
         return "null"
     if field.type in ["string", "varchar", "text"]:
         return "string"
-    elif field.type in ["number", "decimal", "numeric"]:
+    elif field.type in ["number", "numeric"]:
         # https://avro.apache.org/docs/1.11.1/specification/#decimal
         return "bytes"
-    elif field.type in ["float", "double"]:
+    elif field.type in ["decimal"]:
+        typeVal = {"type": "bytes", "logicalType": "decimal"}
+        if field.scale is not None:
+            typeVal["scale"] = field.scale
+        if field.precision is not None:
+            typeVal["precision"] = field.precision
+        return typeVal
+    elif field.type in ["float"]:
+        return "float"
+    elif field.type in ["double"]:
         return "double"
     elif field.type in ["integer", "int"]:
         return "int"
@@ -54,20 +112,26 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
     elif field.type in ["boolean"]:
         return "boolean"
     elif field.type in ["timestamp", "timestamp_tz"]:
-        return "string"
+        return {"type": "long", "logicalType": "timestamp-millis"}
     elif field.type in ["timestamp_ntz"]:
-        return "string"
+        return {"type": "long", "logicalType": "local-timestamp-millis"}
     elif field.type in ["date"]:
-        return "int"
+        return {"type": "int", "logicalType": "date"}
     elif field.type in ["time"]:
         return "long"
+    elif field.type in ["map"]:
+        if field.config is not None and "values" in field.config:
+            return {"type": "map", "values": field.config["values"]}
+        else:
+            return "bytes"
     elif field.type in ["object", "record", "struct"]:
+        if field.config is not None and "namespace" in field.config:
+            return to_avro_record(field_name, field.fields, field.description, field.config["namespace"])
         return to_avro_record(field_name, field.fields, field.description, None)
     elif field.type in ["binary"]:
         return "bytes"
     elif field.type in ["array"]:
-        # TODO support array structs
-        return "array"
+        return {"type": "array", "items": to_avro_type(field.items, field_name)}
     elif field.type in ["null"]:
         return "null"
     else:

datacontract/export/avro_idl_converter.py CHANGED Viewed

@@ -3,35 +3,12 @@ from dataclasses import dataclass
 from enum import Enum
 from io import StringIO
+from datacontract.export.exporter import Exporter
 from datacontract.lint.resolve import inline_definitions_into_data_contract
-from datacontract.model.data_contract_specification import \
-    DataContractSpecification, Field
+from datacontract.model.data_contract_specification import DataContractSpecification, Field
 from datacontract.model.exceptions import DataContractException
-def to_avro_idl(contract: DataContractSpecification) -> str:
-    """Serialize the provided data contract specification into an Avro IDL string.
-    The data contract will be serialized as a protocol, with one record type
-    for each contained model. Model fields are mapped one-to-one to Avro IDL
-    record fields.
-    """
-    stream = StringIO()
-    to_avro_idl_stream(contract, stream)
-    return stream.getvalue()
-def to_avro_idl_stream(contract: DataContractSpecification, stream: typing.TextIO):
-    """Serialize the provided data contract specification into Avro IDL."""
-    ir = _contract_to_avro_idl_ir(contract)
-    if ir.description:
-        stream.write(f"/** {contract.info.description} */\n")
-    stream.write(f"protocol {ir.name or 'Unnamed'} {{\n")
-    for model_type in ir.model_types:
-        _write_model_type(model_type, stream)
-    stream.write("}\n")
 class AvroPrimitiveType(Enum):
     int = "int"
     long = "long"
@@ -86,6 +63,7 @@ class AvroIDLProtocol:
     model_types: list[AvroModelType]
+# TODO use DATACONTRACT_TYPES from datacontract/model/data_contract_specification.py
 avro_primitive_types = set(
     [
         "string",
@@ -108,6 +86,34 @@ avro_primitive_types = set(
 )
+class AvroIdlExporter(Exporter):
+    def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
+        return to_avro_idl(data_contract)
+def to_avro_idl(contract: DataContractSpecification) -> str:
+    """Serialize the provided data contract specification into an Avro IDL string.
+    The data contract will be serialized as a protocol, with one record type
+    for each contained model. Model fields are mapped one-to-one to Avro IDL
+    record fields.
+    """
+    stream = StringIO()
+    to_avro_idl_stream(contract, stream)
+    return stream.getvalue()
+def to_avro_idl_stream(contract: DataContractSpecification, stream: typing.TextIO):
+    """Serialize the provided data contract specification into Avro IDL."""
+    ir = _contract_to_avro_idl_ir(contract)
+    if ir.description:
+        stream.write(f"/** {contract.info.description} */\n")
+    stream.write(f"protocol {ir.name or 'Unnamed'} {{\n")
+    for model_type in ir.model_types:
+        _write_model_type(model_type, stream)
+    stream.write("}\n")
 def _to_avro_primitive_logical_type(field_name: str, field: Field) -> AvroPrimitiveField:
     result = AvroPrimitiveField(field_name, field.required, field.description, AvroPrimitiveType.string)
     match field.type:

datacontract-cli 0.10.0__py3-none-any.whl → 0.10.37__py3-none-any.whl

datacontract-cli 0.10.0py3-none-any.whl → 0.10.37py3-none-any.whl