PyPI - datacontract-cli - Versions diffs - 0.10.1__py3-none-any.whl → 0.10.3__py3-none-any.whl - Mend

datacontract-cli 0.10.1py3-none-any.whl → 0.10.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datacontract-cli might be problematic. Click here for more details.

Files changed (36) hide show

datacontract/breaking/breaking_rules.py +4 -0
datacontract/catalog/catalog.py +76 -0
datacontract/cli.py +39 -3
datacontract/data_contract.py +12 -1
datacontract/engines/fastjsonschema/check_jsonschema.py +1 -2
datacontract/engines/soda/check_soda_execute.py +9 -15
datacontract/engines/soda/connections/duckdb.py +83 -14
datacontract/engines/soda/connections/kafka.py +108 -105
datacontract/export/avro_idl_converter.py +1 -2
datacontract/export/dbt_converter.py +1 -2
datacontract/export/great_expectations_converter.py +1 -2
datacontract/export/html_export.py +3 -2
datacontract/export/jsonschema_converter.py +1 -2
datacontract/export/odcs_converter.py +1 -2
datacontract/export/rdf_converter.py +1 -1
datacontract/export/sodacl_converter.py +1 -2
datacontract/export/terraform_converter.py +1 -2
datacontract/imports/avro_importer.py +1 -2
datacontract/imports/glue_importer.py +183 -0
datacontract/imports/sql_importer.py +20 -9
datacontract/integration/publish_opentelemetry.py +3 -6
datacontract/lint/linters/example_model_linter.py +1 -2
datacontract/lint/linters/field_pattern_linter.py +1 -2
datacontract/lint/linters/notice_period_linter.py +1 -2
datacontract/lint/linters/quality_schema_linter.py +1 -2
datacontract/lint/resolve.py +9 -6
datacontract/model/data_contract_specification.py +2 -0
datacontract/templates/datacontract.html +76 -21
datacontract/templates/index.html +168 -0
datacontract/templates/style/output.css +113 -4
{datacontract_cli-0.10.1.dist-info → datacontract_cli-0.10.3.dist-info}/METADATA +180 -102
{datacontract_cli-0.10.1.dist-info → datacontract_cli-0.10.3.dist-info}/RECORD +36 -33
{datacontract_cli-0.10.1.dist-info → datacontract_cli-0.10.3.dist-info}/LICENSE +0 -0
{datacontract_cli-0.10.1.dist-info → datacontract_cli-0.10.3.dist-info}/WHEEL +0 -0
{datacontract_cli-0.10.1.dist-info → datacontract_cli-0.10.3.dist-info}/entry_points.txt +0 -0
{datacontract_cli-0.10.1.dist-info → datacontract_cli-0.10.3.dist-info}/top_level.txt +0 -0

datacontract/breaking/breaking_rules.py CHANGED Viewed

@@ -20,6 +20,10 @@ class BreakingRules:
     field_ref_removed = Severity.WARNING
     field_ref_updated = Severity.WARNING
+    field_title_added = Severity.INFO
+    field_title_removed = Severity.INFO
+    field_title_updated = Severity.INFO
     field_type_added = Severity.WARNING
     field_type_removed = Severity.WARNING
     field_type_updated = Severity.ERROR

datacontract/catalog/catalog.py ADDED Viewed

@@ -0,0 +1,76 @@
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+import pytz
+from jinja2 import PackageLoader, Environment, select_autoescape
+from datacontract.data_contract import DataContract
+from datacontract.export.html_export import get_version
+from datacontract.model.data_contract_specification import \
+    DataContractSpecification
+def create_data_contract_html(contracts, file: Path, path: Path):
+    data_contract = DataContract(data_contract_file=f"{file.absolute()}", inline_definitions=True, inline_quality=True)
+    html = data_contract.export(export_format="html")
+    spec = data_contract.get_data_contract_specification()
+    file_without_suffix = file.with_suffix(".html")
+    html_filepath = path / file_without_suffix
+    html_filepath.parent.mkdir(parents=True, exist_ok=True)
+    with open(html_filepath, "w") as f:
+        f.write(html)
+    contracts.append(
+        DataContractView(
+            html_filepath=html_filepath,
+            html_link=file_without_suffix,
+            spec=spec,
+        )
+    )
+    print(f"Created {html_filepath}")
+@dataclass
+class DataContractView:
+    """Class for keeping track of an item in inventory."""
+    html_filepath: Path
+    html_link: Path
+    spec: DataContractSpecification
+def create_index_html(contracts, path):
+    index_filepath = path / "index.html"
+    with open(index_filepath, "w") as f:
+        # Load templates from templates folder
+        package_loader = PackageLoader("datacontract", "templates")
+        env = Environment(
+            loader=package_loader,
+            autoescape=select_autoescape(
+                enabled_extensions="html",
+                default_for_string=True,
+            ),
+        )
+        # Load the required template
+        # needs to be included in /MANIFEST.in
+        template = env.get_template("index.html")
+        # needs to be included in /MANIFEST.in
+        style_content, _, _ = package_loader.get_source(env, "style/output.css")
+        tz = pytz.timezone("UTC")
+        now = datetime.now(tz)
+        formatted_date = now.strftime("%d %b %Y %H:%M:%S UTC")
+        datacontract_cli_version = get_version()
+        # Render the template with necessary data
+        html_string = template.render(
+            style=style_content,
+            formatted_date=formatted_date,
+            datacontract_cli_version=datacontract_cli_version,
+            contracts=contracts,
+            contracts_size=len(contracts),
+        )
+        f.write(html_string)
+    print(f"Created {index_filepath}")

datacontract/cli.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from enum import Enum
 from importlib import metadata
+from pathlib import Path
 from typing import Iterable, Optional
 import typer
@@ -10,6 +11,8 @@ from rich.table import Table
 from typer.core import TyperGroup
 from typing_extensions import Annotated
+from datacontract.catalog.catalog import create_index_html, \
+    create_data_contract_html
 from datacontract.data_contract import DataContract
 from datacontract.init.download_datacontract_file import \
     download_datacontract_file, FileExistsException
@@ -160,6 +163,7 @@ class ExportFormat(str, Enum):
 @app.command()
 def export(
     format: Annotated[ExportFormat, typer.Option(help="The export format.")],
+    output: Annotated[Path, typer.Option(help="Specify the file path where the exported data will be saved. If no path is provided, the output will be printed to stdout.")] = None,
     server: Annotated[str, typer.Option(help="The server name to export.")] = None,
     model: Annotated[
         str,
@@ -169,10 +173,12 @@ def export(
             "models (default)."
         ),
     ] = "all",
+    # TODO: this should be a subcommand
     rdf_base: Annotated[
         Optional[str],
         typer.Option(help="[rdf] The base URI used to generate the RDF graph.", rich_help_panel="RDF Options"),
     ] = None,
+    # TODO: this should be a subcommand
     sql_server_type: Annotated[
         Optional[str],
         typer.Option(
@@ -195,26 +201,56 @@ def export(
         sql_server_type=sql_server_type,
     )
     # Don't interpret console markup in output.
-    console.print(result, markup=False)
+    if output is None:
+        console.print(result, markup=False)
+    else:
+        with output.open('w') as f:
+            f.write(result)
+        console.print(f"Written result to {output}")
 class ImportFormat(str, Enum):
     sql = "sql"
     avro = "avro"
+    glue = "glue"
 @app.command(name="import")
 def import_(
     format: Annotated[ImportFormat, typer.Option(help="The format of the source file.")],
-    source: Annotated[str, typer.Option(help="The path to the file that should be imported.")],
+    source: Annotated[str, typer.Option(help="The path to the file or Glue Database that should be imported.")],
 ):
     """
-    Create a data contract from the given source file. Prints to stdout.
+    Create a data contract from the given source location. Prints to stdout.
     """
     result = DataContract().import_from_source(format, source)
     console.print(result.to_yaml())
+@app.command(name="catalog")
+def catalog(
+    files: Annotated[
+        Optional[str], typer.Option(help="Glob pattern for the data contract files to include in the catalog.")
+    ] = "*.yaml",
+    output: Annotated[Optional[str], typer.Option(help="Output directory for the catalog html files.")] = "catalog/",
+):
+    """
+    Create an html catalog of data contracts.
+    """
+    path = Path(output)
+    path.mkdir(parents=True, exist_ok=True)
+    console.print(f"Created {output}")
+    contracts = []
+    for file in Path().glob(files):
+        try:
+            create_data_contract_html(contracts, file, path)
+        except Exception as e:
+            console.print(f"Skipped {file} due to error: {e}")
+    create_index_html(contracts, path)
 @app.command()
 def breaking(
     location_old: Annotated[str, typer.Argument(help="The location (url or path) of the old data contract yaml.")],

datacontract/data_contract.py CHANGED Viewed

@@ -30,6 +30,7 @@ from datacontract.export.sodacl_converter import to_sodacl_yaml
 from datacontract.export.sql_converter import to_sql_ddl, to_sql_query
 from datacontract.export.terraform_converter import to_terraform
 from datacontract.imports.avro_importer import import_avro
+from datacontract.imports.glue_importer import import_glue
 from datacontract.imports.sql_importer import import_sql
 from datacontract.integration.publish_datamesh_manager import \
     publish_datamesh_manager
@@ -66,6 +67,7 @@ class DataContract:
         publish_to_opentelemetry: bool = False,
         spark: SparkSession = None,
         inline_definitions: bool = False,
+        inline_quality: bool = False,
     ):
         self._data_contract_file = data_contract_file
         self._data_contract_str = data_contract_str
@@ -77,6 +79,7 @@ class DataContract:
         self._publish_to_opentelemetry = publish_to_opentelemetry
         self._spark = spark
         self._inline_definitions = inline_definitions
+        self._inline_quality = inline_quality
         self.all_linters = {
             ExampleModelLinter(),
             QualityUsesSchemaLinter(),
@@ -105,6 +108,7 @@ class DataContract:
                 self._data_contract,
                 self._schema_location,
                 inline_definitions=True,
+                inline_quality=True,
             )
             run.checks.append(
                 Check(type="lint", result="passed", name="Data contract is syntactically valid", engine="datacontract")
@@ -273,11 +277,16 @@ class DataContract:
             data_contract=self._data_contract,
             schema_location=self._schema_location,
             inline_definitions=self._inline_definitions,
+            inline_quality=self._inline_quality,
         )
     def export(self, export_format, model: str = "all", rdf_base: str = None, sql_server_type: str = "auto") -> str:
         data_contract = resolve.resolve_data_contract(
-            self._data_contract_file, self._data_contract_str, self._data_contract, inline_definitions=True
+            self._data_contract_file,
+            self._data_contract_str,
+            self._data_contract,
+            inline_definitions=True,
+            inline_quality=True,
         )
         if export_format == "jsonschema":
             if data_contract.models is None:
@@ -482,6 +491,8 @@ class DataContract:
             data_contract_specification = import_sql(data_contract_specification, format, source)
         elif format == "avro":
             data_contract_specification = import_avro(data_contract_specification, source)
+        elif format == "glue":
+            data_contract_specification = import_glue(data_contract_specification, source)
         else:
             print(f"Import format {format} not supported.")

datacontract/engines/fastjsonschema/check_jsonschema.py CHANGED Viewed

@@ -6,8 +6,7 @@ import fastjsonschema
 from datacontract.engines.fastjsonschema.s3.s3_read_files import yield_s3_files
 from datacontract.export.jsonschema_converter import to_jsonschema
-from datacontract.model.data_contract_specification import \
-    DataContractSpecification, Server
+from datacontract.model.data_contract_specification import DataContractSpecification, Server
 from datacontract.model.exceptions import DataContractException
 from datacontract.model.run import Run, Check

datacontract/engines/soda/check_soda_execute.py CHANGED Viewed

@@ -3,20 +3,14 @@ import logging
 from pyspark.sql import SparkSession
 from soda.scan import Scan
-from datacontract.engines.soda.connections.bigquery import \
-    to_bigquery_soda_configuration
-from datacontract.engines.soda.connections.databricks import \
-    to_databricks_soda_configuration
+from datacontract.engines.soda.connections.bigquery import to_bigquery_soda_configuration
+from datacontract.engines.soda.connections.databricks import to_databricks_soda_configuration
 from datacontract.engines.soda.connections.duckdb import get_duckdb_connection
-from datacontract.engines.soda.connections.kafka import create_spark_session, \
-    read_kafka_topic
-from datacontract.engines.soda.connections.postgres import \
-    to_postgres_soda_configuration
-from datacontract.engines.soda.connections.snowflake import \
-    to_snowflake_soda_configuration
+from datacontract.engines.soda.connections.kafka import create_spark_session, read_kafka_topic
+from datacontract.engines.soda.connections.postgres import to_postgres_soda_configuration
+from datacontract.engines.soda.connections.snowflake import to_snowflake_soda_configuration
 from datacontract.export.sodacl_converter import to_sodacl_yaml
-from datacontract.model.data_contract_specification import \
-    DataContractSpecification, Server
+from datacontract.model.data_contract_specification import DataContractSpecification, Server
 from datacontract.model.run import Run, Check, Log
@@ -30,9 +24,9 @@ def check_soda_execute(
     run.log_info("Running engine soda-core")
     scan = Scan()
-    if server.type == "s3" or server.type == "local":
-        if server.format in ["json", "parquet", "csv"]:
-            con = get_duckdb_connection(data_contract, server)
+    if server.type in ["s3", "azure", "local"]:
+        if server.format in ["json", "parquet", "csv", "delta"]:
+            con = get_duckdb_connection(data_contract, server, run)
             scan.add_duckdb_connection(duckdb_connection=con, data_source_name=server.type)
             scan.set_data_source_name(server.type)
         else:

datacontract/engines/soda/connections/duckdb.py CHANGED Viewed

@@ -1,23 +1,28 @@
-import logging
 import os
+from deltalake import DeltaTable
 import duckdb
 from datacontract.export.csv_type_converter import convert_to_duckdb_csv_type
+from datacontract.model.run import Run
-def get_duckdb_connection(data_contract, server):
+def get_duckdb_connection(data_contract, server, run: Run):
     con = duckdb.connect(database=":memory:")
     path: str = ""
     if server.type == "local":
         path = server.path
     if server.type == "s3":
         path = server.location
-    setup_s3_connection(con, server)
+        setup_s3_connection(con, server)
+    if server.type == "azure":
+        path = server.location
+        setup_azure_connection(con, server)
     for model_name, model in data_contract.models.items():
         model_path = path
         if "{model}" in model_path:
             model_path = model_path.format(model=model_name)
-        logging.info(f"Creating table {model_name} for {model_path}")
+        run.log_info(f"Creating table {model_name} for {model_path}")
         if server.format == "json":
             format = "auto"
@@ -34,6 +39,7 @@ def get_duckdb_connection(data_contract, server):
                         """)
         elif server.format == "csv":
             columns = to_csv_types(model)
+            run.log_info("Using columns: " + str(columns))
             if columns is None:
                 con.sql(
                     f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1);"""
@@ -42,6 +48,21 @@ def get_duckdb_connection(data_contract, server):
                 con.sql(
                     f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1, columns={columns});"""
                 )
+        elif server.format == "delta":
+            if server.type == "azure":
+                raise NotImplementedError("Support for Delta Tables on Azure Storage is not implemented yet")
+            storage_options = {
+                "AWS_ENDPOINT_URL": server.endpointUrl,
+                "AWS_ACCESS_KEY_ID": os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID"),
+                "AWS_SECRET_ACCESS_KEY": os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY"),
+                "AWS_REGION": os.getenv("DATACONTRACT_S3_REGION", "us-east-1"),
+                "AWS_ALLOW_HTTP": "True" if server.endpointUrl.startswith("http://") else "False",
+            }
+            delta_table_arrow = DeltaTable(model_path, storage_options=storage_options).to_pyarrow_dataset()
+            con.register(model_name, delta_table_arrow)
     return con
@@ -59,18 +80,66 @@ def setup_s3_connection(con, server):
     s3_region = os.getenv("DATACONTRACT_S3_REGION")
     s3_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
     s3_secret_access_key = os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY")
-    # con.install_extension("httpfs")
-    # con.load_extension("httpfs")
+    s3_endpoint = "s3.amazonaws.com"
+    use_ssl = "true"
+    url_style = "vhost"
     if server.endpointUrl is not None:
         s3_endpoint = server.endpointUrl.removeprefix("http://").removeprefix("https://")
         if server.endpointUrl.startswith("http://"):
-            con.sql("SET s3_use_ssl = 0; SET s3_url_style = 'path';")
-        con.sql(f"""
-                SET s3_endpoint = '{s3_endpoint}';
-                """)
+            use_ssl = "false"
+            url_style = 'path'
     if s3_access_key_id is not None:
         con.sql(f"""
-                    SET s3_region = '{s3_region}';
-                    SET s3_access_key_id = '{s3_access_key_id}';
-                    SET s3_secret_access_key = '{s3_secret_access_key}';
-                    """)
+            CREATE OR REPLACE SECRET s3_secret (
+                TYPE S3,
+                PROVIDER CREDENTIAL_CHAIN,
+                REGION '{s3_region}',
+                KEY_ID '{s3_access_key_id}',
+                SECRET '{s3_secret_access_key}',
+                ENDPOINT '{s3_endpoint}',
+                USE_SSL '{use_ssl}',
+                URL_STYLE '{url_style}'
+            );
+        """)
+    #     con.sql(f"""
+    #                 SET s3_region = '{s3_region}';
+    #                 SET s3_access_key_id = '{s3_access_key_id}';
+    #                 SET s3_secret_access_key = '{s3_secret_access_key}';
+    #                 """)
+    # else:
+    #     con.sql("""
+    #                 RESET s3_region;
+    #                 RESET s3_access_key_id;
+    #                 RESET s3_secret_access_key;
+    #     """)
+    # con.sql("RESET s3_session_token")
+    # print(con.sql("SELECT * FROM duckdb_settings() WHERE name like 's3%'"))
+def setup_azure_connection(con, server):
+    tenant_id = os.getenv("DATACONTRACT_AZURE_TENANT_ID")
+    client_id = os.getenv("DATACONTRACT_AZURE_CLIENT_ID")
+    client_secret = os.getenv("DATACONTRACT_AZURE_CLIENT_SECRET")
+    if tenant_id is None:
+        raise ValueError("Error: Environment variable DATACONTRACT_AZURE_TENANT_ID is not set")
+    if client_id is None:
+        raise ValueError("Error: Environment variable DATACONTRACT_AZURE_CLIENT_ID is not set")
+    if client_secret is None:
+        raise ValueError("Error: Environment variable DATACONTRACT_AZURE_CLIENT_SECRET is not set")
+    con.install_extension("azure")
+    con.load_extension("azure")
+    con.sql(f"""
+    CREATE SECRET azure_spn (
+        TYPE AZURE,
+        PROVIDER SERVICE_PRINCIPAL,
+        TENANT_ID '{tenant_id}',
+        CLIENT_ID '{client_id}',
+        CLIENT_SECRET '{client_secret}'
+    );
+    """)

datacontract-cli 0.10.1__py3-none-any.whl → 0.10.3__py3-none-any.whl

Potentially problematic release.

datacontract-cli 0.10.1py3-none-any.whl → 0.10.3py3-none-any.whl