datacontract-cli 0.10.21__py3-none-any.whl → 0.10.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. datacontract/breaking/breaking.py +1 -1
  2. datacontract/breaking/breaking_rules.py +1 -1
  3. datacontract/cli.py +25 -77
  4. datacontract/data_contract.py +14 -100
  5. datacontract/engines/data_contract_checks.py +735 -0
  6. datacontract/engines/data_contract_test.py +67 -0
  7. datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +2 -3
  8. datacontract/engines/soda/check_soda_execute.py +37 -31
  9. datacontract/engines/soda/connections/{duckdb.py → duckdb_connection.py} +6 -5
  10. datacontract/engines/soda/connections/kafka.py +8 -3
  11. datacontract/export/avro_converter.py +2 -0
  12. datacontract/export/dbt_converter.py +13 -10
  13. datacontract/export/exporter.py +0 -2
  14. datacontract/export/exporter_factory.py +0 -12
  15. datacontract/export/odcs_v3_exporter.py +22 -3
  16. datacontract/export/sodacl_converter.py +22 -294
  17. datacontract/export/sql_type_converter.py +7 -2
  18. datacontract/imports/odcs_importer.py +6 -3
  19. datacontract/imports/odcs_v3_importer.py +3 -1
  20. datacontract/imports/sql_importer.py +229 -29
  21. datacontract/lint/resolve.py +17 -4
  22. datacontract/model/exceptions.py +4 -1
  23. datacontract/model/run.py +11 -4
  24. datacontract/output/junit_test_results.py +135 -0
  25. datacontract/output/output_format.py +10 -0
  26. datacontract/output/test_results_writer.py +79 -0
  27. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.23.dist-info}/METADATA +192 -215
  28. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.23.dist-info}/RECORD +33 -32
  29. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.23.dist-info}/WHEEL +1 -1
  30. datacontract/engines/soda/connections/dask.py +0 -28
  31. datacontract/export/odcs_v2_exporter.py +0 -124
  32. datacontract/imports/odcs_v2_importer.py +0 -177
  33. datacontract/lint/linters/example_model_linter.py +0 -91
  34. /datacontract/{model → breaking}/breaking_change.py +0 -0
  35. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.23.dist-info}/LICENSE +0 -0
  36. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.23.dist-info}/entry_points.txt +0 -0
  37. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.23.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,67 @@
1
+ import typing
2
+
3
+ from datacontract.engines.data_contract_checks import create_checks
4
+
5
+ if typing.TYPE_CHECKING:
6
+ from pyspark.sql import SparkSession
7
+
8
+ from datacontract.engines.datacontract.check_that_datacontract_contains_valid_servers_configuration import (
9
+ check_that_datacontract_contains_valid_server_configuration,
10
+ )
11
+ from datacontract.engines.fastjsonschema.check_jsonschema import check_jsonschema
12
+ from datacontract.engines.soda.check_soda_execute import check_soda_execute
13
+ from datacontract.model.data_contract_specification import DataContractSpecification
14
+ from datacontract.model.exceptions import DataContractException
15
+ from datacontract.model.run import ResultEnum, Run
16
+
17
+
18
+ def execute_data_contract_test(
19
+ data_contract_specification: DataContractSpecification,
20
+ run: Run,
21
+ server_name: str = None,
22
+ spark: "SparkSession" = None,
23
+ ):
24
+ if data_contract_specification.models is None or len(data_contract_specification.models) == 0:
25
+ raise DataContractException(
26
+ type="lint",
27
+ name="Check that data contract contains models",
28
+ result=ResultEnum.warning,
29
+ reason="Models block is missing. Skip executing tests.",
30
+ engine="datacontract",
31
+ )
32
+ server = get_server(data_contract_specification, server_name)
33
+ run.log_info(f"Running tests for data contract {data_contract_specification.id} with server {server_name}")
34
+ run.dataContractId = data_contract_specification.id
35
+ run.dataContractVersion = data_contract_specification.info.version
36
+ run.dataProductId = server.dataProductId
37
+ run.outputPortId = server.outputPortId
38
+ run.server = server_name
39
+
40
+ run.checks.extend(create_checks(data_contract_specification, server))
41
+
42
+ # TODO check server is supported type for nicer error messages
43
+ # TODO check server credentials are complete for nicer error messages
44
+ if server.format == "json" and server.type != "kafka":
45
+ check_jsonschema(run, data_contract_specification, server)
46
+ check_soda_execute(run, data_contract_specification, server, spark)
47
+
48
+
49
+ def get_server(data_contract_specification: DataContractSpecification, server_name: str = None):
50
+ """Get the server configuration from the data contract specification.
51
+
52
+ Args:
53
+ data_contract_specification: The data contract specification
54
+ server_name: Optional name of the server to use. If not provided, uses the first server.
55
+
56
+ Returns:
57
+ The selected server configuration
58
+ """
59
+
60
+ check_that_datacontract_contains_valid_server_configuration(data_contract_specification, server_name)
61
+
62
+ if server_name:
63
+ server = data_contract_specification.servers.get(server_name)
64
+ else:
65
+ server_name = list(data_contract_specification.servers.keys())[0]
66
+ server = data_contract_specification.servers.get(server_name)
67
+ return server
@@ -1,12 +1,11 @@
1
1
  from datacontract.model.data_contract_specification import DataContractSpecification
2
2
  from datacontract.model.exceptions import DataContractException
3
- from datacontract.model.run import Run
4
3
 
5
4
 
6
5
  def check_that_datacontract_contains_valid_server_configuration(
7
- run: Run, data_contract: DataContractSpecification, server_name: str
6
+ data_contract: DataContractSpecification, server_name: str | None
8
7
  ):
9
- if data_contract.servers is None:
8
+ if data_contract.servers is None or len(data_contract.servers) == 0:
10
9
  raise DataContractException(
11
10
  type="lint",
12
11
  name="Check that data contract contains valid server configuration",
@@ -1,8 +1,9 @@
1
1
  import logging
2
+ import uuid
2
3
 
3
4
  from datacontract.engines.soda.connections.bigquery import to_bigquery_soda_configuration
4
5
  from datacontract.engines.soda.connections.databricks import to_databricks_soda_configuration
5
- from datacontract.engines.soda.connections.duckdb import get_duckdb_connection
6
+ from datacontract.engines.soda.connections.duckdb_connection import get_duckdb_connection
6
7
  from datacontract.engines.soda.connections.kafka import create_spark_session, read_kafka_topic
7
8
  from datacontract.engines.soda.connections.postgres import to_postgres_soda_configuration
8
9
  from datacontract.engines.soda.connections.snowflake import to_snowflake_soda_configuration
@@ -13,7 +14,7 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
13
14
  from datacontract.model.run import Check, Log, ResultEnum, Run
14
15
 
15
16
 
16
- def check_soda_execute(run: Run, data_contract: DataContractSpecification, server: Server, spark, tmp_dir):
17
+ def check_soda_execute(run: Run, data_contract: DataContractSpecification, server: Server, spark):
17
18
  from soda.common.config_helper import ConfigHelper
18
19
 
19
20
  ConfigHelper.get_instance().upsert_value("send_anonymous_usage_stats", False)
@@ -80,8 +81,8 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
80
81
  scan.set_data_source_name("datacontract-cli")
81
82
  elif server.type == "kafka":
82
83
  if spark is None:
83
- spark = create_spark_session(tmp_dir)
84
- read_kafka_topic(spark, data_contract, server, tmp_dir)
84
+ spark = create_spark_session()
85
+ read_kafka_topic(spark, data_contract, server)
85
86
  scan.add_spark_session(spark, data_source_name=server.type)
86
87
  scan.set_data_source_name(server.type)
87
88
  elif server.type == "sqlserver":
@@ -106,37 +107,34 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
106
107
  run.log_warn(f"Server type {server.type} not yet supported by datacontract CLI")
107
108
  return
108
109
 
109
- # Don't check types for json format, as they are checked with json schema
110
- # Don't check types for avro format, as they are checked with avro schema
111
- # Don't check types for csv format, as they are hard to detect
112
- server_type = server.type
113
- check_types = server.format != "json" and server.format != "csv" and server.format != "avro"
114
-
115
- sodacl_yaml_str = to_sodacl_yaml(data_contract, server_type, check_types)
110
+ sodacl_yaml_str = to_sodacl_yaml(run)
116
111
  # print("sodacl_yaml_str:\n" + sodacl_yaml_str)
117
112
  scan.add_sodacl_yaml_str(sodacl_yaml_str)
118
113
 
119
114
  # Execute the scan
120
- logging.info("Starting soda scan")
115
+ logging.info("Starting soda scan with checks:\n" + sodacl_yaml_str)
121
116
  scan.execute()
122
117
  logging.info("Finished soda scan")
123
118
 
124
119
  # pprint.PrettyPrinter(indent=2).pprint(scan.build_scan_results())
125
120
 
126
121
  scan_results = scan.get_scan_results()
127
- for c in scan_results.get("checks"):
128
- check = Check(
129
- type="schema",
130
- result=to_result(c),
131
- reason=", ".join(c.get("outcomeReasons")),
132
- name=c.get("name"),
133
- model=c.get("table"),
134
- field=c.get("column"),
135
- engine="soda-core",
136
- diagnostics=c.get("diagnostics"),
137
- )
138
- update_reason(check, c)
139
- run.checks.append(check)
122
+ for scan_result in scan_results.get("checks"):
123
+ name = scan_result.get("name")
124
+ check = get_check(run, scan_result)
125
+ if check is None:
126
+ check = Check(
127
+ id=str(uuid.uuid4()),
128
+ category="custom",
129
+ type="custom",
130
+ name=name,
131
+ engine="soda-core",
132
+ )
133
+ run.checks.append(check)
134
+ check.result = to_result(scan_result)
135
+ check.reason = ", ".join(scan_result.get("outcomeReasons"))
136
+ check.diagnostics = scan_result.get("diagnostics")
137
+ update_reason(check, scan_result)
140
138
 
141
139
  for log in scan_results.get("logs"):
142
140
  run.logs.append(
@@ -152,8 +150,8 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
152
150
  run.checks.append(
153
151
  Check(
154
152
  type="general",
155
- name="Execute quality checks",
156
- result="warning",
153
+ name="Data Contract Tests",
154
+ result=ResultEnum.warning,
157
155
  reason="Engine soda-core has errors. See the logs for details.",
158
156
  engine="soda-core",
159
157
  )
@@ -161,14 +159,22 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
161
159
  return
162
160
 
163
161
 
164
- def to_result(c) -> str:
162
+ def get_check(run, scan_result) -> Check | None:
163
+ check_by_name = next((c for c in run.checks if c.key == scan_result.get("name")), None)
164
+ if check_by_name is not None:
165
+ return check_by_name
166
+
167
+ return None
168
+
169
+
170
+ def to_result(c) -> ResultEnum:
165
171
  soda_outcome = c.get("outcome")
166
172
  if soda_outcome == "pass":
167
- return "passed"
173
+ return ResultEnum.passed
168
174
  elif soda_outcome == "fail":
169
- return "failed"
175
+ return ResultEnum.failed
170
176
  else:
171
- return soda_outcome
177
+ return ResultEnum.unknown
172
178
 
173
179
 
174
180
  def update_reason(check, c):
@@ -1,4 +1,5 @@
1
1
  import os
2
+ from typing import Any
2
3
 
3
4
  import duckdb
4
5
 
@@ -27,13 +28,13 @@ def get_duckdb_connection(data_contract, server, run: Run):
27
28
  run.log_info(f"Creating table {model_name} for {model_path}")
28
29
 
29
30
  if server.format == "json":
30
- format = "auto"
31
+ json_format = "auto"
31
32
  if server.delimiter == "new_line":
32
- format = "newline_delimited"
33
+ json_format = "newline_delimited"
33
34
  elif server.delimiter == "array":
34
- format = "array"
35
+ json_format = "array"
35
36
  con.sql(f"""
36
- CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{format}', hive_partitioning=1);
37
+ CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', hive_partitioning=1);
37
38
  """)
38
39
  elif server.format == "parquet":
39
40
  con.sql(f"""
@@ -56,7 +57,7 @@ def get_duckdb_connection(data_contract, server, run: Run):
56
57
  return con
57
58
 
58
59
 
59
- def to_csv_types(model) -> dict:
60
+ def to_csv_types(model) -> dict[Any, str | None] | None:
60
61
  if model is None:
61
62
  return None
62
63
  columns = {}
@@ -1,12 +1,14 @@
1
+ import atexit
1
2
  import logging
2
3
  import os
4
+ import tempfile
3
5
 
4
6
  from datacontract.export.avro_converter import to_avro_schema_json
5
7
  from datacontract.model.data_contract_specification import DataContractSpecification, Field, Server
6
8
  from datacontract.model.exceptions import DataContractException
7
9
 
8
10
 
9
- def create_spark_session(tmp_dir: str):
11
+ def create_spark_session():
10
12
  """Create and configure a Spark session."""
11
13
 
12
14
  try:
@@ -21,6 +23,9 @@ def create_spark_session(tmp_dir: str):
21
23
  original_exception=e,
22
24
  )
23
25
 
26
+ tmp_dir = tempfile.TemporaryDirectory(prefix="datacontract-cli-spark")
27
+ atexit.register(tmp_dir.cleanup)
28
+
24
29
  spark = (
25
30
  SparkSession.builder.appName("datacontract")
26
31
  .config("spark.sql.warehouse.dir", f"{tmp_dir}/spark-warehouse")
@@ -37,7 +42,7 @@ def create_spark_session(tmp_dir: str):
37
42
  return spark
38
43
 
39
44
 
40
- def read_kafka_topic(spark, data_contract: DataContractSpecification, server: Server, tmp_dir):
45
+ def read_kafka_topic(spark, data_contract: DataContractSpecification, server: Server):
41
46
  """Read and process data from a Kafka topic based on the server configuration."""
42
47
 
43
48
  logging.info("Reading data from Kafka server %s topic %s", server.host, server.topic)
@@ -62,7 +67,7 @@ def read_kafka_topic(spark, data_contract: DataContractSpecification, server: Se
62
67
  type="test",
63
68
  name="Configuring Kafka checks",
64
69
  result="warning",
65
- reason=f"Kafka format '{server.format}' is not supported. " f"Skip executing tests.",
70
+ reason=f"Kafka format '{server.format}' is not supported. Skip executing tests.",
66
71
  engine="datacontract",
67
72
  )
68
73
 
@@ -108,6 +108,8 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
108
108
  elif field.type in ["time"]:
109
109
  return "long"
110
110
  elif field.type in ["object", "record", "struct"]:
111
+ if field.config is not None and "namespace" in field.config:
112
+ return to_avro_record(field_name, field.fields, field.description, field.config["namespace"])
111
113
  return to_avro_record(field_name, field.fields, field.description, None)
112
114
  elif field.type in ["binary"]:
113
115
  return "bytes"
@@ -9,7 +9,7 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
9
9
 
10
10
  class DbtExporter(Exporter):
11
11
  def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
12
- return to_dbt_models_yaml(data_contract)
12
+ return to_dbt_models_yaml(data_contract, server)
13
13
 
14
14
 
15
15
  class DbtSourceExporter(Exporter):
@@ -27,15 +27,16 @@ class DbtStageExporter(Exporter):
27
27
  )
28
28
 
29
29
 
30
- def to_dbt_models_yaml(data_contract_spec: DataContractSpecification):
30
+ def to_dbt_models_yaml(data_contract_spec: DataContractSpecification, server: str = None):
31
31
  dbt = {
32
32
  "version": 2,
33
33
  "models": [],
34
34
  }
35
+
35
36
  for model_key, model_value in data_contract_spec.models.items():
36
- dbt_model = _to_dbt_model(model_key, model_value, data_contract_spec)
37
+ dbt_model = _to_dbt_model(model_key, model_value, data_contract_spec, adapter_type=server)
37
38
  dbt["models"].append(dbt_model)
38
- return yaml.dump(dbt, indent=2, sort_keys=False, allow_unicode=True)
39
+ return yaml.safe_dump(dbt, indent=2, sort_keys=False, allow_unicode=True)
39
40
 
40
41
 
41
42
  def to_dbt_staging_sql(data_contract_spec: DataContractSpecification, model_name: str, model_value: Model) -> str:
@@ -60,7 +61,7 @@ def to_dbt_sources_yaml(data_contract_spec: DataContractSpecification, server: s
60
61
  if data_contract_spec.info.owner is not None:
61
62
  source["meta"] = {"owner": data_contract_spec.info.owner}
62
63
  if data_contract_spec.info.description is not None:
63
- source["description"] = data_contract_spec.info.description
64
+ source["description"] = data_contract_spec.info.description.strip().replace("\n", " ")
64
65
  found_server = data_contract_spec.servers.get(server)
65
66
  adapter_type = None
66
67
  if found_server is not None:
@@ -87,14 +88,16 @@ def _to_dbt_source_table(
87
88
  }
88
89
 
89
90
  if model_value.description is not None:
90
- dbt_model["description"] = model_value.description
91
+ dbt_model["description"] = model_value.description.strip().replace("\n", " ")
91
92
  columns = _to_columns(data_contract_spec, model_value.fields, False, adapter_type)
92
93
  if columns:
93
94
  dbt_model["columns"] = columns
94
95
  return dbt_model
95
96
 
96
97
 
97
- def _to_dbt_model(model_key, model_value: Model, data_contract_spec: DataContractSpecification) -> dict:
98
+ def _to_dbt_model(
99
+ model_key, model_value: Model, data_contract_spec: DataContractSpecification, adapter_type: Optional[str]
100
+ ) -> dict:
98
101
  dbt_model = {
99
102
  "name": model_key,
100
103
  }
@@ -108,8 +111,8 @@ def _to_dbt_model(model_key, model_value: Model, data_contract_spec: DataContrac
108
111
  if _supports_constraints(model_type):
109
112
  dbt_model["config"]["contract"] = {"enforced": True}
110
113
  if model_value.description is not None:
111
- dbt_model["description"] = model_value.description
112
- columns = _to_columns(data_contract_spec, model_value.fields, _supports_constraints(model_type), None)
114
+ dbt_model["description"] = model_value.description.strip().replace("\n", " ")
115
+ columns = _to_columns(data_contract_spec, model_value.fields, _supports_constraints(model_type), adapter_type)
113
116
  if columns:
114
117
  dbt_model["columns"] = columns
115
118
  return dbt_model
@@ -171,7 +174,7 @@ def _to_column(
171
174
  {"dbt_expectations.dbt_expectations.expect_column_values_to_be_of_type": {"column_type": dbt_type}}
172
175
  )
173
176
  if field.description is not None:
174
- column["description"] = field.description
177
+ column["description"] = field.description.strip().replace("\n", " ")
175
178
  if field.required:
176
179
  if supports_constraints:
177
180
  column.setdefault("constraints", []).append({"type": "not_null"})
@@ -25,8 +25,6 @@ class ExportFormat(str, Enum):
25
25
  dbt_sources = "dbt-sources"
26
26
  dbt_staging_sql = "dbt-staging-sql"
27
27
  odcs = "odcs"
28
- odcs_v2 = "odcs_v2"
29
- odcs_v3 = "odcs_v3"
30
28
  rdf = "rdf"
31
29
  avro = "avro"
32
30
  protobuf = "protobuf"
@@ -107,18 +107,6 @@ exporter_factory.register_lazy_exporter(
107
107
  class_name="JsonSchemaExporter",
108
108
  )
109
109
 
110
- exporter_factory.register_lazy_exporter(
111
- name=ExportFormat.odcs_v2,
112
- module_path="datacontract.export.odcs_v2_exporter",
113
- class_name="OdcsV2Exporter",
114
- )
115
-
116
- exporter_factory.register_lazy_exporter(
117
- name=ExportFormat.odcs_v3,
118
- module_path="datacontract.export.odcs_v3_exporter",
119
- class_name="OdcsV3Exporter",
120
- )
121
-
122
110
  exporter_factory.register_lazy_exporter(
123
111
  name=ExportFormat.odcs,
124
112
  module_path="datacontract.export.odcs_v3_exporter",
@@ -19,7 +19,7 @@ def to_odcs_v3_yaml(data_contract_spec: DataContractSpecification) -> str:
19
19
  "name": data_contract_spec.info.title,
20
20
  "version": data_contract_spec.info.version,
21
21
  "domain": data_contract_spec.info.owner,
22
- "status": data_contract_spec.info.status,
22
+ "status": to_status(data_contract_spec.info.status),
23
23
  }
24
24
 
25
25
  if data_contract_spec.terms is not None:
@@ -217,9 +217,9 @@ def to_property(field_name: str, field: Field) -> dict:
217
217
  if field.description is not None:
218
218
  property["description"] = field.description
219
219
  if field.required is not None:
220
- property["isNullable"] = not field.required
220
+ property["nullable"] = not field.required
221
221
  if field.unique is not None:
222
- property["isUnique"] = field.unique
222
+ property["unique"] = field.unique
223
223
  if field.classification is not None:
224
224
  property["classification"] = field.classification
225
225
  if field.examples is not None:
@@ -312,3 +312,22 @@ def to_odcs_quality(quality):
312
312
  if quality.implementation is not None:
313
313
  quality_dict["implementation"] = quality.implementation
314
314
  return quality_dict
315
+
316
+
317
+ def to_status(status):
318
+ """Convert the data contract status to ODCS v3 format."""
319
+ if status is None:
320
+ return "draft" # Default to draft if no status is provided
321
+
322
+ # Valid status values according to ODCS v3.0.1 spec
323
+ valid_statuses = ["proposed", "draft", "active", "deprecated", "retired"]
324
+
325
+ # Convert to lowercase for comparison
326
+ status_lower = status.lower()
327
+
328
+ # If status is already valid, return it as is
329
+ if status_lower in valid_statuses:
330
+ return status_lower
331
+
332
+ # Default to "draft" for any non-standard status
333
+ return "draft"