datacontract-cli 0.10.20__py3-none-any.whl → 0.10.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/{web.py → api.py} +55 -3
- datacontract/breaking/breaking.py +1 -1
- datacontract/breaking/breaking_rules.py +1 -1
- datacontract/cli.py +32 -10
- datacontract/data_contract.py +14 -100
- datacontract/engines/data_contract_checks.py +735 -0
- datacontract/engines/data_contract_test.py +51 -0
- datacontract/engines/soda/check_soda_execute.py +36 -30
- datacontract/engines/soda/connections/kafka.py +8 -3
- datacontract/export/avro_converter.py +2 -0
- datacontract/export/custom_converter.py +40 -0
- datacontract/export/exporter.py +1 -2
- datacontract/export/exporter_factory.py +4 -12
- datacontract/export/sodacl_converter.py +22 -294
- datacontract/export/sql_type_converter.py +7 -2
- datacontract/imports/odcs_importer.py +6 -3
- datacontract/imports/odcs_v3_importer.py +2 -0
- datacontract/imports/sql_importer.py +229 -29
- datacontract/lint/urls.py +4 -4
- datacontract/model/data_contract_specification.py +130 -129
- datacontract/model/exceptions.py +4 -1
- datacontract/model/run.py +25 -18
- datacontract/templates/datacontract.html +16 -2
- datacontract/templates/partials/definition.html +3 -95
- datacontract/templates/partials/model_field.html +13 -0
- datacontract/templates/partials/quality.html +49 -0
- datacontract/templates/style/output.css +151 -152
- {datacontract_cli-0.10.20.dist-info → datacontract_cli-0.10.22.dist-info}/METADATA +238 -184
- {datacontract_cli-0.10.20.dist-info → datacontract_cli-0.10.22.dist-info}/RECORD +34 -34
- datacontract/engines/soda/connections/dask.py +0 -28
- datacontract/export/odcs_v2_exporter.py +0 -124
- datacontract/imports/odcs_v2_importer.py +0 -177
- datacontract/lint/linters/example_model_linter.py +0 -91
- /datacontract/{model → breaking}/breaking_change.py +0 -0
- {datacontract_cli-0.10.20.dist-info → datacontract_cli-0.10.22.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.10.20.dist-info → datacontract_cli-0.10.22.dist-info}/WHEEL +0 -0
- {datacontract_cli-0.10.20.dist-info → datacontract_cli-0.10.22.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.20.dist-info → datacontract_cli-0.10.22.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
|
|
3
|
+
from datacontract.engines.data_contract_checks import create_checks
|
|
4
|
+
|
|
5
|
+
if typing.TYPE_CHECKING:
|
|
6
|
+
from pyspark.sql import SparkSession
|
|
7
|
+
|
|
8
|
+
from datacontract.engines.datacontract.check_that_datacontract_contains_valid_servers_configuration import (
|
|
9
|
+
check_that_datacontract_contains_valid_server_configuration,
|
|
10
|
+
)
|
|
11
|
+
from datacontract.engines.fastjsonschema.check_jsonschema import check_jsonschema
|
|
12
|
+
from datacontract.engines.soda.check_soda_execute import check_soda_execute
|
|
13
|
+
from datacontract.model.data_contract_specification import DataContractSpecification
|
|
14
|
+
from datacontract.model.exceptions import DataContractException
|
|
15
|
+
from datacontract.model.run import ResultEnum, Run
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def execute_data_contract_test(
|
|
19
|
+
data_contract_specification: DataContractSpecification,
|
|
20
|
+
run: Run,
|
|
21
|
+
server_name: str = None,
|
|
22
|
+
spark: "SparkSession" = None,
|
|
23
|
+
):
|
|
24
|
+
if data_contract_specification.models is None or len(data_contract_specification.models) == 0:
|
|
25
|
+
raise DataContractException(
|
|
26
|
+
type="lint",
|
|
27
|
+
name="Check that data contract contains models",
|
|
28
|
+
result=ResultEnum.warning,
|
|
29
|
+
reason="Models block is missing. Skip executing tests.",
|
|
30
|
+
engine="datacontract",
|
|
31
|
+
)
|
|
32
|
+
check_that_datacontract_contains_valid_server_configuration(run, data_contract_specification, server_name)
|
|
33
|
+
if server_name:
|
|
34
|
+
server = data_contract_specification.servers.get(server_name)
|
|
35
|
+
else:
|
|
36
|
+
server_name = list(data_contract_specification.servers.keys())[0]
|
|
37
|
+
server = data_contract_specification.servers.get(server_name)
|
|
38
|
+
run.log_info(f"Running tests for data contract {data_contract_specification.id} with server {server_name}")
|
|
39
|
+
run.dataContractId = data_contract_specification.id
|
|
40
|
+
run.dataContractVersion = data_contract_specification.info.version
|
|
41
|
+
run.dataProductId = server.dataProductId
|
|
42
|
+
run.outputPortId = server.outputPortId
|
|
43
|
+
run.server = server_name
|
|
44
|
+
|
|
45
|
+
run.checks.extend(create_checks(data_contract_specification, server))
|
|
46
|
+
|
|
47
|
+
# TODO check server is supported type for nicer error messages
|
|
48
|
+
# TODO check server credentials are complete for nicer error messages
|
|
49
|
+
if server.format == "json" and server.type != "kafka":
|
|
50
|
+
check_jsonschema(run, data_contract_specification, server)
|
|
51
|
+
check_soda_execute(run, data_contract_specification, server, spark)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import uuid
|
|
2
3
|
|
|
3
4
|
from datacontract.engines.soda.connections.bigquery import to_bigquery_soda_configuration
|
|
4
5
|
from datacontract.engines.soda.connections.databricks import to_databricks_soda_configuration
|
|
@@ -13,7 +14,7 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
|
|
|
13
14
|
from datacontract.model.run import Check, Log, ResultEnum, Run
|
|
14
15
|
|
|
15
16
|
|
|
16
|
-
def check_soda_execute(run: Run, data_contract: DataContractSpecification, server: Server, spark
|
|
17
|
+
def check_soda_execute(run: Run, data_contract: DataContractSpecification, server: Server, spark):
|
|
17
18
|
from soda.common.config_helper import ConfigHelper
|
|
18
19
|
|
|
19
20
|
ConfigHelper.get_instance().upsert_value("send_anonymous_usage_stats", False)
|
|
@@ -80,8 +81,8 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
80
81
|
scan.set_data_source_name("datacontract-cli")
|
|
81
82
|
elif server.type == "kafka":
|
|
82
83
|
if spark is None:
|
|
83
|
-
spark = create_spark_session(
|
|
84
|
-
read_kafka_topic(spark, data_contract, server
|
|
84
|
+
spark = create_spark_session()
|
|
85
|
+
read_kafka_topic(spark, data_contract, server)
|
|
85
86
|
scan.add_spark_session(spark, data_source_name=server.type)
|
|
86
87
|
scan.set_data_source_name(server.type)
|
|
87
88
|
elif server.type == "sqlserver":
|
|
@@ -106,37 +107,34 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
106
107
|
run.log_warn(f"Server type {server.type} not yet supported by datacontract CLI")
|
|
107
108
|
return
|
|
108
109
|
|
|
109
|
-
|
|
110
|
-
# Don't check types for avro format, as they are checked with avro schema
|
|
111
|
-
# Don't check types for csv format, as they are hard to detect
|
|
112
|
-
server_type = server.type
|
|
113
|
-
check_types = server.format != "json" and server.format != "csv" and server.format != "avro"
|
|
114
|
-
|
|
115
|
-
sodacl_yaml_str = to_sodacl_yaml(data_contract, server_type, check_types)
|
|
110
|
+
sodacl_yaml_str = to_sodacl_yaml(run)
|
|
116
111
|
# print("sodacl_yaml_str:\n" + sodacl_yaml_str)
|
|
117
112
|
scan.add_sodacl_yaml_str(sodacl_yaml_str)
|
|
118
113
|
|
|
119
114
|
# Execute the scan
|
|
120
|
-
logging.info("Starting soda scan")
|
|
115
|
+
logging.info("Starting soda scan with checks:\n" + sodacl_yaml_str)
|
|
121
116
|
scan.execute()
|
|
122
117
|
logging.info("Finished soda scan")
|
|
123
118
|
|
|
124
119
|
# pprint.PrettyPrinter(indent=2).pprint(scan.build_scan_results())
|
|
125
120
|
|
|
126
121
|
scan_results = scan.get_scan_results()
|
|
127
|
-
for
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
122
|
+
for scan_result in scan_results.get("checks"):
|
|
123
|
+
name = scan_result.get("name")
|
|
124
|
+
check = get_check(run, scan_result)
|
|
125
|
+
if check is None:
|
|
126
|
+
check = Check(
|
|
127
|
+
id=str(uuid.uuid4()),
|
|
128
|
+
category="custom",
|
|
129
|
+
type="custom",
|
|
130
|
+
name=name,
|
|
131
|
+
engine="soda-core",
|
|
132
|
+
)
|
|
133
|
+
run.checks.append(check)
|
|
134
|
+
check.result = to_result(scan_result)
|
|
135
|
+
check.reason = ", ".join(scan_result.get("outcomeReasons"))
|
|
136
|
+
check.diagnostics = scan_result.get("diagnostics")
|
|
137
|
+
update_reason(check, scan_result)
|
|
140
138
|
|
|
141
139
|
for log in scan_results.get("logs"):
|
|
142
140
|
run.logs.append(
|
|
@@ -152,8 +150,8 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
152
150
|
run.checks.append(
|
|
153
151
|
Check(
|
|
154
152
|
type="general",
|
|
155
|
-
name="
|
|
156
|
-
result=
|
|
153
|
+
name="Data Contract Tests",
|
|
154
|
+
result=ResultEnum.warning,
|
|
157
155
|
reason="Engine soda-core has errors. See the logs for details.",
|
|
158
156
|
engine="soda-core",
|
|
159
157
|
)
|
|
@@ -161,14 +159,22 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
161
159
|
return
|
|
162
160
|
|
|
163
161
|
|
|
164
|
-
def
|
|
162
|
+
def get_check(run, scan_result) -> Check | None:
|
|
163
|
+
check_by_name = next((c for c in run.checks if c.key == scan_result.get("name")), None)
|
|
164
|
+
if check_by_name is not None:
|
|
165
|
+
return check_by_name
|
|
166
|
+
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def to_result(c) -> ResultEnum:
|
|
165
171
|
soda_outcome = c.get("outcome")
|
|
166
172
|
if soda_outcome == "pass":
|
|
167
|
-
return
|
|
173
|
+
return ResultEnum.passed
|
|
168
174
|
elif soda_outcome == "fail":
|
|
169
|
-
return
|
|
175
|
+
return ResultEnum.failed
|
|
170
176
|
else:
|
|
171
|
-
return
|
|
177
|
+
return ResultEnum.unknown
|
|
172
178
|
|
|
173
179
|
|
|
174
180
|
def update_reason(check, c):
|
|
@@ -1,12 +1,14 @@
|
|
|
1
|
+
import atexit
|
|
1
2
|
import logging
|
|
2
3
|
import os
|
|
4
|
+
import tempfile
|
|
3
5
|
|
|
4
6
|
from datacontract.export.avro_converter import to_avro_schema_json
|
|
5
7
|
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Server
|
|
6
8
|
from datacontract.model.exceptions import DataContractException
|
|
7
9
|
|
|
8
10
|
|
|
9
|
-
def create_spark_session(
|
|
11
|
+
def create_spark_session():
|
|
10
12
|
"""Create and configure a Spark session."""
|
|
11
13
|
|
|
12
14
|
try:
|
|
@@ -21,6 +23,9 @@ def create_spark_session(tmp_dir: str):
|
|
|
21
23
|
original_exception=e,
|
|
22
24
|
)
|
|
23
25
|
|
|
26
|
+
tmp_dir = tempfile.TemporaryDirectory(prefix="datacontract-cli-spark")
|
|
27
|
+
atexit.register(tmp_dir.cleanup)
|
|
28
|
+
|
|
24
29
|
spark = (
|
|
25
30
|
SparkSession.builder.appName("datacontract")
|
|
26
31
|
.config("spark.sql.warehouse.dir", f"{tmp_dir}/spark-warehouse")
|
|
@@ -37,7 +42,7 @@ def create_spark_session(tmp_dir: str):
|
|
|
37
42
|
return spark
|
|
38
43
|
|
|
39
44
|
|
|
40
|
-
def read_kafka_topic(spark, data_contract: DataContractSpecification, server: Server
|
|
45
|
+
def read_kafka_topic(spark, data_contract: DataContractSpecification, server: Server):
|
|
41
46
|
"""Read and process data from a Kafka topic based on the server configuration."""
|
|
42
47
|
|
|
43
48
|
logging.info("Reading data from Kafka server %s topic %s", server.host, server.topic)
|
|
@@ -62,7 +67,7 @@ def read_kafka_topic(spark, data_contract: DataContractSpecification, server: Se
|
|
|
62
67
|
type="test",
|
|
63
68
|
name="Configuring Kafka checks",
|
|
64
69
|
result="warning",
|
|
65
|
-
reason=f"Kafka format '{server.format}' is not supported.
|
|
70
|
+
reason=f"Kafka format '{server.format}' is not supported. Skip executing tests.",
|
|
66
71
|
engine="datacontract",
|
|
67
72
|
)
|
|
68
73
|
|
|
@@ -108,6 +108,8 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
|
|
|
108
108
|
elif field.type in ["time"]:
|
|
109
109
|
return "long"
|
|
110
110
|
elif field.type in ["object", "record", "struct"]:
|
|
111
|
+
if field.config is not None and 'namespace' in field.config:
|
|
112
|
+
return to_avro_record(field_name ,field.fields ,field.description ,field.config['namespace'])
|
|
111
113
|
return to_avro_record(field_name, field.fields, field.description, None)
|
|
112
114
|
elif field.type in ["binary"]:
|
|
113
115
|
return "bytes"
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from jinja2 import Environment, FileSystemLoader
|
|
4
|
+
|
|
5
|
+
from datacontract.export.exporter import Exporter
|
|
6
|
+
from datacontract.model.data_contract_specification import (
|
|
7
|
+
DataContractSpecification,
|
|
8
|
+
Model,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CustomExporter(Exporter):
|
|
13
|
+
"""Exporter implementation for converting data contracts to Markdown."""
|
|
14
|
+
|
|
15
|
+
def export(
|
|
16
|
+
self,
|
|
17
|
+
data_contract: DataContractSpecification,
|
|
18
|
+
model: Model,
|
|
19
|
+
server: str,
|
|
20
|
+
sql_server_type: str,
|
|
21
|
+
export_args: dict,
|
|
22
|
+
) -> str:
|
|
23
|
+
"""Exports a data contract to custom format with Jinja."""
|
|
24
|
+
template = export_args.get("template")
|
|
25
|
+
if template is None:
|
|
26
|
+
raise RuntimeError("Export to custom requires template argument.")
|
|
27
|
+
|
|
28
|
+
return to_custom(data_contract, template)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def to_custom(data_contract: DataContractSpecification, template_path: Path) -> str:
|
|
32
|
+
template = get_template(template_path)
|
|
33
|
+
rendered_sql = template.render(data_contract=data_contract)
|
|
34
|
+
return rendered_sql
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_template(path: Path):
|
|
38
|
+
abosolute_path = Path(path).resolve()
|
|
39
|
+
env = Environment(loader=FileSystemLoader(str(abosolute_path.parent)))
|
|
40
|
+
return env.get_template(path.name)
|
datacontract/export/exporter.py
CHANGED
|
@@ -25,8 +25,6 @@ class ExportFormat(str, Enum):
|
|
|
25
25
|
dbt_sources = "dbt-sources"
|
|
26
26
|
dbt_staging_sql = "dbt-staging-sql"
|
|
27
27
|
odcs = "odcs"
|
|
28
|
-
odcs_v2 = "odcs_v2"
|
|
29
|
-
odcs_v3 = "odcs_v3"
|
|
30
28
|
rdf = "rdf"
|
|
31
29
|
avro = "avro"
|
|
32
30
|
protobuf = "protobuf"
|
|
@@ -45,6 +43,7 @@ class ExportFormat(str, Enum):
|
|
|
45
43
|
dcs = "dcs"
|
|
46
44
|
markdown = "markdown"
|
|
47
45
|
iceberg = "iceberg"
|
|
46
|
+
custom = "custom"
|
|
48
47
|
|
|
49
48
|
@classmethod
|
|
50
49
|
def get_supported_formats(cls):
|
|
@@ -107,18 +107,6 @@ exporter_factory.register_lazy_exporter(
|
|
|
107
107
|
class_name="JsonSchemaExporter",
|
|
108
108
|
)
|
|
109
109
|
|
|
110
|
-
exporter_factory.register_lazy_exporter(
|
|
111
|
-
name=ExportFormat.odcs_v2,
|
|
112
|
-
module_path="datacontract.export.odcs_v2_exporter",
|
|
113
|
-
class_name="OdcsV2Exporter",
|
|
114
|
-
)
|
|
115
|
-
|
|
116
|
-
exporter_factory.register_lazy_exporter(
|
|
117
|
-
name=ExportFormat.odcs_v3,
|
|
118
|
-
module_path="datacontract.export.odcs_v3_exporter",
|
|
119
|
-
class_name="OdcsV3Exporter",
|
|
120
|
-
)
|
|
121
|
-
|
|
122
110
|
exporter_factory.register_lazy_exporter(
|
|
123
111
|
name=ExportFormat.odcs,
|
|
124
112
|
module_path="datacontract.export.odcs_v3_exporter",
|
|
@@ -206,3 +194,7 @@ exporter_factory.register_lazy_exporter(
|
|
|
206
194
|
exporter_factory.register_lazy_exporter(
|
|
207
195
|
name=ExportFormat.iceberg, module_path="datacontract.export.iceberg_converter", class_name="IcebergExporter"
|
|
208
196
|
)
|
|
197
|
+
|
|
198
|
+
exporter_factory.register_lazy_exporter(
|
|
199
|
+
name=ExportFormat.custom, module_path="datacontract.export.custom_converter", class_name="CustomExporter"
|
|
200
|
+
)
|
|
@@ -1,302 +1,30 @@
|
|
|
1
|
-
from typing import List
|
|
2
|
-
from venv import logger
|
|
3
|
-
|
|
4
1
|
import yaml
|
|
5
2
|
|
|
3
|
+
from datacontract.engines.data_contract_checks import create_checks
|
|
6
4
|
from datacontract.export.exporter import Exporter
|
|
7
|
-
from datacontract.
|
|
8
|
-
from datacontract.model.data_contract_specification import DataContractSpecification, Quality
|
|
5
|
+
from datacontract.model.run import Run
|
|
9
6
|
|
|
10
7
|
|
|
11
8
|
class SodaExporter(Exporter):
|
|
12
9
|
def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
) -> str:
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def to_checks(model_key, model_value, server_type: str, check_types: bool):
|
|
32
|
-
checks = []
|
|
33
|
-
model_name = to_model_name(model_key, model_value, server_type)
|
|
34
|
-
fields = model_value.fields
|
|
35
|
-
|
|
36
|
-
quote_field_name = server_type in ["postgres", "sqlserver"]
|
|
37
|
-
|
|
38
|
-
for field_name, field in fields.items():
|
|
39
|
-
checks.append(check_field_is_present(field_name))
|
|
40
|
-
if check_types and field.type is not None:
|
|
41
|
-
sql_type = convert_to_sql_type(field, server_type)
|
|
42
|
-
checks.append(check_field_type(field_name, sql_type))
|
|
43
|
-
if field.required:
|
|
44
|
-
checks.append(check_field_required(field_name, quote_field_name))
|
|
45
|
-
if field.unique:
|
|
46
|
-
checks.append(check_field_unique(field_name, quote_field_name))
|
|
47
|
-
if field.minLength is not None:
|
|
48
|
-
checks.append(check_field_min_length(field_name, field.minLength, quote_field_name))
|
|
49
|
-
if field.maxLength is not None:
|
|
50
|
-
checks.append(check_field_max_length(field_name, field.maxLength, quote_field_name))
|
|
51
|
-
if field.minimum is not None:
|
|
52
|
-
checks.append(check_field_minimum(field_name, field.minimum, quote_field_name))
|
|
53
|
-
if field.maximum is not None:
|
|
54
|
-
checks.append(check_field_maximum(field_name, field.maximum, quote_field_name))
|
|
55
|
-
if field.exclusiveMinimum is not None:
|
|
56
|
-
checks.append(check_field_minimum(field_name, field.exclusiveMinimum, quote_field_name))
|
|
57
|
-
checks.append(check_field_not_equal(field_name, field.exclusiveMinimum, quote_field_name))
|
|
58
|
-
if field.exclusiveMaximum is not None:
|
|
59
|
-
checks.append(check_field_maximum(field_name, field.exclusiveMaximum, quote_field_name))
|
|
60
|
-
checks.append(check_field_not_equal(field_name, field.exclusiveMaximum, quote_field_name))
|
|
61
|
-
if field.pattern is not None:
|
|
62
|
-
checks.append(check_field_regex(field_name, field.pattern, quote_field_name))
|
|
63
|
-
if field.enum is not None and len(field.enum) > 0:
|
|
64
|
-
checks.append(check_field_enum(field_name, field.enum, quote_field_name))
|
|
65
|
-
if field.quality is not None and len(field.quality) > 0:
|
|
66
|
-
quality_list = check_quality_list(model_name, field_name, field.quality)
|
|
67
|
-
if (quality_list is not None) and len(quality_list) > 0:
|
|
68
|
-
checks.append(quality_list)
|
|
69
|
-
# TODO references: str = None
|
|
70
|
-
# TODO format
|
|
71
|
-
|
|
72
|
-
if model_value.quality is not None and len(model_value.quality) > 0:
|
|
73
|
-
quality_list = check_quality_list(model_name, None, model_value.quality)
|
|
74
|
-
if (quality_list is not None) and len(quality_list) > 0:
|
|
75
|
-
checks.append(quality_list)
|
|
76
|
-
|
|
77
|
-
checks_for_model_key = f"checks for {model_name}"
|
|
78
|
-
|
|
79
|
-
if quote_field_name:
|
|
80
|
-
checks_for_model_key = f'checks for "{model_name}"'
|
|
81
|
-
|
|
82
|
-
return checks_for_model_key, checks
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
def to_model_name(model_key, model_value, server_type):
|
|
86
|
-
if server_type == "databricks":
|
|
87
|
-
if model_value.config is not None and "databricksTable" in model_value.config:
|
|
88
|
-
return model_value.config["databricksTable"]
|
|
89
|
-
if server_type == "snowflake":
|
|
90
|
-
if model_value.config is not None and "snowflakeTable" in model_value.config:
|
|
91
|
-
return model_value.config["snowflakeTable"]
|
|
92
|
-
if server_type == "sqlserver":
|
|
93
|
-
if model_value.config is not None and "sqlserverTable" in model_value.config:
|
|
94
|
-
return model_value.config["sqlserverTable"]
|
|
95
|
-
if server_type == "postgres" or server_type == "postgresql":
|
|
96
|
-
if model_value.config is not None and "postgresTable" in model_value.config:
|
|
97
|
-
return model_value.config["postgresTable"]
|
|
98
|
-
return model_key
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
def check_field_is_present(field_name):
|
|
102
|
-
return {
|
|
103
|
-
"schema": {
|
|
104
|
-
"name": f"Check that field {field_name} is present",
|
|
105
|
-
"fail": {
|
|
106
|
-
"when required column missing": [field_name],
|
|
107
|
-
},
|
|
108
|
-
}
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
def check_field_type(field_name: str, type: str):
|
|
113
|
-
return {
|
|
114
|
-
"schema": {
|
|
115
|
-
"name": f"Check that field {field_name} has type {type}",
|
|
116
|
-
"fail": {"when wrong column type": {field_name: type}},
|
|
117
|
-
}
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
def check_field_required(field_name: str, quote_field_name: bool = False):
|
|
122
|
-
if quote_field_name:
|
|
123
|
-
field_name = f'"{field_name}"'
|
|
124
|
-
|
|
125
|
-
return {f"missing_count({field_name}) = 0": {"name": f"Check that required field {field_name} has no null values"}}
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
def check_field_unique(field_name, quote_field_name: bool = False):
|
|
129
|
-
if quote_field_name:
|
|
130
|
-
field_name = f'"{field_name}"'
|
|
131
|
-
return {
|
|
132
|
-
f"duplicate_count({field_name}) = 0": {"name": f"Check that unique field {field_name} has no duplicate values"}
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
def check_field_min_length(field_name, min_length, quote_field_name: bool = False):
|
|
137
|
-
if quote_field_name:
|
|
138
|
-
field_name = f'"{field_name}"'
|
|
139
|
-
return {
|
|
140
|
-
f"invalid_count({field_name}) = 0": {
|
|
141
|
-
"name": f"Check that field {field_name} has a min length of {min_length}",
|
|
142
|
-
"valid min length": min_length,
|
|
143
|
-
}
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
def check_field_max_length(field_name, max_length, quote_field_name: bool = False):
|
|
148
|
-
if quote_field_name:
|
|
149
|
-
field_name = f'"{field_name}"'
|
|
150
|
-
return {
|
|
151
|
-
f"invalid_count({field_name}) = 0": {
|
|
152
|
-
"name": f"Check that field {field_name} has a max length of {max_length}",
|
|
153
|
-
"valid max length": max_length,
|
|
154
|
-
}
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
def check_field_minimum(field_name, minimum, quote_field_name: bool = False):
|
|
159
|
-
if quote_field_name:
|
|
160
|
-
field_name = f'"{field_name}"'
|
|
161
|
-
return {
|
|
162
|
-
f"invalid_count({field_name}) = 0": {
|
|
163
|
-
"name": f"Check that field {field_name} has a minimum of {minimum}",
|
|
164
|
-
"valid min": minimum,
|
|
165
|
-
}
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
def check_field_maximum(field_name, maximum, quote_field_name: bool = False):
|
|
170
|
-
if quote_field_name:
|
|
171
|
-
field_name = f'"{field_name}"'
|
|
172
|
-
return {
|
|
173
|
-
f"invalid_count({field_name}) = 0": {
|
|
174
|
-
"name": f"Check that field {field_name} has a maximum of {maximum}",
|
|
175
|
-
"valid max": maximum,
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
def check_field_not_equal(field_name, value, quote_field_name: bool = False):
|
|
181
|
-
if quote_field_name:
|
|
182
|
-
field_name = f'"{field_name}"'
|
|
183
|
-
return {
|
|
184
|
-
f"invalid_count({field_name}) = 0": {
|
|
185
|
-
"name": f"Check that field {field_name} is not equal to {value}",
|
|
186
|
-
"invalid values": [value],
|
|
187
|
-
}
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
def check_field_enum(field_name, enum, quote_field_name: bool = False):
|
|
192
|
-
if quote_field_name:
|
|
193
|
-
field_name = f'"{field_name}"'
|
|
194
|
-
return {
|
|
195
|
-
f"invalid_count({field_name}) = 0": {
|
|
196
|
-
"name": f"Check that field {field_name} only contains enum values {enum}",
|
|
197
|
-
"valid values": enum,
|
|
198
|
-
}
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
def check_field_regex(field_name, pattern, quote_field_name: bool = False):
|
|
203
|
-
if quote_field_name:
|
|
204
|
-
field_name = f'"{field_name}"'
|
|
205
|
-
return {
|
|
206
|
-
f"invalid_count({field_name}) = 0": {
|
|
207
|
-
"name": f"Check that field {field_name} matches regex pattern {pattern}",
|
|
208
|
-
"valid regex": pattern,
|
|
209
|
-
}
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
def check_quality_list(model_name, field_name, quality_list: List[Quality]):
|
|
214
|
-
checks = {}
|
|
215
|
-
|
|
216
|
-
count = 0
|
|
217
|
-
for quality in quality_list:
|
|
218
|
-
if quality.type == "sql":
|
|
219
|
-
if field_name is None:
|
|
220
|
-
metric_name = f"{model_name}_quality_sql_{count}"
|
|
10
|
+
run = Run.create_run()
|
|
11
|
+
run.checks.extend(create_checks(data_contract, server))
|
|
12
|
+
return to_sodacl_yaml(run)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def to_sodacl_yaml(run: Run) -> str:
|
|
16
|
+
sodacl_dict = {}
|
|
17
|
+
for run_check in run.checks:
|
|
18
|
+
if run_check.engine != "soda" or run_check.language != "sodacl":
|
|
19
|
+
continue
|
|
20
|
+
check_yaml_str = run_check.implementation
|
|
21
|
+
check_yaml_dict = yaml.safe_load(check_yaml_str)
|
|
22
|
+
for key, value in check_yaml_dict.items():
|
|
23
|
+
if key in sodacl_dict:
|
|
24
|
+
if isinstance(sodacl_dict[key], list) and isinstance(value, list):
|
|
25
|
+
sodacl_dict[key].extend(value)
|
|
26
|
+
else:
|
|
27
|
+
sodacl_dict[key].update(value)
|
|
221
28
|
else:
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
query = prepare_query(quality, model_name, field_name)
|
|
225
|
-
if query is None:
|
|
226
|
-
logger.warning(f"Quality check {metric_name} has no query")
|
|
227
|
-
continue
|
|
228
|
-
if threshold is None:
|
|
229
|
-
logger.warning(f"Quality check {metric_name} has no valid threshold")
|
|
230
|
-
continue
|
|
231
|
-
checks[f"{metric_name} {threshold}"] = {f"{metric_name} query": query}
|
|
232
|
-
count += 1
|
|
233
|
-
|
|
234
|
-
return checks
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
def prepare_query(quality: Quality, model_name: str, field_name: str = None) -> str | None:
|
|
238
|
-
if quality.query is None:
|
|
239
|
-
return None
|
|
240
|
-
if quality.query == "":
|
|
241
|
-
return None
|
|
242
|
-
|
|
243
|
-
query = quality.query
|
|
244
|
-
|
|
245
|
-
query = query.replace("{model}", model_name)
|
|
246
|
-
query = query.replace("{table}", model_name)
|
|
247
|
-
|
|
248
|
-
if field_name is not None:
|
|
249
|
-
query = query.replace("{field}", field_name)
|
|
250
|
-
query = query.replace("{column}", field_name)
|
|
251
|
-
|
|
252
|
-
return query
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
def to_sodacl_threshold(quality: Quality) -> str | None:
|
|
256
|
-
if quality.mustBe is not None:
|
|
257
|
-
return f"= {quality.mustBe}"
|
|
258
|
-
if quality.mustNotBe is not None:
|
|
259
|
-
return f"!= {quality.mustNotBe}"
|
|
260
|
-
if quality.mustBeGreaterThan is not None:
|
|
261
|
-
return f"> {quality.mustBeGreaterThan}"
|
|
262
|
-
if quality.mustBeGreaterThanOrEqualTo is not None:
|
|
263
|
-
return f">= {quality.mustBeGreaterThanOrEqualTo}"
|
|
264
|
-
if quality.mustBeLessThan is not None:
|
|
265
|
-
return f"< {quality.mustBeLessThan}"
|
|
266
|
-
if quality.mustBeLessThanOrEqualTo is not None:
|
|
267
|
-
return f"<= {quality.mustBeLessThanOrEqualTo}"
|
|
268
|
-
if quality.mustBeBetween is not None:
|
|
269
|
-
if len(quality.mustBeBetween) != 2:
|
|
270
|
-
logger.warning(
|
|
271
|
-
f"Quality check has invalid mustBeBetween, must have exactly 2 integers in an array: {quality.mustBeBetween}"
|
|
272
|
-
)
|
|
273
|
-
return None
|
|
274
|
-
return f"between {quality.mustBeBetween[0]} and {quality.mustBeBetween[1]}"
|
|
275
|
-
if quality.mustNotBeBetween is not None:
|
|
276
|
-
if len(quality.mustNotBeBetween) != 2:
|
|
277
|
-
logger.warning(
|
|
278
|
-
f"Quality check has invalid mustNotBeBetween, must have exactly 2 integers in an array: {quality.mustNotBeBetween}"
|
|
279
|
-
)
|
|
280
|
-
return None
|
|
281
|
-
return f"not between {quality.mustNotBeBetween[0]} and {quality.mustNotBeBetween[1]}"
|
|
282
|
-
return None
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
# These are deprecated root-level quality specifications, use the model-level and field-level quality fields instead
|
|
286
|
-
def add_quality_checks(sodacl, data_contract_spec):
|
|
287
|
-
if data_contract_spec.quality is None:
|
|
288
|
-
return
|
|
289
|
-
if data_contract_spec.quality.type is None:
|
|
290
|
-
return
|
|
291
|
-
if data_contract_spec.quality.type.lower() != "sodacl":
|
|
292
|
-
return
|
|
293
|
-
if isinstance(data_contract_spec.quality.specification, str):
|
|
294
|
-
quality_specification = yaml.safe_load(data_contract_spec.quality.specification)
|
|
295
|
-
else:
|
|
296
|
-
quality_specification = data_contract_spec.quality.specification
|
|
297
|
-
for key, checks in quality_specification.items():
|
|
298
|
-
if key in sodacl:
|
|
299
|
-
for check in checks:
|
|
300
|
-
sodacl[key].append(check)
|
|
301
|
-
else:
|
|
302
|
-
sodacl[key] = checks
|
|
29
|
+
sodacl_dict[key] = value
|
|
30
|
+
return yaml.dump(sodacl_dict)
|
|
@@ -142,11 +142,16 @@ def convert_to_dataframe(field: Field) -> None | str:
|
|
|
142
142
|
if type.lower() in ["boolean"]:
|
|
143
143
|
return "BOOLEAN"
|
|
144
144
|
if type.lower() in ["object", "record", "struct"]:
|
|
145
|
-
|
|
145
|
+
nested_fields = []
|
|
146
|
+
for nested_field_name, nested_field in field.fields.items():
|
|
147
|
+
nested_field_type = convert_to_dataframe(nested_field)
|
|
148
|
+
nested_fields.append(f"{nested_field_name}:{nested_field_type}")
|
|
149
|
+
return f"STRUCT<{','.join(nested_fields)}>"
|
|
146
150
|
if type.lower() in ["bytes"]:
|
|
147
151
|
return "BINARY"
|
|
148
152
|
if type.lower() in ["array"]:
|
|
149
|
-
|
|
153
|
+
item_type = convert_to_dataframe(field.items)
|
|
154
|
+
return f"ARRAY<{item_type}>"
|
|
150
155
|
return None
|
|
151
156
|
|
|
152
157
|
|
|
@@ -41,9 +41,12 @@ def import_odcs(data_contract_specification: DataContractSpecification, source:
|
|
|
41
41
|
)
|
|
42
42
|
|
|
43
43
|
if odcs_api_version.startswith("v2."):
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
44
|
+
raise DataContractException(
|
|
45
|
+
type="schema",
|
|
46
|
+
name="Importing ODCS contract",
|
|
47
|
+
reason=f"Unsupported ODCS API version: {odcs_api_version}",
|
|
48
|
+
engine="datacontract",
|
|
49
|
+
)
|
|
47
50
|
elif odcs_api_version.startswith("v3."):
|
|
48
51
|
from datacontract.imports.odcs_v3_importer import import_odcs_v3
|
|
49
52
|
|