datacontract-cli 0.10.0__py3-none-any.whl → 0.10.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datacontract/__init__.py +13 -0
- datacontract/api.py +260 -0
- datacontract/breaking/breaking.py +242 -12
- datacontract/breaking/breaking_rules.py +37 -1
- datacontract/catalog/catalog.py +80 -0
- datacontract/cli.py +387 -117
- datacontract/data_contract.py +216 -353
- datacontract/engines/data_contract_checks.py +1041 -0
- datacontract/engines/data_contract_test.py +113 -0
- datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +2 -3
- datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
- datacontract/engines/fastjsonschema/check_jsonschema.py +176 -42
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +16 -1
- datacontract/engines/soda/check_soda_execute.py +100 -56
- datacontract/engines/soda/connections/athena.py +79 -0
- datacontract/engines/soda/connections/bigquery.py +8 -1
- datacontract/engines/soda/connections/databricks.py +12 -3
- datacontract/engines/soda/connections/duckdb_connection.py +241 -0
- datacontract/engines/soda/connections/kafka.py +206 -113
- datacontract/engines/soda/connections/snowflake.py +8 -5
- datacontract/engines/soda/connections/sqlserver.py +43 -0
- datacontract/engines/soda/connections/trino.py +26 -0
- datacontract/export/avro_converter.py +72 -8
- datacontract/export/avro_idl_converter.py +31 -25
- datacontract/export/bigquery_converter.py +130 -0
- datacontract/export/custom_converter.py +40 -0
- datacontract/export/data_caterer_converter.py +161 -0
- datacontract/export/dbml_converter.py +148 -0
- datacontract/export/dbt_converter.py +141 -54
- datacontract/export/dcs_exporter.py +6 -0
- datacontract/export/dqx_converter.py +126 -0
- datacontract/export/duckdb_type_converter.py +57 -0
- datacontract/export/excel_exporter.py +923 -0
- datacontract/export/exporter.py +100 -0
- datacontract/export/exporter_factory.py +216 -0
- datacontract/export/go_converter.py +105 -0
- datacontract/export/great_expectations_converter.py +257 -36
- datacontract/export/html_exporter.py +86 -0
- datacontract/export/iceberg_converter.py +188 -0
- datacontract/export/jsonschema_converter.py +71 -16
- datacontract/export/markdown_converter.py +337 -0
- datacontract/export/mermaid_exporter.py +110 -0
- datacontract/export/odcs_v3_exporter.py +375 -0
- datacontract/export/pandas_type_converter.py +40 -0
- datacontract/export/protobuf_converter.py +168 -68
- datacontract/export/pydantic_converter.py +6 -0
- datacontract/export/rdf_converter.py +13 -6
- datacontract/export/sodacl_converter.py +36 -188
- datacontract/export/spark_converter.py +245 -0
- datacontract/export/sql_converter.py +37 -3
- datacontract/export/sql_type_converter.py +269 -8
- datacontract/export/sqlalchemy_converter.py +170 -0
- datacontract/export/terraform_converter.py +7 -2
- datacontract/imports/avro_importer.py +246 -26
- datacontract/imports/bigquery_importer.py +221 -0
- datacontract/imports/csv_importer.py +143 -0
- datacontract/imports/dbml_importer.py +112 -0
- datacontract/imports/dbt_importer.py +240 -0
- datacontract/imports/excel_importer.py +1111 -0
- datacontract/imports/glue_importer.py +288 -0
- datacontract/imports/iceberg_importer.py +172 -0
- datacontract/imports/importer.py +51 -0
- datacontract/imports/importer_factory.py +128 -0
- datacontract/imports/json_importer.py +325 -0
- datacontract/imports/jsonschema_importer.py +146 -0
- datacontract/imports/odcs_importer.py +60 -0
- datacontract/imports/odcs_v3_importer.py +516 -0
- datacontract/imports/parquet_importer.py +81 -0
- datacontract/imports/protobuf_importer.py +264 -0
- datacontract/imports/spark_importer.py +262 -0
- datacontract/imports/sql_importer.py +274 -35
- datacontract/imports/unity_importer.py +219 -0
- datacontract/init/init_template.py +20 -0
- datacontract/integration/datamesh_manager.py +86 -0
- datacontract/lint/resolve.py +271 -49
- datacontract/lint/resources.py +21 -0
- datacontract/lint/schema.py +53 -17
- datacontract/lint/urls.py +32 -12
- datacontract/model/data_contract_specification/__init__.py +1 -0
- datacontract/model/exceptions.py +4 -1
- datacontract/model/odcs.py +24 -0
- datacontract/model/run.py +49 -29
- datacontract/output/__init__.py +0 -0
- datacontract/output/junit_test_results.py +135 -0
- datacontract/output/output_format.py +10 -0
- datacontract/output/test_results_writer.py +79 -0
- datacontract/py.typed +0 -0
- datacontract/schemas/datacontract-1.1.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.1.0.schema.json +1975 -0
- datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
- datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
- datacontract/schemas/odcs-3.0.1.schema.json +2634 -0
- datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
- datacontract/templates/datacontract.html +139 -294
- datacontract/templates/datacontract_odcs.html +685 -0
- datacontract/templates/index.html +236 -0
- datacontract/templates/partials/datacontract_information.html +86 -0
- datacontract/templates/partials/datacontract_servicelevels.html +253 -0
- datacontract/templates/partials/datacontract_terms.html +51 -0
- datacontract/templates/partials/definition.html +25 -0
- datacontract/templates/partials/example.html +27 -0
- datacontract/templates/partials/model_field.html +144 -0
- datacontract/templates/partials/quality.html +49 -0
- datacontract/templates/partials/server.html +211 -0
- datacontract/templates/style/output.css +491 -72
- datacontract_cli-0.10.37.dist-info/METADATA +2235 -0
- datacontract_cli-0.10.37.dist-info/RECORD +119 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/WHEEL +1 -1
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info/licenses}/LICENSE +1 -1
- datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
- datacontract/engines/soda/connections/dask.py +0 -28
- datacontract/engines/soda/connections/duckdb.py +0 -76
- datacontract/export/csv_type_converter.py +0 -36
- datacontract/export/html_export.py +0 -66
- datacontract/export/odcs_converter.py +0 -102
- datacontract/init/download_datacontract_file.py +0 -17
- datacontract/integration/publish_datamesh_manager.py +0 -33
- datacontract/integration/publish_opentelemetry.py +0 -107
- datacontract/lint/lint.py +0 -141
- datacontract/lint/linters/description_linter.py +0 -34
- datacontract/lint/linters/example_model_linter.py +0 -91
- datacontract/lint/linters/field_pattern_linter.py +0 -34
- datacontract/lint/linters/field_reference_linter.py +0 -38
- datacontract/lint/linters/notice_period_linter.py +0 -55
- datacontract/lint/linters/quality_schema_linter.py +0 -52
- datacontract/lint/linters/valid_constraints_linter.py +0 -99
- datacontract/model/data_contract_specification.py +0 -141
- datacontract/web.py +0 -14
- datacontract_cli-0.10.0.dist-info/METADATA +0 -951
- datacontract_cli-0.10.0.dist-info/RECORD +0 -66
- /datacontract/{model → breaking}/breaking_change.py +0 -0
- /datacontract/{lint/linters → export}/__init__.py +0 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/top_level.txt +0 -0
|
@@ -1,42 +1,59 @@
|
|
|
1
|
-
from typing import Dict
|
|
1
|
+
from typing import Dict, Optional
|
|
2
2
|
|
|
3
3
|
import yaml
|
|
4
4
|
|
|
5
|
+
from datacontract.export.exporter import Exporter, _check_models_for_export
|
|
5
6
|
from datacontract.export.sql_type_converter import convert_to_sql_type
|
|
6
|
-
from datacontract.model.data_contract_specification import
|
|
7
|
-
DataContractSpecification, Model, Field
|
|
7
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
class DbtExporter(Exporter):
|
|
11
|
+
def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
|
|
12
|
+
return to_dbt_models_yaml(data_contract, server)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DbtSourceExporter(Exporter):
|
|
16
|
+
def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
|
|
17
|
+
return to_dbt_sources_yaml(data_contract, server)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DbtStageExporter(Exporter):
|
|
21
|
+
def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
|
|
22
|
+
model_name, model_value = _check_models_for_export(data_contract, model, self.export_format)
|
|
23
|
+
return to_dbt_staging_sql(
|
|
24
|
+
data_contract,
|
|
25
|
+
model_name,
|
|
26
|
+
model_value,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def to_dbt_models_yaml(data_contract_spec: DataContractSpecification, server: str = None) -> str:
|
|
11
31
|
dbt = {
|
|
12
32
|
"version": 2,
|
|
13
33
|
"models": [],
|
|
14
34
|
}
|
|
35
|
+
|
|
15
36
|
for model_key, model_value in data_contract_spec.models.items():
|
|
16
|
-
dbt_model = _to_dbt_model(model_key, model_value, data_contract_spec)
|
|
37
|
+
dbt_model = _to_dbt_model(model_key, model_value, data_contract_spec, adapter_type=server)
|
|
17
38
|
dbt["models"].append(dbt_model)
|
|
18
|
-
return yaml.
|
|
39
|
+
return yaml.safe_dump(dbt, indent=2, sort_keys=False, allow_unicode=True)
|
|
19
40
|
|
|
20
41
|
|
|
21
42
|
def to_dbt_staging_sql(data_contract_spec: DataContractSpecification, model_name: str, model_value: Model) -> str:
|
|
22
|
-
if data_contract_spec.models is None or len(data_contract_spec.models.items()) != 1:
|
|
23
|
-
print("Export to dbt-staging-sql currently only works with exactly one model in the data contract.")
|
|
24
|
-
return ""
|
|
25
|
-
|
|
26
43
|
id = data_contract_spec.id
|
|
27
44
|
columns = []
|
|
28
45
|
for field_name, field in model_value.fields.items():
|
|
29
46
|
# TODO escape SQL reserved key words, probably dependent on server type
|
|
30
47
|
columns.append(field_name)
|
|
31
48
|
return f"""
|
|
32
|
-
select
|
|
49
|
+
select
|
|
33
50
|
{", ".join(columns)}
|
|
34
51
|
from {{{{ source('{id}', '{model_name}') }}}}
|
|
35
52
|
"""
|
|
36
53
|
|
|
37
54
|
|
|
38
55
|
def to_dbt_sources_yaml(data_contract_spec: DataContractSpecification, server: str = None):
|
|
39
|
-
source = {"name": data_contract_spec.id
|
|
56
|
+
source = {"name": data_contract_spec.id}
|
|
40
57
|
dbt = {
|
|
41
58
|
"version": 2,
|
|
42
59
|
"sources": [source],
|
|
@@ -44,38 +61,52 @@ def to_dbt_sources_yaml(data_contract_spec: DataContractSpecification, server: s
|
|
|
44
61
|
if data_contract_spec.info.owner is not None:
|
|
45
62
|
source["meta"] = {"owner": data_contract_spec.info.owner}
|
|
46
63
|
if data_contract_spec.info.description is not None:
|
|
47
|
-
source["description"] = data_contract_spec.info.description
|
|
64
|
+
source["description"] = data_contract_spec.info.description.strip().replace("\n", " ")
|
|
48
65
|
found_server = data_contract_spec.servers.get(server)
|
|
66
|
+
adapter_type = None
|
|
49
67
|
if found_server is not None:
|
|
50
|
-
|
|
51
|
-
|
|
68
|
+
adapter_type = found_server.type
|
|
69
|
+
if adapter_type == "bigquery":
|
|
70
|
+
source["database"] = found_server.project
|
|
71
|
+
source["schema"] = found_server.dataset
|
|
72
|
+
else:
|
|
73
|
+
source["database"] = found_server.database
|
|
74
|
+
source["schema"] = found_server.schema_
|
|
52
75
|
|
|
76
|
+
source["tables"] = []
|
|
53
77
|
for model_key, model_value in data_contract_spec.models.items():
|
|
54
|
-
dbt_model = _to_dbt_source_table(model_key, model_value)
|
|
78
|
+
dbt_model = _to_dbt_source_table(data_contract_spec, model_key, model_value, adapter_type)
|
|
55
79
|
source["tables"].append(dbt_model)
|
|
56
80
|
return yaml.dump(dbt, indent=2, sort_keys=False, allow_unicode=True)
|
|
57
81
|
|
|
58
82
|
|
|
59
|
-
def _to_dbt_source_table(
|
|
83
|
+
def _to_dbt_source_table(
|
|
84
|
+
data_contract_spec: DataContractSpecification, model_key, model_value: Model, adapter_type: Optional[str]
|
|
85
|
+
) -> dict:
|
|
60
86
|
dbt_model = {
|
|
61
87
|
"name": model_key,
|
|
62
88
|
}
|
|
63
89
|
|
|
64
90
|
if model_value.description is not None:
|
|
65
|
-
dbt_model["description"] = model_value.description
|
|
66
|
-
columns = _to_columns(model_value.fields, False,
|
|
91
|
+
dbt_model["description"] = model_value.description.strip().replace("\n", " ")
|
|
92
|
+
columns = _to_columns(data_contract_spec, model_value.fields, False, adapter_type)
|
|
67
93
|
if columns:
|
|
68
94
|
dbt_model["columns"] = columns
|
|
69
95
|
return dbt_model
|
|
70
96
|
|
|
71
97
|
|
|
72
|
-
def _to_dbt_model(
|
|
98
|
+
def _to_dbt_model(
|
|
99
|
+
model_key, model_value: Model, data_contract_spec: DataContractSpecification, adapter_type: Optional[str]
|
|
100
|
+
) -> dict:
|
|
73
101
|
dbt_model = {
|
|
74
102
|
"name": model_key,
|
|
75
103
|
}
|
|
76
104
|
model_type = _to_dbt_model_type(model_value.type)
|
|
105
|
+
|
|
77
106
|
dbt_model["config"] = {"meta": {"data_contract": data_contract_spec.id}}
|
|
78
|
-
|
|
107
|
+
|
|
108
|
+
if model_type:
|
|
109
|
+
dbt_model["config"]["materialized"] = model_type
|
|
79
110
|
|
|
80
111
|
if data_contract_spec.info.owner is not None:
|
|
81
112
|
dbt_model["config"]["meta"]["owner"] = data_contract_spec.info.owner
|
|
@@ -83,10 +114,29 @@ def _to_dbt_model(model_key, model_value: Model, data_contract_spec: DataContrac
|
|
|
83
114
|
if _supports_constraints(model_type):
|
|
84
115
|
dbt_model["config"]["contract"] = {"enforced": True}
|
|
85
116
|
if model_value.description is not None:
|
|
86
|
-
dbt_model["description"] = model_value.description
|
|
87
|
-
|
|
117
|
+
dbt_model["description"] = model_value.description.strip().replace("\n", " ")
|
|
118
|
+
|
|
119
|
+
# Handle model-level primaryKey (before columns for better YAML ordering)
|
|
120
|
+
primary_key_columns = []
|
|
121
|
+
if hasattr(model_value, "primaryKey") and model_value.primaryKey:
|
|
122
|
+
if isinstance(model_value.primaryKey, list) and len(model_value.primaryKey) > 1:
|
|
123
|
+
# Multiple columns: use dbt_utils.unique_combination_of_columns
|
|
124
|
+
dbt_model["data_tests"] = [
|
|
125
|
+
{"dbt_utils.unique_combination_of_columns": {"combination_of_columns": model_value.primaryKey}}
|
|
126
|
+
]
|
|
127
|
+
elif isinstance(model_value.primaryKey, list) and len(model_value.primaryKey) == 1:
|
|
128
|
+
# Single column: handle at column level (pass to _to_columns)
|
|
129
|
+
primary_key_columns = model_value.primaryKey
|
|
130
|
+
elif isinstance(model_value.primaryKey, str):
|
|
131
|
+
# Single column as string: handle at column level
|
|
132
|
+
primary_key_columns = [model_value.primaryKey]
|
|
133
|
+
|
|
134
|
+
columns = _to_columns(
|
|
135
|
+
data_contract_spec, model_value.fields, _supports_constraints(model_type), adapter_type, primary_key_columns
|
|
136
|
+
)
|
|
88
137
|
if columns:
|
|
89
138
|
dbt_model["columns"] = columns
|
|
139
|
+
|
|
90
140
|
return dbt_model
|
|
91
141
|
|
|
92
142
|
|
|
@@ -95,7 +145,7 @@ def _to_dbt_model_type(model_type):
|
|
|
95
145
|
# Allowed values: table, view, incremental, ephemeral, materialized view
|
|
96
146
|
# Custom values also possible
|
|
97
147
|
if model_type is None:
|
|
98
|
-
return
|
|
148
|
+
return None
|
|
99
149
|
if model_type.lower() == "table":
|
|
100
150
|
return "table"
|
|
101
151
|
if model_type.lower() == "view":
|
|
@@ -107,48 +157,72 @@ def _supports_constraints(model_type):
|
|
|
107
157
|
return model_type == "table" or model_type == "incremental"
|
|
108
158
|
|
|
109
159
|
|
|
110
|
-
def _to_columns(
|
|
160
|
+
def _to_columns(
|
|
161
|
+
data_contract_spec: DataContractSpecification,
|
|
162
|
+
fields: Dict[str, Field],
|
|
163
|
+
supports_constraints: bool,
|
|
164
|
+
adapter_type: Optional[str],
|
|
165
|
+
primary_key_columns: Optional[list] = None,
|
|
166
|
+
) -> list:
|
|
111
167
|
columns = []
|
|
168
|
+
primary_key_columns = primary_key_columns or []
|
|
112
169
|
for field_name, field in fields.items():
|
|
113
|
-
|
|
114
|
-
column
|
|
170
|
+
is_primary_key = field_name in primary_key_columns
|
|
171
|
+
column = _to_column(data_contract_spec, field_name, field, supports_constraints, adapter_type, is_primary_key)
|
|
115
172
|
columns.append(column)
|
|
116
173
|
return columns
|
|
117
174
|
|
|
118
175
|
|
|
119
|
-
def
|
|
120
|
-
|
|
121
|
-
|
|
176
|
+
def get_table_name_and_column_name(references: str) -> tuple[Optional[str], str]:
|
|
177
|
+
parts = references.split(".")
|
|
178
|
+
if len(parts) < 2:
|
|
179
|
+
return None, parts[0]
|
|
180
|
+
return parts[-2], parts[-1]
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _to_column(
|
|
184
|
+
data_contract_spec: DataContractSpecification,
|
|
185
|
+
field_name: str,
|
|
186
|
+
field: Field,
|
|
187
|
+
supports_constraints: bool,
|
|
188
|
+
adapter_type: Optional[str],
|
|
189
|
+
is_primary_key: bool = False,
|
|
190
|
+
) -> dict:
|
|
191
|
+
column = {"name": field_name}
|
|
192
|
+
adapter_type = adapter_type or "snowflake"
|
|
193
|
+
dbt_type = convert_to_sql_type(field, adapter_type)
|
|
194
|
+
|
|
195
|
+
column["data_tests"] = []
|
|
122
196
|
if dbt_type is not None:
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
)
|
|
197
|
+
column["data_type"] = dbt_type
|
|
198
|
+
else:
|
|
199
|
+
column["data_tests"].append(
|
|
200
|
+
{"dbt_expectations.dbt_expectations.expect_column_values_to_be_of_type": {"column_type": dbt_type}}
|
|
201
|
+
)
|
|
129
202
|
if field.description is not None:
|
|
130
|
-
column["description"] = field.description
|
|
131
|
-
|
|
203
|
+
column["description"] = field.description.strip().replace("\n", " ")
|
|
204
|
+
# Handle required/not_null constraint
|
|
205
|
+
if field.required or is_primary_key:
|
|
132
206
|
if supports_constraints:
|
|
133
207
|
column.setdefault("constraints", []).append({"type": "not_null"})
|
|
134
208
|
else:
|
|
135
|
-
column
|
|
136
|
-
|
|
209
|
+
column["data_tests"].append("not_null")
|
|
210
|
+
|
|
211
|
+
# Handle unique constraint
|
|
212
|
+
if field.unique or is_primary_key:
|
|
137
213
|
if supports_constraints:
|
|
138
214
|
column.setdefault("constraints", []).append({"type": "unique"})
|
|
139
215
|
else:
|
|
140
|
-
column
|
|
216
|
+
column["data_tests"].append("unique")
|
|
141
217
|
if field.enum is not None and len(field.enum) > 0:
|
|
142
|
-
column
|
|
218
|
+
column["data_tests"].append({"accepted_values": {"values": field.enum}})
|
|
143
219
|
if field.minLength is not None or field.maxLength is not None:
|
|
144
220
|
length_test = {}
|
|
145
221
|
if field.minLength is not None:
|
|
146
222
|
length_test["min_value"] = field.minLength
|
|
147
223
|
if field.maxLength is not None:
|
|
148
224
|
length_test["max_value"] = field.maxLength
|
|
149
|
-
column.
|
|
150
|
-
{"dbt_expectations.expect_column_value_lengths_to_be_between": length_test}
|
|
151
|
-
)
|
|
225
|
+
column["data_tests"].append({"dbt_expectations.expect_column_value_lengths_to_be_between": length_test})
|
|
152
226
|
if field.pii is not None:
|
|
153
227
|
column.setdefault("meta", {})["pii"] = field.pii
|
|
154
228
|
if field.classification is not None:
|
|
@@ -157,9 +231,7 @@ def _to_column(field: Field, supports_constraints: bool, supports_datatype: bool
|
|
|
157
231
|
column.setdefault("tags", []).extend(field.tags)
|
|
158
232
|
if field.pattern is not None:
|
|
159
233
|
# Beware, the data contract pattern is a regex, not a like pattern
|
|
160
|
-
column.
|
|
161
|
-
{"dbt_expectations.expect_column_values_to_match_regex": {"regex": field.pattern}}
|
|
162
|
-
)
|
|
234
|
+
column["data_tests"].append({"dbt_expectations.expect_column_values_to_match_regex": {"regex": field.pattern}})
|
|
163
235
|
if (
|
|
164
236
|
field.minimum is not None
|
|
165
237
|
or field.maximum is not None
|
|
@@ -171,7 +243,7 @@ def _to_column(field: Field, supports_constraints: bool, supports_datatype: bool
|
|
|
171
243
|
range_test["min_value"] = field.minimum
|
|
172
244
|
if field.maximum is not None:
|
|
173
245
|
range_test["max_value"] = field.maximum
|
|
174
|
-
column
|
|
246
|
+
column["data_tests"].append({"dbt_expectations.expect_column_values_to_be_between": range_test})
|
|
175
247
|
elif (
|
|
176
248
|
field.exclusiveMinimum is not None
|
|
177
249
|
or field.exclusiveMaximum is not None
|
|
@@ -184,18 +256,18 @@ def _to_column(field: Field, supports_constraints: bool, supports_datatype: bool
|
|
|
184
256
|
if field.exclusiveMaximum is not None:
|
|
185
257
|
range_test["max_value"] = field.exclusiveMaximum
|
|
186
258
|
range_test["strictly"] = True
|
|
187
|
-
column
|
|
259
|
+
column["data_tests"].append({"dbt_expectations.expect_column_values_to_be_between": range_test})
|
|
188
260
|
else:
|
|
189
261
|
if field.minimum is not None:
|
|
190
|
-
column
|
|
262
|
+
column["data_tests"].append(
|
|
191
263
|
{"dbt_expectations.expect_column_values_to_be_between": {"min_value": field.minimum}}
|
|
192
264
|
)
|
|
193
265
|
if field.maximum is not None:
|
|
194
|
-
column
|
|
266
|
+
column["data_tests"].append(
|
|
195
267
|
{"dbt_expectations.expect_column_values_to_be_between": {"max_value": field.maximum}}
|
|
196
268
|
)
|
|
197
269
|
if field.exclusiveMinimum is not None:
|
|
198
|
-
column
|
|
270
|
+
column["data_tests"].append(
|
|
199
271
|
{
|
|
200
272
|
"dbt_expectations.expect_column_values_to_be_between": {
|
|
201
273
|
"min_value": field.exclusiveMinimum,
|
|
@@ -204,7 +276,7 @@ def _to_column(field: Field, supports_constraints: bool, supports_datatype: bool
|
|
|
204
276
|
}
|
|
205
277
|
)
|
|
206
278
|
if field.exclusiveMaximum is not None:
|
|
207
|
-
column
|
|
279
|
+
column["data_tests"].append(
|
|
208
280
|
{
|
|
209
281
|
"dbt_expectations.expect_column_values_to_be_between": {
|
|
210
282
|
"max_value": field.exclusiveMaximum,
|
|
@@ -212,6 +284,21 @@ def _to_column(field: Field, supports_constraints: bool, supports_datatype: bool
|
|
|
212
284
|
}
|
|
213
285
|
}
|
|
214
286
|
)
|
|
287
|
+
if field.references is not None:
|
|
288
|
+
ref_source_name = data_contract_spec.id
|
|
289
|
+
table_name, column_name = get_table_name_and_column_name(field.references)
|
|
290
|
+
if table_name is not None and column_name is not None:
|
|
291
|
+
column["data_tests"].append(
|
|
292
|
+
{
|
|
293
|
+
"relationships": {
|
|
294
|
+
"to": f"""source("{ref_source_name}", "{table_name}")""",
|
|
295
|
+
"field": f"{column_name}",
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
if not column["data_tests"]:
|
|
301
|
+
column.pop("data_tests")
|
|
215
302
|
|
|
216
303
|
# TODO: all constraints
|
|
217
304
|
return column
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Union
|
|
2
|
+
|
|
3
|
+
import yaml
|
|
4
|
+
|
|
5
|
+
from datacontract.export.exporter import Exporter, _check_models_for_export
|
|
6
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Quality
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DqxKeys:
|
|
10
|
+
CHECK = "check"
|
|
11
|
+
ARGUMENTS = "arguments"
|
|
12
|
+
SPECIFICATION = "specification"
|
|
13
|
+
COL_NAME = "column"
|
|
14
|
+
COL_NAMES = "for_each_column"
|
|
15
|
+
COLUMNS = "columns"
|
|
16
|
+
FUNCTION = "function"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DqxExporter(Exporter):
|
|
20
|
+
"""Exporter implementation for converting data contracts to DQX YAML file."""
|
|
21
|
+
|
|
22
|
+
def export(
|
|
23
|
+
self,
|
|
24
|
+
data_contract: DataContractSpecification,
|
|
25
|
+
model: Model,
|
|
26
|
+
server: str,
|
|
27
|
+
sql_server_type: str,
|
|
28
|
+
export_args: Dict[str, Any],
|
|
29
|
+
) -> str:
|
|
30
|
+
"""Exports a data contract to DQX format."""
|
|
31
|
+
model_name, model_value = _check_models_for_export(data_contract, model, self.export_format)
|
|
32
|
+
return to_dqx_yaml(model_value)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def to_dqx_yaml(model_value: Model) -> str:
|
|
36
|
+
"""
|
|
37
|
+
Converts the data contract's quality checks to DQX YAML format.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
model_value (Model): The data contract to convert.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
str: YAML representation of the data contract's quality checks.
|
|
44
|
+
"""
|
|
45
|
+
extracted_rules = extract_quality_rules(model_value)
|
|
46
|
+
return yaml.dump(extracted_rules, sort_keys=False, allow_unicode=True, default_flow_style=False)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def process_quality_rule(rule: Quality, column_name: str) -> Dict[str, Any]:
|
|
50
|
+
"""
|
|
51
|
+
Processes a single quality rule by injecting the column path into its arguments if absent.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
rule (Quality): The quality rule to process.
|
|
55
|
+
column_name (str): The full path to the current column.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
dict: The processed quality rule specification.
|
|
59
|
+
"""
|
|
60
|
+
rule_data = rule.model_extra
|
|
61
|
+
specification = rule_data[DqxKeys.SPECIFICATION]
|
|
62
|
+
check = specification[DqxKeys.CHECK]
|
|
63
|
+
|
|
64
|
+
if column_name:
|
|
65
|
+
arguments = check.setdefault(DqxKeys.ARGUMENTS, {})
|
|
66
|
+
|
|
67
|
+
if (
|
|
68
|
+
DqxKeys.COL_NAME not in arguments
|
|
69
|
+
and DqxKeys.COL_NAMES not in arguments
|
|
70
|
+
and DqxKeys.COLUMNS not in arguments
|
|
71
|
+
):
|
|
72
|
+
if check[DqxKeys.FUNCTION] not in ("is_unique", "foreign_key"):
|
|
73
|
+
arguments[DqxKeys.COL_NAME] = column_name
|
|
74
|
+
else:
|
|
75
|
+
arguments[DqxKeys.COLUMNS] = [column_name]
|
|
76
|
+
|
|
77
|
+
return specification
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def extract_quality_rules(data: Union[Model, Field, Quality], column_path: str = "") -> List[Dict[str, Any]]:
|
|
81
|
+
"""
|
|
82
|
+
Recursively extracts all quality rules from a data contract structure.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
data (Union[Model, Field, Quality]): The data contract model, field, or quality rule.
|
|
86
|
+
column_path (str, optional): The current path in the schema hierarchy. Defaults to "".
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
List[Dict[str, Any]]: A list of quality rule specifications.
|
|
90
|
+
"""
|
|
91
|
+
quality_rules = []
|
|
92
|
+
|
|
93
|
+
if isinstance(data, Quality):
|
|
94
|
+
return [process_quality_rule(data, column_path)]
|
|
95
|
+
|
|
96
|
+
if isinstance(data, (Model, Field)):
|
|
97
|
+
for key, field in data.fields.items():
|
|
98
|
+
current_path = build_column_path(column_path, key)
|
|
99
|
+
|
|
100
|
+
if field.fields:
|
|
101
|
+
# Field is a struct-like object, recurse deeper
|
|
102
|
+
quality_rules.extend(extract_quality_rules(field, current_path))
|
|
103
|
+
else:
|
|
104
|
+
# Process quality rules at leaf fields
|
|
105
|
+
for rule in field.quality:
|
|
106
|
+
quality_rules.append(process_quality_rule(rule, current_path))
|
|
107
|
+
|
|
108
|
+
# Process any quality rules attached directly to this level
|
|
109
|
+
for rule in data.quality:
|
|
110
|
+
quality_rules.append(process_quality_rule(rule, column_path))
|
|
111
|
+
|
|
112
|
+
return quality_rules
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def build_column_path(current_path: str, key: str) -> str:
|
|
116
|
+
"""
|
|
117
|
+
Builds the full column path by concatenating parent path with current key.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
current_path (str): The current path prefix.
|
|
121
|
+
key (str): The current field's key.
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
str: The full path.
|
|
125
|
+
"""
|
|
126
|
+
return f"{current_path}.{key}" if current_path else key
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from datacontract.model.data_contract_specification import Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# https://duckdb.org/docs/data/csv/overview.html
|
|
7
|
+
# ['SQLNULL', 'BOOLEAN', 'BIGINT', 'DOUBLE', 'TIME', 'DATE', 'TIMESTAMP', 'VARCHAR']
|
|
8
|
+
def convert_to_duckdb_csv_type(field) -> None | str:
|
|
9
|
+
datacontract_type = field.type
|
|
10
|
+
if datacontract_type is None:
|
|
11
|
+
return "VARCHAR"
|
|
12
|
+
if datacontract_type.lower() in ["string", "varchar", "text"]:
|
|
13
|
+
return "VARCHAR"
|
|
14
|
+
if datacontract_type.lower() in ["timestamp", "timestamp_tz"]:
|
|
15
|
+
return "TIMESTAMP"
|
|
16
|
+
if datacontract_type.lower() in ["timestamp_ntz"]:
|
|
17
|
+
return "TIMESTAMP"
|
|
18
|
+
if datacontract_type.lower() in ["date"]:
|
|
19
|
+
return "DATE"
|
|
20
|
+
if datacontract_type.lower() in ["time"]:
|
|
21
|
+
return "TIME"
|
|
22
|
+
if datacontract_type.lower() in ["number", "decimal", "numeric"]:
|
|
23
|
+
# precision and scale not supported by data contract
|
|
24
|
+
return "VARCHAR"
|
|
25
|
+
if datacontract_type.lower() in ["float", "double"]:
|
|
26
|
+
return "DOUBLE"
|
|
27
|
+
if datacontract_type.lower() in ["integer", "int", "long", "bigint"]:
|
|
28
|
+
return "BIGINT"
|
|
29
|
+
if datacontract_type.lower() in ["boolean"]:
|
|
30
|
+
return "BOOLEAN"
|
|
31
|
+
if datacontract_type.lower() in ["object", "record", "struct"]:
|
|
32
|
+
# not supported in CSV
|
|
33
|
+
return "VARCHAR"
|
|
34
|
+
if datacontract_type.lower() in ["bytes"]:
|
|
35
|
+
# not supported in CSV
|
|
36
|
+
return "VARCHAR"
|
|
37
|
+
if datacontract_type.lower() in ["array"]:
|
|
38
|
+
return "VARCHAR"
|
|
39
|
+
if datacontract_type.lower() in ["null"]:
|
|
40
|
+
return "SQLNULL"
|
|
41
|
+
return "VARCHAR"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def convert_to_duckdb_json_type(field: Field) -> None | str:
|
|
45
|
+
datacontract_type = field.type
|
|
46
|
+
if datacontract_type is None:
|
|
47
|
+
return "VARCHAR"
|
|
48
|
+
if datacontract_type.lower() in ["array"]:
|
|
49
|
+
return convert_to_duckdb_json_type(field.items) + "[]" # type: ignore
|
|
50
|
+
if datacontract_type.lower() in ["object", "record", "struct"]:
|
|
51
|
+
return convert_to_duckdb_object(field.fields)
|
|
52
|
+
return convert_to_duckdb_csv_type(field)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def convert_to_duckdb_object(fields: Dict[str, Field]):
|
|
56
|
+
columns = [f'"{x[0]}" {convert_to_duckdb_json_type(x[1])}' for x in fields.items()]
|
|
57
|
+
return f"STRUCT({', '.join(columns)})"
|