datacontract-cli 0.10.15__py3-none-any.whl → 0.10.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/breaking/breaking.py +3 -3
- datacontract/breaking/breaking_rules.py +4 -0
- datacontract/cli.py +33 -9
- datacontract/data_contract.py +14 -10
- datacontract/engines/fastjsonschema/check_jsonschema.py +15 -4
- datacontract/engines/soda/check_soda_execute.py +13 -8
- datacontract/engines/soda/connections/databricks.py +12 -3
- datacontract/export/dbml_converter.py +2 -2
- datacontract/export/dbt_converter.py +75 -43
- datacontract/export/exporter.py +7 -2
- datacontract/export/exporter_factory.py +52 -14
- datacontract/export/iceberg_converter.py +188 -0
- datacontract/export/markdown_converter.py +208 -0
- datacontract/export/odcs_v3_exporter.py +49 -29
- datacontract/export/sodacl_converter.py +4 -3
- datacontract/export/sql_converter.py +1 -1
- datacontract/export/sql_type_converter.py +21 -0
- datacontract/export/sqlalchemy_converter.py +3 -1
- datacontract/imports/dbml_importer.py +1 -1
- datacontract/imports/dbt_importer.py +163 -17
- datacontract/imports/iceberg_importer.py +12 -1
- datacontract/imports/odcs_v2_importer.py +1 -1
- datacontract/imports/odcs_v3_importer.py +6 -1
- datacontract/imports/sql_importer.py +1 -1
- datacontract/integration/datamesh_manager.py +14 -3
- datacontract/lint/resolve.py +32 -15
- datacontract/model/data_contract_specification.py +14 -6
- datacontract/model/run.py +1 -0
- datacontract/templates/partials/model_field.html +1 -1
- {datacontract_cli-0.10.15.dist-info → datacontract_cli-0.10.18.dist-info}/METADATA +117 -75
- {datacontract_cli-0.10.15.dist-info → datacontract_cli-0.10.18.dist-info}/RECORD +35 -34
- {datacontract_cli-0.10.15.dist-info → datacontract_cli-0.10.18.dist-info}/WHEEL +1 -1
- datacontract/integration/opentelemetry.py +0 -103
- {datacontract_cli-0.10.15.dist-info → datacontract_cli-0.10.18.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.10.15.dist-info → datacontract_cli-0.10.18.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.15.dist-info → datacontract_cli-0.10.18.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from datacontract.breaking.breaking_rules import BreakingRules
|
|
2
2
|
from datacontract.model.breaking_change import BreakingChange, Location, Severity
|
|
3
|
-
from datacontract.model.data_contract_specification import Contact, Field, Info, Model,
|
|
3
|
+
from datacontract.model.data_contract_specification import Contact, DeprecatedQuality, Field, Info, Model, Terms
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def info_breaking_changes(
|
|
@@ -216,8 +216,8 @@ def terms_breaking_changes(
|
|
|
216
216
|
|
|
217
217
|
|
|
218
218
|
def quality_breaking_changes(
|
|
219
|
-
old_quality:
|
|
220
|
-
new_quality:
|
|
219
|
+
old_quality: DeprecatedQuality,
|
|
220
|
+
new_quality: DeprecatedQuality,
|
|
221
221
|
new_path: str,
|
|
222
222
|
include_severities: [Severity],
|
|
223
223
|
) -> list[BreakingChange]:
|
|
@@ -42,6 +42,10 @@ class BreakingRules:
|
|
|
42
42
|
field_primary_removed = Severity.WARNING
|
|
43
43
|
field_primary_updated = Severity.WARNING
|
|
44
44
|
|
|
45
|
+
field_primary_key_added = Severity.WARNING
|
|
46
|
+
field_primary_key_removed = Severity.WARNING
|
|
47
|
+
field_primary_key_updated = Severity.WARNING
|
|
48
|
+
|
|
45
49
|
field_references_added = Severity.WARNING
|
|
46
50
|
field_references_removed = Severity.WARNING
|
|
47
51
|
field_references_updated = Severity.WARNING
|
datacontract/cli.py
CHANGED
|
@@ -132,13 +132,11 @@ def test(
|
|
|
132
132
|
typer.Option(help="Run the schema and quality tests on the example data within the data contract."),
|
|
133
133
|
] = None,
|
|
134
134
|
publish: Annotated[str, typer.Option(help="The url to publish the results after the test")] = None,
|
|
135
|
-
publish_to_opentelemetry: Annotated[
|
|
136
|
-
bool,
|
|
137
|
-
typer.Option(
|
|
138
|
-
help="Publish the results to opentelemetry. Use environment variables to configure the OTLP endpoint, headers, etc."
|
|
139
|
-
),
|
|
140
|
-
] = False,
|
|
141
135
|
logs: Annotated[bool, typer.Option(help="Print logs")] = False,
|
|
136
|
+
ssl_verification: Annotated[
|
|
137
|
+
bool,
|
|
138
|
+
typer.Option(help="SSL verification when publishing the test results."),
|
|
139
|
+
] = True,
|
|
142
140
|
):
|
|
143
141
|
"""
|
|
144
142
|
Run schema and quality tests on configured servers.
|
|
@@ -150,7 +148,6 @@ def test(
|
|
|
150
148
|
data_contract_file=location,
|
|
151
149
|
schema_location=schema,
|
|
152
150
|
publish_url=publish,
|
|
153
|
-
publish_to_opentelemetry=publish_to_opentelemetry,
|
|
154
151
|
server=server,
|
|
155
152
|
examples=examples,
|
|
156
153
|
).test()
|
|
@@ -221,7 +218,7 @@ def export(
|
|
|
221
218
|
)
|
|
222
219
|
# Don't interpret console markup in output.
|
|
223
220
|
if output is None:
|
|
224
|
-
console.print(result, markup=False)
|
|
221
|
+
console.print(result, markup=False, soft_wrap=True)
|
|
225
222
|
else:
|
|
226
223
|
with output.open("w") as f:
|
|
227
224
|
f.write(result)
|
|
@@ -280,6 +277,14 @@ def import_(
|
|
|
280
277
|
Optional[str],
|
|
281
278
|
typer.Option(help="Table name to assign to the model created from the Iceberg schema."),
|
|
282
279
|
] = None,
|
|
280
|
+
template: Annotated[
|
|
281
|
+
Optional[str],
|
|
282
|
+
typer.Option(help="The location (url or path) of the Data Contract Specification Template"),
|
|
283
|
+
] = None,
|
|
284
|
+
schema: Annotated[
|
|
285
|
+
str,
|
|
286
|
+
typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"),
|
|
287
|
+
] = DEFAULT_DATA_CONTRACT_SCHEMA_URL,
|
|
283
288
|
):
|
|
284
289
|
"""
|
|
285
290
|
Create a data contract from the given source location. Saves to file specified by `output` option if present, otherwise prints to stdout.
|
|
@@ -287,6 +292,8 @@ def import_(
|
|
|
287
292
|
result = DataContract().import_from_source(
|
|
288
293
|
format=format,
|
|
289
294
|
source=source,
|
|
295
|
+
template=template,
|
|
296
|
+
schema=schema,
|
|
290
297
|
glue_table=glue_table,
|
|
291
298
|
bigquery_table=bigquery_table,
|
|
292
299
|
bigquery_project=bigquery_project,
|
|
@@ -298,7 +305,7 @@ def import_(
|
|
|
298
305
|
iceberg_table=iceberg_table,
|
|
299
306
|
)
|
|
300
307
|
if output is None:
|
|
301
|
-
console.print(result.to_yaml())
|
|
308
|
+
console.print(result.to_yaml(), markup=False, soft_wrap=True)
|
|
302
309
|
else:
|
|
303
310
|
with output.open("w") as f:
|
|
304
311
|
f.write(result.to_yaml())
|
|
@@ -315,6 +322,10 @@ def publish(
|
|
|
315
322
|
str,
|
|
316
323
|
typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"),
|
|
317
324
|
] = DEFAULT_DATA_CONTRACT_SCHEMA_URL,
|
|
325
|
+
ssl_verification: Annotated[
|
|
326
|
+
bool,
|
|
327
|
+
typer.Option(help="SSL verification when publishing the data contract."),
|
|
328
|
+
] = True,
|
|
318
329
|
):
|
|
319
330
|
"""
|
|
320
331
|
Publish the data contract to the Data Mesh Manager.
|
|
@@ -323,6 +334,7 @@ def publish(
|
|
|
323
334
|
data_contract_specification=DataContract(
|
|
324
335
|
data_contract_file=location, schema_location=schema
|
|
325
336
|
).get_data_contract_specification(),
|
|
337
|
+
ssl_verification=ssl_verification,
|
|
326
338
|
)
|
|
327
339
|
|
|
328
340
|
|
|
@@ -447,6 +459,18 @@ def _handle_result(run):
|
|
|
447
459
|
console.print(
|
|
448
460
|
f"🟢 data contract is valid. Run {len(run.checks)} checks. Took {(run.timestampEnd - run.timestampStart).total_seconds()} seconds."
|
|
449
461
|
)
|
|
462
|
+
elif run.result == "warning":
|
|
463
|
+
console.print("🟠 data contract has warnings. Found the following warnings:")
|
|
464
|
+
i = 1
|
|
465
|
+
for check in run.checks:
|
|
466
|
+
if check.result != "passed":
|
|
467
|
+
field = to_field(run, check)
|
|
468
|
+
if field:
|
|
469
|
+
field = field + " "
|
|
470
|
+
else:
|
|
471
|
+
field = ""
|
|
472
|
+
console.print(f"{i}) {field}{check.name}: {check.reason}")
|
|
473
|
+
i += 1
|
|
450
474
|
else:
|
|
451
475
|
console.print("🔴 data contract is invalid, found the following errors:")
|
|
452
476
|
i = 1
|
datacontract/data_contract.py
CHANGED
|
@@ -23,7 +23,6 @@ from datacontract.export.exporter import ExportFormat
|
|
|
23
23
|
from datacontract.export.exporter_factory import exporter_factory
|
|
24
24
|
from datacontract.imports.importer_factory import importer_factory
|
|
25
25
|
from datacontract.integration.datamesh_manager import publish_test_results_to_datamesh_manager
|
|
26
|
-
from datacontract.integration.opentelemetry import publish_test_results_to_opentelemetry
|
|
27
26
|
from datacontract.lint import resolve
|
|
28
27
|
from datacontract.lint.linters.description_linter import DescriptionLinter
|
|
29
28
|
from datacontract.lint.linters.example_model_linter import ExampleModelLinter
|
|
@@ -37,6 +36,8 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
|
|
|
37
36
|
from datacontract.model.exceptions import DataContractException
|
|
38
37
|
from datacontract.model.run import Check, Run
|
|
39
38
|
|
|
39
|
+
DEFAULT_DATA_CONTRACT_TEMPLATE_URL = "https://datacontract.com/datacontract.init.yaml"
|
|
40
|
+
|
|
40
41
|
|
|
41
42
|
class DataContract:
|
|
42
43
|
def __init__(
|
|
@@ -48,7 +49,6 @@ class DataContract:
|
|
|
48
49
|
server: str = None,
|
|
49
50
|
examples: bool = False,
|
|
50
51
|
publish_url: str = None,
|
|
51
|
-
publish_to_opentelemetry: bool = False,
|
|
52
52
|
spark: "SparkSession" = None,
|
|
53
53
|
inline_definitions: bool = True,
|
|
54
54
|
inline_quality: bool = True,
|
|
@@ -60,7 +60,6 @@ class DataContract:
|
|
|
60
60
|
self._server = server
|
|
61
61
|
self._examples = examples
|
|
62
62
|
self._publish_url = publish_url
|
|
63
|
-
self._publish_to_opentelemetry = publish_to_opentelemetry
|
|
64
63
|
self._spark = spark
|
|
65
64
|
self._inline_definitions = inline_definitions
|
|
66
65
|
self._inline_quality = inline_quality
|
|
@@ -75,8 +74,10 @@ class DataContract:
|
|
|
75
74
|
}
|
|
76
75
|
|
|
77
76
|
@classmethod
|
|
78
|
-
def init(
|
|
79
|
-
|
|
77
|
+
def init(
|
|
78
|
+
cls, template: str = DEFAULT_DATA_CONTRACT_TEMPLATE_URL, schema: typing.Optional[str] = None
|
|
79
|
+
) -> DataContractSpecification:
|
|
80
|
+
return resolve.resolve_data_contract(data_contract_location=template, schema_location=schema)
|
|
80
81
|
|
|
81
82
|
def lint(self, enabled_linters: typing.Union[str, set[str]] = "all") -> Run:
|
|
82
83
|
"""Lint the data contract by deserializing the contract and checking the schema, as well as calling the configured linters.
|
|
@@ -232,9 +233,6 @@ class DataContract:
|
|
|
232
233
|
if self._publish_url is not None:
|
|
233
234
|
publish_test_results_to_datamesh_manager(run, self._publish_url)
|
|
234
235
|
|
|
235
|
-
if self._publish_to_opentelemetry:
|
|
236
|
-
publish_test_results_to_opentelemetry(run)
|
|
237
|
-
|
|
238
236
|
return run
|
|
239
237
|
|
|
240
238
|
def _get_examples_server(self, data_contract, run, tmp_dir):
|
|
@@ -347,9 +345,15 @@ class DataContract:
|
|
|
347
345
|
)
|
|
348
346
|
|
|
349
347
|
def import_from_source(
|
|
350
|
-
self,
|
|
348
|
+
self,
|
|
349
|
+
format: str,
|
|
350
|
+
source: typing.Optional[str] = None,
|
|
351
|
+
template: typing.Optional[str] = None,
|
|
352
|
+
schema: typing.Optional[str] = None,
|
|
353
|
+
**kwargs,
|
|
351
354
|
) -> DataContractSpecification:
|
|
352
|
-
|
|
355
|
+
template = DEFAULT_DATA_CONTRACT_TEMPLATE_URL if template is None else template
|
|
356
|
+
data_contract_specification_initial = DataContract.init(template=template, schema=schema)
|
|
353
357
|
|
|
354
358
|
return importer_factory.create(format).import_source(
|
|
355
359
|
data_contract_specification=data_contract_specification_initial, source=source, import_args=kwargs
|
|
@@ -11,7 +11,7 @@ from datacontract.engines.fastjsonschema.s3.s3_read_files import yield_s3_files
|
|
|
11
11
|
from datacontract.export.jsonschema_converter import to_jsonschema
|
|
12
12
|
from datacontract.model.data_contract_specification import DataContractSpecification, Server
|
|
13
13
|
from datacontract.model.exceptions import DataContractException
|
|
14
|
-
from datacontract.model.run import Check, Run
|
|
14
|
+
from datacontract.model.run import Check, ResultEnum, Run
|
|
15
15
|
|
|
16
16
|
# Thread-safe cache for primaryKey fields.
|
|
17
17
|
_primary_key_cache = {}
|
|
@@ -256,18 +256,29 @@ def check_jsonschema(run: Run, data_contract: DataContractSpecification, server:
|
|
|
256
256
|
type="schema",
|
|
257
257
|
name="Check that JSON has valid schema",
|
|
258
258
|
model=model_name,
|
|
259
|
-
result=
|
|
259
|
+
result=ResultEnum.info,
|
|
260
260
|
reason="JSON Schema check skipped for GCS, as GCS is currently not supported",
|
|
261
261
|
engine="jsonschema",
|
|
262
262
|
)
|
|
263
263
|
)
|
|
264
|
+
elif server.type == "azure":
|
|
265
|
+
run.checks.append(
|
|
266
|
+
Check(
|
|
267
|
+
type="schema",
|
|
268
|
+
name="Check that JSON has valid schema",
|
|
269
|
+
model=model_name,
|
|
270
|
+
result=ResultEnum.info,
|
|
271
|
+
reason="JSON Schema check skipped for azure, as azure is currently not supported",
|
|
272
|
+
engine="jsonschema",
|
|
273
|
+
)
|
|
274
|
+
)
|
|
264
275
|
else:
|
|
265
276
|
run.checks.append(
|
|
266
277
|
Check(
|
|
267
278
|
type="schema",
|
|
268
279
|
name="Check that JSON has valid schema",
|
|
269
280
|
model=model_name,
|
|
270
|
-
result=
|
|
281
|
+
result=ResultEnum.warning,
|
|
271
282
|
reason=f"Server type {server.type} not supported",
|
|
272
283
|
engine="jsonschema",
|
|
273
284
|
)
|
|
@@ -279,7 +290,7 @@ def check_jsonschema(run: Run, data_contract: DataContractSpecification, server:
|
|
|
279
290
|
type="schema",
|
|
280
291
|
name="Check that JSON has valid schema",
|
|
281
292
|
model=model_name,
|
|
282
|
-
result=
|
|
293
|
+
result=ResultEnum.passed,
|
|
283
294
|
reason="All JSON entries are valid.",
|
|
284
295
|
engine="jsonschema",
|
|
285
296
|
)
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
|
|
3
|
-
from soda.scan import Scan
|
|
4
|
-
|
|
5
3
|
from datacontract.engines.soda.connections.bigquery import to_bigquery_soda_configuration
|
|
6
4
|
from datacontract.engines.soda.connections.databricks import to_databricks_soda_configuration
|
|
7
5
|
from datacontract.engines.soda.connections.duckdb import get_duckdb_connection
|
|
@@ -12,10 +10,15 @@ from datacontract.engines.soda.connections.sqlserver import to_sqlserver_soda_co
|
|
|
12
10
|
from datacontract.engines.soda.connections.trino import to_trino_soda_configuration
|
|
13
11
|
from datacontract.export.sodacl_converter import to_sodacl_yaml
|
|
14
12
|
from datacontract.model.data_contract_specification import DataContractSpecification, Server
|
|
15
|
-
from datacontract.model.run import Check, Log, Run
|
|
13
|
+
from datacontract.model.run import Check, Log, ResultEnum, Run
|
|
16
14
|
|
|
17
15
|
|
|
18
16
|
def check_soda_execute(run: Run, data_contract: DataContractSpecification, server: Server, spark, tmp_dir):
|
|
17
|
+
from soda.common.config_helper import ConfigHelper
|
|
18
|
+
|
|
19
|
+
ConfigHelper.get_instance().upsert_value("send_anonymous_usage_stats", False)
|
|
20
|
+
from soda.scan import Scan
|
|
21
|
+
|
|
19
22
|
if data_contract is None:
|
|
20
23
|
run.log_warn("Cannot run engine soda-core, as data contract is invalid")
|
|
21
24
|
return
|
|
@@ -25,6 +28,7 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
25
28
|
|
|
26
29
|
if server.type in ["s3", "gcs", "azure", "local"]:
|
|
27
30
|
if server.format in ["json", "parquet", "csv", "delta"]:
|
|
31
|
+
run.log_info(f"Configuring engine soda-core to connect to {server.type} {server.format} with duckdb")
|
|
28
32
|
con = get_duckdb_connection(data_contract, server, run)
|
|
29
33
|
scan.add_duckdb_connection(duckdb_connection=con, data_source_name=server.type)
|
|
30
34
|
scan.set_data_source_name(server.type)
|
|
@@ -33,7 +37,7 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
33
37
|
Check(
|
|
34
38
|
type="general",
|
|
35
39
|
name="Check that format is supported",
|
|
36
|
-
result=
|
|
40
|
+
result=ResultEnum.warning,
|
|
37
41
|
reason=f"Format {server.format} not yet supported by datacontract CLI",
|
|
38
42
|
engine="datacontract",
|
|
39
43
|
)
|
|
@@ -54,11 +58,12 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
54
58
|
scan.set_data_source_name(server.type)
|
|
55
59
|
elif server.type == "databricks":
|
|
56
60
|
if spark is not None:
|
|
57
|
-
|
|
61
|
+
run.log_info("Connecting to databricks via spark")
|
|
58
62
|
scan.add_spark_session(spark, data_source_name=server.type)
|
|
59
63
|
scan.set_data_source_name(server.type)
|
|
60
64
|
spark.sql(f"USE {server.catalog}.{server.schema_}")
|
|
61
65
|
else:
|
|
66
|
+
run.log_info("Connecting to databricks directly")
|
|
62
67
|
soda_configuration_str = to_databricks_soda_configuration(server)
|
|
63
68
|
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
64
69
|
scan.set_data_source_name(server.type)
|
|
@@ -93,7 +98,7 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
93
98
|
Check(
|
|
94
99
|
type="general",
|
|
95
100
|
name="Check that server type is supported",
|
|
96
|
-
result=
|
|
101
|
+
result=ResultEnum.warning,
|
|
97
102
|
reason=f"Server type {server.type} not yet supported by datacontract CLI",
|
|
98
103
|
engine="datacontract-cli",
|
|
99
104
|
)
|
|
@@ -182,5 +187,5 @@ def update_reason(check, c):
|
|
|
182
187
|
check.reason = diagnostics_text_split[1].strip()
|
|
183
188
|
# print(check.reason)
|
|
184
189
|
break # Exit the loop once the desired block is found
|
|
185
|
-
if c["diagnostics"]
|
|
186
|
-
check.reason = f"
|
|
190
|
+
if "fail" in c["diagnostics"]:
|
|
191
|
+
check.reason = f"Value: {c['diagnostics']['value']} Fail: {c['diagnostics']['fail']}"
|
|
@@ -4,15 +4,24 @@ import yaml
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def to_databricks_soda_configuration(server):
|
|
7
|
+
token = os.getenv("DATACONTRACT_DATABRICKS_TOKEN")
|
|
8
|
+
if token is None:
|
|
9
|
+
raise ValueError("DATACONTRACT_DATABRICKS_TOKEN environment variable is not set")
|
|
10
|
+
http_path = os.getenv("DATACONTRACT_DATABRICKS_HTTP_PATH")
|
|
11
|
+
host = server.host
|
|
12
|
+
if host is None:
|
|
13
|
+
host = os.getenv("DATACONTRACT_DATABRICKS_SERVER_HOSTNAME")
|
|
14
|
+
if host is None:
|
|
15
|
+
raise ValueError("DATACONTRACT_DATABRICKS_SERVER_HOSTNAME environment variable is not set")
|
|
7
16
|
soda_configuration = {
|
|
8
17
|
f"data_source {server.type}": {
|
|
9
18
|
"type": "spark",
|
|
10
19
|
"method": "databricks",
|
|
11
|
-
"host":
|
|
20
|
+
"host": host,
|
|
12
21
|
"catalog": server.catalog,
|
|
13
22
|
"schema": server.schema_,
|
|
14
|
-
"http_path":
|
|
15
|
-
"token":
|
|
23
|
+
"http_path": http_path,
|
|
24
|
+
"token": token,
|
|
16
25
|
}
|
|
17
26
|
}
|
|
18
27
|
|
|
@@ -90,7 +90,7 @@ Note: {1}
|
|
|
90
90
|
|
|
91
91
|
|
|
92
92
|
def generate_field(field_name: str, field: spec.Field, model_name: str, server: spec.Server) -> Tuple[str, str]:
|
|
93
|
-
if field.primary:
|
|
93
|
+
if field.primaryKey or field.primary:
|
|
94
94
|
if field.required is not None:
|
|
95
95
|
if not field.required:
|
|
96
96
|
raise DataContractException(
|
|
@@ -115,7 +115,7 @@ def generate_field(field_name: str, field: spec.Field, model_name: str, server:
|
|
|
115
115
|
field.unique = True
|
|
116
116
|
|
|
117
117
|
field_attrs = []
|
|
118
|
-
if field.primary:
|
|
118
|
+
if field.primaryKey or field.primary:
|
|
119
119
|
field_attrs.append("pk")
|
|
120
120
|
|
|
121
121
|
if field.unique:
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Dict
|
|
1
|
+
from typing import Dict, Optional
|
|
2
2
|
|
|
3
3
|
import yaml
|
|
4
4
|
|
|
@@ -39,27 +39,20 @@ def to_dbt_models_yaml(data_contract_spec: DataContractSpecification):
|
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
def to_dbt_staging_sql(data_contract_spec: DataContractSpecification, model_name: str, model_value: Model) -> str:
|
|
42
|
-
if data_contract_spec.models is None or len(data_contract_spec.models.items()) != 1:
|
|
43
|
-
print(
|
|
44
|
-
"Export to dbt-staging-sql currently only works with exactly one model in the data contract."
|
|
45
|
-
"Please specify the model name."
|
|
46
|
-
)
|
|
47
|
-
return ""
|
|
48
|
-
|
|
49
42
|
id = data_contract_spec.id
|
|
50
43
|
columns = []
|
|
51
44
|
for field_name, field in model_value.fields.items():
|
|
52
45
|
# TODO escape SQL reserved key words, probably dependent on server type
|
|
53
46
|
columns.append(field_name)
|
|
54
47
|
return f"""
|
|
55
|
-
select
|
|
48
|
+
select
|
|
56
49
|
{", ".join(columns)}
|
|
57
50
|
from {{{{ source('{id}', '{model_name}') }}}}
|
|
58
51
|
"""
|
|
59
52
|
|
|
60
53
|
|
|
61
54
|
def to_dbt_sources_yaml(data_contract_spec: DataContractSpecification, server: str = None):
|
|
62
|
-
source = {"name": data_contract_spec.id
|
|
55
|
+
source = {"name": data_contract_spec.id}
|
|
63
56
|
dbt = {
|
|
64
57
|
"version": 2,
|
|
65
58
|
"sources": [source],
|
|
@@ -69,24 +62,33 @@ def to_dbt_sources_yaml(data_contract_spec: DataContractSpecification, server: s
|
|
|
69
62
|
if data_contract_spec.info.description is not None:
|
|
70
63
|
source["description"] = data_contract_spec.info.description
|
|
71
64
|
found_server = data_contract_spec.servers.get(server)
|
|
65
|
+
adapter_type = None
|
|
72
66
|
if found_server is not None:
|
|
73
|
-
|
|
74
|
-
|
|
67
|
+
adapter_type = found_server.type
|
|
68
|
+
if adapter_type == "bigquery":
|
|
69
|
+
source["database"] = found_server.project
|
|
70
|
+
source["schema"] = found_server.dataset
|
|
71
|
+
else:
|
|
72
|
+
source["database"] = found_server.database
|
|
73
|
+
source["schema"] = found_server.schema_
|
|
75
74
|
|
|
75
|
+
source["tables"] = []
|
|
76
76
|
for model_key, model_value in data_contract_spec.models.items():
|
|
77
|
-
dbt_model = _to_dbt_source_table(model_key, model_value)
|
|
77
|
+
dbt_model = _to_dbt_source_table(data_contract_spec, model_key, model_value, adapter_type)
|
|
78
78
|
source["tables"].append(dbt_model)
|
|
79
79
|
return yaml.dump(dbt, indent=2, sort_keys=False, allow_unicode=True)
|
|
80
80
|
|
|
81
81
|
|
|
82
|
-
def _to_dbt_source_table(
|
|
82
|
+
def _to_dbt_source_table(
|
|
83
|
+
data_contract_spec: DataContractSpecification, model_key, model_value: Model, adapter_type: Optional[str]
|
|
84
|
+
) -> dict:
|
|
83
85
|
dbt_model = {
|
|
84
86
|
"name": model_key,
|
|
85
87
|
}
|
|
86
88
|
|
|
87
89
|
if model_value.description is not None:
|
|
88
90
|
dbt_model["description"] = model_value.description
|
|
89
|
-
columns = _to_columns(model_value.fields, False,
|
|
91
|
+
columns = _to_columns(data_contract_spec, model_value.fields, False, adapter_type)
|
|
90
92
|
if columns:
|
|
91
93
|
dbt_model["columns"] = columns
|
|
92
94
|
return dbt_model
|
|
@@ -107,7 +109,7 @@ def _to_dbt_model(model_key, model_value: Model, data_contract_spec: DataContrac
|
|
|
107
109
|
dbt_model["config"]["contract"] = {"enforced": True}
|
|
108
110
|
if model_value.description is not None:
|
|
109
111
|
dbt_model["description"] = model_value.description
|
|
110
|
-
columns = _to_columns(model_value.fields, _supports_constraints(model_type),
|
|
112
|
+
columns = _to_columns(data_contract_spec, model_value.fields, _supports_constraints(model_type), None)
|
|
111
113
|
if columns:
|
|
112
114
|
dbt_model["columns"] = columns
|
|
113
115
|
return dbt_model
|
|
@@ -130,48 +132,65 @@ def _supports_constraints(model_type):
|
|
|
130
132
|
return model_type == "table" or model_type == "incremental"
|
|
131
133
|
|
|
132
134
|
|
|
133
|
-
def _to_columns(
|
|
135
|
+
def _to_columns(
|
|
136
|
+
data_contract_spec: DataContractSpecification,
|
|
137
|
+
fields: Dict[str, Field],
|
|
138
|
+
supports_constraints: bool,
|
|
139
|
+
adapter_type: Optional[str],
|
|
140
|
+
) -> list:
|
|
134
141
|
columns = []
|
|
135
142
|
for field_name, field in fields.items():
|
|
136
|
-
column = _to_column(field, supports_constraints,
|
|
137
|
-
column["name"] = field_name
|
|
143
|
+
column = _to_column(data_contract_spec, field_name, field, supports_constraints, adapter_type)
|
|
138
144
|
columns.append(column)
|
|
139
145
|
return columns
|
|
140
146
|
|
|
141
147
|
|
|
142
|
-
def
|
|
143
|
-
|
|
144
|
-
|
|
148
|
+
def get_table_name_and_column_name(references: str) -> tuple[Optional[str], str]:
|
|
149
|
+
parts = references.split(".")
|
|
150
|
+
if len(parts) < 2:
|
|
151
|
+
return None, parts[0]
|
|
152
|
+
return parts[-2], parts[-1]
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _to_column(
|
|
156
|
+
data_contract_spec: DataContractSpecification,
|
|
157
|
+
field_name: str,
|
|
158
|
+
field: Field,
|
|
159
|
+
supports_constraints: bool,
|
|
160
|
+
adapter_type: Optional[str],
|
|
161
|
+
) -> dict:
|
|
162
|
+
column = {"name": field_name}
|
|
163
|
+
adapter_type = adapter_type or "snowflake"
|
|
164
|
+
dbt_type = convert_to_sql_type(field, adapter_type)
|
|
165
|
+
|
|
166
|
+
column["data_tests"] = []
|
|
145
167
|
if dbt_type is not None:
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
)
|
|
168
|
+
column["data_type"] = dbt_type
|
|
169
|
+
else:
|
|
170
|
+
column["data_tests"].append(
|
|
171
|
+
{"dbt_expectations.dbt_expectations.expect_column_values_to_be_of_type": {"column_type": dbt_type}}
|
|
172
|
+
)
|
|
152
173
|
if field.description is not None:
|
|
153
174
|
column["description"] = field.description
|
|
154
175
|
if field.required:
|
|
155
176
|
if supports_constraints:
|
|
156
177
|
column.setdefault("constraints", []).append({"type": "not_null"})
|
|
157
178
|
else:
|
|
158
|
-
column
|
|
179
|
+
column["data_tests"].append("not_null")
|
|
159
180
|
if field.unique:
|
|
160
181
|
if supports_constraints:
|
|
161
182
|
column.setdefault("constraints", []).append({"type": "unique"})
|
|
162
183
|
else:
|
|
163
|
-
column
|
|
184
|
+
column["data_tests"].append("unique")
|
|
164
185
|
if field.enum is not None and len(field.enum) > 0:
|
|
165
|
-
column
|
|
186
|
+
column["data_tests"].append({"accepted_values": {"values": field.enum}})
|
|
166
187
|
if field.minLength is not None or field.maxLength is not None:
|
|
167
188
|
length_test = {}
|
|
168
189
|
if field.minLength is not None:
|
|
169
190
|
length_test["min_value"] = field.minLength
|
|
170
191
|
if field.maxLength is not None:
|
|
171
192
|
length_test["max_value"] = field.maxLength
|
|
172
|
-
column.
|
|
173
|
-
{"dbt_expectations.expect_column_value_lengths_to_be_between": length_test}
|
|
174
|
-
)
|
|
193
|
+
column["data_tests"].append({"dbt_expectations.expect_column_value_lengths_to_be_between": length_test})
|
|
175
194
|
if field.pii is not None:
|
|
176
195
|
column.setdefault("meta", {})["pii"] = field.pii
|
|
177
196
|
if field.classification is not None:
|
|
@@ -180,9 +199,7 @@ def _to_column(field: Field, supports_constraints: bool, supports_datatype: bool
|
|
|
180
199
|
column.setdefault("tags", []).extend(field.tags)
|
|
181
200
|
if field.pattern is not None:
|
|
182
201
|
# Beware, the data contract pattern is a regex, not a like pattern
|
|
183
|
-
column.
|
|
184
|
-
{"dbt_expectations.expect_column_values_to_match_regex": {"regex": field.pattern}}
|
|
185
|
-
)
|
|
202
|
+
column["data_tests"].append({"dbt_expectations.expect_column_values_to_match_regex": {"regex": field.pattern}})
|
|
186
203
|
if (
|
|
187
204
|
field.minimum is not None
|
|
188
205
|
or field.maximum is not None
|
|
@@ -194,7 +211,7 @@ def _to_column(field: Field, supports_constraints: bool, supports_datatype: bool
|
|
|
194
211
|
range_test["min_value"] = field.minimum
|
|
195
212
|
if field.maximum is not None:
|
|
196
213
|
range_test["max_value"] = field.maximum
|
|
197
|
-
column
|
|
214
|
+
column["data_tests"].append({"dbt_expectations.expect_column_values_to_be_between": range_test})
|
|
198
215
|
elif (
|
|
199
216
|
field.exclusiveMinimum is not None
|
|
200
217
|
or field.exclusiveMaximum is not None
|
|
@@ -207,18 +224,18 @@ def _to_column(field: Field, supports_constraints: bool, supports_datatype: bool
|
|
|
207
224
|
if field.exclusiveMaximum is not None:
|
|
208
225
|
range_test["max_value"] = field.exclusiveMaximum
|
|
209
226
|
range_test["strictly"] = True
|
|
210
|
-
column
|
|
227
|
+
column["data_tests"].append({"dbt_expectations.expect_column_values_to_be_between": range_test})
|
|
211
228
|
else:
|
|
212
229
|
if field.minimum is not None:
|
|
213
|
-
column
|
|
230
|
+
column["data_tests"].append(
|
|
214
231
|
{"dbt_expectations.expect_column_values_to_be_between": {"min_value": field.minimum}}
|
|
215
232
|
)
|
|
216
233
|
if field.maximum is not None:
|
|
217
|
-
column
|
|
234
|
+
column["data_tests"].append(
|
|
218
235
|
{"dbt_expectations.expect_column_values_to_be_between": {"max_value": field.maximum}}
|
|
219
236
|
)
|
|
220
237
|
if field.exclusiveMinimum is not None:
|
|
221
|
-
column
|
|
238
|
+
column["data_tests"].append(
|
|
222
239
|
{
|
|
223
240
|
"dbt_expectations.expect_column_values_to_be_between": {
|
|
224
241
|
"min_value": field.exclusiveMinimum,
|
|
@@ -227,7 +244,7 @@ def _to_column(field: Field, supports_constraints: bool, supports_datatype: bool
|
|
|
227
244
|
}
|
|
228
245
|
)
|
|
229
246
|
if field.exclusiveMaximum is not None:
|
|
230
|
-
column
|
|
247
|
+
column["data_tests"].append(
|
|
231
248
|
{
|
|
232
249
|
"dbt_expectations.expect_column_values_to_be_between": {
|
|
233
250
|
"max_value": field.exclusiveMaximum,
|
|
@@ -235,6 +252,21 @@ def _to_column(field: Field, supports_constraints: bool, supports_datatype: bool
|
|
|
235
252
|
}
|
|
236
253
|
}
|
|
237
254
|
)
|
|
255
|
+
if field.references is not None:
|
|
256
|
+
ref_source_name = data_contract_spec.id
|
|
257
|
+
table_name, column_name = get_table_name_and_column_name(field.references)
|
|
258
|
+
if table_name is not None and column_name is not None:
|
|
259
|
+
column["data_tests"].append(
|
|
260
|
+
{
|
|
261
|
+
"relationships": {
|
|
262
|
+
"to": f"""source("{ref_source_name}", "{table_name}")""",
|
|
263
|
+
"field": f"{column_name}",
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
if not column["data_tests"]:
|
|
269
|
+
column.pop("data_tests")
|
|
238
270
|
|
|
239
271
|
# TODO: all constraints
|
|
240
272
|
return column
|
datacontract/export/exporter.py
CHANGED
|
@@ -2,7 +2,10 @@ import typing
|
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
3
|
from enum import Enum
|
|
4
4
|
|
|
5
|
-
from datacontract.model.data_contract_specification import
|
|
5
|
+
from datacontract.model.data_contract_specification import (
|
|
6
|
+
DataContractSpecification,
|
|
7
|
+
Model,
|
|
8
|
+
)
|
|
6
9
|
|
|
7
10
|
|
|
8
11
|
class Exporter(ABC):
|
|
@@ -40,6 +43,8 @@ class ExportFormat(str, Enum):
|
|
|
40
43
|
sqlalchemy = "sqlalchemy"
|
|
41
44
|
data_caterer = "data-caterer"
|
|
42
45
|
dcs = "dcs"
|
|
46
|
+
markdown = "markdown"
|
|
47
|
+
iceberg = "iceberg"
|
|
43
48
|
|
|
44
49
|
@classmethod
|
|
45
50
|
def get_supported_formats(cls):
|
|
@@ -48,7 +53,7 @@ class ExportFormat(str, Enum):
|
|
|
48
53
|
|
|
49
54
|
def _check_models_for_export(
|
|
50
55
|
data_contract: DataContractSpecification, model: str, export_format: str
|
|
51
|
-
) -> typing.Tuple[str,
|
|
56
|
+
) -> typing.Tuple[str, Model]:
|
|
52
57
|
if data_contract.models is None:
|
|
53
58
|
raise RuntimeError(f"Export to {export_format} requires models in the data contract.")
|
|
54
59
|
|