datacontract-cli 0.10.16__py3-none-any.whl → 0.10.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/breaking/breaking_rules.py +4 -0
- datacontract/cli.py +49 -32
- datacontract/data_contract.py +14 -11
- datacontract/engines/fastjsonschema/check_jsonschema.py +15 -4
- datacontract/engines/soda/check_soda_execute.py +9 -4
- datacontract/engines/soda/connections/databricks.py +12 -3
- datacontract/engines/soda/connections/duckdb.py +22 -9
- datacontract/export/data_caterer_converter.py +20 -7
- datacontract/export/dbml_converter.py +2 -2
- datacontract/export/dbt_converter.py +41 -16
- datacontract/export/exporter.py +6 -2
- datacontract/export/exporter_factory.py +48 -14
- datacontract/export/iceberg_converter.py +3 -3
- datacontract/export/markdown_converter.py +208 -0
- datacontract/export/odcs_v3_exporter.py +6 -0
- datacontract/export/sodacl_converter.py +22 -5
- datacontract/export/sql_converter.py +1 -1
- datacontract/export/sql_type_converter.py +28 -2
- datacontract/export/sqlalchemy_converter.py +3 -1
- datacontract/imports/csv_importer.py +89 -0
- datacontract/imports/dbml_importer.py +1 -1
- datacontract/imports/dbt_importer.py +94 -12
- datacontract/imports/importer.py +1 -0
- datacontract/imports/importer_factory.py +5 -0
- datacontract/imports/odcs_v2_importer.py +1 -1
- datacontract/imports/odcs_v3_importer.py +1 -1
- datacontract/imports/sql_importer.py +1 -1
- datacontract/init/init_template.py +20 -0
- datacontract/integration/datamesh_manager.py +15 -9
- datacontract/lint/linters/field_reference_linter.py +10 -1
- datacontract/lint/resolve.py +48 -14
- datacontract/lint/schema.py +10 -3
- datacontract/model/data_contract_specification.py +13 -4
- datacontract/model/run.py +1 -0
- datacontract/schemas/datacontract-1.1.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.1.0.schema.json +1975 -0
- datacontract/schemas/odcs-3.0.1.schema.json +2634 -0
- datacontract/templates/datacontract.html +20 -1
- datacontract/templates/partials/definition.html +15 -5
- datacontract/templates/partials/model_field.html +10 -1
- {datacontract_cli-0.10.16.dist-info → datacontract_cli-0.10.19.dist-info}/METADATA +477 -343
- {datacontract_cli-0.10.16.dist-info → datacontract_cli-0.10.19.dist-info}/RECORD +46 -42
- {datacontract_cli-0.10.16.dist-info → datacontract_cli-0.10.19.dist-info}/WHEEL +1 -1
- datacontract/init/download_datacontract_file.py +0 -17
- datacontract/integration/opentelemetry.py +0 -103
- {datacontract_cli-0.10.16.dist-info → datacontract_cli-0.10.19.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.10.16.dist-info → datacontract_cli-0.10.19.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.16.dist-info → datacontract_cli-0.10.19.dist-info}/top_level.txt +0 -0
|
@@ -42,6 +42,10 @@ class BreakingRules:
|
|
|
42
42
|
field_primary_removed = Severity.WARNING
|
|
43
43
|
field_primary_updated = Severity.WARNING
|
|
44
44
|
|
|
45
|
+
field_primary_key_added = Severity.WARNING
|
|
46
|
+
field_primary_key_removed = Severity.WARNING
|
|
47
|
+
field_primary_key_updated = Severity.WARNING
|
|
48
|
+
|
|
45
49
|
field_references_added = Severity.WARNING
|
|
46
50
|
field_references_removed = Severity.WARNING
|
|
47
51
|
field_references_updated = Severity.WARNING
|
datacontract/cli.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from importlib import metadata
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
from typing import Iterable, List, Optional
|
|
@@ -15,15 +16,11 @@ from datacontract import web
|
|
|
15
16
|
from datacontract.catalog.catalog import create_data_contract_html, create_index_html
|
|
16
17
|
from datacontract.data_contract import DataContract, ExportFormat
|
|
17
18
|
from datacontract.imports.importer import ImportFormat
|
|
18
|
-
from datacontract.init.
|
|
19
|
-
FileExistsException,
|
|
20
|
-
download_datacontract_file,
|
|
21
|
-
)
|
|
19
|
+
from datacontract.init.init_template import get_init_template
|
|
22
20
|
from datacontract.integration.datamesh_manager import (
|
|
23
21
|
publish_data_contract_to_datamesh_manager,
|
|
24
22
|
)
|
|
25
|
-
|
|
26
|
-
DEFAULT_DATA_CONTRACT_SCHEMA_URL = "https://datacontract.com/datacontract.schema.json"
|
|
23
|
+
from datacontract.lint.resolve import resolve_data_contract_dict
|
|
27
24
|
|
|
28
25
|
console = Console()
|
|
29
26
|
|
|
@@ -70,24 +67,21 @@ def common(
|
|
|
70
67
|
@app.command()
|
|
71
68
|
def init(
|
|
72
69
|
location: Annotated[
|
|
73
|
-
str,
|
|
74
|
-
typer.Argument(help="The location (url or path) of the data contract yaml to create."),
|
|
70
|
+
str, typer.Argument(help="The location of the data contract file to create.")
|
|
75
71
|
] = "datacontract.yaml",
|
|
76
|
-
template: Annotated[
|
|
77
|
-
str, typer.Option(help="URL of a template or data contract")
|
|
78
|
-
] = "https://datacontract.com/datacontract.init.yaml",
|
|
72
|
+
template: Annotated[str, typer.Option(help="URL of a template or data contract")] = None,
|
|
79
73
|
overwrite: Annotated[bool, typer.Option(help="Replace the existing datacontract.yaml")] = False,
|
|
80
74
|
):
|
|
81
75
|
"""
|
|
82
|
-
|
|
76
|
+
Create an empty data contract.
|
|
83
77
|
"""
|
|
84
|
-
|
|
85
|
-
download_datacontract_file(location, template, overwrite)
|
|
86
|
-
except FileExistsException:
|
|
78
|
+
if not overwrite and os.path.exists(location):
|
|
87
79
|
console.print("File already exists, use --overwrite to overwrite")
|
|
88
80
|
raise typer.Exit(code=1)
|
|
89
|
-
|
|
90
|
-
|
|
81
|
+
template_str = get_init_template(template)
|
|
82
|
+
with open(location, "w") as f:
|
|
83
|
+
f.write(template_str)
|
|
84
|
+
console.print("📄 data contract written to " + location)
|
|
91
85
|
|
|
92
86
|
|
|
93
87
|
@app.command()
|
|
@@ -99,7 +93,7 @@ def lint(
|
|
|
99
93
|
schema: Annotated[
|
|
100
94
|
str,
|
|
101
95
|
typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"),
|
|
102
|
-
] =
|
|
96
|
+
] = None,
|
|
103
97
|
):
|
|
104
98
|
"""
|
|
105
99
|
Validate that the datacontract.yaml is correctly formatted.
|
|
@@ -117,7 +111,7 @@ def test(
|
|
|
117
111
|
schema: Annotated[
|
|
118
112
|
str,
|
|
119
113
|
typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"),
|
|
120
|
-
] =
|
|
114
|
+
] = None,
|
|
121
115
|
server: Annotated[
|
|
122
116
|
str,
|
|
123
117
|
typer.Option(
|
|
@@ -132,13 +126,11 @@ def test(
|
|
|
132
126
|
typer.Option(help="Run the schema and quality tests on the example data within the data contract."),
|
|
133
127
|
] = None,
|
|
134
128
|
publish: Annotated[str, typer.Option(help="The url to publish the results after the test")] = None,
|
|
135
|
-
publish_to_opentelemetry: Annotated[
|
|
136
|
-
bool,
|
|
137
|
-
typer.Option(
|
|
138
|
-
help="Publish the results to opentelemetry. Use environment variables to configure the OTLP endpoint, headers, etc."
|
|
139
|
-
),
|
|
140
|
-
] = False,
|
|
141
129
|
logs: Annotated[bool, typer.Option(help="Print logs")] = False,
|
|
130
|
+
ssl_verification: Annotated[
|
|
131
|
+
bool,
|
|
132
|
+
typer.Option(help="SSL verification when publishing the data contract."),
|
|
133
|
+
] = True,
|
|
142
134
|
):
|
|
143
135
|
"""
|
|
144
136
|
Run schema and quality tests on configured servers.
|
|
@@ -150,9 +142,9 @@ def test(
|
|
|
150
142
|
data_contract_file=location,
|
|
151
143
|
schema_location=schema,
|
|
152
144
|
publish_url=publish,
|
|
153
|
-
publish_to_opentelemetry=publish_to_opentelemetry,
|
|
154
145
|
server=server,
|
|
155
146
|
examples=examples,
|
|
147
|
+
ssl_verification=ssl_verification,
|
|
156
148
|
).test()
|
|
157
149
|
if logs:
|
|
158
150
|
_print_logs(run)
|
|
@@ -200,7 +192,7 @@ def export(
|
|
|
200
192
|
schema: Annotated[
|
|
201
193
|
str,
|
|
202
194
|
typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"),
|
|
203
|
-
] =
|
|
195
|
+
] = None,
|
|
204
196
|
# TODO: this should be a subcommand
|
|
205
197
|
engine: Annotated[
|
|
206
198
|
Optional[str],
|
|
@@ -280,6 +272,14 @@ def import_(
|
|
|
280
272
|
Optional[str],
|
|
281
273
|
typer.Option(help="Table name to assign to the model created from the Iceberg schema."),
|
|
282
274
|
] = None,
|
|
275
|
+
template: Annotated[
|
|
276
|
+
Optional[str],
|
|
277
|
+
typer.Option(help="The location (url or path) of the Data Contract Specification Template"),
|
|
278
|
+
] = None,
|
|
279
|
+
schema: Annotated[
|
|
280
|
+
str,
|
|
281
|
+
typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"),
|
|
282
|
+
] = None,
|
|
283
283
|
):
|
|
284
284
|
"""
|
|
285
285
|
Create a data contract from the given source location. Saves to file specified by `output` option if present, otherwise prints to stdout.
|
|
@@ -287,6 +287,8 @@ def import_(
|
|
|
287
287
|
result = DataContract().import_from_source(
|
|
288
288
|
format=format,
|
|
289
289
|
source=source,
|
|
290
|
+
template=template,
|
|
291
|
+
schema=schema,
|
|
290
292
|
glue_table=glue_table,
|
|
291
293
|
bigquery_table=bigquery_table,
|
|
292
294
|
bigquery_project=bigquery_project,
|
|
@@ -314,15 +316,18 @@ def publish(
|
|
|
314
316
|
schema: Annotated[
|
|
315
317
|
str,
|
|
316
318
|
typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"),
|
|
317
|
-
] =
|
|
319
|
+
] = None,
|
|
320
|
+
ssl_verification: Annotated[
|
|
321
|
+
bool,
|
|
322
|
+
typer.Option(help="SSL verification when publishing the data contract."),
|
|
323
|
+
] = True,
|
|
318
324
|
):
|
|
319
325
|
"""
|
|
320
326
|
Publish the data contract to the Data Mesh Manager.
|
|
321
327
|
"""
|
|
322
328
|
publish_data_contract_to_datamesh_manager(
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
).get_data_contract_specification(),
|
|
329
|
+
data_contract_dict=resolve_data_contract_dict(location),
|
|
330
|
+
ssl_verification=ssl_verification,
|
|
326
331
|
)
|
|
327
332
|
|
|
328
333
|
|
|
@@ -338,7 +343,7 @@ def catalog(
|
|
|
338
343
|
schema: Annotated[
|
|
339
344
|
str,
|
|
340
345
|
typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"),
|
|
341
|
-
] =
|
|
346
|
+
] = None,
|
|
342
347
|
):
|
|
343
348
|
"""
|
|
344
349
|
Create an html catalog of data contracts.
|
|
@@ -447,6 +452,18 @@ def _handle_result(run):
|
|
|
447
452
|
console.print(
|
|
448
453
|
f"🟢 data contract is valid. Run {len(run.checks)} checks. Took {(run.timestampEnd - run.timestampStart).total_seconds()} seconds."
|
|
449
454
|
)
|
|
455
|
+
elif run.result == "warning":
|
|
456
|
+
console.print("🟠 data contract has warnings. Found the following warnings:")
|
|
457
|
+
i = 1
|
|
458
|
+
for check in run.checks:
|
|
459
|
+
if check.result != "passed":
|
|
460
|
+
field = to_field(run, check)
|
|
461
|
+
if field:
|
|
462
|
+
field = field + " "
|
|
463
|
+
else:
|
|
464
|
+
field = ""
|
|
465
|
+
console.print(f"{i}) {field}{check.name}: {check.reason}")
|
|
466
|
+
i += 1
|
|
450
467
|
else:
|
|
451
468
|
console.print("🔴 data contract is invalid, found the following errors:")
|
|
452
469
|
i = 1
|
datacontract/data_contract.py
CHANGED
|
@@ -22,8 +22,8 @@ from datacontract.engines.soda.check_soda_execute import check_soda_execute
|
|
|
22
22
|
from datacontract.export.exporter import ExportFormat
|
|
23
23
|
from datacontract.export.exporter_factory import exporter_factory
|
|
24
24
|
from datacontract.imports.importer_factory import importer_factory
|
|
25
|
+
from datacontract.init.init_template import get_init_template
|
|
25
26
|
from datacontract.integration.datamesh_manager import publish_test_results_to_datamesh_manager
|
|
26
|
-
from datacontract.integration.opentelemetry import publish_test_results_to_opentelemetry
|
|
27
27
|
from datacontract.lint import resolve
|
|
28
28
|
from datacontract.lint.linters.description_linter import DescriptionLinter
|
|
29
29
|
from datacontract.lint.linters.example_model_linter import ExampleModelLinter
|
|
@@ -48,10 +48,10 @@ class DataContract:
|
|
|
48
48
|
server: str = None,
|
|
49
49
|
examples: bool = False,
|
|
50
50
|
publish_url: str = None,
|
|
51
|
-
publish_to_opentelemetry: bool = False,
|
|
52
51
|
spark: "SparkSession" = None,
|
|
53
52
|
inline_definitions: bool = True,
|
|
54
53
|
inline_quality: bool = True,
|
|
54
|
+
ssl_verification: bool = True,
|
|
55
55
|
):
|
|
56
56
|
self._data_contract_file = data_contract_file
|
|
57
57
|
self._data_contract_str = data_contract_str
|
|
@@ -60,10 +60,10 @@ class DataContract:
|
|
|
60
60
|
self._server = server
|
|
61
61
|
self._examples = examples
|
|
62
62
|
self._publish_url = publish_url
|
|
63
|
-
self._publish_to_opentelemetry = publish_to_opentelemetry
|
|
64
63
|
self._spark = spark
|
|
65
64
|
self._inline_definitions = inline_definitions
|
|
66
65
|
self._inline_quality = inline_quality
|
|
66
|
+
self._ssl_verification = ssl_verification
|
|
67
67
|
self.all_linters = {
|
|
68
68
|
ExampleModelLinter(),
|
|
69
69
|
QualityUsesSchemaLinter(),
|
|
@@ -75,8 +75,9 @@ class DataContract:
|
|
|
75
75
|
}
|
|
76
76
|
|
|
77
77
|
@classmethod
|
|
78
|
-
def init(cls, template: str =
|
|
79
|
-
|
|
78
|
+
def init(cls, template: typing.Optional[str], schema: typing.Optional[str] = None) -> DataContractSpecification:
|
|
79
|
+
template_str = get_init_template(template)
|
|
80
|
+
return resolve.resolve_data_contract(data_contract_str=template_str, schema_location=schema)
|
|
80
81
|
|
|
81
82
|
def lint(self, enabled_linters: typing.Union[str, set[str]] = "all") -> Run:
|
|
82
83
|
"""Lint the data contract by deserializing the contract and checking the schema, as well as calling the configured linters.
|
|
@@ -230,10 +231,7 @@ class DataContract:
|
|
|
230
231
|
run.finish()
|
|
231
232
|
|
|
232
233
|
if self._publish_url is not None:
|
|
233
|
-
publish_test_results_to_datamesh_manager(run, self._publish_url)
|
|
234
|
-
|
|
235
|
-
if self._publish_to_opentelemetry:
|
|
236
|
-
publish_test_results_to_opentelemetry(run)
|
|
234
|
+
publish_test_results_to_datamesh_manager(run, self._publish_url, self._ssl_verification)
|
|
237
235
|
|
|
238
236
|
return run
|
|
239
237
|
|
|
@@ -347,9 +345,14 @@ class DataContract:
|
|
|
347
345
|
)
|
|
348
346
|
|
|
349
347
|
def import_from_source(
|
|
350
|
-
self,
|
|
348
|
+
self,
|
|
349
|
+
format: str,
|
|
350
|
+
source: typing.Optional[str] = None,
|
|
351
|
+
template: typing.Optional[str] = None,
|
|
352
|
+
schema: typing.Optional[str] = None,
|
|
353
|
+
**kwargs,
|
|
351
354
|
) -> DataContractSpecification:
|
|
352
|
-
data_contract_specification_initial = DataContract.init()
|
|
355
|
+
data_contract_specification_initial = DataContract.init(template=template, schema=schema)
|
|
353
356
|
|
|
354
357
|
return importer_factory.create(format).import_source(
|
|
355
358
|
data_contract_specification=data_contract_specification_initial, source=source, import_args=kwargs
|
|
@@ -11,7 +11,7 @@ from datacontract.engines.fastjsonschema.s3.s3_read_files import yield_s3_files
|
|
|
11
11
|
from datacontract.export.jsonschema_converter import to_jsonschema
|
|
12
12
|
from datacontract.model.data_contract_specification import DataContractSpecification, Server
|
|
13
13
|
from datacontract.model.exceptions import DataContractException
|
|
14
|
-
from datacontract.model.run import Check, Run
|
|
14
|
+
from datacontract.model.run import Check, ResultEnum, Run
|
|
15
15
|
|
|
16
16
|
# Thread-safe cache for primaryKey fields.
|
|
17
17
|
_primary_key_cache = {}
|
|
@@ -256,18 +256,29 @@ def check_jsonschema(run: Run, data_contract: DataContractSpecification, server:
|
|
|
256
256
|
type="schema",
|
|
257
257
|
name="Check that JSON has valid schema",
|
|
258
258
|
model=model_name,
|
|
259
|
-
result=
|
|
259
|
+
result=ResultEnum.info,
|
|
260
260
|
reason="JSON Schema check skipped for GCS, as GCS is currently not supported",
|
|
261
261
|
engine="jsonschema",
|
|
262
262
|
)
|
|
263
263
|
)
|
|
264
|
+
elif server.type == "azure":
|
|
265
|
+
run.checks.append(
|
|
266
|
+
Check(
|
|
267
|
+
type="schema",
|
|
268
|
+
name="Check that JSON has valid schema",
|
|
269
|
+
model=model_name,
|
|
270
|
+
result=ResultEnum.info,
|
|
271
|
+
reason="JSON Schema check skipped for azure, as azure is currently not supported",
|
|
272
|
+
engine="jsonschema",
|
|
273
|
+
)
|
|
274
|
+
)
|
|
264
275
|
else:
|
|
265
276
|
run.checks.append(
|
|
266
277
|
Check(
|
|
267
278
|
type="schema",
|
|
268
279
|
name="Check that JSON has valid schema",
|
|
269
280
|
model=model_name,
|
|
270
|
-
result=
|
|
281
|
+
result=ResultEnum.warning,
|
|
271
282
|
reason=f"Server type {server.type} not supported",
|
|
272
283
|
engine="jsonschema",
|
|
273
284
|
)
|
|
@@ -279,7 +290,7 @@ def check_jsonschema(run: Run, data_contract: DataContractSpecification, server:
|
|
|
279
290
|
type="schema",
|
|
280
291
|
name="Check that JSON has valid schema",
|
|
281
292
|
model=model_name,
|
|
282
|
-
result=
|
|
293
|
+
result=ResultEnum.passed,
|
|
283
294
|
reason="All JSON entries are valid.",
|
|
284
295
|
engine="jsonschema",
|
|
285
296
|
)
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
|
|
3
|
-
from soda.scan import Scan
|
|
4
|
-
|
|
5
3
|
from datacontract.engines.soda.connections.bigquery import to_bigquery_soda_configuration
|
|
6
4
|
from datacontract.engines.soda.connections.databricks import to_databricks_soda_configuration
|
|
7
5
|
from datacontract.engines.soda.connections.duckdb import get_duckdb_connection
|
|
@@ -16,6 +14,11 @@ from datacontract.model.run import Check, Log, ResultEnum, Run
|
|
|
16
14
|
|
|
17
15
|
|
|
18
16
|
def check_soda_execute(run: Run, data_contract: DataContractSpecification, server: Server, spark, tmp_dir):
|
|
17
|
+
from soda.common.config_helper import ConfigHelper
|
|
18
|
+
|
|
19
|
+
ConfigHelper.get_instance().upsert_value("send_anonymous_usage_stats", False)
|
|
20
|
+
from soda.scan import Scan
|
|
21
|
+
|
|
19
22
|
if data_contract is None:
|
|
20
23
|
run.log_warn("Cannot run engine soda-core, as data contract is invalid")
|
|
21
24
|
return
|
|
@@ -25,6 +28,7 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
25
28
|
|
|
26
29
|
if server.type in ["s3", "gcs", "azure", "local"]:
|
|
27
30
|
if server.format in ["json", "parquet", "csv", "delta"]:
|
|
31
|
+
run.log_info(f"Configuring engine soda-core to connect to {server.type} {server.format} with duckdb")
|
|
28
32
|
con = get_duckdb_connection(data_contract, server, run)
|
|
29
33
|
scan.add_duckdb_connection(duckdb_connection=con, data_source_name=server.type)
|
|
30
34
|
scan.set_data_source_name(server.type)
|
|
@@ -54,11 +58,12 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
54
58
|
scan.set_data_source_name(server.type)
|
|
55
59
|
elif server.type == "databricks":
|
|
56
60
|
if spark is not None:
|
|
57
|
-
|
|
61
|
+
run.log_info("Connecting to databricks via spark")
|
|
58
62
|
scan.add_spark_session(spark, data_source_name=server.type)
|
|
59
63
|
scan.set_data_source_name(server.type)
|
|
60
64
|
spark.sql(f"USE {server.catalog}.{server.schema_}")
|
|
61
65
|
else:
|
|
66
|
+
run.log_info("Connecting to databricks directly")
|
|
62
67
|
soda_configuration_str = to_databricks_soda_configuration(server)
|
|
63
68
|
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
64
69
|
scan.set_data_source_name(server.type)
|
|
@@ -183,4 +188,4 @@ def update_reason(check, c):
|
|
|
183
188
|
# print(check.reason)
|
|
184
189
|
break # Exit the loop once the desired block is found
|
|
185
190
|
if "fail" in c["diagnostics"]:
|
|
186
|
-
check.reason = f"
|
|
191
|
+
check.reason = f"Value: {c['diagnostics']['value']} Fail: {c['diagnostics']['fail']}"
|
|
@@ -4,15 +4,24 @@ import yaml
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def to_databricks_soda_configuration(server):
|
|
7
|
+
token = os.getenv("DATACONTRACT_DATABRICKS_TOKEN")
|
|
8
|
+
if token is None:
|
|
9
|
+
raise ValueError("DATACONTRACT_DATABRICKS_TOKEN environment variable is not set")
|
|
10
|
+
http_path = os.getenv("DATACONTRACT_DATABRICKS_HTTP_PATH")
|
|
11
|
+
host = server.host
|
|
12
|
+
if host is None:
|
|
13
|
+
host = os.getenv("DATACONTRACT_DATABRICKS_SERVER_HOSTNAME")
|
|
14
|
+
if host is None:
|
|
15
|
+
raise ValueError("DATACONTRACT_DATABRICKS_SERVER_HOSTNAME environment variable is not set")
|
|
7
16
|
soda_configuration = {
|
|
8
17
|
f"data_source {server.type}": {
|
|
9
18
|
"type": "spark",
|
|
10
19
|
"method": "databricks",
|
|
11
|
-
"host":
|
|
20
|
+
"host": host,
|
|
12
21
|
"catalog": server.catalog,
|
|
13
22
|
"schema": server.schema_,
|
|
14
|
-
"http_path":
|
|
15
|
-
"token":
|
|
23
|
+
"http_path": http_path,
|
|
24
|
+
"token": token,
|
|
16
25
|
}
|
|
17
26
|
}
|
|
18
27
|
|
|
@@ -146,6 +146,7 @@ def setup_azure_connection(con, server):
|
|
|
146
146
|
tenant_id = os.getenv("DATACONTRACT_AZURE_TENANT_ID")
|
|
147
147
|
client_id = os.getenv("DATACONTRACT_AZURE_CLIENT_ID")
|
|
148
148
|
client_secret = os.getenv("DATACONTRACT_AZURE_CLIENT_SECRET")
|
|
149
|
+
storage_account = server.storageAccount
|
|
149
150
|
|
|
150
151
|
if tenant_id is None:
|
|
151
152
|
raise ValueError("Error: Environment variable DATACONTRACT_AZURE_TENANT_ID is not set")
|
|
@@ -157,12 +158,24 @@ def setup_azure_connection(con, server):
|
|
|
157
158
|
con.install_extension("azure")
|
|
158
159
|
con.load_extension("azure")
|
|
159
160
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
161
|
+
if storage_account is not None:
|
|
162
|
+
con.sql(f"""
|
|
163
|
+
CREATE SECRET azure_spn (
|
|
164
|
+
TYPE AZURE,
|
|
165
|
+
PROVIDER SERVICE_PRINCIPAL,
|
|
166
|
+
TENANT_ID '{tenant_id}',
|
|
167
|
+
CLIENT_ID '{client_id}',
|
|
168
|
+
CLIENT_SECRET '{client_secret}',
|
|
169
|
+
ACCOUNT_NAME '{storage_account}'
|
|
170
|
+
);
|
|
171
|
+
""")
|
|
172
|
+
else:
|
|
173
|
+
con.sql(f"""
|
|
174
|
+
CREATE SECRET azure_spn (
|
|
175
|
+
TYPE AZURE,
|
|
176
|
+
PROVIDER SERVICE_PRINCIPAL,
|
|
177
|
+
TENANT_ID '{tenant_id}',
|
|
178
|
+
CLIENT_ID '{client_id}',
|
|
179
|
+
CLIENT_SECRET '{client_secret}'
|
|
180
|
+
);
|
|
181
|
+
""")
|
|
@@ -42,11 +42,11 @@ def _to_data_caterer_generate_step(model_key, model_value: Model, server: Server
|
|
|
42
42
|
"name": model_key,
|
|
43
43
|
"type": _to_step_type(server),
|
|
44
44
|
"options": _to_data_source_options(model_key, server),
|
|
45
|
-
"
|
|
45
|
+
"fields": [],
|
|
46
46
|
}
|
|
47
47
|
fields = _to_fields(model_value.fields)
|
|
48
48
|
if fields:
|
|
49
|
-
step["
|
|
49
|
+
step["fields"] = fields
|
|
50
50
|
return step
|
|
51
51
|
|
|
52
52
|
|
|
@@ -97,16 +97,29 @@ def _to_field(field_name: str, field: Field) -> dict:
|
|
|
97
97
|
if new_type == "object" or new_type == "record" or new_type == "struct":
|
|
98
98
|
# need to get nested field definitions
|
|
99
99
|
nested_fields = _to_fields(field.fields)
|
|
100
|
-
dc_field["
|
|
100
|
+
dc_field["fields"] = nested_fields
|
|
101
|
+
elif new_type == "array":
|
|
102
|
+
if field.items is not None and field.items.type is not None:
|
|
103
|
+
dc_generator_opts["arrayType"] = _to_data_type(field.items.type)
|
|
104
|
+
else:
|
|
105
|
+
dc_generator_opts["arrayType"] = "string"
|
|
101
106
|
|
|
102
107
|
if field.enum is not None and len(field.enum) > 0:
|
|
103
108
|
dc_generator_opts["oneOf"] = field.enum
|
|
104
109
|
if field.unique is not None and field.unique:
|
|
105
110
|
dc_generator_opts["isUnique"] = field.unique
|
|
111
|
+
if field.primaryKey is not None and field.primaryKey:
|
|
112
|
+
dc_generator_opts["isPrimaryKey"] = field.primaryKey
|
|
106
113
|
if field.minLength is not None:
|
|
107
|
-
|
|
114
|
+
if field.type is not None and field.type == "array":
|
|
115
|
+
dc_generator_opts["arrayMinLen"] = field.minLength
|
|
116
|
+
else:
|
|
117
|
+
dc_generator_opts["minLen"] = field.minLength
|
|
108
118
|
if field.maxLength is not None:
|
|
109
|
-
|
|
119
|
+
if field.type is not None and field.type == "array":
|
|
120
|
+
dc_generator_opts["arrayMaxLen"] = field.maxLength
|
|
121
|
+
else:
|
|
122
|
+
dc_generator_opts["maxLen"] = field.maxLength
|
|
110
123
|
if field.pattern is not None:
|
|
111
124
|
dc_generator_opts["regex"] = field.pattern
|
|
112
125
|
if field.minimum is not None:
|
|
@@ -115,7 +128,7 @@ def _to_field(field_name: str, field: Field) -> dict:
|
|
|
115
128
|
dc_generator_opts["max"] = field.maximum
|
|
116
129
|
|
|
117
130
|
if len(dc_generator_opts.keys()) > 0:
|
|
118
|
-
dc_field["
|
|
131
|
+
dc_field["options"] = dc_generator_opts
|
|
119
132
|
return dc_field
|
|
120
133
|
|
|
121
134
|
|
|
@@ -124,7 +137,7 @@ def _to_data_type(data_type):
|
|
|
124
137
|
return "double"
|
|
125
138
|
elif data_type == "decimal" or data_type == "bigint":
|
|
126
139
|
return "decimal"
|
|
127
|
-
elif data_type == "int":
|
|
140
|
+
elif data_type == "int" or data_type == "integer":
|
|
128
141
|
return "integer"
|
|
129
142
|
elif data_type == "long":
|
|
130
143
|
return "long"
|
|
@@ -90,7 +90,7 @@ Note: {1}
|
|
|
90
90
|
|
|
91
91
|
|
|
92
92
|
def generate_field(field_name: str, field: spec.Field, model_name: str, server: spec.Server) -> Tuple[str, str]:
|
|
93
|
-
if field.primary:
|
|
93
|
+
if field.primaryKey or field.primary:
|
|
94
94
|
if field.required is not None:
|
|
95
95
|
if not field.required:
|
|
96
96
|
raise DataContractException(
|
|
@@ -115,7 +115,7 @@ def generate_field(field_name: str, field: spec.Field, model_name: str, server:
|
|
|
115
115
|
field.unique = True
|
|
116
116
|
|
|
117
117
|
field_attrs = []
|
|
118
|
-
if field.primary:
|
|
118
|
+
if field.primaryKey or field.primary:
|
|
119
119
|
field_attrs.append("pk")
|
|
120
120
|
|
|
121
121
|
if field.unique:
|
|
@@ -39,13 +39,6 @@ def to_dbt_models_yaml(data_contract_spec: DataContractSpecification):
|
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
def to_dbt_staging_sql(data_contract_spec: DataContractSpecification, model_name: str, model_value: Model) -> str:
|
|
42
|
-
if data_contract_spec.models is None or len(data_contract_spec.models.items()) != 1:
|
|
43
|
-
print(
|
|
44
|
-
"Export to dbt-staging-sql currently only works with exactly one model in the data contract."
|
|
45
|
-
"Please specify the model name."
|
|
46
|
-
)
|
|
47
|
-
return ""
|
|
48
|
-
|
|
49
42
|
id = data_contract_spec.id
|
|
50
43
|
columns = []
|
|
51
44
|
for field_name, field in model_value.fields.items():
|
|
@@ -81,19 +74,21 @@ def to_dbt_sources_yaml(data_contract_spec: DataContractSpecification, server: s
|
|
|
81
74
|
|
|
82
75
|
source["tables"] = []
|
|
83
76
|
for model_key, model_value in data_contract_spec.models.items():
|
|
84
|
-
dbt_model = _to_dbt_source_table(model_key, model_value, adapter_type)
|
|
77
|
+
dbt_model = _to_dbt_source_table(data_contract_spec, model_key, model_value, adapter_type)
|
|
85
78
|
source["tables"].append(dbt_model)
|
|
86
79
|
return yaml.dump(dbt, indent=2, sort_keys=False, allow_unicode=True)
|
|
87
80
|
|
|
88
81
|
|
|
89
|
-
def _to_dbt_source_table(
|
|
82
|
+
def _to_dbt_source_table(
|
|
83
|
+
data_contract_spec: DataContractSpecification, model_key, model_value: Model, adapter_type: Optional[str]
|
|
84
|
+
) -> dict:
|
|
90
85
|
dbt_model = {
|
|
91
86
|
"name": model_key,
|
|
92
87
|
}
|
|
93
88
|
|
|
94
89
|
if model_value.description is not None:
|
|
95
90
|
dbt_model["description"] = model_value.description
|
|
96
|
-
columns = _to_columns(model_value.fields, False, adapter_type)
|
|
91
|
+
columns = _to_columns(data_contract_spec, model_value.fields, False, adapter_type)
|
|
97
92
|
if columns:
|
|
98
93
|
dbt_model["columns"] = columns
|
|
99
94
|
return dbt_model
|
|
@@ -114,7 +109,7 @@ def _to_dbt_model(model_key, model_value: Model, data_contract_spec: DataContrac
|
|
|
114
109
|
dbt_model["config"]["contract"] = {"enforced": True}
|
|
115
110
|
if model_value.description is not None:
|
|
116
111
|
dbt_model["description"] = model_value.description
|
|
117
|
-
columns = _to_columns(model_value.fields, _supports_constraints(model_type), None)
|
|
112
|
+
columns = _to_columns(data_contract_spec, model_value.fields, _supports_constraints(model_type), None)
|
|
118
113
|
if columns:
|
|
119
114
|
dbt_model["columns"] = columns
|
|
120
115
|
return dbt_model
|
|
@@ -137,15 +132,33 @@ def _supports_constraints(model_type):
|
|
|
137
132
|
return model_type == "table" or model_type == "incremental"
|
|
138
133
|
|
|
139
134
|
|
|
140
|
-
def _to_columns(
|
|
135
|
+
def _to_columns(
|
|
136
|
+
data_contract_spec: DataContractSpecification,
|
|
137
|
+
fields: Dict[str, Field],
|
|
138
|
+
supports_constraints: bool,
|
|
139
|
+
adapter_type: Optional[str],
|
|
140
|
+
) -> list:
|
|
141
141
|
columns = []
|
|
142
142
|
for field_name, field in fields.items():
|
|
143
|
-
column = _to_column(field_name, field, supports_constraints, adapter_type)
|
|
143
|
+
column = _to_column(data_contract_spec, field_name, field, supports_constraints, adapter_type)
|
|
144
144
|
columns.append(column)
|
|
145
145
|
return columns
|
|
146
146
|
|
|
147
147
|
|
|
148
|
-
def
|
|
148
|
+
def get_table_name_and_column_name(references: str) -> tuple[Optional[str], str]:
|
|
149
|
+
parts = references.split(".")
|
|
150
|
+
if len(parts) < 2:
|
|
151
|
+
return None, parts[0]
|
|
152
|
+
return parts[-2], parts[-1]
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _to_column(
|
|
156
|
+
data_contract_spec: DataContractSpecification,
|
|
157
|
+
field_name: str,
|
|
158
|
+
field: Field,
|
|
159
|
+
supports_constraints: bool,
|
|
160
|
+
adapter_type: Optional[str],
|
|
161
|
+
) -> dict:
|
|
149
162
|
column = {"name": field_name}
|
|
150
163
|
adapter_type = adapter_type or "snowflake"
|
|
151
164
|
dbt_type = convert_to_sql_type(field, adapter_type)
|
|
@@ -239,9 +252,21 @@ def _to_column(field_name: str, field: Field, supports_constraints: bool, adapte
|
|
|
239
252
|
}
|
|
240
253
|
}
|
|
241
254
|
)
|
|
255
|
+
if field.references is not None:
|
|
256
|
+
ref_source_name = data_contract_spec.id
|
|
257
|
+
table_name, column_name = get_table_name_and_column_name(field.references)
|
|
258
|
+
if table_name is not None and column_name is not None:
|
|
259
|
+
column["data_tests"].append(
|
|
260
|
+
{
|
|
261
|
+
"relationships": {
|
|
262
|
+
"to": f"""source("{ref_source_name}", "{table_name}")""",
|
|
263
|
+
"field": f"{column_name}",
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
)
|
|
242
267
|
|
|
243
|
-
|
|
244
|
-
|
|
268
|
+
if not column["data_tests"]:
|
|
269
|
+
column.pop("data_tests")
|
|
245
270
|
|
|
246
271
|
# TODO: all constraints
|
|
247
272
|
return column
|
datacontract/export/exporter.py
CHANGED
|
@@ -2,7 +2,10 @@ import typing
|
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
3
|
from enum import Enum
|
|
4
4
|
|
|
5
|
-
from datacontract.model.data_contract_specification import
|
|
5
|
+
from datacontract.model.data_contract_specification import (
|
|
6
|
+
DataContractSpecification,
|
|
7
|
+
Model,
|
|
8
|
+
)
|
|
6
9
|
|
|
7
10
|
|
|
8
11
|
class Exporter(ABC):
|
|
@@ -40,6 +43,7 @@ class ExportFormat(str, Enum):
|
|
|
40
43
|
sqlalchemy = "sqlalchemy"
|
|
41
44
|
data_caterer = "data-caterer"
|
|
42
45
|
dcs = "dcs"
|
|
46
|
+
markdown = "markdown"
|
|
43
47
|
iceberg = "iceberg"
|
|
44
48
|
|
|
45
49
|
@classmethod
|
|
@@ -49,7 +53,7 @@ class ExportFormat(str, Enum):
|
|
|
49
53
|
|
|
50
54
|
def _check_models_for_export(
|
|
51
55
|
data_contract: DataContractSpecification, model: str, export_format: str
|
|
52
|
-
) -> typing.Tuple[str,
|
|
56
|
+
) -> typing.Tuple[str, Model]:
|
|
53
57
|
if data_contract.models is None:
|
|
54
58
|
raise RuntimeError(f"Export to {export_format} requires models in the data contract.")
|
|
55
59
|
|