datacontract-cli 0.10.16__py3-none-any.whl → 0.10.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/breaking/breaking_rules.py +4 -0
- datacontract/cli.py +49 -32
- datacontract/data_contract.py +14 -11
- datacontract/engines/fastjsonschema/check_jsonschema.py +15 -4
- datacontract/engines/soda/check_soda_execute.py +9 -4
- datacontract/engines/soda/connections/databricks.py +12 -3
- datacontract/engines/soda/connections/duckdb.py +22 -9
- datacontract/export/data_caterer_converter.py +20 -7
- datacontract/export/dbml_converter.py +2 -2
- datacontract/export/dbt_converter.py +41 -16
- datacontract/export/exporter.py +6 -2
- datacontract/export/exporter_factory.py +48 -14
- datacontract/export/iceberg_converter.py +3 -3
- datacontract/export/markdown_converter.py +208 -0
- datacontract/export/odcs_v3_exporter.py +6 -0
- datacontract/export/sodacl_converter.py +22 -5
- datacontract/export/sql_converter.py +1 -1
- datacontract/export/sql_type_converter.py +28 -2
- datacontract/export/sqlalchemy_converter.py +3 -1
- datacontract/imports/csv_importer.py +89 -0
- datacontract/imports/dbml_importer.py +1 -1
- datacontract/imports/dbt_importer.py +94 -12
- datacontract/imports/importer.py +1 -0
- datacontract/imports/importer_factory.py +5 -0
- datacontract/imports/odcs_v2_importer.py +1 -1
- datacontract/imports/odcs_v3_importer.py +1 -1
- datacontract/imports/sql_importer.py +1 -1
- datacontract/init/init_template.py +20 -0
- datacontract/integration/datamesh_manager.py +15 -9
- datacontract/lint/linters/field_reference_linter.py +10 -1
- datacontract/lint/resolve.py +48 -14
- datacontract/lint/schema.py +10 -3
- datacontract/model/data_contract_specification.py +13 -4
- datacontract/model/run.py +1 -0
- datacontract/schemas/datacontract-1.1.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.1.0.schema.json +1975 -0
- datacontract/schemas/odcs-3.0.1.schema.json +2634 -0
- datacontract/templates/datacontract.html +20 -1
- datacontract/templates/partials/definition.html +15 -5
- datacontract/templates/partials/model_field.html +10 -1
- {datacontract_cli-0.10.16.dist-info → datacontract_cli-0.10.19.dist-info}/METADATA +477 -343
- {datacontract_cli-0.10.16.dist-info → datacontract_cli-0.10.19.dist-info}/RECORD +46 -42
- {datacontract_cli-0.10.16.dist-info → datacontract_cli-0.10.19.dist-info}/WHEEL +1 -1
- datacontract/init/download_datacontract_file.py +0 -17
- datacontract/integration/opentelemetry.py +0 -103
- {datacontract_cli-0.10.16.dist-info → datacontract_cli-0.10.19.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.10.16.dist-info → datacontract_cli-0.10.19.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.16.dist-info → datacontract_cli-0.10.19.dist-info}/top_level.txt +0 -0
|
@@ -3,7 +3,7 @@ from typing import TypedDict
|
|
|
3
3
|
|
|
4
4
|
from dbt.artifacts.resources.v1.components import ColumnInfo
|
|
5
5
|
from dbt.contracts.graph.manifest import Manifest
|
|
6
|
-
from dbt.contracts.graph.nodes import GenericTestNode
|
|
6
|
+
from dbt.contracts.graph.nodes import GenericTestNode, ManifestNode, ModelNode
|
|
7
7
|
from dbt_common.contracts.constraints import ConstraintType
|
|
8
8
|
|
|
9
9
|
from datacontract.imports.bigquery_importer import map_type_from_bigquery
|
|
@@ -51,6 +51,46 @@ def read_dbt_manifest(manifest_path: str) -> Manifest:
|
|
|
51
51
|
return manifest
|
|
52
52
|
|
|
53
53
|
|
|
54
|
+
def _get_primary_keys(manifest: Manifest, node: ManifestNode) -> list[str]:
|
|
55
|
+
node_unique_id = node.unique_id
|
|
56
|
+
if isinstance(node, ModelNode):
|
|
57
|
+
test_nodes = []
|
|
58
|
+
for node_id in manifest.child_map.get(node_unique_id, []):
|
|
59
|
+
test_node = manifest.nodes.get(node_id)
|
|
60
|
+
if not test_node or test_node.resource_type != "test":
|
|
61
|
+
continue
|
|
62
|
+
if not isinstance(test_node, GenericTestNode):
|
|
63
|
+
continue
|
|
64
|
+
if test_node.config.where is not None:
|
|
65
|
+
continue
|
|
66
|
+
test_nodes.append(test_node)
|
|
67
|
+
return node.infer_primary_key(test_nodes)
|
|
68
|
+
return []
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _get_references(manifest: Manifest, node: ManifestNode) -> dict[str, str]:
|
|
72
|
+
node_unique_id = node.unique_id
|
|
73
|
+
references = {}
|
|
74
|
+
for node_id in manifest.child_map.get(node_unique_id, []):
|
|
75
|
+
test_node = manifest.nodes.get(node_id)
|
|
76
|
+
if not test_node or test_node.resource_type != "test":
|
|
77
|
+
continue
|
|
78
|
+
if not isinstance(test_node, GenericTestNode):
|
|
79
|
+
continue
|
|
80
|
+
if test_node.test_metadata.name != "relationships":
|
|
81
|
+
continue
|
|
82
|
+
if test_node.config.where is not None:
|
|
83
|
+
continue
|
|
84
|
+
if test_node.attached_node != node_unique_id:
|
|
85
|
+
continue
|
|
86
|
+
relationship_target_node_id = [n for n in test_node.depends_on.nodes if n != node_unique_id][0]
|
|
87
|
+
relationship_target_node = manifest.nodes.get(relationship_target_node_id)
|
|
88
|
+
references[f"{node.name}.{test_node.column_name}"] = (
|
|
89
|
+
f"""{relationship_target_node.name}.{test_node.test_metadata.kwargs["field"]}"""
|
|
90
|
+
)
|
|
91
|
+
return references
|
|
92
|
+
|
|
93
|
+
|
|
54
94
|
def import_dbt_manifest(
|
|
55
95
|
data_contract_specification: DataContractSpecification,
|
|
56
96
|
manifest: Manifest,
|
|
@@ -65,28 +105,40 @@ def import_dbt_manifest(
|
|
|
65
105
|
data_contract_specification.info.dbt_version = manifest.metadata.dbt_version
|
|
66
106
|
adapter_type = manifest.metadata.adapter_type
|
|
67
107
|
data_contract_specification.models = data_contract_specification.models or {}
|
|
68
|
-
for
|
|
108
|
+
for node in manifest.nodes.values():
|
|
69
109
|
# Only intressted in processing models.
|
|
70
|
-
if
|
|
110
|
+
if node.resource_type not in resource_types:
|
|
71
111
|
continue
|
|
72
112
|
|
|
73
113
|
# To allow args stored in dbt_models to filter relevant models.
|
|
74
114
|
# If dbt_models is empty, use all models.
|
|
75
|
-
if dbt_nodes and
|
|
115
|
+
if dbt_nodes and node.name not in dbt_nodes:
|
|
76
116
|
continue
|
|
77
117
|
|
|
118
|
+
model_unique_id = node.unique_id
|
|
119
|
+
primary_keys = _get_primary_keys(manifest, node)
|
|
120
|
+
references = _get_references(manifest, node)
|
|
121
|
+
|
|
122
|
+
primary_key = None
|
|
123
|
+
if len(primary_keys) == 1:
|
|
124
|
+
primary_key = primary_keys[0]
|
|
125
|
+
|
|
78
126
|
dc_model = Model(
|
|
79
|
-
description=
|
|
80
|
-
tags=
|
|
127
|
+
description=node.description,
|
|
128
|
+
tags=node.tags,
|
|
81
129
|
fields=create_fields(
|
|
82
130
|
manifest,
|
|
83
|
-
model_unique_id=
|
|
84
|
-
columns=
|
|
131
|
+
model_unique_id=model_unique_id,
|
|
132
|
+
columns=node.columns,
|
|
133
|
+
primary_key_name=primary_key,
|
|
134
|
+
references=references,
|
|
85
135
|
adapter_type=adapter_type,
|
|
86
136
|
),
|
|
87
137
|
)
|
|
138
|
+
if len(primary_keys) > 1:
|
|
139
|
+
dc_model.primaryKey = primary_keys
|
|
88
140
|
|
|
89
|
-
data_contract_specification.models[
|
|
141
|
+
data_contract_specification.models[node.name] = dc_model
|
|
90
142
|
|
|
91
143
|
return data_contract_specification
|
|
92
144
|
|
|
@@ -98,9 +150,17 @@ def convert_data_type_by_adapter_type(data_type: str, adapter_type: str) -> str:
|
|
|
98
150
|
|
|
99
151
|
|
|
100
152
|
def create_fields(
|
|
101
|
-
manifest: Manifest,
|
|
153
|
+
manifest: Manifest,
|
|
154
|
+
model_unique_id: str,
|
|
155
|
+
columns: dict[str, ColumnInfo],
|
|
156
|
+
primary_key_name: str,
|
|
157
|
+
references: dict[str, str],
|
|
158
|
+
adapter_type: str,
|
|
102
159
|
) -> dict[str, Field]:
|
|
103
|
-
fields = {
|
|
160
|
+
fields = {
|
|
161
|
+
column.name: create_field(manifest, model_unique_id, column, primary_key_name, references, adapter_type)
|
|
162
|
+
for column in columns.values()
|
|
163
|
+
}
|
|
104
164
|
return fields
|
|
105
165
|
|
|
106
166
|
|
|
@@ -137,7 +197,14 @@ def get_column_tests(manifest: Manifest, model_name: str, column_name: str) -> l
|
|
|
137
197
|
return column_tests
|
|
138
198
|
|
|
139
199
|
|
|
140
|
-
def create_field(
|
|
200
|
+
def create_field(
|
|
201
|
+
manifest: Manifest,
|
|
202
|
+
model_unique_id: str,
|
|
203
|
+
column: ColumnInfo,
|
|
204
|
+
primary_key_name: str,
|
|
205
|
+
references: dict[str, str],
|
|
206
|
+
adapter_type: str,
|
|
207
|
+
) -> Field:
|
|
141
208
|
column_type = convert_data_type_by_adapter_type(column.data_type, adapter_type) if column.data_type else ""
|
|
142
209
|
field = Field(
|
|
143
210
|
description=column.description,
|
|
@@ -155,4 +222,19 @@ def create_field(manifest: Manifest, model_unique_id: str, column: ColumnInfo, a
|
|
|
155
222
|
if required:
|
|
156
223
|
field.required = required
|
|
157
224
|
|
|
225
|
+
unique = False
|
|
226
|
+
if any(constraint.type == ConstraintType.unique for constraint in column.constraints):
|
|
227
|
+
unique = True
|
|
228
|
+
if [test for test in all_tests if test["test_type"] == "unique"]:
|
|
229
|
+
unique = True
|
|
230
|
+
if unique:
|
|
231
|
+
field.unique = unique
|
|
232
|
+
|
|
233
|
+
if column.name == primary_key_name:
|
|
234
|
+
field.primaryKey = True
|
|
235
|
+
|
|
236
|
+
references_key = f"{manifest.nodes[model_unique_id].name}.{column.name}"
|
|
237
|
+
if references_key in references:
|
|
238
|
+
field.references = references[references_key]
|
|
239
|
+
|
|
158
240
|
return field
|
datacontract/imports/importer.py
CHANGED
|
@@ -104,3 +104,8 @@ importer_factory.register_lazy_importer(
|
|
|
104
104
|
module_path="datacontract.imports.parquet_importer",
|
|
105
105
|
class_name="ParquetImporter",
|
|
106
106
|
)
|
|
107
|
+
importer_factory.register_lazy_importer(
|
|
108
|
+
name=ImportFormat.csv,
|
|
109
|
+
module_path="datacontract.imports.csv_importer",
|
|
110
|
+
class_name="CsvImporter",
|
|
111
|
+
)
|
|
@@ -141,7 +141,7 @@ def import_fields(odcs_columns: Dict[str, Any], custom_type_mappings: Dict[str,
|
|
|
141
141
|
type=mapped_type,
|
|
142
142
|
title=column.get("businessName") if column.get("businessName") is not None else "",
|
|
143
143
|
required=not column.get("isNullable") if column.get("isNullable") is not None else False,
|
|
144
|
-
|
|
144
|
+
primaryKey=column.get("isPrimary") if column.get("isPrimary") is not None else False,
|
|
145
145
|
unique=column.get("isUnique") if column.get("isUnique") is not None else False,
|
|
146
146
|
classification=column.get("classification") if column.get("classification") is not None else "",
|
|
147
147
|
tags=column.get("tags") if column.get("tags") is not None else [],
|
|
@@ -265,7 +265,7 @@ def import_fields(
|
|
|
265
265
|
type=mapped_type,
|
|
266
266
|
title=odcs_property.get("businessName"),
|
|
267
267
|
required=not odcs_property.get("nullable") if odcs_property.get("nullable") is not None else False,
|
|
268
|
-
|
|
268
|
+
primaryKey=odcs_property.get("primaryKey")
|
|
269
269
|
if not has_composite_primary_key(odcs_properties) and odcs_property.get("primaryKey") is not None
|
|
270
270
|
else False,
|
|
271
271
|
unique=odcs_property.get("unique"),
|
|
@@ -38,7 +38,7 @@ def import_sql(data_contract_specification: DataContractSpecification, format: s
|
|
|
38
38
|
if primary_key in fields:
|
|
39
39
|
fields[primary_key].unique = True
|
|
40
40
|
fields[primary_key].required = True
|
|
41
|
-
fields[primary_key].
|
|
41
|
+
fields[primary_key].primaryKey = True
|
|
42
42
|
|
|
43
43
|
data_contract_specification.models[table_name] = Model(
|
|
44
44
|
type="table",
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import importlib.resources as resources
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
|
|
6
|
+
DEFAULT_DATA_CONTRACT_INIT_TEMPLATE = "datacontract-1.1.0.init.yaml"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_init_template(location: str = None) -> str:
|
|
10
|
+
if location is None:
|
|
11
|
+
logging.info("Use default bundled template " + DEFAULT_DATA_CONTRACT_INIT_TEMPLATE)
|
|
12
|
+
schemas = resources.files("datacontract")
|
|
13
|
+
template = schemas.joinpath("schemas", DEFAULT_DATA_CONTRACT_INIT_TEMPLATE)
|
|
14
|
+
with template.open("r") as file:
|
|
15
|
+
return file.read()
|
|
16
|
+
elif location.startswith("http://") or location.startswith("https://"):
|
|
17
|
+
return requests.get(location).text
|
|
18
|
+
else:
|
|
19
|
+
with open(location, "r") as file:
|
|
20
|
+
return file.read()
|
|
@@ -2,11 +2,10 @@ import os
|
|
|
2
2
|
|
|
3
3
|
import requests
|
|
4
4
|
|
|
5
|
-
from datacontract.model.data_contract_specification import DataContractSpecification
|
|
6
5
|
from datacontract.model.run import Run
|
|
7
6
|
|
|
8
7
|
|
|
9
|
-
def publish_test_results_to_datamesh_manager(run: Run, publish_url: str):
|
|
8
|
+
def publish_test_results_to_datamesh_manager(run: Run, publish_url: str, ssl_verification: bool):
|
|
10
9
|
try:
|
|
11
10
|
if publish_url is None:
|
|
12
11
|
# this url supports Data Mesh Manager and Data Contract Manager
|
|
@@ -28,7 +27,12 @@ def publish_test_results_to_datamesh_manager(run: Run, publish_url: str):
|
|
|
28
27
|
headers = {"Content-Type": "application/json", "x-api-key": api_key}
|
|
29
28
|
request_body = run.model_dump_json()
|
|
30
29
|
# print("Request Body:", request_body)
|
|
31
|
-
response = requests.post(
|
|
30
|
+
response = requests.post(
|
|
31
|
+
url,
|
|
32
|
+
data=request_body,
|
|
33
|
+
headers=headers,
|
|
34
|
+
verify=ssl_verification,
|
|
35
|
+
)
|
|
32
36
|
# print("Status Code:", response.status_code)
|
|
33
37
|
# print("Response Body:", response.text)
|
|
34
38
|
if response.status_code != 200:
|
|
@@ -39,9 +43,12 @@ def publish_test_results_to_datamesh_manager(run: Run, publish_url: str):
|
|
|
39
43
|
run.log_error(f"Failed publishing test results. Error: {str(e)}")
|
|
40
44
|
|
|
41
45
|
|
|
42
|
-
def publish_data_contract_to_datamesh_manager(
|
|
46
|
+
def publish_data_contract_to_datamesh_manager(data_contract_dict: dict, ssl_verification: bool):
|
|
43
47
|
try:
|
|
44
48
|
api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
|
|
49
|
+
host = "https://api.datamesh-manager.com"
|
|
50
|
+
if os.getenv("DATAMESH_MANAGER_HOST") is not None:
|
|
51
|
+
host = os.getenv("DATAMESH_MANAGER_HOST")
|
|
45
52
|
if api_key is None:
|
|
46
53
|
api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY")
|
|
47
54
|
if api_key is None:
|
|
@@ -49,14 +56,13 @@ def publish_data_contract_to_datamesh_manager(data_contract_specification: DataC
|
|
|
49
56
|
"Cannot publish data contract, as neither DATAMESH_MANAGER_API_KEY nor DATACONTRACT_MANAGER_API_KEY is set"
|
|
50
57
|
)
|
|
51
58
|
headers = {"Content-Type": "application/json", "x-api-key": api_key}
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
url = "https://api.datamesh-manager.com/api/datacontracts/{0}".format(id)
|
|
55
|
-
request_body = spec.model_dump_json().encode("utf-8")
|
|
59
|
+
id = data_contract_dict["id"]
|
|
60
|
+
url = f"{host}/api/datacontracts/{id}"
|
|
56
61
|
response = requests.put(
|
|
57
62
|
url=url,
|
|
58
|
-
|
|
63
|
+
json=data_contract_dict,
|
|
59
64
|
headers=headers,
|
|
65
|
+
verify=ssl_verification,
|
|
60
66
|
)
|
|
61
67
|
if response.status_code != 200:
|
|
62
68
|
print(f"Error publishing data contract to Data Mesh Manager: {response.text}")
|
|
@@ -22,7 +22,16 @@ class FieldReferenceLinter(Linter):
|
|
|
22
22
|
for model_name, model in contract.models.items():
|
|
23
23
|
for field_name, field in model.fields.items():
|
|
24
24
|
if field.references:
|
|
25
|
-
|
|
25
|
+
reference_hierarchy = field.references.split(".")
|
|
26
|
+
if len(reference_hierarchy) != 2:
|
|
27
|
+
result = result.with_error(
|
|
28
|
+
f"Field '{field_name}' in model '{model_name}'"
|
|
29
|
+
f" references must follow the model.field syntax and refer to a field in a model in this data contract."
|
|
30
|
+
)
|
|
31
|
+
continue
|
|
32
|
+
ref_model = reference_hierarchy[0]
|
|
33
|
+
ref_field = reference_hierarchy[1]
|
|
34
|
+
|
|
26
35
|
if ref_model not in contract.models:
|
|
27
36
|
result = result.with_error(
|
|
28
37
|
f"Field '{field_name}' in model '{model_name}'"
|
datacontract/lint/resolve.py
CHANGED
|
@@ -44,6 +44,27 @@ def resolve_data_contract(
|
|
|
44
44
|
)
|
|
45
45
|
|
|
46
46
|
|
|
47
|
+
def resolve_data_contract_dict(
|
|
48
|
+
data_contract_location: str = None,
|
|
49
|
+
data_contract_str: str = None,
|
|
50
|
+
data_contract: DataContractSpecification = None,
|
|
51
|
+
) -> dict:
|
|
52
|
+
if data_contract_location is not None:
|
|
53
|
+
return _to_yaml(read_resource(data_contract_location))
|
|
54
|
+
elif data_contract_str is not None:
|
|
55
|
+
return _to_yaml(data_contract_str)
|
|
56
|
+
elif data_contract is not None:
|
|
57
|
+
return data_contract.model_dump()
|
|
58
|
+
else:
|
|
59
|
+
raise DataContractException(
|
|
60
|
+
type="lint",
|
|
61
|
+
result="failed",
|
|
62
|
+
name="Check that data contract YAML is valid",
|
|
63
|
+
reason="Data contract needs to be provided",
|
|
64
|
+
engine="datacontract",
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
47
68
|
def resolve_data_contract_from_location(
|
|
48
69
|
location, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False
|
|
49
70
|
) -> DataContractSpecification:
|
|
@@ -54,20 +75,30 @@ def resolve_data_contract_from_location(
|
|
|
54
75
|
def inline_definitions_into_data_contract(spec: DataContractSpecification):
|
|
55
76
|
for model in spec.models.values():
|
|
56
77
|
for field in model.fields.values():
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
78
|
+
inline_definition_into_field(field, spec)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def inline_definition_into_field(field, spec):
|
|
82
|
+
# iterate recursively over arrays
|
|
83
|
+
if field.items is not None:
|
|
84
|
+
inline_definition_into_field(field.items, spec)
|
|
60
85
|
|
|
61
|
-
|
|
62
|
-
|
|
86
|
+
# iterate recursively over nested fields
|
|
87
|
+
if field.fields is not None:
|
|
88
|
+
for nested_field_name, nested_field in field.fields.items():
|
|
89
|
+
inline_definition_into_field(nested_field, spec)
|
|
63
90
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
91
|
+
if not field.ref:
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
definition = _resolve_definition_ref(field.ref, spec)
|
|
95
|
+
for field_name in field.model_fields.keys():
|
|
96
|
+
if field_name in definition.model_fields_set and field_name not in field.model_fields_set:
|
|
97
|
+
setattr(field, field_name, getattr(definition, field_name))
|
|
98
|
+
# extras
|
|
99
|
+
for extra_field_name, extra_field_value in definition.model_extra.items():
|
|
100
|
+
if extra_field_name not in field.model_extra.keys():
|
|
101
|
+
setattr(field, extra_field_name, extra_field_value)
|
|
71
102
|
|
|
72
103
|
|
|
73
104
|
def _resolve_definition_ref(ref, spec) -> Definition:
|
|
@@ -202,9 +233,12 @@ def _resolve_data_contract_from_str(
|
|
|
202
233
|
yaml_dict = _to_yaml(data_contract_str)
|
|
203
234
|
|
|
204
235
|
if is_open_data_contract_standard(yaml_dict):
|
|
236
|
+
logging.info("Importing ODCS v3")
|
|
205
237
|
# if ODCS, then validate the ODCS schema and import to DataContractSpecification directly
|
|
206
238
|
data_contract_specification = DataContractSpecification(dataContractSpecification="1.1.0")
|
|
207
239
|
return import_odcs_v3_from_str(data_contract_specification, source_str=data_contract_str)
|
|
240
|
+
else:
|
|
241
|
+
logging.info("Importing DCS")
|
|
208
242
|
|
|
209
243
|
_validate_data_contract_specification_schema(yaml_dict, schema_location)
|
|
210
244
|
data_contract_specification = yaml_dict
|
|
@@ -218,7 +252,7 @@ def _resolve_data_contract_from_str(
|
|
|
218
252
|
return spec
|
|
219
253
|
|
|
220
254
|
|
|
221
|
-
def _to_yaml(data_contract_str):
|
|
255
|
+
def _to_yaml(data_contract_str) -> dict:
|
|
222
256
|
try:
|
|
223
257
|
yaml_dict = yaml.safe_load(data_contract_str)
|
|
224
258
|
return yaml_dict
|
|
@@ -236,7 +270,7 @@ def _to_yaml(data_contract_str):
|
|
|
236
270
|
def _validate_data_contract_specification_schema(data_contract_yaml, schema_location: str = None):
|
|
237
271
|
schema = fetch_schema(schema_location)
|
|
238
272
|
try:
|
|
239
|
-
fastjsonschema.validate(schema, data_contract_yaml)
|
|
273
|
+
fastjsonschema.validate(schema, data_contract_yaml, use_default=False)
|
|
240
274
|
logging.debug("YAML data is valid.")
|
|
241
275
|
except JsonSchemaValueException as e:
|
|
242
276
|
logging.warning(f"Data Contract YAML is invalid. Validation error: {e.message}")
|
datacontract/lint/schema.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
+
import importlib.resources as resources
|
|
1
2
|
import json
|
|
3
|
+
import logging
|
|
2
4
|
import os
|
|
3
5
|
from typing import Any, Dict
|
|
4
6
|
|
|
@@ -6,6 +8,8 @@ import requests
|
|
|
6
8
|
|
|
7
9
|
from datacontract.model.exceptions import DataContractException
|
|
8
10
|
|
|
11
|
+
DEFAULT_DATA_CONTRACT_SCHEMA = "datacontract-1.1.0.schema.json"
|
|
12
|
+
|
|
9
13
|
|
|
10
14
|
def fetch_schema(location: str = None) -> Dict[str, Any]:
|
|
11
15
|
"""
|
|
@@ -27,9 +31,12 @@ def fetch_schema(location: str = None) -> Dict[str, Any]:
|
|
|
27
31
|
|
|
28
32
|
"""
|
|
29
33
|
if location is None:
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
34
|
+
logging.info("Use default bundled schema " + DEFAULT_DATA_CONTRACT_SCHEMA)
|
|
35
|
+
schemas = resources.files("datacontract")
|
|
36
|
+
schema_file = schemas.joinpath("schemas", DEFAULT_DATA_CONTRACT_SCHEMA)
|
|
37
|
+
with schema_file.open("r") as file:
|
|
38
|
+
schema = json.load(file)
|
|
39
|
+
elif location.startswith("http://") or location.startswith("https://"):
|
|
33
40
|
response = requests.get(location)
|
|
34
41
|
schema = response.json()
|
|
35
42
|
else:
|
|
@@ -72,6 +72,7 @@ class Server(pyd.BaseModel):
|
|
|
72
72
|
dataProductId: str = None
|
|
73
73
|
outputPortId: str = None
|
|
74
74
|
driver: str = None
|
|
75
|
+
storageAccount: str = None
|
|
75
76
|
roles: List[ServerRole] = None
|
|
76
77
|
|
|
77
78
|
model_config = pyd.ConfigDict(
|
|
@@ -112,6 +113,7 @@ class Definition(pyd.BaseModel):
|
|
|
112
113
|
tags: List[str] = []
|
|
113
114
|
links: Dict[str, str] = {}
|
|
114
115
|
example: str = None
|
|
116
|
+
examples: List[Any] | None = None
|
|
115
117
|
|
|
116
118
|
model_config = pyd.ConfigDict(
|
|
117
119
|
extra="allow",
|
|
@@ -141,13 +143,15 @@ class Quality(pyd.BaseModel):
|
|
|
141
143
|
|
|
142
144
|
class Field(pyd.BaseModel):
|
|
143
145
|
ref: str = pyd.Field(default=None, alias="$ref")
|
|
144
|
-
ref_obj: Definition = pyd.Field(default=None, exclude=True)
|
|
145
146
|
title: str | None = None
|
|
146
147
|
type: str = None
|
|
147
148
|
format: str = None
|
|
148
149
|
required: bool = None
|
|
149
|
-
primary: bool =
|
|
150
|
-
|
|
150
|
+
primary: bool = pyd.Field(
|
|
151
|
+
default=None,
|
|
152
|
+
deprecated="Removed in Data Contract Specification v1.1.0. Use primaryKey instead.",
|
|
153
|
+
)
|
|
154
|
+
primaryKey: bool | None = None
|
|
151
155
|
unique: bool | None = None
|
|
152
156
|
references: str = None
|
|
153
157
|
description: str | None = None
|
|
@@ -169,7 +173,10 @@ class Field(pyd.BaseModel):
|
|
|
169
173
|
values: "Field" = None
|
|
170
174
|
precision: int = None
|
|
171
175
|
scale: int = None
|
|
172
|
-
example: str =
|
|
176
|
+
example: str = pyd.Field(
|
|
177
|
+
default=None,
|
|
178
|
+
deprecated="Removed in Data Contract Specification v1.1.0. Use " "examples instead.",
|
|
179
|
+
)
|
|
173
180
|
examples: List[Any] | None = None
|
|
174
181
|
quality: List[Quality] | None = []
|
|
175
182
|
config: Dict[str, Any] | None = None
|
|
@@ -186,6 +193,8 @@ class Model(pyd.BaseModel):
|
|
|
186
193
|
title: Optional[str] = None
|
|
187
194
|
fields: Dict[str, Field] = {}
|
|
188
195
|
quality: List[Quality] | None = []
|
|
196
|
+
primaryKey: List[str] | None = []
|
|
197
|
+
examples: List[Any] | None = None
|
|
189
198
|
config: Dict[str, Any] = None
|
|
190
199
|
tags: List[str] | None = None
|
|
191
200
|
|
datacontract/model/run.py
CHANGED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
dataContractSpecification: 1.1.0
|
|
2
|
+
id: my-data-contract-id
|
|
3
|
+
info:
|
|
4
|
+
title: My Data Contract
|
|
5
|
+
version: 0.0.1
|
|
6
|
+
# description:
|
|
7
|
+
# owner:
|
|
8
|
+
# contact:
|
|
9
|
+
# name:
|
|
10
|
+
# url:
|
|
11
|
+
# email:
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
### servers
|
|
15
|
+
|
|
16
|
+
#servers:
|
|
17
|
+
# production:
|
|
18
|
+
# type: s3
|
|
19
|
+
# location: s3://
|
|
20
|
+
# format: parquet
|
|
21
|
+
# delimiter: new_line
|
|
22
|
+
|
|
23
|
+
### terms
|
|
24
|
+
|
|
25
|
+
#terms:
|
|
26
|
+
# usage:
|
|
27
|
+
# limitations:
|
|
28
|
+
# billing:
|
|
29
|
+
# noticePeriod:
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
### models
|
|
33
|
+
|
|
34
|
+
# models:
|
|
35
|
+
# my_model:
|
|
36
|
+
# description:
|
|
37
|
+
# type:
|
|
38
|
+
# fields:
|
|
39
|
+
# my_field:
|
|
40
|
+
# type:
|
|
41
|
+
# description:
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
### definitions
|
|
45
|
+
|
|
46
|
+
# definitions:
|
|
47
|
+
# my_field:
|
|
48
|
+
# domain:
|
|
49
|
+
# name:
|
|
50
|
+
# title:
|
|
51
|
+
# type:
|
|
52
|
+
# description:
|
|
53
|
+
# example:
|
|
54
|
+
# pii:
|
|
55
|
+
# classification:
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
### servicelevels
|
|
59
|
+
|
|
60
|
+
#servicelevels:
|
|
61
|
+
# availability:
|
|
62
|
+
# description: The server is available during support hours
|
|
63
|
+
# percentage: 99.9%
|
|
64
|
+
# retention:
|
|
65
|
+
# description: Data is retained for one year because!
|
|
66
|
+
# period: P1Y
|
|
67
|
+
# unlimited: false
|
|
68
|
+
# latency:
|
|
69
|
+
# description: Data is available within 25 hours after the order was placed
|
|
70
|
+
# threshold: 25h
|
|
71
|
+
# sourceTimestampField: orders.order_timestamp
|
|
72
|
+
# processedTimestampField: orders.processed_timestamp
|
|
73
|
+
# freshness:
|
|
74
|
+
# description: The age of the youngest row in a table.
|
|
75
|
+
# threshold: 25h
|
|
76
|
+
# timestampField: orders.order_timestamp
|
|
77
|
+
# frequency:
|
|
78
|
+
# description: Data is delivered once a day
|
|
79
|
+
# type: batch # or streaming
|
|
80
|
+
# interval: daily # for batch, either or cron
|
|
81
|
+
# cron: 0 0 * * * # for batch, either or interval
|
|
82
|
+
# support:
|
|
83
|
+
# description: The data is available during typical business hours at headquarters
|
|
84
|
+
# time: 9am to 5pm in EST on business days
|
|
85
|
+
# responseTime: 1h
|
|
86
|
+
# backup:
|
|
87
|
+
# description: Data is backed up once a week, every Sunday at 0:00 UTC.
|
|
88
|
+
# interval: weekly
|
|
89
|
+
# cron: 0 0 * * 0
|
|
90
|
+
# recoveryTime: 24 hours
|
|
91
|
+
# recoveryPoint: 1 week
|