datacontract-cli 0.10.9__py3-none-any.whl → 0.10.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/cli.py +7 -0
- datacontract/data_contract.py +4 -2
- datacontract/engines/soda/check_soda_execute.py +5 -2
- datacontract/engines/soda/connections/duckdb.py +4 -0
- datacontract/export/avro_converter.py +1 -1
- datacontract/export/sodacl_converter.py +1 -1
- datacontract/imports/avro_importer.py +142 -8
- datacontract/imports/dbt_importer.py +117 -0
- datacontract/imports/glue_importer.py +2 -2
- datacontract/imports/importer.py +6 -1
- datacontract/imports/importer_factory.py +24 -6
- datacontract/imports/jsonschema_importer.py +6 -3
- datacontract/imports/spark_importer.py +134 -0
- datacontract/integration/publish_datamesh_manager.py +10 -5
- datacontract/lint/resolve.py +72 -27
- datacontract/lint/schema.py +24 -4
- datacontract/model/data_contract_specification.py +3 -0
- datacontract/templates/datacontract.html +1 -1
- datacontract/templates/index.html +1 -1
- {datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.10.dist-info}/METADATA +114 -101
- {datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.10.dist-info}/RECORD +25 -23
- {datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.10.dist-info}/WHEEL +1 -1
- {datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.10.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.10.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.10.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
from pyspark.sql import DataFrame, SparkSession, types
|
|
2
|
+
from datacontract.imports.importer import Importer
|
|
3
|
+
from datacontract.model.data_contract_specification import (
|
|
4
|
+
DataContractSpecification,
|
|
5
|
+
Model,
|
|
6
|
+
Field,
|
|
7
|
+
Server,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SparkImporter(Importer):
|
|
12
|
+
def import_source(
|
|
13
|
+
self,
|
|
14
|
+
data_contract_specification: DataContractSpecification,
|
|
15
|
+
source: str,
|
|
16
|
+
import_args: dict,
|
|
17
|
+
) -> dict:
|
|
18
|
+
"""
|
|
19
|
+
Imports data from a Spark source into the data contract specification.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
data_contract_specification: The data contract specification object.
|
|
23
|
+
source: The source string indicating the Spark tables to read.
|
|
24
|
+
import_args: Additional arguments for the import process.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
dict: The updated data contract specification.
|
|
28
|
+
"""
|
|
29
|
+
return import_spark(data_contract_specification, source)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def import_spark(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
|
|
33
|
+
"""
|
|
34
|
+
Reads Spark tables and updates the data contract specification with their schemas.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
data_contract_specification: The data contract specification to update.
|
|
38
|
+
source: A comma-separated string of Spark temporary views to read.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
DataContractSpecification: The updated data contract specification.
|
|
42
|
+
"""
|
|
43
|
+
spark = SparkSession.builder.getOrCreate()
|
|
44
|
+
data_contract_specification.servers["local"] = Server(type="dataframe")
|
|
45
|
+
for temp_view in source.split(","):
|
|
46
|
+
temp_view = temp_view.strip()
|
|
47
|
+
df = spark.read.table(temp_view)
|
|
48
|
+
data_contract_specification.models[temp_view] = import_from_spark_df(df)
|
|
49
|
+
return data_contract_specification
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def import_from_spark_df(df: DataFrame) -> Model:
|
|
53
|
+
"""
|
|
54
|
+
Converts a Spark DataFrame into a Model.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
df: The Spark DataFrame to convert.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
Model: The generated data contract model.
|
|
61
|
+
"""
|
|
62
|
+
model = Model()
|
|
63
|
+
schema = df.schema
|
|
64
|
+
|
|
65
|
+
for field in schema:
|
|
66
|
+
model.fields[field.name] = _field_from_spark(field)
|
|
67
|
+
|
|
68
|
+
return model
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _field_from_spark(spark_field: types.StructField) -> Field:
|
|
72
|
+
"""
|
|
73
|
+
Converts a Spark StructField into a Field object for the data contract.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
spark_field: The Spark StructField to convert.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Field: The corresponding Field object.
|
|
80
|
+
"""
|
|
81
|
+
field_type = _data_type_from_spark(spark_field.dataType)
|
|
82
|
+
field = Field()
|
|
83
|
+
field.type = field_type
|
|
84
|
+
field.required = not spark_field.nullable
|
|
85
|
+
|
|
86
|
+
if field_type == "array":
|
|
87
|
+
field.items = _field_from_spark(spark_field.dataType.elementType)
|
|
88
|
+
|
|
89
|
+
if field_type == "struct":
|
|
90
|
+
field.fields = {sf.name: _field_from_spark(sf) for sf in spark_field.dataType.fields}
|
|
91
|
+
|
|
92
|
+
return field
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _data_type_from_spark(spark_type: types.DataType) -> str:
|
|
96
|
+
"""
|
|
97
|
+
Maps Spark data types to the Data Contract type system.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
spark_type: The Spark data type to map.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
str: The corresponding Data Contract type.
|
|
104
|
+
"""
|
|
105
|
+
if isinstance(spark_type, types.StringType):
|
|
106
|
+
return "string"
|
|
107
|
+
elif isinstance(spark_type, types.IntegerType):
|
|
108
|
+
return "integer"
|
|
109
|
+
elif isinstance(spark_type, types.LongType):
|
|
110
|
+
return "long"
|
|
111
|
+
elif isinstance(spark_type, types.FloatType):
|
|
112
|
+
return "float"
|
|
113
|
+
elif isinstance(spark_type, types.DoubleType):
|
|
114
|
+
return "double"
|
|
115
|
+
elif isinstance(spark_type, types.StructType):
|
|
116
|
+
return "struct"
|
|
117
|
+
elif isinstance(spark_type, types.ArrayType):
|
|
118
|
+
return "array"
|
|
119
|
+
elif isinstance(spark_type, types.TimestampType):
|
|
120
|
+
return "timestamp"
|
|
121
|
+
elif isinstance(spark_type, types.TimestampNTZType):
|
|
122
|
+
return "timestamp_ntz"
|
|
123
|
+
elif isinstance(spark_type, types.DateType):
|
|
124
|
+
return "date"
|
|
125
|
+
elif isinstance(spark_type, types.BooleanType):
|
|
126
|
+
return "boolean"
|
|
127
|
+
elif isinstance(spark_type, types.BinaryType):
|
|
128
|
+
return "bytes"
|
|
129
|
+
elif isinstance(spark_type, types.DecimalType):
|
|
130
|
+
return "decimal"
|
|
131
|
+
elif isinstance(spark_type, types.NullType):
|
|
132
|
+
return "null"
|
|
133
|
+
else:
|
|
134
|
+
raise ValueError(f"Unsupported Spark type: {spark_type}")
|
|
@@ -8,18 +8,23 @@ from datacontract.model.run import Run
|
|
|
8
8
|
def publish_datamesh_manager(run: Run, publish_url: str):
|
|
9
9
|
try:
|
|
10
10
|
if publish_url is None:
|
|
11
|
-
url
|
|
11
|
+
# this url supports Data Mesh Manager and Data Contract Manager
|
|
12
|
+
url = "https://api.datamesh-manager.com/api/test-results"
|
|
12
13
|
else:
|
|
13
14
|
url = publish_url
|
|
14
|
-
|
|
15
|
+
api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
|
|
16
|
+
if api_key is None:
|
|
17
|
+
api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY")
|
|
15
18
|
|
|
16
19
|
if run.dataContractId is None:
|
|
17
20
|
raise Exception("Cannot publish run results, as data contract ID is unknown")
|
|
18
21
|
|
|
19
|
-
if
|
|
20
|
-
raise Exception(
|
|
22
|
+
if api_key is None:
|
|
23
|
+
raise Exception(
|
|
24
|
+
"Cannot publish run results, as DATAMESH_MANAGER_API_KEY nor DATACONTRACT_MANAGER_API_KEY are not set"
|
|
25
|
+
)
|
|
21
26
|
|
|
22
|
-
headers = {"Content-Type": "application/json", "x-api-key":
|
|
27
|
+
headers = {"Content-Type": "application/json", "x-api-key": api_key}
|
|
23
28
|
request_body = run.model_dump_json()
|
|
24
29
|
# print("Request Body:", request_body)
|
|
25
30
|
response = requests.post(url, data=request_body, headers=headers)
|
datacontract/lint/resolve.py
CHANGED
|
@@ -25,7 +25,7 @@ def resolve_data_contract(
|
|
|
25
25
|
data_contract_location, schema_location, inline_definitions, inline_quality
|
|
26
26
|
)
|
|
27
27
|
elif data_contract_str is not None:
|
|
28
|
-
return
|
|
28
|
+
return _resolve_data_contract_from_str(data_contract_str, schema_location, inline_definitions, inline_quality)
|
|
29
29
|
elif data_contract is not None:
|
|
30
30
|
return data_contract
|
|
31
31
|
else:
|
|
@@ -45,7 +45,7 @@ def resolve_data_contract_from_location(
|
|
|
45
45
|
data_contract_str = fetch_resource(location)
|
|
46
46
|
else:
|
|
47
47
|
data_contract_str = read_file(location)
|
|
48
|
-
return
|
|
48
|
+
return _resolve_data_contract_from_str(data_contract_str, schema_location, inline_definitions, inline_quality)
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
def inline_definitions_into_data_contract(spec: DataContractSpecification):
|
|
@@ -55,7 +55,7 @@ def inline_definitions_into_data_contract(spec: DataContractSpecification):
|
|
|
55
55
|
if not field.ref and not field.ref_obj:
|
|
56
56
|
continue
|
|
57
57
|
|
|
58
|
-
definition =
|
|
58
|
+
definition = _resolve_definition_ref(field.ref, spec)
|
|
59
59
|
field.ref_obj = definition
|
|
60
60
|
|
|
61
61
|
for field_name in field.model_fields.keys():
|
|
@@ -67,19 +67,41 @@ def inline_definitions_into_data_contract(spec: DataContractSpecification):
|
|
|
67
67
|
setattr(field, extra_field_name, extra_field_value)
|
|
68
68
|
|
|
69
69
|
|
|
70
|
-
def
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
path = ref
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
70
|
+
def _resolve_definition_ref(ref, spec) -> Definition:
|
|
71
|
+
logging.info(f"Resolving definition ref {ref}")
|
|
72
|
+
|
|
73
|
+
if "#" in ref:
|
|
74
|
+
path, definition_path = ref.split("#")
|
|
75
|
+
else:
|
|
76
|
+
path, definition_path = ref, None
|
|
77
|
+
|
|
78
|
+
if path.startswith("http://") or path.startswith("https://"):
|
|
79
|
+
logging.info(f"Resolving definition url {path}")
|
|
80
|
+
|
|
81
|
+
definition_str = fetch_resource(path)
|
|
82
|
+
definition_dict = _to_yaml(definition_str)
|
|
83
|
+
definition = Definition(**definition_dict)
|
|
84
|
+
if definition_path is not None:
|
|
85
|
+
return _find_by_path_in_definition(definition_path, definition)
|
|
86
|
+
else:
|
|
87
|
+
return definition
|
|
88
|
+
elif path.startswith("file://"):
|
|
89
|
+
logging.info(f"Resolving definition file path {path}")
|
|
90
|
+
|
|
91
|
+
path = path.replace("file://", "")
|
|
92
|
+
definition_str = _fetch_file(path)
|
|
93
|
+
definition_dict = _to_yaml(definition_str)
|
|
94
|
+
definition = Definition(**definition_dict)
|
|
95
|
+
if definition_path is not None:
|
|
96
|
+
return _find_by_path_in_definition(definition_path, definition)
|
|
97
|
+
else:
|
|
98
|
+
return definition
|
|
99
|
+
elif ref.startswith("#"):
|
|
100
|
+
logging.info(f"Resolving definition local path {path}")
|
|
101
|
+
|
|
102
|
+
definition_path = ref[1:]
|
|
103
|
+
|
|
104
|
+
return _find_by_path_in_spec(definition_path, spec)
|
|
83
105
|
else:
|
|
84
106
|
raise DataContractException(
|
|
85
107
|
type="lint",
|
|
@@ -90,7 +112,30 @@ def resolve_definition_ref(ref, definitions) -> Definition:
|
|
|
90
112
|
)
|
|
91
113
|
|
|
92
114
|
|
|
93
|
-
def
|
|
115
|
+
def _find_by_path_in_spec(definition_path: str, spec: DataContractSpecification):
|
|
116
|
+
path_elements = definition_path.split("/")
|
|
117
|
+
definition = spec.definitions[path_elements[2]]
|
|
118
|
+
definition = _find_subfield_in_definition(definition, path_elements[3:])
|
|
119
|
+
return definition
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _find_by_path_in_definition(definition_path: str, definition: Definition):
|
|
123
|
+
if definition_path == "" or definition_path == "/":
|
|
124
|
+
return definition
|
|
125
|
+
|
|
126
|
+
path_elements = definition_path.split("/")
|
|
127
|
+
return _find_subfield_in_definition(definition, path_elements[1:])
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _find_subfield_in_definition(definition: Definition, path_elements):
|
|
131
|
+
while len(path_elements) > 0 and path_elements[0] == "fields":
|
|
132
|
+
definition = definition.fields[path_elements[1]]
|
|
133
|
+
path_elements = path_elements[2:]
|
|
134
|
+
|
|
135
|
+
return definition
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _fetch_file(path) -> str:
|
|
94
139
|
if not os.path.exists(path):
|
|
95
140
|
raise DataContractException(
|
|
96
141
|
type="export",
|
|
@@ -103,7 +148,7 @@ def fetch_file(path) -> str:
|
|
|
103
148
|
return file.read()
|
|
104
149
|
|
|
105
150
|
|
|
106
|
-
def
|
|
151
|
+
def _resolve_quality_ref(quality: Quality):
|
|
107
152
|
"""
|
|
108
153
|
Return the content of a ref file path
|
|
109
154
|
@param quality data contract quality specification
|
|
@@ -112,13 +157,13 @@ def resolve_quality_ref(quality: Quality):
|
|
|
112
157
|
specification = quality.specification
|
|
113
158
|
if quality.type == "great-expectations":
|
|
114
159
|
for model, model_quality in specification.items():
|
|
115
|
-
specification[model] =
|
|
160
|
+
specification[model] = _get_quality_ref_file(model_quality)
|
|
116
161
|
else:
|
|
117
162
|
if "$ref" in specification:
|
|
118
|
-
quality.specification =
|
|
163
|
+
quality.specification = _get_quality_ref_file(specification)
|
|
119
164
|
|
|
120
165
|
|
|
121
|
-
def
|
|
166
|
+
def _get_quality_ref_file(quality_spec: str | object) -> str | object:
|
|
122
167
|
"""
|
|
123
168
|
Get the file associated with a quality reference
|
|
124
169
|
@param quality_spec quality specification
|
|
@@ -139,23 +184,23 @@ def get_quality_ref_file(quality_spec: str | object) -> str | object:
|
|
|
139
184
|
return quality_spec
|
|
140
185
|
|
|
141
186
|
|
|
142
|
-
def
|
|
187
|
+
def _resolve_data_contract_from_str(
|
|
143
188
|
data_contract_str, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False
|
|
144
189
|
) -> DataContractSpecification:
|
|
145
|
-
data_contract_yaml_dict =
|
|
146
|
-
|
|
190
|
+
data_contract_yaml_dict = _to_yaml(data_contract_str)
|
|
191
|
+
_validate(data_contract_yaml_dict, schema_location)
|
|
147
192
|
|
|
148
193
|
spec = DataContractSpecification(**data_contract_yaml_dict)
|
|
149
194
|
|
|
150
195
|
if inline_definitions:
|
|
151
196
|
inline_definitions_into_data_contract(spec)
|
|
152
197
|
if spec.quality and inline_quality:
|
|
153
|
-
|
|
198
|
+
_resolve_quality_ref(spec.quality)
|
|
154
199
|
|
|
155
200
|
return spec
|
|
156
201
|
|
|
157
202
|
|
|
158
|
-
def
|
|
203
|
+
def _to_yaml(data_contract_str):
|
|
159
204
|
try:
|
|
160
205
|
yaml_dict = yaml.safe_load(data_contract_str)
|
|
161
206
|
return yaml_dict
|
|
@@ -170,7 +215,7 @@ def to_yaml(data_contract_str):
|
|
|
170
215
|
)
|
|
171
216
|
|
|
172
217
|
|
|
173
|
-
def
|
|
218
|
+
def _validate(data_contract_yaml, schema_location: str = None):
|
|
174
219
|
schema = fetch_schema(schema_location)
|
|
175
220
|
try:
|
|
176
221
|
fastjsonschema.validate(schema, data_contract_yaml)
|
datacontract/lint/schema.py
CHANGED
|
@@ -1,18 +1,37 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
+
from typing import Dict, Any
|
|
3
4
|
|
|
4
5
|
import requests
|
|
5
6
|
|
|
6
7
|
from datacontract.model.exceptions import DataContractException
|
|
7
8
|
|
|
8
9
|
|
|
9
|
-
def fetch_schema(location: str = None):
|
|
10
|
+
def fetch_schema(location: str = None) -> Dict[str, Any]:
|
|
11
|
+
"""
|
|
12
|
+
Fetch and return a JSON schema from a given location.
|
|
13
|
+
|
|
14
|
+
This function retrieves a JSON schema either from a URL or a local file path.
|
|
15
|
+
If no location is provided, it defaults to the DataContract schema URL.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
location: The URL or file path of the schema.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
The JSON schema as a dictionary.
|
|
22
|
+
|
|
23
|
+
Raises:
|
|
24
|
+
DataContractException: If the specified local file does not exist.
|
|
25
|
+
requests.RequestException: If there's an error fetching the schema from a URL.
|
|
26
|
+
json.JSONDecodeError: If there's an error decoding the JSON schema.
|
|
27
|
+
|
|
28
|
+
"""
|
|
10
29
|
if location is None:
|
|
11
30
|
location = "https://datacontract.com/datacontract.schema.json"
|
|
12
31
|
|
|
13
32
|
if location.startswith("http://") or location.startswith("https://"):
|
|
14
33
|
response = requests.get(location)
|
|
15
|
-
|
|
34
|
+
schema = response.json()
|
|
16
35
|
else:
|
|
17
36
|
if not os.path.exists(location):
|
|
18
37
|
raise DataContractException(
|
|
@@ -23,5 +42,6 @@ def fetch_schema(location: str = None):
|
|
|
23
42
|
result="error",
|
|
24
43
|
)
|
|
25
44
|
with open(location, "r") as file:
|
|
26
|
-
|
|
27
|
-
|
|
45
|
+
schema = json.load(file)
|
|
46
|
+
|
|
47
|
+
return schema
|
|
@@ -73,6 +73,7 @@ class Definition(pyd.BaseModel):
|
|
|
73
73
|
exclusiveMaximum: int = None
|
|
74
74
|
pii: bool = None
|
|
75
75
|
classification: str = None
|
|
76
|
+
fields: Dict[str, "Definition"] = {}
|
|
76
77
|
tags: List[str] = []
|
|
77
78
|
links: Dict[str, str] = {}
|
|
78
79
|
example: str = None
|
|
@@ -107,6 +108,8 @@ class Field(pyd.BaseModel):
|
|
|
107
108
|
links: Dict[str, str] = {}
|
|
108
109
|
fields: Dict[str, "Field"] = {}
|
|
109
110
|
items: "Field" = None
|
|
111
|
+
keys: "Field" = None
|
|
112
|
+
values: "Field" = None
|
|
110
113
|
precision: int = None
|
|
111
114
|
scale: int = None
|
|
112
115
|
example: str = None
|
|
@@ -250,7 +250,7 @@
|
|
|
250
250
|
</div>
|
|
251
251
|
<div class="mt-8 md:order-1 md:mt-0">
|
|
252
252
|
<p class="text-center leading-5 text-gray-400">
|
|
253
|
-
Supported with ❤️ by <a href="https://
|
|
253
|
+
Supported with ❤️ by <a href="https://datacontract-manager.com" class="text-gray-400 hover:text-gray-500">Data Contract Manager</a>
|
|
254
254
|
</p>
|
|
255
255
|
</div>
|
|
256
256
|
</div>
|
|
@@ -190,7 +190,7 @@
|
|
|
190
190
|
</div>
|
|
191
191
|
<div class="mt-8 md:order-1 md:mt-0">
|
|
192
192
|
<p class="text-center leading-5 text-gray-400">
|
|
193
|
-
Supported with ❤️ by <a href="https://
|
|
193
|
+
Supported with ❤️ by <a href="https://datacontract-manager.com" class="text-gray-400 hover:text-gray-500">Data Contract Manager</a>
|
|
194
194
|
</p>
|
|
195
195
|
</div>
|
|
196
196
|
</div>
|