datacontract-cli 0.10.10__py3-none-any.whl → 0.10.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/cli.py +19 -3
- datacontract/data_contract.py +17 -17
- datacontract/engines/fastjsonschema/check_jsonschema.py +15 -1
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +2 -0
- datacontract/engines/soda/check_soda_execute.py +2 -8
- datacontract/engines/soda/connections/duckdb.py +23 -20
- datacontract/engines/soda/connections/kafka.py +81 -23
- datacontract/engines/soda/connections/snowflake.py +8 -5
- datacontract/export/avro_converter.py +12 -2
- datacontract/export/dbml_converter.py +42 -19
- datacontract/export/exporter.py +2 -1
- datacontract/export/exporter_factory.py +6 -0
- datacontract/export/jsonschema_converter.py +1 -4
- datacontract/export/spark_converter.py +4 -0
- datacontract/export/sql_type_converter.py +64 -29
- datacontract/export/sqlalchemy_converter.py +169 -0
- datacontract/imports/avro_importer.py +1 -0
- datacontract/imports/bigquery_importer.py +2 -2
- datacontract/imports/dbml_importer.py +112 -0
- datacontract/imports/dbt_importer.py +67 -91
- datacontract/imports/glue_importer.py +64 -54
- datacontract/imports/importer.py +3 -2
- datacontract/imports/importer_factory.py +5 -0
- datacontract/imports/jsonschema_importer.py +106 -120
- datacontract/imports/odcs_importer.py +1 -1
- datacontract/imports/spark_importer.py +29 -10
- datacontract/imports/sql_importer.py +5 -1
- datacontract/imports/unity_importer.py +1 -1
- datacontract/integration/{publish_datamesh_manager.py → datamesh_manager.py} +33 -5
- datacontract/integration/{publish_opentelemetry.py → opentelemetry.py} +1 -1
- datacontract/model/data_contract_specification.py +6 -2
- datacontract/templates/partials/model_field.html +10 -2
- {datacontract_cli-0.10.10.dist-info → datacontract_cli-0.10.12.dist-info}/METADATA +283 -113
- {datacontract_cli-0.10.10.dist-info → datacontract_cli-0.10.12.dist-info}/RECORD +38 -37
- {datacontract_cli-0.10.10.dist-info → datacontract_cli-0.10.12.dist-info}/WHEEL +1 -1
- datacontract/publish/publish.py +0 -32
- {datacontract_cli-0.10.10.dist-info → datacontract_cli-0.10.12.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.10.10.dist-info → datacontract_cli-0.10.12.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.10.dist-info → datacontract_cli-0.10.12.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import boto3
|
|
2
|
-
from typing import List
|
|
3
|
-
|
|
2
|
+
from typing import List, Dict, Generator
|
|
3
|
+
import re
|
|
4
4
|
from datacontract.imports.importer import Importer
|
|
5
5
|
from datacontract.model.data_contract_specification import (
|
|
6
6
|
DataContractSpecification,
|
|
@@ -13,7 +13,7 @@ from datacontract.model.data_contract_specification import (
|
|
|
13
13
|
class GlueImporter(Importer):
|
|
14
14
|
def import_source(
|
|
15
15
|
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
16
|
-
) ->
|
|
16
|
+
) -> DataContractSpecification:
|
|
17
17
|
return import_glue(data_contract_specification, source, import_args.get("glue_table"))
|
|
18
18
|
|
|
19
19
|
|
|
@@ -39,7 +39,7 @@ def get_glue_database(database_name: str):
|
|
|
39
39
|
|
|
40
40
|
return (
|
|
41
41
|
response["Database"]["CatalogId"],
|
|
42
|
-
response["Database"].get("LocationUri"
|
|
42
|
+
response["Database"].get("LocationUri"),
|
|
43
43
|
)
|
|
44
44
|
|
|
45
45
|
|
|
@@ -75,7 +75,7 @@ def get_glue_tables(database_name: str) -> List[str]:
|
|
|
75
75
|
return table_names
|
|
76
76
|
|
|
77
77
|
|
|
78
|
-
def get_glue_table_schema(database_name: str, table_name: str):
|
|
78
|
+
def get_glue_table_schema(database_name: str, table_name: str) -> List[Dict]:
|
|
79
79
|
"""Get the schema of a Glue table.
|
|
80
80
|
|
|
81
81
|
Args:
|
|
@@ -93,11 +93,11 @@ def get_glue_table_schema(database_name: str, table_name: str):
|
|
|
93
93
|
response = glue.get_table(DatabaseName=database_name, Name=table_name)
|
|
94
94
|
except glue.exceptions.EntityNotFoundException:
|
|
95
95
|
print(f"Table {table_name} not found in database {database_name}.")
|
|
96
|
-
return
|
|
96
|
+
return []
|
|
97
97
|
except Exception as e:
|
|
98
98
|
# todo catch all
|
|
99
99
|
print(f"Error: {e}")
|
|
100
|
-
return
|
|
100
|
+
return []
|
|
101
101
|
|
|
102
102
|
table_schema = response["Table"]["StorageDescriptor"]["Columns"]
|
|
103
103
|
|
|
@@ -109,10 +109,9 @@ def get_glue_table_schema(database_name: str, table_name: str):
|
|
|
109
109
|
"Name": pk["Name"],
|
|
110
110
|
"Type": pk["Type"],
|
|
111
111
|
"Hive": True,
|
|
112
|
-
"Comment": "
|
|
112
|
+
"Comment": pk.get("Comment"),
|
|
113
113
|
}
|
|
114
114
|
)
|
|
115
|
-
|
|
116
115
|
return table_schema
|
|
117
116
|
|
|
118
117
|
|
|
@@ -120,7 +119,7 @@ def import_glue(
|
|
|
120
119
|
data_contract_specification: DataContractSpecification,
|
|
121
120
|
source: str,
|
|
122
121
|
table_names: List[str],
|
|
123
|
-
):
|
|
122
|
+
) -> DataContractSpecification:
|
|
124
123
|
"""Import the schema of a Glue database.
|
|
125
124
|
|
|
126
125
|
Args:
|
|
@@ -140,8 +139,13 @@ def import_glue(
|
|
|
140
139
|
if table_names is None:
|
|
141
140
|
table_names = get_glue_tables(source)
|
|
142
141
|
|
|
142
|
+
server_kwargs = {"type": "glue", "account": catalogid, "database": source}
|
|
143
|
+
|
|
144
|
+
if location_uri:
|
|
145
|
+
server_kwargs["location"] = location_uri
|
|
146
|
+
|
|
143
147
|
data_contract_specification.servers = {
|
|
144
|
-
"production": Server(
|
|
148
|
+
"production": Server(**server_kwargs),
|
|
145
149
|
}
|
|
146
150
|
|
|
147
151
|
for table_name in table_names:
|
|
@@ -161,12 +165,6 @@ def import_glue(
|
|
|
161
165
|
field.description = column.get("Comment")
|
|
162
166
|
fields[column["Name"]] = field
|
|
163
167
|
|
|
164
|
-
if "decimal" in column["Type"]:
|
|
165
|
-
# Extract precision and scale from the string
|
|
166
|
-
perc_scale = column["Type"][8:-1].split(",")
|
|
167
|
-
field.precision = int(perc_scale[0])
|
|
168
|
-
field.scale = int(perc_scale[1])
|
|
169
|
-
|
|
170
168
|
data_contract_specification.models[table_name] = Model(
|
|
171
169
|
type="table",
|
|
172
170
|
fields=fields,
|
|
@@ -186,21 +184,43 @@ def create_typed_field(dtype: str) -> Field:
|
|
|
186
184
|
"""
|
|
187
185
|
field = Field()
|
|
188
186
|
dtype = dtype.strip().lower().replace(" ", "")
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
187
|
+
# Example: array<string>
|
|
188
|
+
if dtype.startswith("array"):
|
|
189
|
+
field.type = "array"
|
|
190
|
+
field.items = create_typed_field(dtype[6:-1])
|
|
191
|
+
# Example: struct<field1:float,field2:string>
|
|
192
|
+
elif dtype.startswith("struct"):
|
|
193
|
+
field.type = "struct"
|
|
194
|
+
for f in split_struct(dtype[7:-1]):
|
|
195
|
+
field_name, field_key = f.split(":", 1)
|
|
196
|
+
field.fields[field_name] = create_typed_field(field_key)
|
|
197
|
+
# Example: map<string,int>
|
|
198
|
+
elif dtype.startswith("map"):
|
|
199
|
+
field.type = "map"
|
|
200
|
+
map_match = re.match(r"map<(.+?),\s*(.+)>", dtype)
|
|
201
|
+
if map_match:
|
|
202
|
+
key_type = map_match.group(1)
|
|
203
|
+
value_type = map_match.group(2)
|
|
204
|
+
field.keys = create_typed_field(key_type)
|
|
205
|
+
field.values = create_typed_field(value_type)
|
|
206
|
+
# Example: decimal(38, 6) or decimal
|
|
207
|
+
elif dtype.startswith("decimal"):
|
|
208
|
+
field.type = "decimal"
|
|
209
|
+
decimal_match = re.match(r"decimal\((\d+),\s*(\d+)\)", dtype)
|
|
210
|
+
if decimal_match: # if precision specified
|
|
211
|
+
field.precision = int(decimal_match.group(1))
|
|
212
|
+
field.scale = int(decimal_match.group(2))
|
|
213
|
+
# Example: varchar(255) or varchar
|
|
214
|
+
elif dtype.startswith("varchar"):
|
|
215
|
+
field.type = "varchar"
|
|
216
|
+
if len(dtype) > 7:
|
|
217
|
+
field.maxLength = int(dtype[8:-1])
|
|
198
218
|
else:
|
|
199
219
|
field.type = map_type_from_sql(dtype)
|
|
200
220
|
return field
|
|
201
221
|
|
|
202
222
|
|
|
203
|
-
def split_fields(s: str):
|
|
223
|
+
def split_fields(s: str) -> Generator[str, None, None]:
|
|
204
224
|
"""Split a string of fields considering nested structures.
|
|
205
225
|
|
|
206
226
|
Args:
|
|
@@ -247,30 +267,20 @@ def map_type_from_sql(sql_type: str) -> str:
|
|
|
247
267
|
return None
|
|
248
268
|
|
|
249
269
|
sql_type = sql_type.lower()
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
return "double"
|
|
268
|
-
if sql_type.startswith("boolean"):
|
|
269
|
-
return "boolean"
|
|
270
|
-
if sql_type.startswith("timestamp"):
|
|
271
|
-
return "timestamp"
|
|
272
|
-
if sql_type.startswith("date"):
|
|
273
|
-
return "date"
|
|
274
|
-
if sql_type.startswith("decimal"):
|
|
275
|
-
return "decimal"
|
|
276
|
-
return "variant"
|
|
270
|
+
|
|
271
|
+
type_mapping = {
|
|
272
|
+
"string": "string",
|
|
273
|
+
"int": "int",
|
|
274
|
+
"bigint": "bigint",
|
|
275
|
+
"float": "float",
|
|
276
|
+
"double": "double",
|
|
277
|
+
"boolean": "boolean",
|
|
278
|
+
"timestamp": "timestamp",
|
|
279
|
+
"date": "date",
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
for prefix, mapped_type in type_mapping.items():
|
|
283
|
+
if sql_type.startswith(prefix):
|
|
284
|
+
return mapped_type
|
|
285
|
+
|
|
286
|
+
return "unknown"
|
datacontract/imports/importer.py
CHANGED
|
@@ -14,7 +14,7 @@ class Importer(ABC):
|
|
|
14
14
|
data_contract_specification: DataContractSpecification,
|
|
15
15
|
source: str,
|
|
16
16
|
import_args: dict,
|
|
17
|
-
) ->
|
|
17
|
+
) -> DataContractSpecification:
|
|
18
18
|
pass
|
|
19
19
|
|
|
20
20
|
|
|
@@ -22,6 +22,7 @@ class ImportFormat(str, Enum):
|
|
|
22
22
|
sql = "sql"
|
|
23
23
|
avro = "avro"
|
|
24
24
|
dbt = "dbt"
|
|
25
|
+
dbml = "dbml"
|
|
25
26
|
glue = "glue"
|
|
26
27
|
jsonschema = "jsonschema"
|
|
27
28
|
bigquery = "bigquery"
|
|
@@ -30,5 +31,5 @@ class ImportFormat(str, Enum):
|
|
|
30
31
|
spark = "spark"
|
|
31
32
|
|
|
32
33
|
@classmethod
|
|
33
|
-
def
|
|
34
|
+
def get_supported_formats(cls):
|
|
34
35
|
return list(map(lambda c: c.value, cls))
|
|
@@ -88,3 +88,8 @@ importer_factory.register_lazy_importer(
|
|
|
88
88
|
importer_factory.register_lazy_importer(
|
|
89
89
|
name=ImportFormat.dbt, module_path="datacontract.imports.dbt_importer", class_name="DbtManifestImporter"
|
|
90
90
|
)
|
|
91
|
+
importer_factory.register_lazy_importer(
|
|
92
|
+
name=ImportFormat.dbml,
|
|
93
|
+
module_path="datacontract.imports.dbml_importer",
|
|
94
|
+
class_name="DBMLImporter",
|
|
95
|
+
)
|
|
@@ -10,140 +10,49 @@ from datacontract.model.exceptions import DataContractException
|
|
|
10
10
|
class JsonSchemaImporter(Importer):
|
|
11
11
|
def import_source(
|
|
12
12
|
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
13
|
-
) ->
|
|
13
|
+
) -> DataContractSpecification:
|
|
14
14
|
return import_jsonschema(data_contract_specification, source)
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
def convert_json_schema_properties(properties, is_definition=False):
|
|
18
|
-
fields = {}
|
|
19
|
-
for field_name, field_schema in properties.items():
|
|
20
|
-
field_kwargs = {}
|
|
21
|
-
field_type = field_schema.get("type")
|
|
22
|
-
|
|
23
|
-
# Determine if the field is required and set the type to the non-null option if applicable
|
|
24
|
-
if isinstance(field_type, list) and "null" in field_type:
|
|
25
|
-
field_kwargs["required"] = False
|
|
26
|
-
non_null_types = [t for t in field_type if t != "null"]
|
|
27
|
-
if non_null_types:
|
|
28
|
-
field_type = non_null_types[0]
|
|
29
|
-
else:
|
|
30
|
-
field_type = None
|
|
31
|
-
else:
|
|
32
|
-
field_kwargs["required"] = True
|
|
33
|
-
|
|
34
|
-
# Set the non-null type
|
|
35
|
-
if field_type:
|
|
36
|
-
field_kwargs["type"] = field_type
|
|
37
|
-
|
|
38
|
-
for key, value in field_schema.items():
|
|
39
|
-
match key:
|
|
40
|
-
case "title":
|
|
41
|
-
field_kwargs["title"] = value
|
|
42
|
-
case "type":
|
|
43
|
-
pass # type is already handled above
|
|
44
|
-
case "format":
|
|
45
|
-
field_kwargs["format"] = value
|
|
46
|
-
case "description":
|
|
47
|
-
field_kwargs["description"] = value
|
|
48
|
-
case "pattern":
|
|
49
|
-
field_kwargs["pattern"] = value
|
|
50
|
-
case "minLength":
|
|
51
|
-
field_kwargs["minLength"] = value
|
|
52
|
-
case "maxLength":
|
|
53
|
-
field_kwargs["maxLength"] = value
|
|
54
|
-
case "minimum":
|
|
55
|
-
field_kwargs["minimum"] = value
|
|
56
|
-
case "exclusiveMinimum":
|
|
57
|
-
field_kwargs["exclusiveMinimum"] = value
|
|
58
|
-
case "maximum":
|
|
59
|
-
field_kwargs["maximum"] = value
|
|
60
|
-
case "exclusiveMaximum":
|
|
61
|
-
field_kwargs["exclusiveMaximum"] = value
|
|
62
|
-
case "enum":
|
|
63
|
-
field_kwargs["enum"] = value
|
|
64
|
-
case "tags":
|
|
65
|
-
field_kwargs["tags"] = value
|
|
66
|
-
case "properties":
|
|
67
|
-
field_kwargs["fields"] = convert_json_schema_properties(value, is_definition=is_definition)
|
|
68
|
-
case "items":
|
|
69
|
-
field_kwargs["items"] = convert_json_schema_properties(value, is_definition=is_definition)
|
|
70
|
-
|
|
71
|
-
if is_definition:
|
|
72
|
-
field = Definition(**field_kwargs)
|
|
73
|
-
else:
|
|
74
|
-
field = Field(**field_kwargs)
|
|
75
|
-
fields[field_name] = field
|
|
76
|
-
|
|
77
|
-
return fields
|
|
78
|
-
|
|
79
|
-
|
|
80
17
|
def import_jsonschema(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
|
|
81
18
|
if data_contract_specification.models is None:
|
|
82
19
|
data_contract_specification.models = {}
|
|
83
20
|
|
|
21
|
+
json_schema = load_and_validate_json_schema(source)
|
|
22
|
+
|
|
23
|
+
title = json_schema.get("title", "default_model")
|
|
24
|
+
description = json_schema.get("description")
|
|
25
|
+
type_ = json_schema.get("type")
|
|
26
|
+
properties = json_schema.get("properties", {})
|
|
27
|
+
required_properties = json_schema.get("required", [])
|
|
28
|
+
|
|
29
|
+
fields_kwargs = jsonschema_to_args(properties, required_properties)
|
|
30
|
+
fields = {name: Field(**kwargs) for name, kwargs in fields_kwargs.items()}
|
|
31
|
+
|
|
32
|
+
model = Model(description=description, type=type_, title=title, fields=fields)
|
|
33
|
+
data_contract_specification.models[title] = model
|
|
34
|
+
|
|
35
|
+
definitions = json_schema.get("definitions", {})
|
|
36
|
+
for name, schema in definitions.items():
|
|
37
|
+
kwargs = schema_to_args(schema)
|
|
38
|
+
data_contract_specification.definitions[name] = Definition(name=name, **kwargs)
|
|
39
|
+
|
|
40
|
+
return data_contract_specification
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def load_and_validate_json_schema(source):
|
|
84
44
|
try:
|
|
85
45
|
with open(source, "r") as file:
|
|
86
46
|
json_schema = json.loads(file.read())
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
model = Model(
|
|
91
|
-
description=json_schema.get("description"),
|
|
92
|
-
type=json_schema.get("type"),
|
|
93
|
-
title=json_schema.get("title"),
|
|
94
|
-
fields=convert_json_schema_properties(json_schema.get("properties", {})),
|
|
95
|
-
)
|
|
96
|
-
data_contract_specification.models[json_schema.get("title", "default_model")] = model
|
|
97
|
-
|
|
98
|
-
if "definitions" in json_schema:
|
|
99
|
-
for def_name, def_schema in json_schema["definitions"].items():
|
|
100
|
-
definition_kwargs = {}
|
|
101
|
-
|
|
102
|
-
for key, value in def_schema.items():
|
|
103
|
-
match key:
|
|
104
|
-
case "domain":
|
|
105
|
-
definition_kwargs["domain"] = value
|
|
106
|
-
case "title":
|
|
107
|
-
definition_kwargs["title"] = value
|
|
108
|
-
case "description":
|
|
109
|
-
definition_kwargs["description"] = value
|
|
110
|
-
case "type":
|
|
111
|
-
definition_kwargs["type"] = value
|
|
112
|
-
case "enum":
|
|
113
|
-
definition_kwargs["enum"] = value
|
|
114
|
-
case "format":
|
|
115
|
-
definition_kwargs["format"] = value
|
|
116
|
-
case "minLength":
|
|
117
|
-
definition_kwargs["minLength"] = value
|
|
118
|
-
case "maxLength":
|
|
119
|
-
definition_kwargs["maxLength"] = value
|
|
120
|
-
case "pattern":
|
|
121
|
-
definition_kwargs["pattern"] = value
|
|
122
|
-
case "minimum":
|
|
123
|
-
definition_kwargs["minimum"] = value
|
|
124
|
-
case "exclusiveMinimum":
|
|
125
|
-
definition_kwargs["exclusiveMinimum"] = value
|
|
126
|
-
case "maximum":
|
|
127
|
-
definition_kwargs["maximum"] = value
|
|
128
|
-
case "exclusiveMaximum":
|
|
129
|
-
definition_kwargs["exclusiveMaximum"] = value
|
|
130
|
-
case "pii":
|
|
131
|
-
definition_kwargs["pii"] = value
|
|
132
|
-
case "classification":
|
|
133
|
-
definition_kwargs["classification"] = value
|
|
134
|
-
case "tags":
|
|
135
|
-
definition_kwargs["tags"] = value
|
|
136
|
-
case "properties":
|
|
137
|
-
definition_kwargs["fields"] = convert_json_schema_properties(value, is_definition=True)
|
|
138
|
-
|
|
139
|
-
definition = Definition(name=def_name, **definition_kwargs)
|
|
140
|
-
data_contract_specification.definitions[def_name] = definition
|
|
47
|
+
|
|
48
|
+
validator = fastjsonschema.compile({})
|
|
49
|
+
validator(json_schema)
|
|
141
50
|
|
|
142
51
|
except fastjsonschema.JsonSchemaException as e:
|
|
143
52
|
raise DataContractException(
|
|
144
53
|
type="schema",
|
|
145
54
|
name="Parse json schema",
|
|
146
|
-
reason=f"Failed to
|
|
55
|
+
reason=f"Failed to validate json schema from {source}: {e}",
|
|
147
56
|
engine="datacontract",
|
|
148
57
|
)
|
|
149
58
|
|
|
@@ -155,5 +64,82 @@ def import_jsonschema(data_contract_specification: DataContractSpecification, so
|
|
|
155
64
|
engine="datacontract",
|
|
156
65
|
original_exception=e,
|
|
157
66
|
)
|
|
67
|
+
return json_schema
|
|
158
68
|
|
|
159
|
-
|
|
69
|
+
|
|
70
|
+
def jsonschema_to_args(properties, required_properties):
|
|
71
|
+
args = {}
|
|
72
|
+
for property, property_schema in properties.items():
|
|
73
|
+
is_required = property in required_properties
|
|
74
|
+
args[property] = schema_to_args(property_schema, is_required)
|
|
75
|
+
|
|
76
|
+
return args
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def schema_to_args(property_schema, is_required: bool = None) -> dict:
|
|
80
|
+
direct_mappings = {
|
|
81
|
+
"title",
|
|
82
|
+
"description",
|
|
83
|
+
"format",
|
|
84
|
+
"pattern",
|
|
85
|
+
"enum",
|
|
86
|
+
"tags",
|
|
87
|
+
"pii",
|
|
88
|
+
"minLength",
|
|
89
|
+
"maxLength",
|
|
90
|
+
"minimum",
|
|
91
|
+
"exclusiveMinimum",
|
|
92
|
+
"maximum",
|
|
93
|
+
"exclusiveMaximum",
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
field_kwargs = {key: value for key, value in property_schema.items() if key in direct_mappings}
|
|
97
|
+
|
|
98
|
+
if is_required is not None:
|
|
99
|
+
field_kwargs["required"] = is_required
|
|
100
|
+
|
|
101
|
+
property_type = determine_type(property_schema)
|
|
102
|
+
if property_type is not None:
|
|
103
|
+
field_kwargs["type"] = property_type
|
|
104
|
+
|
|
105
|
+
if property_type == "array":
|
|
106
|
+
nested_item_type, nested_items = determine_nested_item_type(property_schema)
|
|
107
|
+
|
|
108
|
+
if nested_items is not None:
|
|
109
|
+
field_kwargs["items"] = schema_to_args(nested_item_type)
|
|
110
|
+
|
|
111
|
+
nested_properties = property_schema.get("properties")
|
|
112
|
+
if nested_properties is not None:
|
|
113
|
+
# recursive call for complex nested properties
|
|
114
|
+
field_kwargs["fields"] = jsonschema_to_args(nested_properties, property_schema["required"])
|
|
115
|
+
|
|
116
|
+
return field_kwargs
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def determine_nested_item_type(property_schema):
|
|
120
|
+
nested_items = property_schema.get("items")
|
|
121
|
+
nested_items_is_list = isinstance(nested_items, list)
|
|
122
|
+
if nested_items_is_list and len(nested_items) != 1:
|
|
123
|
+
raise DataContractException(
|
|
124
|
+
type="schema",
|
|
125
|
+
name="Parse json schema",
|
|
126
|
+
reason=f"Union types for arrays are currently not supported ({nested_items})",
|
|
127
|
+
engine="datacontract",
|
|
128
|
+
)
|
|
129
|
+
if nested_items_is_list and len(nested_items) == 1:
|
|
130
|
+
nested_item_type = nested_items[0]
|
|
131
|
+
elif not nested_items_is_list and nested_items is not None:
|
|
132
|
+
nested_item_type = nested_items
|
|
133
|
+
return nested_item_type, nested_items
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def determine_type(property_schema):
|
|
137
|
+
property_type = property_schema.get("type")
|
|
138
|
+
type_is_list = isinstance(property_type, list)
|
|
139
|
+
if type_is_list:
|
|
140
|
+
non_null_types = [t for t in property_type if t != "null"]
|
|
141
|
+
if non_null_types:
|
|
142
|
+
property_type = non_null_types[0]
|
|
143
|
+
else:
|
|
144
|
+
property_type = None
|
|
145
|
+
return property_type
|
|
@@ -46,7 +46,7 @@ DATACONTRACT_TYPES = [
|
|
|
46
46
|
class OdcsImporter(Importer):
|
|
47
47
|
def import_source(
|
|
48
48
|
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
49
|
-
) ->
|
|
49
|
+
) -> DataContractSpecification:
|
|
50
50
|
return import_odcs(data_contract_specification, source)
|
|
51
51
|
|
|
52
52
|
|
|
@@ -14,7 +14,7 @@ class SparkImporter(Importer):
|
|
|
14
14
|
data_contract_specification: DataContractSpecification,
|
|
15
15
|
source: str,
|
|
16
16
|
import_args: dict,
|
|
17
|
-
) ->
|
|
17
|
+
) -> DataContractSpecification:
|
|
18
18
|
"""
|
|
19
19
|
Imports data from a Spark source into the data contract specification.
|
|
20
20
|
|
|
@@ -63,12 +63,12 @@ def import_from_spark_df(df: DataFrame) -> Model:
|
|
|
63
63
|
schema = df.schema
|
|
64
64
|
|
|
65
65
|
for field in schema:
|
|
66
|
-
model.fields[field.name] =
|
|
66
|
+
model.fields[field.name] = _field_from_struct_type(field)
|
|
67
67
|
|
|
68
68
|
return model
|
|
69
69
|
|
|
70
70
|
|
|
71
|
-
def
|
|
71
|
+
def _field_from_struct_type(spark_field: types.StructField) -> Field:
|
|
72
72
|
"""
|
|
73
73
|
Converts a Spark StructField into a Field object for the data contract.
|
|
74
74
|
|
|
@@ -76,18 +76,35 @@ def _field_from_spark(spark_field: types.StructField) -> Field:
|
|
|
76
76
|
spark_field: The Spark StructField to convert.
|
|
77
77
|
|
|
78
78
|
Returns:
|
|
79
|
-
Field: The
|
|
79
|
+
Field: The generated Field object.
|
|
80
80
|
"""
|
|
81
|
-
field_type = _data_type_from_spark(spark_field.dataType)
|
|
82
81
|
field = Field()
|
|
83
|
-
field.type = field_type
|
|
84
82
|
field.required = not spark_field.nullable
|
|
83
|
+
return _type_from_data_type(field, spark_field.dataType)
|
|
85
84
|
|
|
86
|
-
if field_type == "array":
|
|
87
|
-
field.items = _field_from_spark(spark_field.dataType.elementType)
|
|
88
85
|
|
|
89
|
-
|
|
90
|
-
|
|
86
|
+
def _type_from_data_type(field: Field, spark_type: types.DataType) -> Field:
|
|
87
|
+
"""
|
|
88
|
+
Maps Spark data types to the Data Contract type system and updates the field.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
field: The Field object to update.
|
|
92
|
+
spark_type: The Spark data type to map.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Field: The updated Field object.
|
|
96
|
+
"""
|
|
97
|
+
field.type = _data_type_from_spark(spark_type)
|
|
98
|
+
|
|
99
|
+
if field.type == "array":
|
|
100
|
+
field.items = _type_from_data_type(Field(required=not spark_type.containsNull), spark_type.elementType)
|
|
101
|
+
|
|
102
|
+
elif field.type == "map":
|
|
103
|
+
field.keys = _type_from_data_type(Field(required=True), spark_type.keyType)
|
|
104
|
+
field.values = _type_from_data_type(Field(required=not spark_type.valueContainsNull), spark_type.valueType)
|
|
105
|
+
|
|
106
|
+
elif field.type == "struct":
|
|
107
|
+
field.fields = {sf.name: _field_from_struct_type(sf) for sf in spark_type.fields}
|
|
91
108
|
|
|
92
109
|
return field
|
|
93
110
|
|
|
@@ -116,6 +133,8 @@ def _data_type_from_spark(spark_type: types.DataType) -> str:
|
|
|
116
133
|
return "struct"
|
|
117
134
|
elif isinstance(spark_type, types.ArrayType):
|
|
118
135
|
return "array"
|
|
136
|
+
elif isinstance(spark_type, types.MapType):
|
|
137
|
+
return "map"
|
|
119
138
|
elif isinstance(spark_type, types.TimestampType):
|
|
120
139
|
return "timestamp"
|
|
121
140
|
elif isinstance(spark_type, types.TimestampNTZType):
|
|
@@ -7,7 +7,7 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
|
|
|
7
7
|
class SqlImporter(Importer):
|
|
8
8
|
def import_source(
|
|
9
9
|
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
10
|
-
) ->
|
|
10
|
+
) -> DataContractSpecification:
|
|
11
11
|
return import_sql(data_contract_specification, self.import_format, source)
|
|
12
12
|
|
|
13
13
|
|
|
@@ -64,6 +64,10 @@ def map_type_from_sql(sql_type: str):
|
|
|
64
64
|
return "integer"
|
|
65
65
|
elif sql_type_normed.startswith("float"):
|
|
66
66
|
return "float"
|
|
67
|
+
elif sql_type_normed.startswith("decimal"):
|
|
68
|
+
return "decimal"
|
|
69
|
+
elif sql_type_normed.startswith("numeric"):
|
|
70
|
+
return "numeric"
|
|
67
71
|
elif sql_type_normed.startswith("bool"):
|
|
68
72
|
return "boolean"
|
|
69
73
|
elif sql_type_normed.startswith("timestamp"):
|
|
@@ -11,7 +11,7 @@ from datacontract.model.exceptions import DataContractException
|
|
|
11
11
|
class UnityImporter(Importer):
|
|
12
12
|
def import_source(
|
|
13
13
|
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
14
|
-
) ->
|
|
14
|
+
) -> DataContractSpecification:
|
|
15
15
|
if source is not None:
|
|
16
16
|
data_contract_specification = import_unity_from_json(data_contract_specification, source)
|
|
17
17
|
else:
|
|
@@ -2,28 +2,29 @@ import os
|
|
|
2
2
|
|
|
3
3
|
import requests
|
|
4
4
|
|
|
5
|
+
from datacontract.model.data_contract_specification import DataContractSpecification
|
|
5
6
|
from datacontract.model.run import Run
|
|
6
7
|
|
|
7
8
|
|
|
8
|
-
def
|
|
9
|
+
def publish_test_results_to_datamesh_manager(run: Run, publish_url: str):
|
|
9
10
|
try:
|
|
10
11
|
if publish_url is None:
|
|
11
12
|
# this url supports Data Mesh Manager and Data Contract Manager
|
|
12
13
|
url = "https://api.datamesh-manager.com/api/test-results"
|
|
13
14
|
else:
|
|
14
15
|
url = publish_url
|
|
16
|
+
|
|
15
17
|
api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
|
|
16
18
|
if api_key is None:
|
|
17
19
|
api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY")
|
|
18
|
-
|
|
19
|
-
if run.dataContractId is None:
|
|
20
|
-
raise Exception("Cannot publish run results, as data contract ID is unknown")
|
|
21
|
-
|
|
22
20
|
if api_key is None:
|
|
23
21
|
raise Exception(
|
|
24
22
|
"Cannot publish run results, as DATAMESH_MANAGER_API_KEY nor DATACONTRACT_MANAGER_API_KEY are not set"
|
|
25
23
|
)
|
|
26
24
|
|
|
25
|
+
if run.dataContractId is None:
|
|
26
|
+
raise Exception("Cannot publish run results, as data contract ID is unknown")
|
|
27
|
+
|
|
27
28
|
headers = {"Content-Type": "application/json", "x-api-key": api_key}
|
|
28
29
|
request_body = run.model_dump_json()
|
|
29
30
|
# print("Request Body:", request_body)
|
|
@@ -36,3 +37,30 @@ def publish_datamesh_manager(run: Run, publish_url: str):
|
|
|
36
37
|
run.log_info(f"Published test results to {url}")
|
|
37
38
|
except Exception as e:
|
|
38
39
|
run.log_error(f"Failed publishing test results. Error: {str(e)}")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def publish_data_contract_to_datamesh_manager(data_contract_specification: DataContractSpecification):
|
|
43
|
+
try:
|
|
44
|
+
api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
|
|
45
|
+
if api_key is None:
|
|
46
|
+
api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY")
|
|
47
|
+
if api_key is None:
|
|
48
|
+
raise Exception(
|
|
49
|
+
"Cannot publish data contract, as neither DATAMESH_MANAGER_API_KEY nor DATACONTRACT_MANAGER_API_KEY is set"
|
|
50
|
+
)
|
|
51
|
+
headers = {"Content-Type": "application/json", "x-api-key": api_key}
|
|
52
|
+
spec = data_contract_specification
|
|
53
|
+
id = spec.id
|
|
54
|
+
url = "https://api.datamesh-manager.com/api/datacontracts/{0}".format(id)
|
|
55
|
+
request_body = spec.model_dump_json().encode("utf-8")
|
|
56
|
+
response = requests.put(
|
|
57
|
+
url=url,
|
|
58
|
+
data=request_body,
|
|
59
|
+
headers=headers,
|
|
60
|
+
)
|
|
61
|
+
if response.status_code != 200:
|
|
62
|
+
print(f"Error publishing data contract to Data Mesh Manager: {response.text}")
|
|
63
|
+
exit(1)
|
|
64
|
+
print(f"Published data contract to {url}")
|
|
65
|
+
except Exception as e:
|
|
66
|
+
print(f"Failed publishing data contract. Error: {str(e)}")
|
|
@@ -34,7 +34,7 @@ from datacontract.model.run import Run
|
|
|
34
34
|
# - Metrics only, no logs yet (but loosely planned)
|
|
35
35
|
|
|
36
36
|
|
|
37
|
-
def
|
|
37
|
+
def publish_test_results_to_opentelemetry(run: Run):
|
|
38
38
|
try:
|
|
39
39
|
if run.dataContractId is None:
|
|
40
40
|
raise Exception("Cannot publish run results, as data contract ID is unknown")
|