datacontract-cli 0.10.9__py3-none-any.whl → 0.10.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/cli.py +7 -0
- datacontract/data_contract.py +16 -9
- datacontract/engines/fastjsonschema/check_jsonschema.py +4 -1
- datacontract/engines/soda/check_soda_execute.py +5 -2
- datacontract/engines/soda/connections/duckdb.py +20 -12
- datacontract/engines/soda/connections/snowflake.py +8 -5
- datacontract/export/avro_converter.py +1 -1
- datacontract/export/dbml_converter.py +41 -19
- datacontract/export/exporter.py +1 -1
- datacontract/export/jsonschema_converter.py +1 -4
- datacontract/export/sodacl_converter.py +1 -1
- datacontract/imports/avro_importer.py +142 -8
- datacontract/imports/dbt_importer.py +117 -0
- datacontract/imports/glue_importer.py +9 -3
- datacontract/imports/importer.py +7 -2
- datacontract/imports/importer_factory.py +24 -6
- datacontract/imports/jsonschema_importer.py +106 -117
- datacontract/imports/spark_importer.py +134 -0
- datacontract/imports/sql_importer.py +4 -0
- datacontract/integration/publish_datamesh_manager.py +10 -5
- datacontract/lint/resolve.py +72 -27
- datacontract/lint/schema.py +24 -4
- datacontract/model/data_contract_specification.py +3 -0
- datacontract/templates/datacontract.html +1 -1
- datacontract/templates/index.html +1 -1
- datacontract/templates/partials/model_field.html +10 -2
- {datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.11.dist-info}/METADATA +300 -192
- {datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.11.dist-info}/RECORD +32 -30
- {datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.11.dist-info}/WHEEL +1 -1
- {datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.11.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.11.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.9.dist-info → datacontract_cli-0.10.11.dist-info}/top_level.txt +0 -0
|
@@ -14,7 +14,7 @@ class GlueImporter(Importer):
|
|
|
14
14
|
def import_source(
|
|
15
15
|
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
16
16
|
) -> dict:
|
|
17
|
-
return import_glue(data_contract_specification, source, import_args.get("
|
|
17
|
+
return import_glue(data_contract_specification, source, import_args.get("glue_table"))
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
def get_glue_database(database_name: str):
|
|
@@ -154,7 +154,7 @@ def import_glue(
|
|
|
154
154
|
for column in table_schema:
|
|
155
155
|
field = create_typed_field(column["Type"])
|
|
156
156
|
|
|
157
|
-
# hive
|
|
157
|
+
# hive partitions are required, but are not primary keys
|
|
158
158
|
if column.get("Hive"):
|
|
159
159
|
field.required = True
|
|
160
160
|
|
|
@@ -186,7 +186,7 @@ def create_typed_field(dtype: str) -> Field:
|
|
|
186
186
|
"""
|
|
187
187
|
field = Field()
|
|
188
188
|
dtype = dtype.strip().lower().replace(" ", "")
|
|
189
|
-
if dtype.startswith(("array", "struct")):
|
|
189
|
+
if dtype.startswith(("array", "struct", "map")):
|
|
190
190
|
orig_dtype: str = dtype
|
|
191
191
|
if dtype.startswith("array"):
|
|
192
192
|
field.type = "array"
|
|
@@ -195,6 +195,12 @@ def create_typed_field(dtype: str) -> Field:
|
|
|
195
195
|
field.type = "struct"
|
|
196
196
|
for f in split_struct(orig_dtype[7:-1]):
|
|
197
197
|
field.fields[f.split(":", 1)[0].strip()] = create_typed_field(f.split(":", 1)[1])
|
|
198
|
+
elif dtype.startswith("map"):
|
|
199
|
+
field.type = "map"
|
|
200
|
+
key_type = orig_dtype[4:-1].split(",", 1)[0]
|
|
201
|
+
value_type = orig_dtype[4:-1].split(",", 1)[1]
|
|
202
|
+
field.keys = create_typed_field(key_type)
|
|
203
|
+
field.values = create_typed_field(value_type)
|
|
198
204
|
else:
|
|
199
205
|
field.type = map_type_from_sql(dtype)
|
|
200
206
|
return field
|
datacontract/imports/importer.py
CHANGED
|
@@ -10,7 +10,10 @@ class Importer(ABC):
|
|
|
10
10
|
|
|
11
11
|
@abstractmethod
|
|
12
12
|
def import_source(
|
|
13
|
-
self,
|
|
13
|
+
self,
|
|
14
|
+
data_contract_specification: DataContractSpecification,
|
|
15
|
+
source: str,
|
|
16
|
+
import_args: dict,
|
|
14
17
|
) -> dict:
|
|
15
18
|
pass
|
|
16
19
|
|
|
@@ -18,12 +21,14 @@ class Importer(ABC):
|
|
|
18
21
|
class ImportFormat(str, Enum):
|
|
19
22
|
sql = "sql"
|
|
20
23
|
avro = "avro"
|
|
24
|
+
dbt = "dbt"
|
|
21
25
|
glue = "glue"
|
|
22
26
|
jsonschema = "jsonschema"
|
|
23
27
|
bigquery = "bigquery"
|
|
24
28
|
odcs = "odcs"
|
|
25
29
|
unity = "unity"
|
|
30
|
+
spark = "spark"
|
|
26
31
|
|
|
27
32
|
@classmethod
|
|
28
|
-
def
|
|
33
|
+
def get_supported_formats(cls):
|
|
29
34
|
return list(map(lambda c: c.value, cls))
|
|
@@ -18,7 +18,7 @@ class ImporterFactory:
|
|
|
18
18
|
importers = self.dict_importer.copy()
|
|
19
19
|
importers.update(self.dict_lazy_importer.copy())
|
|
20
20
|
if name not in importers.keys():
|
|
21
|
-
raise ValueError(f"The '{name}' format is not
|
|
21
|
+
raise ValueError(f"The '{name}' format is not supported.")
|
|
22
22
|
importer_class = importers[name]
|
|
23
23
|
if type(importers[name]) is tuple:
|
|
24
24
|
importer_class = load_module_class(module_path=importers[name][0], class_name=importers[name][1])
|
|
@@ -46,7 +46,9 @@ def load_module_class(module_path, class_name):
|
|
|
46
46
|
|
|
47
47
|
importer_factory = ImporterFactory()
|
|
48
48
|
importer_factory.register_lazy_importer(
|
|
49
|
-
name=ImportFormat.avro,
|
|
49
|
+
name=ImportFormat.avro,
|
|
50
|
+
module_path="datacontract.imports.avro_importer",
|
|
51
|
+
class_name="AvroImporter",
|
|
50
52
|
)
|
|
51
53
|
importer_factory.register_lazy_importer(
|
|
52
54
|
name=ImportFormat.bigquery,
|
|
@@ -54,7 +56,9 @@ importer_factory.register_lazy_importer(
|
|
|
54
56
|
class_name="BigQueryImporter",
|
|
55
57
|
)
|
|
56
58
|
importer_factory.register_lazy_importer(
|
|
57
|
-
name=ImportFormat.glue,
|
|
59
|
+
name=ImportFormat.glue,
|
|
60
|
+
module_path="datacontract.imports.glue_importer",
|
|
61
|
+
class_name="GlueImporter",
|
|
58
62
|
)
|
|
59
63
|
importer_factory.register_lazy_importer(
|
|
60
64
|
name=ImportFormat.jsonschema,
|
|
@@ -62,11 +66,25 @@ importer_factory.register_lazy_importer(
|
|
|
62
66
|
class_name="JsonSchemaImporter",
|
|
63
67
|
)
|
|
64
68
|
importer_factory.register_lazy_importer(
|
|
65
|
-
name=ImportFormat.odcs,
|
|
69
|
+
name=ImportFormat.odcs,
|
|
70
|
+
module_path="datacontract.imports.odcs_importer",
|
|
71
|
+
class_name="OdcsImporter",
|
|
66
72
|
)
|
|
67
73
|
importer_factory.register_lazy_importer(
|
|
68
|
-
name=ImportFormat.sql,
|
|
74
|
+
name=ImportFormat.sql,
|
|
75
|
+
module_path="datacontract.imports.sql_importer",
|
|
76
|
+
class_name="SqlImporter",
|
|
69
77
|
)
|
|
70
78
|
importer_factory.register_lazy_importer(
|
|
71
|
-
name=ImportFormat.unity,
|
|
79
|
+
name=ImportFormat.unity,
|
|
80
|
+
module_path="datacontract.imports.unity_importer",
|
|
81
|
+
class_name="UnityImporter",
|
|
82
|
+
)
|
|
83
|
+
importer_factory.register_lazy_importer(
|
|
84
|
+
name=ImportFormat.spark,
|
|
85
|
+
module_path="datacontract.imports.spark_importer",
|
|
86
|
+
class_name="SparkImporter",
|
|
87
|
+
)
|
|
88
|
+
importer_factory.register_lazy_importer(
|
|
89
|
+
name=ImportFormat.dbt, module_path="datacontract.imports.dbt_importer", class_name="DbtManifestImporter"
|
|
72
90
|
)
|
|
@@ -10,137 +10,49 @@ from datacontract.model.exceptions import DataContractException
|
|
|
10
10
|
class JsonSchemaImporter(Importer):
|
|
11
11
|
def import_source(
|
|
12
12
|
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
13
|
-
) ->
|
|
13
|
+
) -> DataContractSpecification:
|
|
14
14
|
return import_jsonschema(data_contract_specification, source)
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
def convert_json_schema_properties(properties, is_definition=False):
|
|
18
|
-
fields = {}
|
|
19
|
-
for field_name, field_schema in properties.items():
|
|
20
|
-
field_kwargs = {}
|
|
21
|
-
field_type = field_schema.get("type")
|
|
22
|
-
|
|
23
|
-
# Determine if the field is required and set the type to the non-null option if applicable
|
|
24
|
-
if isinstance(field_type, list) and "null" in field_type:
|
|
25
|
-
field_kwargs["required"] = False
|
|
26
|
-
non_null_types = [t for t in field_type if t != "null"]
|
|
27
|
-
if non_null_types:
|
|
28
|
-
field_type = non_null_types[0]
|
|
29
|
-
else:
|
|
30
|
-
field_type = None
|
|
31
|
-
else:
|
|
32
|
-
field_kwargs["required"] = True
|
|
33
|
-
|
|
34
|
-
# Set the non-null type
|
|
35
|
-
if field_type:
|
|
36
|
-
field_kwargs["type"] = field_type
|
|
37
|
-
|
|
38
|
-
for key, value in field_schema.items():
|
|
39
|
-
match key:
|
|
40
|
-
case "title":
|
|
41
|
-
field_kwargs["title"] = value
|
|
42
|
-
case "type":
|
|
43
|
-
pass # type is already handled above
|
|
44
|
-
case "format":
|
|
45
|
-
field_kwargs["format"] = value
|
|
46
|
-
case "description":
|
|
47
|
-
field_kwargs["description"] = value
|
|
48
|
-
case "pattern":
|
|
49
|
-
field_kwargs["pattern"] = value
|
|
50
|
-
case "minLength":
|
|
51
|
-
field_kwargs["minLength"] = value
|
|
52
|
-
case "maxLength":
|
|
53
|
-
field_kwargs["maxLength"] = value
|
|
54
|
-
case "minimum":
|
|
55
|
-
field_kwargs["minimum"] = value
|
|
56
|
-
case "exclusiveMinimum":
|
|
57
|
-
field_kwargs["exclusiveMinimum"] = value
|
|
58
|
-
case "maximum":
|
|
59
|
-
field_kwargs["maximum"] = value
|
|
60
|
-
case "exclusiveMaximum":
|
|
61
|
-
field_kwargs["exclusiveMaximum"] = value
|
|
62
|
-
case "enum":
|
|
63
|
-
field_kwargs["enum"] = value
|
|
64
|
-
case "tags":
|
|
65
|
-
field_kwargs["tags"] = value
|
|
66
|
-
case "properties":
|
|
67
|
-
field_kwargs["fields"] = convert_json_schema_properties(value)
|
|
68
|
-
case "items":
|
|
69
|
-
field_kwargs["items"] = convert_json_schema_properties(value)
|
|
70
|
-
|
|
71
|
-
field = Field(**field_kwargs)
|
|
72
|
-
fields[field_name] = field
|
|
73
|
-
|
|
74
|
-
return fields
|
|
75
|
-
|
|
76
|
-
|
|
77
17
|
def import_jsonschema(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
|
|
78
18
|
if data_contract_specification.models is None:
|
|
79
19
|
data_contract_specification.models = {}
|
|
80
20
|
|
|
21
|
+
json_schema = load_and_validate_json_schema(source)
|
|
22
|
+
|
|
23
|
+
title = json_schema.get("title", "default_model")
|
|
24
|
+
description = json_schema.get("description")
|
|
25
|
+
type_ = json_schema.get("type")
|
|
26
|
+
properties = json_schema.get("properties", {})
|
|
27
|
+
required_properties = json_schema.get("required", [])
|
|
28
|
+
|
|
29
|
+
fields_kwargs = jsonschema_to_args(properties, required_properties)
|
|
30
|
+
fields = {name: Field(**kwargs) for name, kwargs in fields_kwargs.items()}
|
|
31
|
+
|
|
32
|
+
model = Model(description=description, type=type_, title=title, fields=fields)
|
|
33
|
+
data_contract_specification.models[title] = model
|
|
34
|
+
|
|
35
|
+
definitions = json_schema.get("definitions", {})
|
|
36
|
+
for name, schema in definitions.items():
|
|
37
|
+
kwargs = schema_to_args(schema)
|
|
38
|
+
data_contract_specification.definitions[name] = Definition(name=name, **kwargs)
|
|
39
|
+
|
|
40
|
+
return data_contract_specification
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def load_and_validate_json_schema(source):
|
|
81
44
|
try:
|
|
82
45
|
with open(source, "r") as file:
|
|
83
46
|
json_schema = json.loads(file.read())
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
model = Model(
|
|
88
|
-
description=json_schema.get("description"),
|
|
89
|
-
type=json_schema.get("type"),
|
|
90
|
-
title=json_schema.get("title"),
|
|
91
|
-
fields=convert_json_schema_properties(json_schema.get("properties", {})),
|
|
92
|
-
)
|
|
93
|
-
data_contract_specification.models[json_schema.get("title", "default_model")] = model
|
|
94
|
-
|
|
95
|
-
if "definitions" in json_schema:
|
|
96
|
-
for def_name, def_schema in json_schema["definitions"].items():
|
|
97
|
-
definition_kwargs = {}
|
|
98
|
-
|
|
99
|
-
for key, value in def_schema.items():
|
|
100
|
-
match key:
|
|
101
|
-
case "domain":
|
|
102
|
-
definition_kwargs["domain"] = value
|
|
103
|
-
case "title":
|
|
104
|
-
definition_kwargs["title"] = value
|
|
105
|
-
case "description":
|
|
106
|
-
definition_kwargs["description"] = value
|
|
107
|
-
case "type":
|
|
108
|
-
definition_kwargs["type"] = value
|
|
109
|
-
case "enum":
|
|
110
|
-
definition_kwargs["enum"] = value
|
|
111
|
-
case "format":
|
|
112
|
-
definition_kwargs["format"] = value
|
|
113
|
-
case "minLength":
|
|
114
|
-
definition_kwargs["minLength"] = value
|
|
115
|
-
case "maxLength":
|
|
116
|
-
definition_kwargs["maxLength"] = value
|
|
117
|
-
case "pattern":
|
|
118
|
-
definition_kwargs["pattern"] = value
|
|
119
|
-
case "minimum":
|
|
120
|
-
definition_kwargs["minimum"] = value
|
|
121
|
-
case "exclusiveMinimum":
|
|
122
|
-
definition_kwargs["exclusiveMinimum"] = value
|
|
123
|
-
case "maximum":
|
|
124
|
-
definition_kwargs["maximum"] = value
|
|
125
|
-
case "exclusiveMaximum":
|
|
126
|
-
definition_kwargs["exclusiveMaximum"] = value
|
|
127
|
-
case "pii":
|
|
128
|
-
definition_kwargs["pii"] = value
|
|
129
|
-
case "classification":
|
|
130
|
-
definition_kwargs["classification"] = value
|
|
131
|
-
case "tags":
|
|
132
|
-
definition_kwargs["tags"] = value
|
|
133
|
-
case "properties":
|
|
134
|
-
definition_kwargs["fields"] = convert_json_schema_properties(value, is_definition=True)
|
|
135
|
-
|
|
136
|
-
definition = Definition(name=def_name, **definition_kwargs)
|
|
137
|
-
data_contract_specification.definitions[def_name] = definition
|
|
47
|
+
|
|
48
|
+
validator = fastjsonschema.compile({})
|
|
49
|
+
validator(json_schema)
|
|
138
50
|
|
|
139
51
|
except fastjsonschema.JsonSchemaException as e:
|
|
140
52
|
raise DataContractException(
|
|
141
53
|
type="schema",
|
|
142
54
|
name="Parse json schema",
|
|
143
|
-
reason=f"Failed to
|
|
55
|
+
reason=f"Failed to validate json schema from {source}: {e}",
|
|
144
56
|
engine="datacontract",
|
|
145
57
|
)
|
|
146
58
|
|
|
@@ -152,5 +64,82 @@ def import_jsonschema(data_contract_specification: DataContractSpecification, so
|
|
|
152
64
|
engine="datacontract",
|
|
153
65
|
original_exception=e,
|
|
154
66
|
)
|
|
67
|
+
return json_schema
|
|
155
68
|
|
|
156
|
-
|
|
69
|
+
|
|
70
|
+
def jsonschema_to_args(properties, required_properties):
|
|
71
|
+
args = {}
|
|
72
|
+
for property, property_schema in properties.items():
|
|
73
|
+
is_required = property in required_properties
|
|
74
|
+
args[property] = schema_to_args(property_schema, is_required)
|
|
75
|
+
|
|
76
|
+
return args
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def schema_to_args(property_schema, is_required: bool = None) -> dict:
|
|
80
|
+
direct_mappings = {
|
|
81
|
+
"title",
|
|
82
|
+
"description",
|
|
83
|
+
"format",
|
|
84
|
+
"pattern",
|
|
85
|
+
"enum",
|
|
86
|
+
"tags",
|
|
87
|
+
"pii",
|
|
88
|
+
"minLength",
|
|
89
|
+
"maxLength",
|
|
90
|
+
"minimum",
|
|
91
|
+
"exclusiveMinimum",
|
|
92
|
+
"maximum",
|
|
93
|
+
"exclusiveMaximum",
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
field_kwargs = {key: value for key, value in property_schema.items() if key in direct_mappings}
|
|
97
|
+
|
|
98
|
+
if is_required is not None:
|
|
99
|
+
field_kwargs["required"] = is_required
|
|
100
|
+
|
|
101
|
+
property_type = determine_type(property_schema)
|
|
102
|
+
if property_type is not None:
|
|
103
|
+
field_kwargs["type"] = property_type
|
|
104
|
+
|
|
105
|
+
if property_type == "array":
|
|
106
|
+
nested_item_type, nested_items = determine_nested_item_type(property_schema)
|
|
107
|
+
|
|
108
|
+
if nested_items is not None:
|
|
109
|
+
field_kwargs["items"] = schema_to_args(nested_item_type)
|
|
110
|
+
|
|
111
|
+
nested_properties = property_schema.get("properties")
|
|
112
|
+
if nested_properties is not None:
|
|
113
|
+
# recursive call for complex nested properties
|
|
114
|
+
field_kwargs["fields"] = jsonschema_to_args(nested_properties, property_schema["required"])
|
|
115
|
+
|
|
116
|
+
return field_kwargs
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def determine_nested_item_type(property_schema):
|
|
120
|
+
nested_items = property_schema.get("items")
|
|
121
|
+
nested_items_is_list = isinstance(nested_items, list)
|
|
122
|
+
if nested_items_is_list and len(nested_items) != 1:
|
|
123
|
+
raise DataContractException(
|
|
124
|
+
type="schema",
|
|
125
|
+
name="Parse json schema",
|
|
126
|
+
reason=f"Union types for arrays are currently not supported ({nested_items})",
|
|
127
|
+
engine="datacontract",
|
|
128
|
+
)
|
|
129
|
+
if nested_items_is_list and len(nested_items) == 1:
|
|
130
|
+
nested_item_type = nested_items[0]
|
|
131
|
+
elif not nested_items_is_list and nested_items is not None:
|
|
132
|
+
nested_item_type = nested_items
|
|
133
|
+
return nested_item_type, nested_items
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def determine_type(property_schema):
|
|
137
|
+
property_type = property_schema.get("type")
|
|
138
|
+
type_is_list = isinstance(property_type, list)
|
|
139
|
+
if type_is_list:
|
|
140
|
+
non_null_types = [t for t in property_type if t != "null"]
|
|
141
|
+
if non_null_types:
|
|
142
|
+
property_type = non_null_types[0]
|
|
143
|
+
else:
|
|
144
|
+
property_type = None
|
|
145
|
+
return property_type
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
from pyspark.sql import DataFrame, SparkSession, types
|
|
2
|
+
from datacontract.imports.importer import Importer
|
|
3
|
+
from datacontract.model.data_contract_specification import (
|
|
4
|
+
DataContractSpecification,
|
|
5
|
+
Model,
|
|
6
|
+
Field,
|
|
7
|
+
Server,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SparkImporter(Importer):
|
|
12
|
+
def import_source(
|
|
13
|
+
self,
|
|
14
|
+
data_contract_specification: DataContractSpecification,
|
|
15
|
+
source: str,
|
|
16
|
+
import_args: dict,
|
|
17
|
+
) -> dict:
|
|
18
|
+
"""
|
|
19
|
+
Imports data from a Spark source into the data contract specification.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
data_contract_specification: The data contract specification object.
|
|
23
|
+
source: The source string indicating the Spark tables to read.
|
|
24
|
+
import_args: Additional arguments for the import process.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
dict: The updated data contract specification.
|
|
28
|
+
"""
|
|
29
|
+
return import_spark(data_contract_specification, source)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def import_spark(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
|
|
33
|
+
"""
|
|
34
|
+
Reads Spark tables and updates the data contract specification with their schemas.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
data_contract_specification: The data contract specification to update.
|
|
38
|
+
source: A comma-separated string of Spark temporary views to read.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
DataContractSpecification: The updated data contract specification.
|
|
42
|
+
"""
|
|
43
|
+
spark = SparkSession.builder.getOrCreate()
|
|
44
|
+
data_contract_specification.servers["local"] = Server(type="dataframe")
|
|
45
|
+
for temp_view in source.split(","):
|
|
46
|
+
temp_view = temp_view.strip()
|
|
47
|
+
df = spark.read.table(temp_view)
|
|
48
|
+
data_contract_specification.models[temp_view] = import_from_spark_df(df)
|
|
49
|
+
return data_contract_specification
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def import_from_spark_df(df: DataFrame) -> Model:
|
|
53
|
+
"""
|
|
54
|
+
Converts a Spark DataFrame into a Model.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
df: The Spark DataFrame to convert.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
Model: The generated data contract model.
|
|
61
|
+
"""
|
|
62
|
+
model = Model()
|
|
63
|
+
schema = df.schema
|
|
64
|
+
|
|
65
|
+
for field in schema:
|
|
66
|
+
model.fields[field.name] = _field_from_spark(field)
|
|
67
|
+
|
|
68
|
+
return model
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _field_from_spark(spark_field: types.StructField) -> Field:
|
|
72
|
+
"""
|
|
73
|
+
Converts a Spark StructField into a Field object for the data contract.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
spark_field: The Spark StructField to convert.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Field: The corresponding Field object.
|
|
80
|
+
"""
|
|
81
|
+
field_type = _data_type_from_spark(spark_field.dataType)
|
|
82
|
+
field = Field()
|
|
83
|
+
field.type = field_type
|
|
84
|
+
field.required = not spark_field.nullable
|
|
85
|
+
|
|
86
|
+
if field_type == "array":
|
|
87
|
+
field.items = _field_from_spark(spark_field.dataType.elementType)
|
|
88
|
+
|
|
89
|
+
if field_type == "struct":
|
|
90
|
+
field.fields = {sf.name: _field_from_spark(sf) for sf in spark_field.dataType.fields}
|
|
91
|
+
|
|
92
|
+
return field
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _data_type_from_spark(spark_type: types.DataType) -> str:
|
|
96
|
+
"""
|
|
97
|
+
Maps Spark data types to the Data Contract type system.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
spark_type: The Spark data type to map.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
str: The corresponding Data Contract type.
|
|
104
|
+
"""
|
|
105
|
+
if isinstance(spark_type, types.StringType):
|
|
106
|
+
return "string"
|
|
107
|
+
elif isinstance(spark_type, types.IntegerType):
|
|
108
|
+
return "integer"
|
|
109
|
+
elif isinstance(spark_type, types.LongType):
|
|
110
|
+
return "long"
|
|
111
|
+
elif isinstance(spark_type, types.FloatType):
|
|
112
|
+
return "float"
|
|
113
|
+
elif isinstance(spark_type, types.DoubleType):
|
|
114
|
+
return "double"
|
|
115
|
+
elif isinstance(spark_type, types.StructType):
|
|
116
|
+
return "struct"
|
|
117
|
+
elif isinstance(spark_type, types.ArrayType):
|
|
118
|
+
return "array"
|
|
119
|
+
elif isinstance(spark_type, types.TimestampType):
|
|
120
|
+
return "timestamp"
|
|
121
|
+
elif isinstance(spark_type, types.TimestampNTZType):
|
|
122
|
+
return "timestamp_ntz"
|
|
123
|
+
elif isinstance(spark_type, types.DateType):
|
|
124
|
+
return "date"
|
|
125
|
+
elif isinstance(spark_type, types.BooleanType):
|
|
126
|
+
return "boolean"
|
|
127
|
+
elif isinstance(spark_type, types.BinaryType):
|
|
128
|
+
return "bytes"
|
|
129
|
+
elif isinstance(spark_type, types.DecimalType):
|
|
130
|
+
return "decimal"
|
|
131
|
+
elif isinstance(spark_type, types.NullType):
|
|
132
|
+
return "null"
|
|
133
|
+
else:
|
|
134
|
+
raise ValueError(f"Unsupported Spark type: {spark_type}")
|
|
@@ -64,6 +64,10 @@ def map_type_from_sql(sql_type: str):
|
|
|
64
64
|
return "integer"
|
|
65
65
|
elif sql_type_normed.startswith("float"):
|
|
66
66
|
return "float"
|
|
67
|
+
elif sql_type_normed.startswith("decimal"):
|
|
68
|
+
return "decimal"
|
|
69
|
+
elif sql_type_normed.startswith("numeric"):
|
|
70
|
+
return "numeric"
|
|
67
71
|
elif sql_type_normed.startswith("bool"):
|
|
68
72
|
return "boolean"
|
|
69
73
|
elif sql_type_normed.startswith("timestamp"):
|
|
@@ -8,18 +8,23 @@ from datacontract.model.run import Run
|
|
|
8
8
|
def publish_datamesh_manager(run: Run, publish_url: str):
|
|
9
9
|
try:
|
|
10
10
|
if publish_url is None:
|
|
11
|
-
url
|
|
11
|
+
# this url supports Data Mesh Manager and Data Contract Manager
|
|
12
|
+
url = "https://api.datamesh-manager.com/api/test-results"
|
|
12
13
|
else:
|
|
13
14
|
url = publish_url
|
|
14
|
-
|
|
15
|
+
api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
|
|
16
|
+
if api_key is None:
|
|
17
|
+
api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY")
|
|
15
18
|
|
|
16
19
|
if run.dataContractId is None:
|
|
17
20
|
raise Exception("Cannot publish run results, as data contract ID is unknown")
|
|
18
21
|
|
|
19
|
-
if
|
|
20
|
-
raise Exception(
|
|
22
|
+
if api_key is None:
|
|
23
|
+
raise Exception(
|
|
24
|
+
"Cannot publish run results, as DATAMESH_MANAGER_API_KEY nor DATACONTRACT_MANAGER_API_KEY are not set"
|
|
25
|
+
)
|
|
21
26
|
|
|
22
|
-
headers = {"Content-Type": "application/json", "x-api-key":
|
|
27
|
+
headers = {"Content-Type": "application/json", "x-api-key": api_key}
|
|
23
28
|
request_body = run.model_dump_json()
|
|
24
29
|
# print("Request Body:", request_body)
|
|
25
30
|
response = requests.post(url, data=request_body, headers=headers)
|