datacontract-cli 0.10.8__py3-none-any.whl → 0.10.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/catalog/catalog.py +4 -2
- datacontract/cli.py +36 -18
- datacontract/data_contract.py +13 -53
- datacontract/engines/soda/check_soda_execute.py +10 -2
- datacontract/engines/soda/connections/duckdb.py +32 -12
- datacontract/engines/soda/connections/trino.py +26 -0
- datacontract/export/avro_converter.py +1 -1
- datacontract/export/exporter.py +3 -2
- datacontract/export/exporter_factory.py +132 -39
- datacontract/export/jsonschema_converter.py +7 -7
- datacontract/export/sodacl_converter.py +17 -12
- datacontract/export/spark_converter.py +211 -0
- datacontract/export/sql_type_converter.py +28 -0
- datacontract/imports/avro_importer.py +149 -7
- datacontract/imports/bigquery_importer.py +17 -0
- datacontract/imports/dbt_importer.py +117 -0
- datacontract/imports/glue_importer.py +116 -33
- datacontract/imports/importer.py +34 -0
- datacontract/imports/importer_factory.py +90 -0
- datacontract/imports/jsonschema_importer.py +14 -3
- datacontract/imports/odcs_importer.py +8 -0
- datacontract/imports/spark_importer.py +134 -0
- datacontract/imports/sql_importer.py +8 -0
- datacontract/imports/unity_importer.py +23 -9
- datacontract/integration/publish_datamesh_manager.py +10 -5
- datacontract/lint/resolve.py +87 -21
- datacontract/lint/schema.py +24 -4
- datacontract/model/data_contract_specification.py +37 -4
- datacontract/templates/datacontract.html +18 -3
- datacontract/templates/index.html +1 -1
- datacontract/templates/partials/datacontract_information.html +20 -0
- datacontract/templates/partials/datacontract_terms.html +7 -0
- datacontract/templates/partials/definition.html +9 -1
- datacontract/templates/partials/model_field.html +23 -6
- datacontract/templates/partials/server.html +49 -16
- datacontract/templates/style/output.css +42 -0
- {datacontract_cli-0.10.8.dist-info → datacontract_cli-0.10.10.dist-info}/METADATA +310 -122
- {datacontract_cli-0.10.8.dist-info → datacontract_cli-0.10.10.dist-info}/RECORD +42 -36
- {datacontract_cli-0.10.8.dist-info → datacontract_cli-0.10.10.dist-info}/WHEEL +1 -1
- {datacontract_cli-0.10.8.dist-info → datacontract_cli-0.10.10.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.10.8.dist-info → datacontract_cli-0.10.10.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.8.dist-info → datacontract_cli-0.10.10.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import boto3
|
|
2
2
|
from typing import List
|
|
3
3
|
|
|
4
|
+
from datacontract.imports.importer import Importer
|
|
4
5
|
from datacontract.model.data_contract_specification import (
|
|
5
6
|
DataContractSpecification,
|
|
6
7
|
Model,
|
|
@@ -9,7 +10,14 @@ from datacontract.model.data_contract_specification import (
|
|
|
9
10
|
)
|
|
10
11
|
|
|
11
12
|
|
|
12
|
-
|
|
13
|
+
class GlueImporter(Importer):
|
|
14
|
+
def import_source(
|
|
15
|
+
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
16
|
+
) -> dict:
|
|
17
|
+
return import_glue(data_contract_specification, source, import_args.get("glue_table"))
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_glue_database(database_name: str):
|
|
13
21
|
"""Get the details Glue database.
|
|
14
22
|
|
|
15
23
|
Args:
|
|
@@ -18,31 +26,32 @@ def get_glue_database(datebase_name: str):
|
|
|
18
26
|
Returns:
|
|
19
27
|
set: catalogid and locationUri
|
|
20
28
|
"""
|
|
21
|
-
|
|
22
29
|
glue = boto3.client("glue")
|
|
23
30
|
try:
|
|
24
|
-
response = glue.get_database(Name=
|
|
31
|
+
response = glue.get_database(Name=database_name)
|
|
25
32
|
except glue.exceptions.EntityNotFoundException:
|
|
26
|
-
print(f"Database not found {
|
|
33
|
+
print(f"Database not found {database_name}.")
|
|
27
34
|
return (None, None)
|
|
28
35
|
except Exception as e:
|
|
29
36
|
# todo catch all
|
|
30
37
|
print(f"Error: {e}")
|
|
31
38
|
return (None, None)
|
|
32
39
|
|
|
33
|
-
return (
|
|
40
|
+
return (
|
|
41
|
+
response["Database"]["CatalogId"],
|
|
42
|
+
response["Database"].get("LocationUri", "None"),
|
|
43
|
+
)
|
|
34
44
|
|
|
35
45
|
|
|
36
46
|
def get_glue_tables(database_name: str) -> List[str]:
|
|
37
47
|
"""Get the list of tables in a Glue database.
|
|
38
48
|
|
|
39
49
|
Args:
|
|
40
|
-
database_name (str):
|
|
50
|
+
database_name (str): Glue database to request.
|
|
41
51
|
|
|
42
52
|
Returns:
|
|
43
|
-
List[
|
|
53
|
+
List[str]: List of table names
|
|
44
54
|
"""
|
|
45
|
-
|
|
46
55
|
glue = boto3.client("glue")
|
|
47
56
|
|
|
48
57
|
# Set the paginator
|
|
@@ -107,9 +116,21 @@ def get_glue_table_schema(database_name: str, table_name: str):
|
|
|
107
116
|
return table_schema
|
|
108
117
|
|
|
109
118
|
|
|
110
|
-
def import_glue(
|
|
111
|
-
|
|
119
|
+
def import_glue(
|
|
120
|
+
data_contract_specification: DataContractSpecification,
|
|
121
|
+
source: str,
|
|
122
|
+
table_names: List[str],
|
|
123
|
+
):
|
|
124
|
+
"""Import the schema of a Glue database.
|
|
112
125
|
|
|
126
|
+
Args:
|
|
127
|
+
data_contract_specification (DataContractSpecification): The data contract specification to update.
|
|
128
|
+
source (str): The name of the Glue database.
|
|
129
|
+
table_names (List[str]): List of table names to import. If None, all tables in the database are imported.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
DataContractSpecification: The updated data contract specification.
|
|
133
|
+
"""
|
|
113
134
|
catalogid, location_uri = get_glue_database(source)
|
|
114
135
|
|
|
115
136
|
# something went wrong
|
|
@@ -131,21 +152,18 @@ def import_glue(data_contract_specification: DataContractSpecification, source:
|
|
|
131
152
|
|
|
132
153
|
fields = {}
|
|
133
154
|
for column in table_schema:
|
|
134
|
-
field =
|
|
135
|
-
field.type = map_type_from_sql(column["Type"])
|
|
155
|
+
field = create_typed_field(column["Type"])
|
|
136
156
|
|
|
137
|
-
# hive
|
|
157
|
+
# hive partitions are required, but are not primary keys
|
|
138
158
|
if column.get("Hive"):
|
|
139
159
|
field.required = True
|
|
140
160
|
|
|
141
161
|
field.description = column.get("Comment")
|
|
142
|
-
|
|
143
162
|
fields[column["Name"]] = field
|
|
144
163
|
|
|
145
164
|
if "decimal" in column["Type"]:
|
|
146
165
|
# Extract precision and scale from the string
|
|
147
166
|
perc_scale = column["Type"][8:-1].split(",")
|
|
148
|
-
print(perc_scale)
|
|
149
167
|
field.precision = int(perc_scale[0])
|
|
150
168
|
field.scale = int(perc_scale[1])
|
|
151
169
|
|
|
@@ -157,37 +175,102 @@ def import_glue(data_contract_specification: DataContractSpecification, source:
|
|
|
157
175
|
return data_contract_specification
|
|
158
176
|
|
|
159
177
|
|
|
160
|
-
def
|
|
178
|
+
def create_typed_field(dtype: str) -> Field:
|
|
179
|
+
"""Create a typed field based on the given data type.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
dtype (str): The data type of the field.
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
Field: The created field with the appropriate type.
|
|
186
|
+
"""
|
|
187
|
+
field = Field()
|
|
188
|
+
dtype = dtype.strip().lower().replace(" ", "")
|
|
189
|
+
if dtype.startswith(("array", "struct")):
|
|
190
|
+
orig_dtype: str = dtype
|
|
191
|
+
if dtype.startswith("array"):
|
|
192
|
+
field.type = "array"
|
|
193
|
+
field.items = create_typed_field(orig_dtype[6:-1])
|
|
194
|
+
elif dtype.startswith("struct"):
|
|
195
|
+
field.type = "struct"
|
|
196
|
+
for f in split_struct(orig_dtype[7:-1]):
|
|
197
|
+
field.fields[f.split(":", 1)[0].strip()] = create_typed_field(f.split(":", 1)[1])
|
|
198
|
+
else:
|
|
199
|
+
field.type = map_type_from_sql(dtype)
|
|
200
|
+
return field
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def split_fields(s: str):
|
|
204
|
+
"""Split a string of fields considering nested structures.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
s (str): The string to split.
|
|
208
|
+
|
|
209
|
+
Yields:
|
|
210
|
+
str: The next field in the string.
|
|
211
|
+
"""
|
|
212
|
+
counter: int = 0
|
|
213
|
+
last: int = 0
|
|
214
|
+
for i, x in enumerate(s):
|
|
215
|
+
if x in ("<", "("):
|
|
216
|
+
counter += 1
|
|
217
|
+
elif x in (">", ")"):
|
|
218
|
+
counter -= 1
|
|
219
|
+
elif x == "," and counter == 0:
|
|
220
|
+
yield s[last:i]
|
|
221
|
+
last = i + 1
|
|
222
|
+
yield s[last:]
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def split_struct(s: str) -> List[str]:
|
|
226
|
+
"""Split a struct string into individual fields.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
s (str): The struct string to split.
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
List[str]: List of individual fields in the struct.
|
|
233
|
+
"""
|
|
234
|
+
return list(split_fields(s=s))
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def map_type_from_sql(sql_type: str) -> str:
|
|
238
|
+
"""Map an SQL type to a corresponding field type.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
sql_type (str): The SQL type to map.
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
str: The corresponding field type.
|
|
245
|
+
"""
|
|
161
246
|
if sql_type is None:
|
|
162
247
|
return None
|
|
163
248
|
|
|
164
|
-
|
|
249
|
+
sql_type = sql_type.lower()
|
|
250
|
+
if sql_type.startswith("varchar"):
|
|
165
251
|
return "varchar"
|
|
166
|
-
if sql_type.
|
|
252
|
+
if sql_type.startswith("string"):
|
|
167
253
|
return "string"
|
|
168
|
-
if sql_type.
|
|
254
|
+
if sql_type.startswith("text"):
|
|
169
255
|
return "text"
|
|
170
|
-
|
|
256
|
+
if sql_type.startswith("byte"):
|
|
171
257
|
return "byte"
|
|
172
|
-
|
|
258
|
+
if sql_type.startswith("short"):
|
|
173
259
|
return "short"
|
|
174
|
-
|
|
260
|
+
if sql_type.startswith("integer") or sql_type.startswith("int"):
|
|
175
261
|
return "integer"
|
|
176
|
-
|
|
262
|
+
if sql_type.startswith("long") or sql_type.startswith("bigint"):
|
|
177
263
|
return "long"
|
|
178
|
-
|
|
179
|
-
return "long"
|
|
180
|
-
elif sql_type.lower().startswith("float"):
|
|
264
|
+
if sql_type.startswith("float"):
|
|
181
265
|
return "float"
|
|
182
|
-
|
|
266
|
+
if sql_type.startswith("double"):
|
|
183
267
|
return "double"
|
|
184
|
-
|
|
268
|
+
if sql_type.startswith("boolean"):
|
|
185
269
|
return "boolean"
|
|
186
|
-
|
|
270
|
+
if sql_type.startswith("timestamp"):
|
|
187
271
|
return "timestamp"
|
|
188
|
-
|
|
272
|
+
if sql_type.startswith("date"):
|
|
189
273
|
return "date"
|
|
190
|
-
|
|
274
|
+
if sql_type.startswith("decimal"):
|
|
191
275
|
return "decimal"
|
|
192
|
-
|
|
193
|
-
return "variant"
|
|
276
|
+
return "variant"
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from enum import Enum
|
|
3
|
+
|
|
4
|
+
from datacontract.model.data_contract_specification import DataContractSpecification
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Importer(ABC):
|
|
8
|
+
def __init__(self, import_format) -> None:
|
|
9
|
+
self.import_format = import_format
|
|
10
|
+
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def import_source(
|
|
13
|
+
self,
|
|
14
|
+
data_contract_specification: DataContractSpecification,
|
|
15
|
+
source: str,
|
|
16
|
+
import_args: dict,
|
|
17
|
+
) -> dict:
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ImportFormat(str, Enum):
|
|
22
|
+
sql = "sql"
|
|
23
|
+
avro = "avro"
|
|
24
|
+
dbt = "dbt"
|
|
25
|
+
glue = "glue"
|
|
26
|
+
jsonschema = "jsonschema"
|
|
27
|
+
bigquery = "bigquery"
|
|
28
|
+
odcs = "odcs"
|
|
29
|
+
unity = "unity"
|
|
30
|
+
spark = "spark"
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def get_suported_formats(cls):
|
|
34
|
+
return list(map(lambda c: c.value, cls))
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import importlib.util
|
|
2
|
+
import sys
|
|
3
|
+
from datacontract.imports.importer import ImportFormat, Importer
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ImporterFactory:
|
|
7
|
+
def __init__(self):
|
|
8
|
+
self.dict_importer = {}
|
|
9
|
+
self.dict_lazy_importer = {}
|
|
10
|
+
|
|
11
|
+
def register_importer(self, name, importer: Importer):
|
|
12
|
+
self.dict_importer.update({name: importer})
|
|
13
|
+
|
|
14
|
+
def register_lazy_importer(self, name: str, module_path: str, class_name: str):
|
|
15
|
+
self.dict_lazy_importer.update({name: (module_path, class_name)})
|
|
16
|
+
|
|
17
|
+
def create(self, name) -> Importer:
|
|
18
|
+
importers = self.dict_importer.copy()
|
|
19
|
+
importers.update(self.dict_lazy_importer.copy())
|
|
20
|
+
if name not in importers.keys():
|
|
21
|
+
raise ValueError(f"The '{name}' format is not supported.")
|
|
22
|
+
importer_class = importers[name]
|
|
23
|
+
if type(importers[name]) is tuple:
|
|
24
|
+
importer_class = load_module_class(module_path=importers[name][0], class_name=importers[name][1])
|
|
25
|
+
if not importer_class:
|
|
26
|
+
raise ValueError(f"Module {name} could not be loaded.")
|
|
27
|
+
return importer_class(name)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def import_module(module_path):
|
|
31
|
+
if importlib.util.find_spec(module_path) is not None:
|
|
32
|
+
try:
|
|
33
|
+
module = importlib.import_module(module_path)
|
|
34
|
+
except ModuleNotFoundError:
|
|
35
|
+
return None
|
|
36
|
+
sys.modules[module_path] = module
|
|
37
|
+
return module
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def load_module_class(module_path, class_name):
|
|
41
|
+
module = import_module(module_path)
|
|
42
|
+
if not module:
|
|
43
|
+
return None
|
|
44
|
+
return getattr(module, class_name)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
importer_factory = ImporterFactory()
|
|
48
|
+
importer_factory.register_lazy_importer(
|
|
49
|
+
name=ImportFormat.avro,
|
|
50
|
+
module_path="datacontract.imports.avro_importer",
|
|
51
|
+
class_name="AvroImporter",
|
|
52
|
+
)
|
|
53
|
+
importer_factory.register_lazy_importer(
|
|
54
|
+
name=ImportFormat.bigquery,
|
|
55
|
+
module_path="datacontract.imports.bigquery_importer",
|
|
56
|
+
class_name="BigQueryImporter",
|
|
57
|
+
)
|
|
58
|
+
importer_factory.register_lazy_importer(
|
|
59
|
+
name=ImportFormat.glue,
|
|
60
|
+
module_path="datacontract.imports.glue_importer",
|
|
61
|
+
class_name="GlueImporter",
|
|
62
|
+
)
|
|
63
|
+
importer_factory.register_lazy_importer(
|
|
64
|
+
name=ImportFormat.jsonschema,
|
|
65
|
+
module_path="datacontract.imports.jsonschema_importer",
|
|
66
|
+
class_name="JsonSchemaImporter",
|
|
67
|
+
)
|
|
68
|
+
importer_factory.register_lazy_importer(
|
|
69
|
+
name=ImportFormat.odcs,
|
|
70
|
+
module_path="datacontract.imports.odcs_importer",
|
|
71
|
+
class_name="OdcsImporter",
|
|
72
|
+
)
|
|
73
|
+
importer_factory.register_lazy_importer(
|
|
74
|
+
name=ImportFormat.sql,
|
|
75
|
+
module_path="datacontract.imports.sql_importer",
|
|
76
|
+
class_name="SqlImporter",
|
|
77
|
+
)
|
|
78
|
+
importer_factory.register_lazy_importer(
|
|
79
|
+
name=ImportFormat.unity,
|
|
80
|
+
module_path="datacontract.imports.unity_importer",
|
|
81
|
+
class_name="UnityImporter",
|
|
82
|
+
)
|
|
83
|
+
importer_factory.register_lazy_importer(
|
|
84
|
+
name=ImportFormat.spark,
|
|
85
|
+
module_path="datacontract.imports.spark_importer",
|
|
86
|
+
class_name="SparkImporter",
|
|
87
|
+
)
|
|
88
|
+
importer_factory.register_lazy_importer(
|
|
89
|
+
name=ImportFormat.dbt, module_path="datacontract.imports.dbt_importer", class_name="DbtManifestImporter"
|
|
90
|
+
)
|
|
@@ -2,10 +2,18 @@ import json
|
|
|
2
2
|
|
|
3
3
|
import fastjsonschema
|
|
4
4
|
|
|
5
|
+
from datacontract.imports.importer import Importer
|
|
5
6
|
from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field, Definition
|
|
6
7
|
from datacontract.model.exceptions import DataContractException
|
|
7
8
|
|
|
8
9
|
|
|
10
|
+
class JsonSchemaImporter(Importer):
|
|
11
|
+
def import_source(
|
|
12
|
+
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
13
|
+
) -> dict:
|
|
14
|
+
return import_jsonschema(data_contract_specification, source)
|
|
15
|
+
|
|
16
|
+
|
|
9
17
|
def convert_json_schema_properties(properties, is_definition=False):
|
|
10
18
|
fields = {}
|
|
11
19
|
for field_name, field_schema in properties.items():
|
|
@@ -56,11 +64,14 @@ def convert_json_schema_properties(properties, is_definition=False):
|
|
|
56
64
|
case "tags":
|
|
57
65
|
field_kwargs["tags"] = value
|
|
58
66
|
case "properties":
|
|
59
|
-
field_kwargs["fields"] = convert_json_schema_properties(value)
|
|
67
|
+
field_kwargs["fields"] = convert_json_schema_properties(value, is_definition=is_definition)
|
|
60
68
|
case "items":
|
|
61
|
-
field_kwargs["items"] = convert_json_schema_properties(value)
|
|
69
|
+
field_kwargs["items"] = convert_json_schema_properties(value, is_definition=is_definition)
|
|
62
70
|
|
|
63
|
-
|
|
71
|
+
if is_definition:
|
|
72
|
+
field = Definition(**field_kwargs)
|
|
73
|
+
else:
|
|
74
|
+
field = Field(**field_kwargs)
|
|
64
75
|
fields[field_name] = field
|
|
65
76
|
|
|
66
77
|
return fields
|
|
@@ -2,6 +2,7 @@ import datetime
|
|
|
2
2
|
import logging
|
|
3
3
|
from typing import Any, Dict, List
|
|
4
4
|
import yaml
|
|
5
|
+
from datacontract.imports.importer import Importer
|
|
5
6
|
from datacontract.model.data_contract_specification import (
|
|
6
7
|
Availability,
|
|
7
8
|
Contact,
|
|
@@ -42,6 +43,13 @@ DATACONTRACT_TYPES = [
|
|
|
42
43
|
]
|
|
43
44
|
|
|
44
45
|
|
|
46
|
+
class OdcsImporter(Importer):
|
|
47
|
+
def import_source(
|
|
48
|
+
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
49
|
+
) -> dict:
|
|
50
|
+
return import_odcs(data_contract_specification, source)
|
|
51
|
+
|
|
52
|
+
|
|
45
53
|
def import_odcs(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
|
|
46
54
|
try:
|
|
47
55
|
with open(source, "r") as file:
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
from pyspark.sql import DataFrame, SparkSession, types
|
|
2
|
+
from datacontract.imports.importer import Importer
|
|
3
|
+
from datacontract.model.data_contract_specification import (
|
|
4
|
+
DataContractSpecification,
|
|
5
|
+
Model,
|
|
6
|
+
Field,
|
|
7
|
+
Server,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SparkImporter(Importer):
|
|
12
|
+
def import_source(
|
|
13
|
+
self,
|
|
14
|
+
data_contract_specification: DataContractSpecification,
|
|
15
|
+
source: str,
|
|
16
|
+
import_args: dict,
|
|
17
|
+
) -> dict:
|
|
18
|
+
"""
|
|
19
|
+
Imports data from a Spark source into the data contract specification.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
data_contract_specification: The data contract specification object.
|
|
23
|
+
source: The source string indicating the Spark tables to read.
|
|
24
|
+
import_args: Additional arguments for the import process.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
dict: The updated data contract specification.
|
|
28
|
+
"""
|
|
29
|
+
return import_spark(data_contract_specification, source)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def import_spark(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
|
|
33
|
+
"""
|
|
34
|
+
Reads Spark tables and updates the data contract specification with their schemas.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
data_contract_specification: The data contract specification to update.
|
|
38
|
+
source: A comma-separated string of Spark temporary views to read.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
DataContractSpecification: The updated data contract specification.
|
|
42
|
+
"""
|
|
43
|
+
spark = SparkSession.builder.getOrCreate()
|
|
44
|
+
data_contract_specification.servers["local"] = Server(type="dataframe")
|
|
45
|
+
for temp_view in source.split(","):
|
|
46
|
+
temp_view = temp_view.strip()
|
|
47
|
+
df = spark.read.table(temp_view)
|
|
48
|
+
data_contract_specification.models[temp_view] = import_from_spark_df(df)
|
|
49
|
+
return data_contract_specification
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def import_from_spark_df(df: DataFrame) -> Model:
|
|
53
|
+
"""
|
|
54
|
+
Converts a Spark DataFrame into a Model.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
df: The Spark DataFrame to convert.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
Model: The generated data contract model.
|
|
61
|
+
"""
|
|
62
|
+
model = Model()
|
|
63
|
+
schema = df.schema
|
|
64
|
+
|
|
65
|
+
for field in schema:
|
|
66
|
+
model.fields[field.name] = _field_from_spark(field)
|
|
67
|
+
|
|
68
|
+
return model
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _field_from_spark(spark_field: types.StructField) -> Field:
|
|
72
|
+
"""
|
|
73
|
+
Converts a Spark StructField into a Field object for the data contract.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
spark_field: The Spark StructField to convert.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Field: The corresponding Field object.
|
|
80
|
+
"""
|
|
81
|
+
field_type = _data_type_from_spark(spark_field.dataType)
|
|
82
|
+
field = Field()
|
|
83
|
+
field.type = field_type
|
|
84
|
+
field.required = not spark_field.nullable
|
|
85
|
+
|
|
86
|
+
if field_type == "array":
|
|
87
|
+
field.items = _field_from_spark(spark_field.dataType.elementType)
|
|
88
|
+
|
|
89
|
+
if field_type == "struct":
|
|
90
|
+
field.fields = {sf.name: _field_from_spark(sf) for sf in spark_field.dataType.fields}
|
|
91
|
+
|
|
92
|
+
return field
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _data_type_from_spark(spark_type: types.DataType) -> str:
|
|
96
|
+
"""
|
|
97
|
+
Maps Spark data types to the Data Contract type system.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
spark_type: The Spark data type to map.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
str: The corresponding Data Contract type.
|
|
104
|
+
"""
|
|
105
|
+
if isinstance(spark_type, types.StringType):
|
|
106
|
+
return "string"
|
|
107
|
+
elif isinstance(spark_type, types.IntegerType):
|
|
108
|
+
return "integer"
|
|
109
|
+
elif isinstance(spark_type, types.LongType):
|
|
110
|
+
return "long"
|
|
111
|
+
elif isinstance(spark_type, types.FloatType):
|
|
112
|
+
return "float"
|
|
113
|
+
elif isinstance(spark_type, types.DoubleType):
|
|
114
|
+
return "double"
|
|
115
|
+
elif isinstance(spark_type, types.StructType):
|
|
116
|
+
return "struct"
|
|
117
|
+
elif isinstance(spark_type, types.ArrayType):
|
|
118
|
+
return "array"
|
|
119
|
+
elif isinstance(spark_type, types.TimestampType):
|
|
120
|
+
return "timestamp"
|
|
121
|
+
elif isinstance(spark_type, types.TimestampNTZType):
|
|
122
|
+
return "timestamp_ntz"
|
|
123
|
+
elif isinstance(spark_type, types.DateType):
|
|
124
|
+
return "date"
|
|
125
|
+
elif isinstance(spark_type, types.BooleanType):
|
|
126
|
+
return "boolean"
|
|
127
|
+
elif isinstance(spark_type, types.BinaryType):
|
|
128
|
+
return "bytes"
|
|
129
|
+
elif isinstance(spark_type, types.DecimalType):
|
|
130
|
+
return "decimal"
|
|
131
|
+
elif isinstance(spark_type, types.NullType):
|
|
132
|
+
return "null"
|
|
133
|
+
else:
|
|
134
|
+
raise ValueError(f"Unsupported Spark type: {spark_type}")
|
|
@@ -1,8 +1,16 @@
|
|
|
1
1
|
from simple_ddl_parser import parse_from_file
|
|
2
2
|
|
|
3
|
+
from datacontract.imports.importer import Importer
|
|
3
4
|
from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
|
|
4
5
|
|
|
5
6
|
|
|
7
|
+
class SqlImporter(Importer):
|
|
8
|
+
def import_source(
|
|
9
|
+
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
10
|
+
) -> dict:
|
|
11
|
+
return import_sql(data_contract_specification, self.import_format, source)
|
|
12
|
+
|
|
13
|
+
|
|
6
14
|
def import_sql(data_contract_specification: DataContractSpecification, format: str, source: str):
|
|
7
15
|
ddl = parse_from_file(source, group_by_type=True)
|
|
8
16
|
tables = ddl["tables"]
|
|
@@ -3,9 +3,24 @@ import requests
|
|
|
3
3
|
import os
|
|
4
4
|
import typing
|
|
5
5
|
|
|
6
|
+
from datacontract.imports.importer import Importer
|
|
6
7
|
from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
|
|
7
8
|
from datacontract.model.exceptions import DataContractException
|
|
8
9
|
|
|
10
|
+
|
|
11
|
+
class UnityImporter(Importer):
|
|
12
|
+
def import_source(
|
|
13
|
+
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
14
|
+
) -> dict:
|
|
15
|
+
if source is not None:
|
|
16
|
+
data_contract_specification = import_unity_from_json(data_contract_specification, source)
|
|
17
|
+
else:
|
|
18
|
+
data_contract_specification = import_unity_from_api(
|
|
19
|
+
data_contract_specification, import_args.get("unity_table_full_name")
|
|
20
|
+
)
|
|
21
|
+
return data_contract_specification
|
|
22
|
+
|
|
23
|
+
|
|
9
24
|
def import_unity_from_json(
|
|
10
25
|
data_contract_specification: DataContractSpecification, source: str
|
|
11
26
|
) -> DataContractSpecification:
|
|
@@ -22,23 +37,21 @@ def import_unity_from_json(
|
|
|
22
37
|
)
|
|
23
38
|
return convert_unity_schema(data_contract_specification, unity_schema)
|
|
24
39
|
|
|
40
|
+
|
|
25
41
|
def import_unity_from_api(
|
|
26
|
-
data_contract_specification: DataContractSpecification,
|
|
27
|
-
unity_table_full_name: typing.Optional[str] = None
|
|
42
|
+
data_contract_specification: DataContractSpecification, unity_table_full_name: typing.Optional[str] = None
|
|
28
43
|
) -> DataContractSpecification:
|
|
29
|
-
databricks_instance = os.getenv(
|
|
30
|
-
access_token = os.getenv(
|
|
44
|
+
databricks_instance = os.getenv("DATABRICKS_IMPORT_INSTANCE")
|
|
45
|
+
access_token = os.getenv("DATABRICKS_IMPORT_ACCESS_TOKEN")
|
|
31
46
|
|
|
32
47
|
if not databricks_instance or not access_token:
|
|
33
48
|
print("Missing environment variables for Databricks instance or access token.")
|
|
34
49
|
print("Both, $DATABRICKS_IMPORT_INSTANCE and $DATABRICKS_IMPORT_ACCESS_TOKEN must be set.")
|
|
35
50
|
exit(1) # Exit if variables are not set
|
|
36
51
|
|
|
37
|
-
api_url = f
|
|
52
|
+
api_url = f"{databricks_instance}/api/2.1/unity-catalog/tables/{unity_table_full_name}"
|
|
38
53
|
|
|
39
|
-
headers = {
|
|
40
|
-
'Authorization': f'Bearer {access_token}'
|
|
41
|
-
}
|
|
54
|
+
headers = {"Authorization": f"Bearer {access_token}"}
|
|
42
55
|
response = requests.get(api_url, headers=headers)
|
|
43
56
|
|
|
44
57
|
if response.status_code != 200:
|
|
@@ -46,13 +59,14 @@ def import_unity_from_api(
|
|
|
46
59
|
type="schema",
|
|
47
60
|
name="Retrieve unity catalog schema",
|
|
48
61
|
reason=f"Failed to retrieve unity catalog schema from databricks instance: {response.status_code} {response.text}",
|
|
49
|
-
engine="datacontract"
|
|
62
|
+
engine="datacontract",
|
|
50
63
|
)
|
|
51
64
|
|
|
52
65
|
convert_unity_schema(data_contract_specification, response.json())
|
|
53
66
|
|
|
54
67
|
return data_contract_specification
|
|
55
68
|
|
|
69
|
+
|
|
56
70
|
def convert_unity_schema(
|
|
57
71
|
data_contract_specification: DataContractSpecification, unity_schema: dict
|
|
58
72
|
) -> DataContractSpecification:
|
|
@@ -8,18 +8,23 @@ from datacontract.model.run import Run
|
|
|
8
8
|
def publish_datamesh_manager(run: Run, publish_url: str):
|
|
9
9
|
try:
|
|
10
10
|
if publish_url is None:
|
|
11
|
-
url
|
|
11
|
+
# this url supports Data Mesh Manager and Data Contract Manager
|
|
12
|
+
url = "https://api.datamesh-manager.com/api/test-results"
|
|
12
13
|
else:
|
|
13
14
|
url = publish_url
|
|
14
|
-
|
|
15
|
+
api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
|
|
16
|
+
if api_key is None:
|
|
17
|
+
api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY")
|
|
15
18
|
|
|
16
19
|
if run.dataContractId is None:
|
|
17
20
|
raise Exception("Cannot publish run results, as data contract ID is unknown")
|
|
18
21
|
|
|
19
|
-
if
|
|
20
|
-
raise Exception(
|
|
22
|
+
if api_key is None:
|
|
23
|
+
raise Exception(
|
|
24
|
+
"Cannot publish run results, as DATAMESH_MANAGER_API_KEY nor DATACONTRACT_MANAGER_API_KEY are not set"
|
|
25
|
+
)
|
|
21
26
|
|
|
22
|
-
headers = {"Content-Type": "application/json", "x-api-key":
|
|
27
|
+
headers = {"Content-Type": "application/json", "x-api-key": api_key}
|
|
23
28
|
request_body = run.model_dump_json()
|
|
24
29
|
# print("Request Body:", request_body)
|
|
25
30
|
response = requests.post(url, data=request_body, headers=headers)
|