datacontract-cli 0.10.12__py3-none-any.whl → 0.10.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/cli.py +5 -0
- datacontract/data_contract.py +9 -1
- datacontract/engines/soda/connections/kafka.py +28 -6
- datacontract/export/avro_converter.py +8 -1
- datacontract/export/avro_idl_converter.py +1 -0
- datacontract/export/bigquery_converter.py +30 -23
- datacontract/export/data_caterer_converter.py +148 -0
- datacontract/export/dcs_exporter.py +6 -0
- datacontract/export/exporter.py +5 -1
- datacontract/export/exporter_factory.py +19 -1
- datacontract/export/jsonschema_converter.py +13 -2
- datacontract/export/{odcs_converter.py → odcs_v2_exporter.py} +4 -4
- datacontract/export/odcs_v3_exporter.py +294 -0
- datacontract/export/sodacl_converter.py +82 -2
- datacontract/export/spark_converter.py +3 -1
- datacontract/export/sql_type_converter.py +56 -21
- datacontract/imports/iceberg_importer.py +162 -0
- datacontract/imports/importer.py +1 -0
- datacontract/imports/importer_factory.py +5 -0
- datacontract/imports/odcs_importer.py +25 -168
- datacontract/imports/odcs_v2_importer.py +177 -0
- datacontract/imports/odcs_v3_importer.py +309 -0
- datacontract/imports/spark_importer.py +5 -1
- datacontract/imports/unity_importer.py +105 -84
- datacontract/integration/datamesh_manager.py +1 -1
- datacontract/lint/resolve.py +24 -10
- datacontract/lint/resources.py +21 -0
- datacontract/lint/urls.py +29 -13
- datacontract/model/data_contract_specification.py +72 -8
- datacontract/model/odcs.py +11 -0
- {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/METADATA +106 -52
- {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/RECORD +36 -29
- {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/WHEEL +1 -1
- datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
- {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
from venv import logger
|
|
5
|
+
|
|
6
|
+
import yaml
|
|
7
|
+
|
|
8
|
+
from datacontract.imports.importer import Importer
|
|
9
|
+
from datacontract.lint.resources import read_resource
|
|
10
|
+
from datacontract.model.data_contract_specification import (
|
|
11
|
+
Availability,
|
|
12
|
+
DataContractSpecification,
|
|
13
|
+
Info,
|
|
14
|
+
Model,
|
|
15
|
+
Field,
|
|
16
|
+
Retention,
|
|
17
|
+
Server,
|
|
18
|
+
ServiceLevel,
|
|
19
|
+
Terms,
|
|
20
|
+
DATACONTRACT_TYPES,
|
|
21
|
+
)
|
|
22
|
+
from datacontract.model.exceptions import DataContractException
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class OdcsImporter(Importer):
|
|
26
|
+
def import_source(
|
|
27
|
+
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
28
|
+
) -> DataContractSpecification:
|
|
29
|
+
return import_odcs_v3(data_contract_specification, source)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def import_odcs_v3(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
|
|
33
|
+
source_str = read_resource(source)
|
|
34
|
+
return import_odcs_v3_from_str(data_contract_specification, source_str)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def import_odcs_v3_from_str(
|
|
38
|
+
data_contract_specification: DataContractSpecification, source_str: str
|
|
39
|
+
) -> DataContractSpecification:
|
|
40
|
+
try:
|
|
41
|
+
odcs_contract = yaml.safe_load(source_str)
|
|
42
|
+
except Exception as e:
|
|
43
|
+
raise DataContractException(
|
|
44
|
+
type="schema",
|
|
45
|
+
name="Parse ODCS contract",
|
|
46
|
+
reason=f"Failed to parse odcs contract from {source_str}",
|
|
47
|
+
engine="datacontract",
|
|
48
|
+
original_exception=e,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
data_contract_specification.id = odcs_contract["id"]
|
|
52
|
+
data_contract_specification.info = import_info(odcs_contract)
|
|
53
|
+
data_contract_specification.servers = import_servers(odcs_contract)
|
|
54
|
+
data_contract_specification.terms = import_terms(odcs_contract)
|
|
55
|
+
data_contract_specification.servicelevels = import_servicelevels(odcs_contract)
|
|
56
|
+
data_contract_specification.models = import_models(odcs_contract)
|
|
57
|
+
data_contract_specification.tags = import_tags(odcs_contract)
|
|
58
|
+
|
|
59
|
+
return data_contract_specification
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def import_info(odcs_contract: Dict[str, Any]) -> Info:
|
|
63
|
+
info = Info()
|
|
64
|
+
|
|
65
|
+
info.title = odcs_contract.get("name") if odcs_contract.get("name") is not None else ""
|
|
66
|
+
|
|
67
|
+
if odcs_contract.get("version") is not None:
|
|
68
|
+
info.version = odcs_contract.get("version")
|
|
69
|
+
|
|
70
|
+
# odcs.description.purpose => datacontract.description
|
|
71
|
+
if odcs_contract.get("description") is not None and odcs_contract.get("description").get("purpose") is not None:
|
|
72
|
+
info.description = odcs_contract.get("description").get("purpose")
|
|
73
|
+
|
|
74
|
+
# odcs.domain => datacontract.owner
|
|
75
|
+
if odcs_contract.get("domain") is not None:
|
|
76
|
+
info.owner = odcs_contract.get("domain")
|
|
77
|
+
|
|
78
|
+
# add dataProduct as custom property
|
|
79
|
+
if odcs_contract.get("dataProduct") is not None:
|
|
80
|
+
info.dataProduct = odcs_contract.get("dataProduct")
|
|
81
|
+
|
|
82
|
+
# add tenant as custom property
|
|
83
|
+
if odcs_contract.get("tenant") is not None:
|
|
84
|
+
info.tenant = odcs_contract.get("tenant")
|
|
85
|
+
|
|
86
|
+
return info
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def import_servers(odcs_contract: Dict[str, Any]) -> Dict[str, Server] | None:
|
|
90
|
+
if odcs_contract.get("servers") is None:
|
|
91
|
+
return None
|
|
92
|
+
servers = {}
|
|
93
|
+
for odcs_server in odcs_contract.get("servers"):
|
|
94
|
+
server_name = odcs_server.get("server")
|
|
95
|
+
if server_name is None:
|
|
96
|
+
logger.warning("Server name is missing, skipping server")
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
server = Server()
|
|
100
|
+
server.type = odcs_server.get("type")
|
|
101
|
+
server.description = odcs_server.get("description")
|
|
102
|
+
server.environment = odcs_server.get("environment")
|
|
103
|
+
server.format = odcs_server.get("format")
|
|
104
|
+
server.project = odcs_server.get("project")
|
|
105
|
+
server.dataset = odcs_server.get("dataset")
|
|
106
|
+
server.path = odcs_server.get("path")
|
|
107
|
+
server.delimiter = odcs_server.get("delimiter")
|
|
108
|
+
server.endpointUrl = odcs_server.get("endpointUrl")
|
|
109
|
+
server.location = odcs_server.get("location")
|
|
110
|
+
server.account = odcs_server.get("account")
|
|
111
|
+
server.database = odcs_server.get("database")
|
|
112
|
+
server.schema_ = odcs_server.get("schema")
|
|
113
|
+
server.host = odcs_server.get("host")
|
|
114
|
+
server.port = odcs_server.get("port")
|
|
115
|
+
server.catalog = odcs_server.get("catalog")
|
|
116
|
+
server.topic = odcs_server.get("topic")
|
|
117
|
+
server.http_path = odcs_server.get("http_path")
|
|
118
|
+
server.token = odcs_server.get("token")
|
|
119
|
+
server.dataProductId = odcs_server.get("dataProductId")
|
|
120
|
+
server.outputPortId = odcs_server.get("outputPortId")
|
|
121
|
+
server.driver = odcs_server.get("driver")
|
|
122
|
+
server.roles = odcs_server.get("roles")
|
|
123
|
+
|
|
124
|
+
servers[server_name] = server
|
|
125
|
+
return servers
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def import_terms(odcs_contract: Dict[str, Any]) -> Terms | None:
|
|
129
|
+
if odcs_contract.get("description") is None:
|
|
130
|
+
return None
|
|
131
|
+
if (
|
|
132
|
+
odcs_contract.get("description").get("usage") is not None
|
|
133
|
+
or odcs_contract.get("description").get("limitations") is not None
|
|
134
|
+
or odcs_contract.get("price") is not None
|
|
135
|
+
):
|
|
136
|
+
terms = Terms()
|
|
137
|
+
if odcs_contract.get("description").get("usage") is not None:
|
|
138
|
+
terms.usage = odcs_contract.get("description").get("usage")
|
|
139
|
+
if odcs_contract.get("description").get("limitations") is not None:
|
|
140
|
+
terms.limitations = odcs_contract.get("description").get("limitations")
|
|
141
|
+
if odcs_contract.get("price") is not None:
|
|
142
|
+
terms.billing = f"{odcs_contract.get('price').get('priceAmount')} {odcs_contract.get('price').get('priceCurrency')} / {odcs_contract.get('price').get('priceUnit')}"
|
|
143
|
+
|
|
144
|
+
return terms
|
|
145
|
+
else:
|
|
146
|
+
return None
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def import_servicelevels(odcs_contract: Dict[str, Any]) -> ServiceLevel:
|
|
150
|
+
# find the two properties we can map (based on the examples)
|
|
151
|
+
sla_properties = odcs_contract.get("slaProperties") if odcs_contract.get("slaProperties") is not None else []
|
|
152
|
+
availability = next((p for p in sla_properties if p["property"] == "generalAvailability"), None)
|
|
153
|
+
retention = next((p for p in sla_properties if p["property"] == "retention"), None)
|
|
154
|
+
|
|
155
|
+
if availability is not None or retention is not None:
|
|
156
|
+
servicelevel = ServiceLevel()
|
|
157
|
+
|
|
158
|
+
if availability is not None:
|
|
159
|
+
value = availability.get("value")
|
|
160
|
+
if isinstance(value, datetime.datetime):
|
|
161
|
+
value = value.isoformat()
|
|
162
|
+
servicelevel.availability = Availability(description=value)
|
|
163
|
+
|
|
164
|
+
if retention is not None:
|
|
165
|
+
servicelevel.retention = Retention(period=f"{retention.get('value')}{retention.get('unit')}")
|
|
166
|
+
|
|
167
|
+
return servicelevel
|
|
168
|
+
else:
|
|
169
|
+
return None
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def get_server_type(odcs_contract: Dict[str, Any]) -> str | None:
|
|
173
|
+
servers = import_servers(odcs_contract)
|
|
174
|
+
if servers is None or len(servers) == 0:
|
|
175
|
+
return None
|
|
176
|
+
# get first server from map
|
|
177
|
+
server = next(iter(servers.values()))
|
|
178
|
+
return server.type
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def import_models(odcs_contract: Dict[str, Any]) -> Dict[str, Model]:
|
|
182
|
+
custom_type_mappings = get_custom_type_mappings(odcs_contract.get("customProperties"))
|
|
183
|
+
|
|
184
|
+
odcs_schemas = odcs_contract.get("schema") if odcs_contract.get("schema") is not None else []
|
|
185
|
+
result = {}
|
|
186
|
+
|
|
187
|
+
for odcs_schema in odcs_schemas:
|
|
188
|
+
schema_name = odcs_schema.get("name")
|
|
189
|
+
schema_physical_name = odcs_schema.get("physicalName")
|
|
190
|
+
schema_description = odcs_schema.get("description") if odcs_schema.get("description") is not None else ""
|
|
191
|
+
model_name = schema_physical_name if schema_physical_name is not None else schema_name
|
|
192
|
+
model = Model(description=" ".join(schema_description.splitlines()), type="table")
|
|
193
|
+
model.fields = import_fields(
|
|
194
|
+
odcs_schema.get("properties"), custom_type_mappings, server_type=get_server_type(odcs_contract)
|
|
195
|
+
)
|
|
196
|
+
model.title = schema_name
|
|
197
|
+
if odcs_schema.get("dataGranularityDescription") is not None:
|
|
198
|
+
model.config = {"dataGranularityDescription": odcs_schema.get("dataGranularityDescription")}
|
|
199
|
+
result[model_name] = model
|
|
200
|
+
|
|
201
|
+
return result
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def import_field_config(odcs_property: Dict[str, Any], server_type=None) -> Dict[str, Any]:
|
|
205
|
+
config = {}
|
|
206
|
+
if odcs_property.get("criticalDataElement") is not None:
|
|
207
|
+
config["criticalDataElement"] = odcs_property.get("criticalDataElement")
|
|
208
|
+
if odcs_property.get("encryptedName") is not None:
|
|
209
|
+
config["encryptedName"] = odcs_property.get("encryptedName")
|
|
210
|
+
if odcs_property.get("partitionKeyPosition") is not None:
|
|
211
|
+
config["partitionKeyPosition"] = odcs_property.get("partitionKeyPosition")
|
|
212
|
+
if odcs_property.get("partitioned") is not None:
|
|
213
|
+
config["partitioned"] = odcs_property.get("partitioned")
|
|
214
|
+
|
|
215
|
+
if odcs_property.get("customProperties") is not None and isinstance(odcs_property.get("customProperties"), list):
|
|
216
|
+
for item in odcs_property.get("customProperties"):
|
|
217
|
+
config[item["property"]] = item["value"]
|
|
218
|
+
|
|
219
|
+
physical_type = odcs_property.get("physicalType")
|
|
220
|
+
if physical_type is not None:
|
|
221
|
+
if server_type == "postgres" or server_type == "postgresql":
|
|
222
|
+
config["postgresType"] = physical_type
|
|
223
|
+
elif server_type == "bigquery":
|
|
224
|
+
config["bigqueryType"] = physical_type
|
|
225
|
+
elif server_type == "snowflake":
|
|
226
|
+
config["snowflakeType"] = physical_type
|
|
227
|
+
elif server_type == "redshift":
|
|
228
|
+
config["redshiftType"] = physical_type
|
|
229
|
+
elif server_type == "sqlserver":
|
|
230
|
+
config["sqlserverType"] = physical_type
|
|
231
|
+
elif server_type == "databricksType":
|
|
232
|
+
config["databricksType"] = physical_type
|
|
233
|
+
else:
|
|
234
|
+
config["physicalType"] = physical_type
|
|
235
|
+
|
|
236
|
+
return config
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def has_composite_primary_key(odcs_properties) -> bool:
|
|
240
|
+
primary_keys = [prop for prop in odcs_properties if prop.get("primaryKey") is not None and prop.get("primaryKey")]
|
|
241
|
+
return len(primary_keys) > 1
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def import_fields(
|
|
245
|
+
odcs_properties: Dict[str, Any], custom_type_mappings: Dict[str, str], server_type
|
|
246
|
+
) -> Dict[str, Field]:
|
|
247
|
+
logger = logging.getLogger(__name__)
|
|
248
|
+
result = {}
|
|
249
|
+
|
|
250
|
+
if odcs_properties is None:
|
|
251
|
+
return result
|
|
252
|
+
|
|
253
|
+
for odcs_property in odcs_properties:
|
|
254
|
+
mapped_type = map_type(odcs_property.get("logicalType"), custom_type_mappings)
|
|
255
|
+
if mapped_type is not None:
|
|
256
|
+
property_name = odcs_property["name"]
|
|
257
|
+
description = odcs_property.get("description") if odcs_property.get("description") is not None else None
|
|
258
|
+
field = Field(
|
|
259
|
+
description=" ".join(description.splitlines()) if description is not None else None,
|
|
260
|
+
type=mapped_type,
|
|
261
|
+
title=odcs_property.get("businessName"),
|
|
262
|
+
required=not odcs_property.get("nullable") if odcs_property.get("nullable") is not None else False,
|
|
263
|
+
primary=odcs_property.get("primaryKey")
|
|
264
|
+
if not has_composite_primary_key(odcs_properties) and odcs_property.get("primaryKey") is not None
|
|
265
|
+
else False,
|
|
266
|
+
unique=odcs_property.get("unique"),
|
|
267
|
+
examples=odcs_property.get("examples") if odcs_property.get("examples") is not None else None,
|
|
268
|
+
classification=odcs_property.get("classification")
|
|
269
|
+
if odcs_property.get("classification") is not None
|
|
270
|
+
else "",
|
|
271
|
+
tags=odcs_property.get("tags") if odcs_property.get("tags") is not None else None,
|
|
272
|
+
quality=odcs_property.get("quality") if odcs_property.get("quality") is not None else [],
|
|
273
|
+
config=import_field_config(odcs_property, server_type),
|
|
274
|
+
)
|
|
275
|
+
result[property_name] = field
|
|
276
|
+
else:
|
|
277
|
+
logger.info(
|
|
278
|
+
f"Can't map {odcs_property.get('column')} to the Datacontract Mapping types, as there is no equivalent or special mapping. Consider introducing a customProperty 'dc_mapping_{odcs_property.get('logicalName')}' that defines your expected type as the 'value'"
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
return result
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def map_type(odcs_type: str, custom_mappings: Dict[str, str]) -> str | None:
|
|
285
|
+
t = odcs_type.lower()
|
|
286
|
+
if t in DATACONTRACT_TYPES:
|
|
287
|
+
return t
|
|
288
|
+
elif custom_mappings.get(t) is not None:
|
|
289
|
+
return custom_mappings.get(t)
|
|
290
|
+
else:
|
|
291
|
+
return None
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def get_custom_type_mappings(odcs_custom_properties: List[Any]) -> Dict[str, str]:
|
|
295
|
+
result = {}
|
|
296
|
+
if odcs_custom_properties is not None:
|
|
297
|
+
for prop in odcs_custom_properties:
|
|
298
|
+
if prop["property"].startswith("dc_mapping_"):
|
|
299
|
+
odcs_type_name = prop["property"].substring(11)
|
|
300
|
+
datacontract_type = prop["value"]
|
|
301
|
+
result[odcs_type_name] = datacontract_type
|
|
302
|
+
|
|
303
|
+
return result
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def import_tags(odcs_contract) -> List[str] | None:
|
|
307
|
+
if odcs_contract.get("tags") is None:
|
|
308
|
+
return None
|
|
309
|
+
return odcs_contract.get("tags")
|
|
@@ -80,6 +80,8 @@ def _field_from_struct_type(spark_field: types.StructField) -> Field:
|
|
|
80
80
|
"""
|
|
81
81
|
field = Field()
|
|
82
82
|
field.required = not spark_field.nullable
|
|
83
|
+
field.description = spark_field.metadata.get("comment")
|
|
84
|
+
|
|
83
85
|
return _type_from_data_type(field, spark_field.dataType)
|
|
84
86
|
|
|
85
87
|
|
|
@@ -121,7 +123,7 @@ def _data_type_from_spark(spark_type: types.DataType) -> str:
|
|
|
121
123
|
"""
|
|
122
124
|
if isinstance(spark_type, types.StringType):
|
|
123
125
|
return "string"
|
|
124
|
-
elif isinstance(spark_type, types.IntegerType):
|
|
126
|
+
elif isinstance(spark_type, (types.IntegerType, types.ShortType)):
|
|
125
127
|
return "integer"
|
|
126
128
|
elif isinstance(spark_type, types.LongType):
|
|
127
129
|
return "long"
|
|
@@ -149,5 +151,7 @@ def _data_type_from_spark(spark_type: types.DataType) -> str:
|
|
|
149
151
|
return "decimal"
|
|
150
152
|
elif isinstance(spark_type, types.NullType):
|
|
151
153
|
return "null"
|
|
154
|
+
elif isinstance(spark_type, types.VarcharType):
|
|
155
|
+
return "varchar"
|
|
152
156
|
else:
|
|
153
157
|
raise ValueError(f"Unsupported Spark type: {spark_type}")
|
|
@@ -1,17 +1,37 @@
|
|
|
1
1
|
import json
|
|
2
|
-
import requests
|
|
3
2
|
import os
|
|
4
|
-
import
|
|
3
|
+
from typing import List, Optional
|
|
4
|
+
|
|
5
|
+
from pyspark.sql import types
|
|
6
|
+
from databricks.sdk import WorkspaceClient
|
|
7
|
+
from databricks.sdk.service.catalog import TableInfo, ColumnInfo
|
|
5
8
|
|
|
6
9
|
from datacontract.imports.importer import Importer
|
|
10
|
+
from datacontract.imports.spark_importer import _field_from_struct_type
|
|
7
11
|
from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
|
|
8
12
|
from datacontract.model.exceptions import DataContractException
|
|
9
13
|
|
|
10
14
|
|
|
11
15
|
class UnityImporter(Importer):
|
|
16
|
+
"""
|
|
17
|
+
UnityImporter class for importing data contract specifications from Unity Catalog.
|
|
18
|
+
"""
|
|
19
|
+
|
|
12
20
|
def import_source(
|
|
13
21
|
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
14
22
|
) -> DataContractSpecification:
|
|
23
|
+
"""
|
|
24
|
+
Import data contract specification from a source.
|
|
25
|
+
|
|
26
|
+
:param data_contract_specification: The data contract specification to be imported.
|
|
27
|
+
:type data_contract_specification: DataContractSpecification
|
|
28
|
+
:param source: The source from which to import the data contract specification.
|
|
29
|
+
:type source: str
|
|
30
|
+
:param import_args: Additional arguments for the import process.
|
|
31
|
+
:type import_args: dict
|
|
32
|
+
:return: The imported data contract specification.
|
|
33
|
+
:rtype: DataContractSpecification
|
|
34
|
+
"""
|
|
15
35
|
if source is not None:
|
|
16
36
|
data_contract_specification = import_unity_from_json(data_contract_specification, source)
|
|
17
37
|
else:
|
|
@@ -24,9 +44,21 @@ class UnityImporter(Importer):
|
|
|
24
44
|
def import_unity_from_json(
|
|
25
45
|
data_contract_specification: DataContractSpecification, source: str
|
|
26
46
|
) -> DataContractSpecification:
|
|
47
|
+
"""
|
|
48
|
+
Import data contract specification from a JSON file.
|
|
49
|
+
|
|
50
|
+
:param data_contract_specification: The data contract specification to be imported.
|
|
51
|
+
:type data_contract_specification: DataContractSpecification
|
|
52
|
+
:param source: The path to the JSON file.
|
|
53
|
+
:type source: str
|
|
54
|
+
:return: The imported data contract specification.
|
|
55
|
+
:rtype: DataContractSpecification
|
|
56
|
+
:raises DataContractException: If there is an error parsing the JSON file.
|
|
57
|
+
"""
|
|
27
58
|
try:
|
|
28
59
|
with open(source, "r") as file:
|
|
29
|
-
|
|
60
|
+
json_contents = json.loads(file.read())
|
|
61
|
+
unity_schema = TableInfo.from_dict(json_contents)
|
|
30
62
|
except json.JSONDecodeError as e:
|
|
31
63
|
raise DataContractException(
|
|
32
64
|
type="schema",
|
|
@@ -39,114 +71,103 @@ def import_unity_from_json(
|
|
|
39
71
|
|
|
40
72
|
|
|
41
73
|
def import_unity_from_api(
|
|
42
|
-
data_contract_specification: DataContractSpecification, unity_table_full_name:
|
|
74
|
+
data_contract_specification: DataContractSpecification, unity_table_full_name: Optional[str] = None
|
|
43
75
|
) -> DataContractSpecification:
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
76
|
+
"""
|
|
77
|
+
Import data contract specification from Unity Catalog API.
|
|
78
|
+
|
|
79
|
+
:param data_contract_specification: The data contract specification to be imported.
|
|
80
|
+
:type data_contract_specification: DataContractSpecification
|
|
81
|
+
:param unity_table_full_name: The full name of the Unity table.
|
|
82
|
+
:type unity_table_full_name: Optional[str]
|
|
83
|
+
:return: The imported data contract specification.
|
|
84
|
+
:rtype: DataContractSpecification
|
|
85
|
+
:raises DataContractException: If there is an error retrieving the schema from the API.
|
|
86
|
+
"""
|
|
87
|
+
try:
|
|
88
|
+
workspace_client = WorkspaceClient()
|
|
89
|
+
unity_schema: TableInfo = workspace_client.tables.get(unity_table_full_name)
|
|
90
|
+
except Exception as e:
|
|
58
91
|
raise DataContractException(
|
|
59
92
|
type="schema",
|
|
60
93
|
name="Retrieve unity catalog schema",
|
|
61
|
-
reason=f"Failed to retrieve unity catalog schema from databricks
|
|
94
|
+
reason=f"Failed to retrieve unity catalog schema from databricks profile: {os.getenv('DATABRICKS_CONFIG_PROFILE')}",
|
|
62
95
|
engine="datacontract",
|
|
96
|
+
original_exception=e,
|
|
63
97
|
)
|
|
64
98
|
|
|
65
|
-
convert_unity_schema(data_contract_specification,
|
|
99
|
+
convert_unity_schema(data_contract_specification, unity_schema)
|
|
66
100
|
|
|
67
101
|
return data_contract_specification
|
|
68
102
|
|
|
69
103
|
|
|
70
104
|
def convert_unity_schema(
|
|
71
|
-
data_contract_specification: DataContractSpecification, unity_schema:
|
|
105
|
+
data_contract_specification: DataContractSpecification, unity_schema: TableInfo
|
|
72
106
|
) -> DataContractSpecification:
|
|
107
|
+
"""
|
|
108
|
+
Convert Unity schema to data contract specification.
|
|
109
|
+
|
|
110
|
+
:param data_contract_specification: The data contract specification to be converted.
|
|
111
|
+
:type data_contract_specification: DataContractSpecification
|
|
112
|
+
:param unity_schema: The Unity schema to be converted.
|
|
113
|
+
:type unity_schema: TableInfo
|
|
114
|
+
:return: The converted data contract specification.
|
|
115
|
+
:rtype: DataContractSpecification
|
|
116
|
+
"""
|
|
73
117
|
if data_contract_specification.models is None:
|
|
74
118
|
data_contract_specification.models = {}
|
|
75
119
|
|
|
76
|
-
fields = import_table_fields(unity_schema.
|
|
120
|
+
fields = import_table_fields(unity_schema.columns)
|
|
77
121
|
|
|
78
|
-
table_id = unity_schema.
|
|
122
|
+
table_id = unity_schema.name or unity_schema.table_id
|
|
79
123
|
|
|
80
124
|
data_contract_specification.models[table_id] = Model(fields=fields, type="table")
|
|
81
125
|
|
|
82
|
-
if unity_schema.
|
|
83
|
-
data_contract_specification.models[table_id].title = unity_schema.
|
|
126
|
+
if unity_schema.name:
|
|
127
|
+
data_contract_specification.models[table_id].title = unity_schema.name
|
|
128
|
+
|
|
129
|
+
if unity_schema.comment:
|
|
130
|
+
data_contract_specification.models[table_id].description = unity_schema.comment
|
|
84
131
|
|
|
85
132
|
return data_contract_specification
|
|
86
133
|
|
|
87
134
|
|
|
88
|
-
def import_table_fields(
|
|
135
|
+
def import_table_fields(columns: List[ColumnInfo]) -> dict[str, Field]:
|
|
136
|
+
"""
|
|
137
|
+
Import table fields from Unity schema columns.
|
|
138
|
+
|
|
139
|
+
Here we are first converting the `ColumnInfo.type_json` to a Spark StructField object
|
|
140
|
+
so we can leave the complexity of the Spark field types to the Spark JSON schema parser,
|
|
141
|
+
then re-use the logic in `datacontract.imports.spark_importer` to convert the StructField
|
|
142
|
+
into a Field object.
|
|
143
|
+
|
|
144
|
+
:param columns: The list of Unity schema columns.
|
|
145
|
+
:type columns: List[ColumnInfo]
|
|
146
|
+
:return: A dictionary of imported fields.
|
|
147
|
+
:rtype: dict[str, Field]
|
|
148
|
+
"""
|
|
89
149
|
imported_fields = {}
|
|
90
|
-
for field in table_fields:
|
|
91
|
-
field_name = field.get("name")
|
|
92
|
-
imported_fields[field_name] = Field()
|
|
93
|
-
imported_fields[field_name].required = field.get("nullable") == "false"
|
|
94
|
-
imported_fields[field_name].description = field.get("comment")
|
|
95
|
-
|
|
96
|
-
# databricks api 2.1 specifies that type_name can be any of:
|
|
97
|
-
# BOOLEAN | BYTE | SHORT | INT | LONG | FLOAT | DOUBLE | DATE | TIMESTAMP | TIMESTAMP_NTZ | STRING
|
|
98
|
-
# | BINARY | DECIMAL | INTERVAL | ARRAY | STRUCT | MAP | CHAR | NULL | USER_DEFINED_TYPE | TABLE_TYPE
|
|
99
|
-
if field.get("type_name") in ["INTERVAL", "ARRAY", "STRUCT", "MAP", "USER_DEFINED_TYPE", "TABLE_TYPE"]:
|
|
100
|
-
# complex types are not supported, yet
|
|
101
|
-
raise DataContractException(
|
|
102
|
-
type="schema",
|
|
103
|
-
result="failed",
|
|
104
|
-
name="Map unity type to data contract type",
|
|
105
|
-
reason=f"type ${field.get('type_name')} is not supported yet for unity import",
|
|
106
|
-
engine="datacontract",
|
|
107
|
-
)
|
|
108
150
|
|
|
109
|
-
|
|
151
|
+
for column in columns:
|
|
152
|
+
struct_field: types.StructField = _type_json_to_spark_field(column.type_json)
|
|
153
|
+
imported_fields[column.name] = _field_from_struct_type(struct_field)
|
|
110
154
|
|
|
111
155
|
return imported_fields
|
|
112
156
|
|
|
113
157
|
|
|
114
|
-
def
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
return "date"
|
|
131
|
-
elif type_str == "TIMESTAMP":
|
|
132
|
-
return "timestamp"
|
|
133
|
-
elif type_str == "TIMESTAMP_NTZ":
|
|
134
|
-
return "timestamp_ntz"
|
|
135
|
-
elif type_str == "STRING":
|
|
136
|
-
return "string"
|
|
137
|
-
elif type_str == "BINARY":
|
|
138
|
-
return "bytes"
|
|
139
|
-
elif type_str == "DECIMAL":
|
|
140
|
-
return "decimal"
|
|
141
|
-
elif type_str == "CHAR":
|
|
142
|
-
return "varchar"
|
|
143
|
-
elif type_str == "NULL":
|
|
144
|
-
return "null"
|
|
145
|
-
else:
|
|
146
|
-
raise DataContractException(
|
|
147
|
-
type="schema",
|
|
148
|
-
result="failed",
|
|
149
|
-
name="Map unity type to data contract type",
|
|
150
|
-
reason=f"Unsupported type {type_str} in unity json definition.",
|
|
151
|
-
engine="datacontract",
|
|
152
|
-
)
|
|
158
|
+
def _type_json_to_spark_field(type_json: str) -> types.StructField:
|
|
159
|
+
"""
|
|
160
|
+
Parses a JSON string representing a Spark field and returns a StructField object.
|
|
161
|
+
|
|
162
|
+
The reason we do this is to leverage the Spark JSON schema parser to handle the
|
|
163
|
+
complexity of the Spark field types. The field `type_json` in the Unity API is
|
|
164
|
+
the output of a `StructField.jsonValue()` call.
|
|
165
|
+
|
|
166
|
+
:param type_json: The JSON string representing the Spark field.
|
|
167
|
+
:type type_json: str
|
|
168
|
+
|
|
169
|
+
:return: The StructField object.
|
|
170
|
+
:rtype: types.StructField
|
|
171
|
+
"""
|
|
172
|
+
type_dict = json.loads(type_json)
|
|
173
|
+
return types.StructField.fromJson(type_dict)
|
|
@@ -23,7 +23,7 @@ def publish_test_results_to_datamesh_manager(run: Run, publish_url: str):
|
|
|
23
23
|
)
|
|
24
24
|
|
|
25
25
|
if run.dataContractId is None:
|
|
26
|
-
raise Exception("Cannot publish run results
|
|
26
|
+
raise Exception("Cannot publish run results for unknown data contract ID")
|
|
27
27
|
|
|
28
28
|
headers = {"Content-Type": "application/json", "x-api-key": api_key}
|
|
29
29
|
request_body = run.model_dump_json()
|
datacontract/lint/resolve.py
CHANGED
|
@@ -5,11 +5,13 @@ import fastjsonschema
|
|
|
5
5
|
import yaml
|
|
6
6
|
from fastjsonschema import JsonSchemaValueException
|
|
7
7
|
|
|
8
|
-
from datacontract.
|
|
8
|
+
from datacontract.imports.odcs_v3_importer import import_odcs_v3_from_str
|
|
9
|
+
from datacontract.lint.resources import read_resource
|
|
9
10
|
from datacontract.lint.schema import fetch_schema
|
|
10
11
|
from datacontract.lint.urls import fetch_resource
|
|
11
12
|
from datacontract.model.data_contract_specification import DataContractSpecification, Definition, Quality
|
|
12
13
|
from datacontract.model.exceptions import DataContractException
|
|
14
|
+
from datacontract.model.odcs import is_open_data_contract_standard
|
|
13
15
|
|
|
14
16
|
|
|
15
17
|
def resolve_data_contract(
|
|
@@ -41,10 +43,7 @@ def resolve_data_contract(
|
|
|
41
43
|
def resolve_data_contract_from_location(
|
|
42
44
|
location, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False
|
|
43
45
|
) -> DataContractSpecification:
|
|
44
|
-
|
|
45
|
-
data_contract_str = fetch_resource(location)
|
|
46
|
-
else:
|
|
47
|
-
data_contract_str = read_file(location)
|
|
46
|
+
data_contract_str = read_resource(location)
|
|
48
47
|
return _resolve_data_contract_from_str(data_contract_str, schema_location, inline_definitions, inline_quality)
|
|
49
48
|
|
|
50
49
|
|
|
@@ -114,7 +113,16 @@ def _resolve_definition_ref(ref, spec) -> Definition:
|
|
|
114
113
|
|
|
115
114
|
def _find_by_path_in_spec(definition_path: str, spec: DataContractSpecification):
|
|
116
115
|
path_elements = definition_path.split("/")
|
|
117
|
-
|
|
116
|
+
definition_key = path_elements[2]
|
|
117
|
+
if definition_key not in spec.definitions:
|
|
118
|
+
raise DataContractException(
|
|
119
|
+
type="lint",
|
|
120
|
+
result="failed",
|
|
121
|
+
name="Check that data contract YAML is valid",
|
|
122
|
+
reason=f"Cannot resolve definition {definition_key}",
|
|
123
|
+
engine="datacontract",
|
|
124
|
+
)
|
|
125
|
+
definition = spec.definitions[definition_key]
|
|
118
126
|
definition = _find_subfield_in_definition(definition, path_elements[3:])
|
|
119
127
|
return definition
|
|
120
128
|
|
|
@@ -187,10 +195,16 @@ def _get_quality_ref_file(quality_spec: str | object) -> str | object:
|
|
|
187
195
|
def _resolve_data_contract_from_str(
|
|
188
196
|
data_contract_str, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False
|
|
189
197
|
) -> DataContractSpecification:
|
|
190
|
-
|
|
191
|
-
|
|
198
|
+
yaml_dict = _to_yaml(data_contract_str)
|
|
199
|
+
|
|
200
|
+
if is_open_data_contract_standard(yaml_dict):
|
|
201
|
+
# if ODCS, then validate the ODCS schema and import to DataContractSpecification directly
|
|
202
|
+
data_contract_specification = DataContractSpecification(dataContractSpecification="0.9.3")
|
|
203
|
+
return import_odcs_v3_from_str(data_contract_specification, source_str=data_contract_str)
|
|
192
204
|
|
|
193
|
-
|
|
205
|
+
_validate_data_contract_specification_schema(yaml_dict, schema_location)
|
|
206
|
+
data_contract_specification = yaml_dict
|
|
207
|
+
spec = DataContractSpecification(**data_contract_specification)
|
|
194
208
|
|
|
195
209
|
if inline_definitions:
|
|
196
210
|
inline_definitions_into_data_contract(spec)
|
|
@@ -215,7 +229,7 @@ def _to_yaml(data_contract_str):
|
|
|
215
229
|
)
|
|
216
230
|
|
|
217
231
|
|
|
218
|
-
def
|
|
232
|
+
def _validate_data_contract_specification_schema(data_contract_yaml, schema_location: str = None):
|
|
219
233
|
schema = fetch_schema(schema_location)
|
|
220
234
|
try:
|
|
221
235
|
fastjsonschema.validate(schema, data_contract_yaml)
|