datacontract-cli 0.10.34__py3-none-any.whl → 0.10.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/api.py +10 -3
- datacontract/cli.py +5 -3
- datacontract/data_contract.py +18 -51
- datacontract/engines/data_contract_checks.py +280 -19
- datacontract/engines/fastjsonschema/check_jsonschema.py +29 -19
- datacontract/export/dbt_converter.py +30 -4
- datacontract/export/dqx_converter.py +126 -0
- datacontract/export/excel_exporter.py +3 -3
- datacontract/export/exporter.py +1 -0
- datacontract/export/exporter_factory.py +6 -0
- datacontract/export/markdown_converter.py +35 -16
- datacontract/export/mermaid_exporter.py +24 -11
- datacontract/export/rdf_converter.py +2 -2
- datacontract/export/spark_converter.py +28 -3
- datacontract/export/sql_type_converter.py +6 -4
- datacontract/imports/odcs_v3_importer.py +100 -19
- datacontract/imports/unity_importer.py +16 -11
- datacontract/init/init_template.py +1 -1
- datacontract/lint/resolve.py +1 -1
- datacontract/lint/schema.py +1 -1
- datacontract/schemas/datacontract-1.1.0.init.yaml +1 -1
- datacontract/schemas/datacontract-1.2.0.init.yaml +1 -1
- datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
- datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
- datacontract/templates/datacontract_odcs.html +60 -41
- {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/METADATA +68 -56
- {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/RECORD +32 -35
- datacontract/lint/lint.py +0 -142
- datacontract/lint/linters/__init__.py +0 -0
- datacontract/lint/linters/description_linter.py +0 -33
- datacontract/lint/linters/field_pattern_linter.py +0 -34
- datacontract/lint/linters/field_reference_linter.py +0 -47
- datacontract/lint/linters/notice_period_linter.py +0 -55
- datacontract/lint/linters/valid_constraints_linter.py +0 -100
- {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/WHEEL +0 -0
- {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/licenses/LICENSE +0 -0
- {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/top_level.txt +0 -0
|
@@ -138,8 +138,11 @@ def import_servers(odcs: OpenDataContractStandard) -> Dict[str, Server] | None:
|
|
|
138
138
|
server.driver = getattr(odcs_server, "driver", None)
|
|
139
139
|
server.roles = import_server_roles(odcs_server.roles)
|
|
140
140
|
server.storageAccount = (
|
|
141
|
-
|
|
141
|
+
to_azure_storage_account(odcs_server.location)
|
|
142
|
+
if server.type == "azure" and "://" in server.location
|
|
143
|
+
else None
|
|
142
144
|
)
|
|
145
|
+
|
|
143
146
|
servers[server_name] = server
|
|
144
147
|
return servers
|
|
145
148
|
|
|
@@ -204,7 +207,11 @@ def import_models(odcs: Any) -> Dict[str, Model]:
|
|
|
204
207
|
schema_physical_name = odcs_schema.physicalName
|
|
205
208
|
schema_description = odcs_schema.description if odcs_schema.description is not None else ""
|
|
206
209
|
model_name = schema_physical_name if schema_physical_name is not None else schema_name
|
|
207
|
-
model = Model(
|
|
210
|
+
model = Model(
|
|
211
|
+
description=" ".join(schema_description.splitlines()) if schema_description else "",
|
|
212
|
+
type="table",
|
|
213
|
+
tags=odcs_schema.tags if odcs_schema.tags is not None else None,
|
|
214
|
+
)
|
|
208
215
|
model.fields = import_fields(odcs_schema.properties, custom_type_mappings, server_type=get_server_type(odcs))
|
|
209
216
|
if odcs_schema.quality is not None:
|
|
210
217
|
model.quality = convert_quality_list(odcs_schema.quality)
|
|
@@ -228,6 +235,8 @@ def convert_quality_list(odcs_quality_list):
|
|
|
228
235
|
quality.description = odcs_quality.description
|
|
229
236
|
if odcs_quality.query is not None:
|
|
230
237
|
quality.query = odcs_quality.query
|
|
238
|
+
if odcs_quality.rule is not None:
|
|
239
|
+
quality.metric = odcs_quality.rule
|
|
231
240
|
if odcs_quality.mustBe is not None:
|
|
232
241
|
quality.mustBe = odcs_quality.mustBe
|
|
233
242
|
if odcs_quality.mustNotBe is not None:
|
|
@@ -235,11 +244,11 @@ def convert_quality_list(odcs_quality_list):
|
|
|
235
244
|
if odcs_quality.mustBeGreaterThan is not None:
|
|
236
245
|
quality.mustBeGreaterThan = odcs_quality.mustBeGreaterThan
|
|
237
246
|
if odcs_quality.mustBeGreaterOrEqualTo is not None:
|
|
238
|
-
quality.
|
|
247
|
+
quality.mustBeGreaterOrEqualTo = odcs_quality.mustBeGreaterOrEqualTo
|
|
239
248
|
if odcs_quality.mustBeLessThan is not None:
|
|
240
249
|
quality.mustBeLessThan = odcs_quality.mustBeLessThan
|
|
241
250
|
if odcs_quality.mustBeLessOrEqualTo is not None:
|
|
242
|
-
quality.
|
|
251
|
+
quality.mustBeLessOrEqualTo = odcs_quality.mustBeLessOrEqualTo
|
|
243
252
|
if odcs_quality.mustBeBetween is not None:
|
|
244
253
|
quality.mustBeBetween = odcs_quality.mustBeBetween
|
|
245
254
|
if odcs_quality.mustNotBeBetween is not None:
|
|
@@ -252,8 +261,6 @@ def convert_quality_list(odcs_quality_list):
|
|
|
252
261
|
quality.model_extra["businessImpact"] = odcs_quality.businessImpact
|
|
253
262
|
if odcs_quality.dimension is not None:
|
|
254
263
|
quality.model_extra["dimension"] = odcs_quality.dimension
|
|
255
|
-
if odcs_quality.rule is not None:
|
|
256
|
-
quality.model_extra["rule"] = odcs_quality.rule
|
|
257
264
|
if odcs_quality.schedule is not None:
|
|
258
265
|
quality.model_extra["schedule"] = odcs_quality.schedule
|
|
259
266
|
if odcs_quality.scheduler is not None:
|
|
@@ -327,7 +334,7 @@ def import_fields(
|
|
|
327
334
|
return result
|
|
328
335
|
|
|
329
336
|
for odcs_property in odcs_properties:
|
|
330
|
-
mapped_type = map_type(odcs_property.logicalType, custom_type_mappings)
|
|
337
|
+
mapped_type = map_type(odcs_property.logicalType, custom_type_mappings, odcs_property.physicalType)
|
|
331
338
|
if mapped_type is not None:
|
|
332
339
|
property_name = odcs_property.name
|
|
333
340
|
description = odcs_property.description if odcs_property.description is not None else None
|
|
@@ -370,23 +377,72 @@ def import_fields(
|
|
|
370
377
|
|
|
371
378
|
result[property_name] = field
|
|
372
379
|
else:
|
|
373
|
-
|
|
374
|
-
|
|
380
|
+
type_info = f"logicalType={odcs_property.logicalType}, physicalType={odcs_property.physicalType}"
|
|
381
|
+
logger.warning(
|
|
382
|
+
f"Can't map field '{odcs_property.name}' ({type_info}) to the Datacontract Mapping types. "
|
|
383
|
+
f"Both logicalType and physicalType are missing or unmappable. "
|
|
384
|
+
f"Consider introducing a customProperty 'dc_mapping_<type>' that defines your expected type as the 'value'"
|
|
375
385
|
)
|
|
376
386
|
|
|
377
387
|
return result
|
|
378
388
|
|
|
379
389
|
|
|
380
|
-
def map_type(
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
+
def map_type(odcs_logical_type: str, custom_mappings: Dict[str, str], physical_type: str = None) -> str | None:
|
|
391
|
+
# Try to map logicalType first
|
|
392
|
+
if odcs_logical_type is not None:
|
|
393
|
+
t = odcs_logical_type.lower()
|
|
394
|
+
if t in DATACONTRACT_TYPES:
|
|
395
|
+
return t
|
|
396
|
+
elif custom_mappings.get(t) is not None:
|
|
397
|
+
return custom_mappings.get(t)
|
|
398
|
+
|
|
399
|
+
# Fallback to physicalType if logicalType is not mapped
|
|
400
|
+
if physical_type is not None:
|
|
401
|
+
pt = physical_type.lower()
|
|
402
|
+
# Remove parameters from physical type (e.g., VARCHAR(50) -> varchar, DECIMAL(10,2) -> decimal)
|
|
403
|
+
pt_base = pt.split('(')[0].strip()
|
|
404
|
+
|
|
405
|
+
# Try direct mapping of physical type
|
|
406
|
+
if pt in DATACONTRACT_TYPES:
|
|
407
|
+
return pt
|
|
408
|
+
elif pt_base in DATACONTRACT_TYPES:
|
|
409
|
+
return pt_base
|
|
410
|
+
elif custom_mappings.get(pt) is not None:
|
|
411
|
+
return custom_mappings.get(pt)
|
|
412
|
+
elif custom_mappings.get(pt_base) is not None:
|
|
413
|
+
return custom_mappings.get(pt_base)
|
|
414
|
+
# Common physical type mappings
|
|
415
|
+
elif pt_base in ["varchar", "char", "nvarchar", "nchar", "text", "ntext", "string", "character varying"]:
|
|
416
|
+
return "string"
|
|
417
|
+
elif pt_base in ["int", "integer", "smallint", "tinyint", "mediumint", "int2", "int4", "int8"]:
|
|
418
|
+
return "int"
|
|
419
|
+
elif pt_base in ["bigint", "long", "int64"]:
|
|
420
|
+
return "long"
|
|
421
|
+
elif pt_base in ["float", "real", "float4", "float8"]:
|
|
422
|
+
return "float"
|
|
423
|
+
elif pt_base in ["double", "double precision"]:
|
|
424
|
+
return "double"
|
|
425
|
+
elif pt_base in ["decimal", "numeric", "number"]:
|
|
426
|
+
return "decimal"
|
|
427
|
+
elif pt_base in ["boolean", "bool", "bit"]:
|
|
428
|
+
return "boolean"
|
|
429
|
+
elif pt_base in ["timestamp", "datetime", "datetime2", "timestamptz", "timestamp with time zone"]:
|
|
430
|
+
return "timestamp"
|
|
431
|
+
elif pt_base in ["date"]:
|
|
432
|
+
return "date"
|
|
433
|
+
elif pt_base in ["time"]:
|
|
434
|
+
return "time"
|
|
435
|
+
elif pt_base in ["json", "jsonb"]:
|
|
436
|
+
return "json"
|
|
437
|
+
elif pt_base in ["array"]:
|
|
438
|
+
return "array"
|
|
439
|
+
elif pt_base in ["object", "struct", "record"]:
|
|
440
|
+
return "object"
|
|
441
|
+
elif pt_base in ["bytes", "binary", "varbinary", "blob", "bytea"]:
|
|
442
|
+
return "bytes"
|
|
443
|
+
else:
|
|
444
|
+
return None
|
|
445
|
+
return None
|
|
390
446
|
|
|
391
447
|
|
|
392
448
|
def get_custom_type_mappings(odcs_custom_properties: List[CustomProperty]) -> Dict[str, str]:
|
|
@@ -414,3 +470,28 @@ def import_tags(odcs: OpenDataContractStandard) -> List[str] | None:
|
|
|
414
470
|
if odcs.tags is None:
|
|
415
471
|
return None
|
|
416
472
|
return odcs.tags
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def to_azure_storage_account(location: str) -> str | None:
|
|
476
|
+
"""
|
|
477
|
+
Converts a storage location string to extract the storage account name.
|
|
478
|
+
ODCS v3.0 has no explicit field for the storage account. It uses the location field, which is a URI.
|
|
479
|
+
|
|
480
|
+
This function parses a storage location string to identify and return the
|
|
481
|
+
storage account name. It handles two primary patterns:
|
|
482
|
+
1. Protocol://containerName@storageAccountName
|
|
483
|
+
2. Protocol://storageAccountName
|
|
484
|
+
|
|
485
|
+
:param location: The storage location string to parse, typically following
|
|
486
|
+
the format protocol://containerName@storageAccountName. or
|
|
487
|
+
protocol://storageAccountName.
|
|
488
|
+
:return: The extracted storage account name if found, otherwise None
|
|
489
|
+
"""
|
|
490
|
+
# to catch protocol://containerName@storageAccountName. pattern from location
|
|
491
|
+
match = re.search(r"(?<=@)([^.]*)", location, re.IGNORECASE)
|
|
492
|
+
if match:
|
|
493
|
+
return match.group()
|
|
494
|
+
else:
|
|
495
|
+
# to catch protocol://storageAccountName. pattern from location
|
|
496
|
+
match = re.search(r"(?<=//)(?!@)([^.]*)", location, re.IGNORECASE)
|
|
497
|
+
return match.group() if match else None
|
|
@@ -88,23 +88,28 @@ def import_unity_from_api(
|
|
|
88
88
|
"""
|
|
89
89
|
try:
|
|
90
90
|
# print(f"Retrieving Unity Catalog schema for table: {unity_table_full_name}")
|
|
91
|
+
profile = os.getenv("DATACONTRACT_DATABRICKS_PROFILE")
|
|
91
92
|
host, token = os.getenv("DATACONTRACT_DATABRICKS_SERVER_HOSTNAME"), os.getenv("DATACONTRACT_DATABRICKS_TOKEN")
|
|
92
93
|
# print(f"Databricks host: {host}, token: {'***' if token else 'not set'}")
|
|
93
|
-
|
|
94
|
-
raise DataContractException(
|
|
95
|
-
type="configuration",
|
|
96
|
-
name="Databricks configuration",
|
|
97
|
-
reason="DATACONTRACT_DATABRICKS_SERVER_HOSTNAME environment variable is not set",
|
|
98
|
-
engine="datacontract",
|
|
99
|
-
)
|
|
100
|
-
if not token:
|
|
101
|
-
raise DataContractException(
|
|
94
|
+
exception = DataContractException(
|
|
102
95
|
type="configuration",
|
|
103
96
|
name="Databricks configuration",
|
|
104
|
-
reason="
|
|
97
|
+
reason="",
|
|
105
98
|
engine="datacontract",
|
|
106
99
|
)
|
|
107
|
-
|
|
100
|
+
if not profile and not host and not token:
|
|
101
|
+
reason = "Either DATACONTRACT_DATABRICKS_PROFILE or both DATACONTRACT_DATABRICKS_SERVER_HOSTNAME and DATACONTRACT_DATABRICKS_TOKEN environment variables must be set"
|
|
102
|
+
exception.reason = reason
|
|
103
|
+
raise exception
|
|
104
|
+
if token and not host:
|
|
105
|
+
reason = "DATACONTRACT_DATABRICKS_SERVER_HOSTNAME environment variable is not set"
|
|
106
|
+
exception.reason = reason
|
|
107
|
+
raise exception
|
|
108
|
+
if host and not token:
|
|
109
|
+
reason = "DATACONTRACT_DATABRICKS_TOKEN environment variable is not set"
|
|
110
|
+
exception.reason = reason
|
|
111
|
+
raise exception
|
|
112
|
+
workspace_client = WorkspaceClient(profile=profile) if profile else WorkspaceClient(host=host, token=token)
|
|
108
113
|
except Exception as e:
|
|
109
114
|
raise DataContractException(
|
|
110
115
|
type="schema",
|
datacontract/lint/resolve.py
CHANGED
|
@@ -303,7 +303,7 @@ def _resolve_data_contract_from_str(
|
|
|
303
303
|
# if ODCS, then validate the ODCS schema and import to DataContractSpecification directly
|
|
304
304
|
odcs = parse_odcs_v3_from_str(data_contract_str)
|
|
305
305
|
|
|
306
|
-
data_contract_specification = DataContractSpecification(dataContractSpecification="1.2.
|
|
306
|
+
data_contract_specification = DataContractSpecification(dataContractSpecification="1.2.1")
|
|
307
307
|
return import_from_odcs(data_contract_specification, odcs)
|
|
308
308
|
|
|
309
309
|
logging.info("Importing DCS")
|
datacontract/lint/schema.py
CHANGED
|
@@ -8,7 +8,7 @@ import requests
|
|
|
8
8
|
|
|
9
9
|
from datacontract.model.exceptions import DataContractException
|
|
10
10
|
|
|
11
|
-
DEFAULT_DATA_CONTRACT_SCHEMA = "datacontract-1.2.
|
|
11
|
+
DEFAULT_DATA_CONTRACT_SCHEMA = "datacontract-1.2.1.schema.json"
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
def fetch_schema(location: str = None) -> Dict[str, Any]:
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
dataContractSpecification: 1.2.1
|
|
2
|
+
id: my-data-contract-id
|
|
3
|
+
info:
|
|
4
|
+
title: My Data Contract
|
|
5
|
+
version: 0.0.1
|
|
6
|
+
# description:
|
|
7
|
+
# owner:
|
|
8
|
+
# contact:
|
|
9
|
+
# name:
|
|
10
|
+
# url:
|
|
11
|
+
# email:
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
### servers
|
|
15
|
+
|
|
16
|
+
#servers:
|
|
17
|
+
# production:
|
|
18
|
+
# type: s3
|
|
19
|
+
# location: s3://
|
|
20
|
+
# format: parquet
|
|
21
|
+
# delimiter: new_line
|
|
22
|
+
|
|
23
|
+
### terms
|
|
24
|
+
|
|
25
|
+
#terms:
|
|
26
|
+
# usage:
|
|
27
|
+
# limitations:
|
|
28
|
+
# billing:
|
|
29
|
+
# noticePeriod:
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
### models
|
|
33
|
+
|
|
34
|
+
# models:
|
|
35
|
+
# my_model:
|
|
36
|
+
# description:
|
|
37
|
+
# type:
|
|
38
|
+
# fields:
|
|
39
|
+
# my_field:
|
|
40
|
+
# type:
|
|
41
|
+
# description:
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
### definitions
|
|
45
|
+
|
|
46
|
+
# definitions:
|
|
47
|
+
# my_field:
|
|
48
|
+
# domain:
|
|
49
|
+
# name:
|
|
50
|
+
# title:
|
|
51
|
+
# type:
|
|
52
|
+
# description:
|
|
53
|
+
# example:
|
|
54
|
+
# pii:
|
|
55
|
+
# classification:
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
### servicelevels
|
|
59
|
+
|
|
60
|
+
#servicelevels:
|
|
61
|
+
# availability:
|
|
62
|
+
# description: The server is available during support hours
|
|
63
|
+
# percentage: 99.9%
|
|
64
|
+
# retention:
|
|
65
|
+
# description: Data is retained for one year because!
|
|
66
|
+
# period: P1Y
|
|
67
|
+
# unlimited: false
|
|
68
|
+
# latency:
|
|
69
|
+
# description: Data is available within 25 hours after the order was placed
|
|
70
|
+
# threshold: 25h
|
|
71
|
+
# sourceTimestampField: orders.order_timestamp
|
|
72
|
+
# processedTimestampField: orders.processed_timestamp
|
|
73
|
+
# freshness:
|
|
74
|
+
# description: The age of the youngest row in a table.
|
|
75
|
+
# threshold: 25h
|
|
76
|
+
# timestampField: orders.order_timestamp
|
|
77
|
+
# frequency:
|
|
78
|
+
# description: Data is delivered once a day
|
|
79
|
+
# type: batch # or streaming
|
|
80
|
+
# interval: daily # for batch, either or cron
|
|
81
|
+
# cron: 0 0 * * * # for batch, either or interval
|
|
82
|
+
# support:
|
|
83
|
+
# description: The data is available during typical business hours at headquarters
|
|
84
|
+
# time: 9am to 5pm in EST on business days
|
|
85
|
+
# responseTime: 1h
|
|
86
|
+
# backup:
|
|
87
|
+
# description: Data is backed up once a week, every Sunday at 0:00 UTC.
|
|
88
|
+
# interval: weekly
|
|
89
|
+
# cron: 0 0 * * 0
|
|
90
|
+
# recoveryTime: 24 hours
|
|
91
|
+
# recoveryPoint: 1 week
|