datacontract-cli 0.10.34__py3-none-any.whl → 0.10.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (39) hide show
  1. datacontract/api.py +10 -3
  2. datacontract/cli.py +5 -3
  3. datacontract/data_contract.py +18 -51
  4. datacontract/engines/data_contract_checks.py +280 -19
  5. datacontract/engines/fastjsonschema/check_jsonschema.py +29 -19
  6. datacontract/export/dbt_converter.py +30 -4
  7. datacontract/export/dqx_converter.py +126 -0
  8. datacontract/export/excel_exporter.py +3 -3
  9. datacontract/export/exporter.py +1 -0
  10. datacontract/export/exporter_factory.py +6 -0
  11. datacontract/export/markdown_converter.py +35 -16
  12. datacontract/export/mermaid_exporter.py +24 -11
  13. datacontract/export/rdf_converter.py +2 -2
  14. datacontract/export/spark_converter.py +28 -3
  15. datacontract/export/sql_type_converter.py +6 -4
  16. datacontract/imports/odcs_v3_importer.py +100 -19
  17. datacontract/imports/unity_importer.py +16 -11
  18. datacontract/init/init_template.py +1 -1
  19. datacontract/lint/resolve.py +1 -1
  20. datacontract/lint/schema.py +1 -1
  21. datacontract/schemas/datacontract-1.1.0.init.yaml +1 -1
  22. datacontract/schemas/datacontract-1.2.0.init.yaml +1 -1
  23. datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
  24. datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
  25. datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
  26. datacontract/templates/datacontract_odcs.html +60 -41
  27. {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/METADATA +68 -56
  28. {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/RECORD +32 -35
  29. datacontract/lint/lint.py +0 -142
  30. datacontract/lint/linters/__init__.py +0 -0
  31. datacontract/lint/linters/description_linter.py +0 -33
  32. datacontract/lint/linters/field_pattern_linter.py +0 -34
  33. datacontract/lint/linters/field_reference_linter.py +0 -47
  34. datacontract/lint/linters/notice_period_linter.py +0 -55
  35. datacontract/lint/linters/valid_constraints_linter.py +0 -100
  36. {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/WHEEL +0 -0
  37. {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/entry_points.txt +0 -0
  38. {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/licenses/LICENSE +0 -0
  39. {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/top_level.txt +0 -0
@@ -138,8 +138,11 @@ def import_servers(odcs: OpenDataContractStandard) -> Dict[str, Server] | None:
138
138
  server.driver = getattr(odcs_server, "driver", None)
139
139
  server.roles = import_server_roles(odcs_server.roles)
140
140
  server.storageAccount = (
141
- re.search(r"(?:@|://)([^.]+)\.", odcs_server.location, re.IGNORECASE) if server.type == "azure" else None
141
+ to_azure_storage_account(odcs_server.location)
142
+ if server.type == "azure" and "://" in server.location
143
+ else None
142
144
  )
145
+
143
146
  servers[server_name] = server
144
147
  return servers
145
148
 
@@ -204,7 +207,11 @@ def import_models(odcs: Any) -> Dict[str, Model]:
204
207
  schema_physical_name = odcs_schema.physicalName
205
208
  schema_description = odcs_schema.description if odcs_schema.description is not None else ""
206
209
  model_name = schema_physical_name if schema_physical_name is not None else schema_name
207
- model = Model(description=" ".join(schema_description.splitlines()) if schema_description else "", type="table")
210
+ model = Model(
211
+ description=" ".join(schema_description.splitlines()) if schema_description else "",
212
+ type="table",
213
+ tags=odcs_schema.tags if odcs_schema.tags is not None else None,
214
+ )
208
215
  model.fields = import_fields(odcs_schema.properties, custom_type_mappings, server_type=get_server_type(odcs))
209
216
  if odcs_schema.quality is not None:
210
217
  model.quality = convert_quality_list(odcs_schema.quality)
@@ -228,6 +235,8 @@ def convert_quality_list(odcs_quality_list):
228
235
  quality.description = odcs_quality.description
229
236
  if odcs_quality.query is not None:
230
237
  quality.query = odcs_quality.query
238
+ if odcs_quality.rule is not None:
239
+ quality.metric = odcs_quality.rule
231
240
  if odcs_quality.mustBe is not None:
232
241
  quality.mustBe = odcs_quality.mustBe
233
242
  if odcs_quality.mustNotBe is not None:
@@ -235,11 +244,11 @@ def convert_quality_list(odcs_quality_list):
235
244
  if odcs_quality.mustBeGreaterThan is not None:
236
245
  quality.mustBeGreaterThan = odcs_quality.mustBeGreaterThan
237
246
  if odcs_quality.mustBeGreaterOrEqualTo is not None:
238
- quality.mustBeGreaterThanOrEqualTo = odcs_quality.mustBeGreaterOrEqualTo
247
+ quality.mustBeGreaterOrEqualTo = odcs_quality.mustBeGreaterOrEqualTo
239
248
  if odcs_quality.mustBeLessThan is not None:
240
249
  quality.mustBeLessThan = odcs_quality.mustBeLessThan
241
250
  if odcs_quality.mustBeLessOrEqualTo is not None:
242
- quality.mustBeLessThanOrEqualTo = odcs_quality.mustBeLessOrEqualTo
251
+ quality.mustBeLessOrEqualTo = odcs_quality.mustBeLessOrEqualTo
243
252
  if odcs_quality.mustBeBetween is not None:
244
253
  quality.mustBeBetween = odcs_quality.mustBeBetween
245
254
  if odcs_quality.mustNotBeBetween is not None:
@@ -252,8 +261,6 @@ def convert_quality_list(odcs_quality_list):
252
261
  quality.model_extra["businessImpact"] = odcs_quality.businessImpact
253
262
  if odcs_quality.dimension is not None:
254
263
  quality.model_extra["dimension"] = odcs_quality.dimension
255
- if odcs_quality.rule is not None:
256
- quality.model_extra["rule"] = odcs_quality.rule
257
264
  if odcs_quality.schedule is not None:
258
265
  quality.model_extra["schedule"] = odcs_quality.schedule
259
266
  if odcs_quality.scheduler is not None:
@@ -327,7 +334,7 @@ def import_fields(
327
334
  return result
328
335
 
329
336
  for odcs_property in odcs_properties:
330
- mapped_type = map_type(odcs_property.logicalType, custom_type_mappings)
337
+ mapped_type = map_type(odcs_property.logicalType, custom_type_mappings, odcs_property.physicalType)
331
338
  if mapped_type is not None:
332
339
  property_name = odcs_property.name
333
340
  description = odcs_property.description if odcs_property.description is not None else None
@@ -370,23 +377,72 @@ def import_fields(
370
377
 
371
378
  result[property_name] = field
372
379
  else:
373
- logger.info(
374
- f"Can't map {odcs_property.name} to the Datacontract Mapping types, as there is no equivalent or special mapping. Consider introducing a customProperty 'dc_mapping_{odcs_property.logicalType}' that defines your expected type as the 'value'"
380
+ type_info = f"logicalType={odcs_property.logicalType}, physicalType={odcs_property.physicalType}"
381
+ logger.warning(
382
+ f"Can't map field '{odcs_property.name}' ({type_info}) to the Datacontract Mapping types. "
383
+ f"Both logicalType and physicalType are missing or unmappable. "
384
+ f"Consider introducing a customProperty 'dc_mapping_<type>' that defines your expected type as the 'value'"
375
385
  )
376
386
 
377
387
  return result
378
388
 
379
389
 
380
- def map_type(odcs_type: str, custom_mappings: Dict[str, str]) -> str | None:
381
- if odcs_type is None:
382
- return None
383
- t = odcs_type.lower()
384
- if t in DATACONTRACT_TYPES:
385
- return t
386
- elif custom_mappings.get(t) is not None:
387
- return custom_mappings.get(t)
388
- else:
389
- return None
390
+ def map_type(odcs_logical_type: str, custom_mappings: Dict[str, str], physical_type: str = None) -> str | None:
391
+ # Try to map logicalType first
392
+ if odcs_logical_type is not None:
393
+ t = odcs_logical_type.lower()
394
+ if t in DATACONTRACT_TYPES:
395
+ return t
396
+ elif custom_mappings.get(t) is not None:
397
+ return custom_mappings.get(t)
398
+
399
+ # Fallback to physicalType if logicalType is not mapped
400
+ if physical_type is not None:
401
+ pt = physical_type.lower()
402
+ # Remove parameters from physical type (e.g., VARCHAR(50) -> varchar, DECIMAL(10,2) -> decimal)
403
+ pt_base = pt.split('(')[0].strip()
404
+
405
+ # Try direct mapping of physical type
406
+ if pt in DATACONTRACT_TYPES:
407
+ return pt
408
+ elif pt_base in DATACONTRACT_TYPES:
409
+ return pt_base
410
+ elif custom_mappings.get(pt) is not None:
411
+ return custom_mappings.get(pt)
412
+ elif custom_mappings.get(pt_base) is not None:
413
+ return custom_mappings.get(pt_base)
414
+ # Common physical type mappings
415
+ elif pt_base in ["varchar", "char", "nvarchar", "nchar", "text", "ntext", "string", "character varying"]:
416
+ return "string"
417
+ elif pt_base in ["int", "integer", "smallint", "tinyint", "mediumint", "int2", "int4", "int8"]:
418
+ return "int"
419
+ elif pt_base in ["bigint", "long", "int64"]:
420
+ return "long"
421
+ elif pt_base in ["float", "real", "float4", "float8"]:
422
+ return "float"
423
+ elif pt_base in ["double", "double precision"]:
424
+ return "double"
425
+ elif pt_base in ["decimal", "numeric", "number"]:
426
+ return "decimal"
427
+ elif pt_base in ["boolean", "bool", "bit"]:
428
+ return "boolean"
429
+ elif pt_base in ["timestamp", "datetime", "datetime2", "timestamptz", "timestamp with time zone"]:
430
+ return "timestamp"
431
+ elif pt_base in ["date"]:
432
+ return "date"
433
+ elif pt_base in ["time"]:
434
+ return "time"
435
+ elif pt_base in ["json", "jsonb"]:
436
+ return "json"
437
+ elif pt_base in ["array"]:
438
+ return "array"
439
+ elif pt_base in ["object", "struct", "record"]:
440
+ return "object"
441
+ elif pt_base in ["bytes", "binary", "varbinary", "blob", "bytea"]:
442
+ return "bytes"
443
+ else:
444
+ return None
445
+ return None
390
446
 
391
447
 
392
448
  def get_custom_type_mappings(odcs_custom_properties: List[CustomProperty]) -> Dict[str, str]:
@@ -414,3 +470,28 @@ def import_tags(odcs: OpenDataContractStandard) -> List[str] | None:
414
470
  if odcs.tags is None:
415
471
  return None
416
472
  return odcs.tags
473
+
474
+
475
+ def to_azure_storage_account(location: str) -> str | None:
476
+ """
477
+ Converts a storage location string to extract the storage account name.
478
+ ODCS v3.0 has no explicit field for the storage account. It uses the location field, which is a URI.
479
+
480
+ This function parses a storage location string to identify and return the
481
+ storage account name. It handles two primary patterns:
482
+ 1. Protocol://containerName@storageAccountName
483
+ 2. Protocol://storageAccountName
484
+
485
+ :param location: The storage location string to parse, typically following
486
+ the format protocol://containerName@storageAccountName. or
487
+ protocol://storageAccountName.
488
+ :return: The extracted storage account name if found, otherwise None
489
+ """
490
+ # to catch protocol://containerName@storageAccountName. pattern from location
491
+ match = re.search(r"(?<=@)([^.]*)", location, re.IGNORECASE)
492
+ if match:
493
+ return match.group()
494
+ else:
495
+ # to catch protocol://storageAccountName. pattern from location
496
+ match = re.search(r"(?<=//)(?!@)([^.]*)", location, re.IGNORECASE)
497
+ return match.group() if match else None
@@ -88,23 +88,28 @@ def import_unity_from_api(
88
88
  """
89
89
  try:
90
90
  # print(f"Retrieving Unity Catalog schema for table: {unity_table_full_name}")
91
+ profile = os.getenv("DATACONTRACT_DATABRICKS_PROFILE")
91
92
  host, token = os.getenv("DATACONTRACT_DATABRICKS_SERVER_HOSTNAME"), os.getenv("DATACONTRACT_DATABRICKS_TOKEN")
92
93
  # print(f"Databricks host: {host}, token: {'***' if token else 'not set'}")
93
- if not host:
94
- raise DataContractException(
95
- type="configuration",
96
- name="Databricks configuration",
97
- reason="DATACONTRACT_DATABRICKS_SERVER_HOSTNAME environment variable is not set",
98
- engine="datacontract",
99
- )
100
- if not token:
101
- raise DataContractException(
94
+ exception = DataContractException(
102
95
  type="configuration",
103
96
  name="Databricks configuration",
104
- reason="DATACONTRACT_DATABRICKS_TOKEN environment variable is not set",
97
+ reason="",
105
98
  engine="datacontract",
106
99
  )
107
- workspace_client = WorkspaceClient(host=host, token=token)
100
+ if not profile and not host and not token:
101
+ reason = "Either DATACONTRACT_DATABRICKS_PROFILE or both DATACONTRACT_DATABRICKS_SERVER_HOSTNAME and DATACONTRACT_DATABRICKS_TOKEN environment variables must be set"
102
+ exception.reason = reason
103
+ raise exception
104
+ if token and not host:
105
+ reason = "DATACONTRACT_DATABRICKS_SERVER_HOSTNAME environment variable is not set"
106
+ exception.reason = reason
107
+ raise exception
108
+ if host and not token:
109
+ reason = "DATACONTRACT_DATABRICKS_TOKEN environment variable is not set"
110
+ exception.reason = reason
111
+ raise exception
112
+ workspace_client = WorkspaceClient(profile=profile) if profile else WorkspaceClient(host=host, token=token)
108
113
  except Exception as e:
109
114
  raise DataContractException(
110
115
  type="schema",
@@ -3,7 +3,7 @@ import logging
3
3
 
4
4
  import requests
5
5
 
6
- DEFAULT_DATA_CONTRACT_INIT_TEMPLATE = "datacontract-1.2.0.init.yaml"
6
+ DEFAULT_DATA_CONTRACT_INIT_TEMPLATE = "datacontract-1.2.1.init.yaml"
7
7
 
8
8
 
9
9
  def get_init_template(location: str = None) -> str:
@@ -303,7 +303,7 @@ def _resolve_data_contract_from_str(
303
303
  # if ODCS, then validate the ODCS schema and import to DataContractSpecification directly
304
304
  odcs = parse_odcs_v3_from_str(data_contract_str)
305
305
 
306
- data_contract_specification = DataContractSpecification(dataContractSpecification="1.2.0")
306
+ data_contract_specification = DataContractSpecification(dataContractSpecification="1.2.1")
307
307
  return import_from_odcs(data_contract_specification, odcs)
308
308
 
309
309
  logging.info("Importing DCS")
@@ -8,7 +8,7 @@ import requests
8
8
 
9
9
  from datacontract.model.exceptions import DataContractException
10
10
 
11
- DEFAULT_DATA_CONTRACT_SCHEMA = "datacontract-1.2.0.schema.json"
11
+ DEFAULT_DATA_CONTRACT_SCHEMA = "datacontract-1.2.1.schema.json"
12
12
 
13
13
 
14
14
  def fetch_schema(location: str = None) -> Dict[str, Any]:
@@ -1,4 +1,4 @@
1
- dataContractSpecification: 1.2.0
1
+ dataContractSpecification: 1.2.1
2
2
  id: my-data-contract-id
3
3
  info:
4
4
  title: My Data Contract
@@ -1,4 +1,4 @@
1
- dataContractSpecification: 1.2.0
1
+ dataContractSpecification: 1.2.1
2
2
  id: my-data-contract-id
3
3
  info:
4
4
  title: My Data Contract
@@ -0,0 +1,91 @@
1
+ dataContractSpecification: 1.2.1
2
+ id: my-data-contract-id
3
+ info:
4
+ title: My Data Contract
5
+ version: 0.0.1
6
+ # description:
7
+ # owner:
8
+ # contact:
9
+ # name:
10
+ # url:
11
+ # email:
12
+
13
+
14
+ ### servers
15
+
16
+ #servers:
17
+ # production:
18
+ # type: s3
19
+ # location: s3://
20
+ # format: parquet
21
+ # delimiter: new_line
22
+
23
+ ### terms
24
+
25
+ #terms:
26
+ # usage:
27
+ # limitations:
28
+ # billing:
29
+ # noticePeriod:
30
+
31
+
32
+ ### models
33
+
34
+ # models:
35
+ # my_model:
36
+ # description:
37
+ # type:
38
+ # fields:
39
+ # my_field:
40
+ # type:
41
+ # description:
42
+
43
+
44
+ ### definitions
45
+
46
+ # definitions:
47
+ # my_field:
48
+ # domain:
49
+ # name:
50
+ # title:
51
+ # type:
52
+ # description:
53
+ # example:
54
+ # pii:
55
+ # classification:
56
+
57
+
58
+ ### servicelevels
59
+
60
+ #servicelevels:
61
+ # availability:
62
+ # description: The server is available during support hours
63
+ # percentage: 99.9%
64
+ # retention:
65
+ # description: Data is retained for one year because!
66
+ # period: P1Y
67
+ # unlimited: false
68
+ # latency:
69
+ # description: Data is available within 25 hours after the order was placed
70
+ # threshold: 25h
71
+ # sourceTimestampField: orders.order_timestamp
72
+ # processedTimestampField: orders.processed_timestamp
73
+ # freshness:
74
+ # description: The age of the youngest row in a table.
75
+ # threshold: 25h
76
+ # timestampField: orders.order_timestamp
77
+ # frequency:
78
+ # description: Data is delivered once a day
79
+ # type: batch # or streaming
80
+ # interval: daily # for batch, either or cron
81
+ # cron: 0 0 * * * # for batch, either or interval
82
+ # support:
83
+ # description: The data is available during typical business hours at headquarters
84
+ # time: 9am to 5pm in EST on business days
85
+ # responseTime: 1h
86
+ # backup:
87
+ # description: Data is backed up once a week, every Sunday at 0:00 UTC.
88
+ # interval: weekly
89
+ # cron: 0 0 * * 0
90
+ # recoveryTime: 24 hours
91
+ # recoveryPoint: 1 week