datacontract-cli 0.10.27__py3-none-any.whl → 0.10.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (37) hide show
  1. datacontract/api.py +1 -1
  2. datacontract/cli.py +37 -5
  3. datacontract/data_contract.py +122 -29
  4. datacontract/engines/data_contract_checks.py +2 -0
  5. datacontract/engines/soda/connections/duckdb_connection.py +1 -1
  6. datacontract/export/html_exporter.py +28 -23
  7. datacontract/export/mermaid_exporter.py +78 -13
  8. datacontract/export/odcs_v3_exporter.py +7 -9
  9. datacontract/export/rdf_converter.py +2 -2
  10. datacontract/export/sql_type_converter.py +2 -2
  11. datacontract/imports/excel_importer.py +7 -2
  12. datacontract/imports/importer.py +11 -1
  13. datacontract/imports/importer_factory.py +7 -0
  14. datacontract/imports/json_importer.py +325 -0
  15. datacontract/imports/odcs_importer.py +2 -2
  16. datacontract/imports/odcs_v3_importer.py +9 -9
  17. datacontract/imports/spark_importer.py +38 -16
  18. datacontract/imports/sql_importer.py +4 -2
  19. datacontract/imports/unity_importer.py +77 -37
  20. datacontract/init/init_template.py +1 -1
  21. datacontract/integration/datamesh_manager.py +16 -2
  22. datacontract/lint/resolve.py +61 -7
  23. datacontract/lint/schema.py +1 -1
  24. datacontract/schemas/datacontract-1.1.0.init.yaml +1 -1
  25. datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
  26. datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
  27. datacontract/templates/datacontract.html +4 -0
  28. datacontract/templates/datacontract_odcs.html +666 -0
  29. datacontract/templates/index.html +2 -0
  30. datacontract/templates/partials/server.html +2 -0
  31. datacontract/templates/style/output.css +319 -145
  32. {datacontract_cli-0.10.27.dist-info → datacontract_cli-0.10.29.dist-info}/METADATA +98 -62
  33. {datacontract_cli-0.10.27.dist-info → datacontract_cli-0.10.29.dist-info}/RECORD +37 -33
  34. {datacontract_cli-0.10.27.dist-info → datacontract_cli-0.10.29.dist-info}/WHEEL +1 -1
  35. {datacontract_cli-0.10.27.dist-info → datacontract_cli-0.10.29.dist-info}/entry_points.txt +0 -0
  36. {datacontract_cli-0.10.27.dist-info → datacontract_cli-0.10.29.dist-info}/licenses/LICENSE +0 -0
  37. {datacontract_cli-0.10.27.dist-info → datacontract_cli-0.10.29.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,14 @@
1
1
  import json
2
2
  import os
3
- from typing import List, Optional
3
+ from typing import List
4
4
 
5
5
  from databricks.sdk import WorkspaceClient
6
6
  from databricks.sdk.service.catalog import ColumnInfo, TableInfo
7
- from pyspark.sql import types
7
+ from open_data_contract_standard.model import OpenDataContractStandard
8
8
 
9
9
  from datacontract.imports.importer import Importer
10
- from datacontract.imports.spark_importer import _field_from_struct_type
11
- from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
10
+ from datacontract.imports.sql_importer import map_type_from_sql, to_physical_type_key
11
+ from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Server
12
12
  from datacontract.model.exceptions import DataContractException
13
13
 
14
14
 
@@ -18,8 +18,11 @@ class UnityImporter(Importer):
18
18
  """
19
19
 
20
20
  def import_source(
21
- self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
22
- ) -> DataContractSpecification:
21
+ self,
22
+ data_contract_specification: DataContractSpecification | OpenDataContractStandard,
23
+ source: str,
24
+ import_args: dict,
25
+ ) -> DataContractSpecification | OpenDataContractStandard:
23
26
  """
24
27
  Import data contract specification from a source.
25
28
 
@@ -35,15 +38,14 @@ class UnityImporter(Importer):
35
38
  if source is not None:
36
39
  data_contract_specification = import_unity_from_json(data_contract_specification, source)
37
40
  else:
38
- data_contract_specification = import_unity_from_api(
39
- data_contract_specification, import_args.get("unity_table_full_name")
40
- )
41
+ unity_table_full_name_list = import_args.get("unity_table_full_name")
42
+ data_contract_specification = import_unity_from_api(data_contract_specification, unity_table_full_name_list)
41
43
  return data_contract_specification
42
44
 
43
45
 
44
46
  def import_unity_from_json(
45
- data_contract_specification: DataContractSpecification, source: str
46
- ) -> DataContractSpecification:
47
+ data_contract_specification: DataContractSpecification | OpenDataContractStandard, source: str
48
+ ) -> DataContractSpecification | OpenDataContractStandard:
47
49
  """
48
50
  Import data contract specification from a JSON file.
49
51
 
@@ -71,39 +73,66 @@ def import_unity_from_json(
71
73
 
72
74
 
73
75
  def import_unity_from_api(
74
- data_contract_specification: DataContractSpecification, unity_table_full_name: Optional[str] = None
76
+ data_contract_specification: DataContractSpecification, unity_table_full_name_list: List[str] = None
75
77
  ) -> DataContractSpecification:
76
78
  """
77
79
  Import data contract specification from Unity Catalog API.
78
80
 
79
81
  :param data_contract_specification: The data contract specification to be imported.
80
82
  :type data_contract_specification: DataContractSpecification
81
- :param unity_table_full_name: The full name of the Unity table.
82
- :type unity_table_full_name: Optional[str]
83
+ :param unity_table_full_name_list: The full name of the Unity table.
84
+ :type unity_table_full_name_list: list[str]
83
85
  :return: The imported data contract specification.
84
86
  :rtype: DataContractSpecification
85
87
  :raises DataContractException: If there is an error retrieving the schema from the API.
86
88
  """
87
89
  try:
88
- workspace_client = WorkspaceClient()
89
- unity_schema: TableInfo = workspace_client.tables.get(unity_table_full_name)
90
+ # print(f"Retrieving Unity Catalog schema for table: {unity_table_full_name}")
91
+ host, token = os.getenv("DATACONTRACT_DATABRICKS_SERVER_HOSTNAME"), os.getenv("DATACONTRACT_DATABRICKS_TOKEN")
92
+ # print(f"Databricks host: {host}, token: {'***' if token else 'not set'}")
93
+ if not host:
94
+ raise DataContractException(
95
+ type="configuration",
96
+ name="Databricks configuration",
97
+ reason="DATACONTRACT_DATABRICKS_SERVER_HOSTNAME environment variable is not set",
98
+ engine="datacontract",
99
+ )
100
+ if not token:
101
+ raise DataContractException(
102
+ type="configuration",
103
+ name="Databricks configuration",
104
+ reason="DATACONTRACT_DATABRICKS_TOKEN environment variable is not set",
105
+ engine="datacontract",
106
+ )
107
+ workspace_client = WorkspaceClient(host=host, token=token)
90
108
  except Exception as e:
91
109
  raise DataContractException(
92
110
  type="schema",
93
111
  name="Retrieve unity catalog schema",
94
- reason=f"Failed to retrieve unity catalog schema from databricks profile: {os.getenv('DATABRICKS_CONFIG_PROFILE')}",
112
+ reason="Failed to connect to unity catalog schema",
95
113
  engine="datacontract",
96
114
  original_exception=e,
97
115
  )
98
116
 
99
- convert_unity_schema(data_contract_specification, unity_schema)
117
+ for unity_table_full_name in unity_table_full_name_list:
118
+ try:
119
+ unity_schema: TableInfo = workspace_client.tables.get(unity_table_full_name)
120
+ except Exception as e:
121
+ raise DataContractException(
122
+ type="schema",
123
+ name="Retrieve unity catalog schema",
124
+ reason=f"Unity table {unity_table_full_name} not found",
125
+ engine="datacontract",
126
+ original_exception=e,
127
+ )
128
+ data_contract_specification = convert_unity_schema(data_contract_specification, unity_schema)
100
129
 
101
130
  return data_contract_specification
102
131
 
103
132
 
104
133
  def convert_unity_schema(
105
- data_contract_specification: DataContractSpecification, unity_schema: TableInfo
106
- ) -> DataContractSpecification:
134
+ data_contract_specification: DataContractSpecification | OpenDataContractStandard, unity_schema: TableInfo
135
+ ) -> DataContractSpecification | OpenDataContractStandard:
107
136
  """
108
137
  Convert Unity schema to data contract specification.
109
138
 
@@ -117,6 +146,21 @@ def convert_unity_schema(
117
146
  if data_contract_specification.models is None:
118
147
  data_contract_specification.models = {}
119
148
 
149
+ if data_contract_specification.servers is None:
150
+ data_contract_specification.servers = {}
151
+
152
+ # Configure databricks server with catalog and schema from Unity table info
153
+ schema_name = unity_schema.schema_name
154
+ catalog_name = unity_schema.catalog_name
155
+ if catalog_name and schema_name:
156
+ server_name = "myserver" # Default server name
157
+
158
+ data_contract_specification.servers[server_name] = Server(
159
+ type="databricks",
160
+ catalog=catalog_name,
161
+ schema=schema_name,
162
+ )
163
+
120
164
  fields = import_table_fields(unity_schema.columns)
121
165
 
122
166
  table_id = unity_schema.name or unity_schema.table_id
@@ -149,25 +193,21 @@ def import_table_fields(columns: List[ColumnInfo]) -> dict[str, Field]:
149
193
  imported_fields = {}
150
194
 
151
195
  for column in columns:
152
- struct_field: types.StructField = _type_json_to_spark_field(column.type_json)
153
- imported_fields[column.name] = _field_from_struct_type(struct_field)
196
+ imported_fields[column.name] = _to_field(column)
154
197
 
155
198
  return imported_fields
156
199
 
157
200
 
158
- def _type_json_to_spark_field(type_json: str) -> types.StructField:
159
- """
160
- Parses a JSON string representing a Spark field and returns a StructField object.
201
+ def _to_field(column: ColumnInfo) -> Field:
202
+ field = Field()
203
+ if column.type_name is not None:
204
+ sql_type = str(column.type_text)
205
+ field.type = map_type_from_sql(sql_type)
206
+ physical_type_key = to_physical_type_key("databricks")
207
+ field.config = {
208
+ physical_type_key: sql_type,
209
+ }
210
+ field.required = column.nullable is None or not column.nullable
211
+ field.description = column.comment if column.comment else None
161
212
 
162
- The reason we do this is to leverage the Spark JSON schema parser to handle the
163
- complexity of the Spark field types. The field `type_json` in the Unity API is
164
- the output of a `StructField.jsonValue()` call.
165
-
166
- :param type_json: The JSON string representing the Spark field.
167
- :type type_json: str
168
-
169
- :return: The StructField object.
170
- :rtype: types.StructField
171
- """
172
- type_dict = json.loads(type_json)
173
- return types.StructField.fromJson(type_dict)
213
+ return field
@@ -3,7 +3,7 @@ import logging
3
3
 
4
4
  import requests
5
5
 
6
- DEFAULT_DATA_CONTRACT_INIT_TEMPLATE = "datacontract-1.1.0.init.yaml"
6
+ DEFAULT_DATA_CONTRACT_INIT_TEMPLATE = "datacontract-1.2.0.init.yaml"
7
7
 
8
8
 
9
9
  def get_init_template(location: str = None) -> str:
@@ -4,6 +4,9 @@ import requests
4
4
 
5
5
  from datacontract.model.run import Run
6
6
 
7
+ # used to retrieve the HTML location of the published data contract or test results
8
+ RESPONSE_HEADER_LOCATION_HTML = "location-html"
9
+
7
10
 
8
11
  def publish_test_results_to_datamesh_manager(run: Run, publish_url: str, ssl_verification: bool):
9
12
  try:
@@ -38,7 +41,12 @@ def publish_test_results_to_datamesh_manager(run: Run, publish_url: str, ssl_ver
38
41
  if response.status_code != 200:
39
42
  run.log_error(f"Error publishing test results to Data Mesh Manager: {response.text}")
40
43
  return
41
- run.log_info(f"Published test results to {url}")
44
+ run.log_info("Published test results successfully")
45
+
46
+ location_html = response.headers.get(RESPONSE_HEADER_LOCATION_HTML)
47
+ if location_html is not None and len(location_html) > 0:
48
+ print(f"🚀 Open {location_html}")
49
+
42
50
  except Exception as e:
43
51
  run.log_error(f"Failed publishing test results. Error: {str(e)}")
44
52
 
@@ -67,6 +75,12 @@ def publish_data_contract_to_datamesh_manager(data_contract_dict: dict, ssl_veri
67
75
  if response.status_code != 200:
68
76
  print(f"Error publishing data contract to Data Mesh Manager: {response.text}")
69
77
  exit(1)
70
- print(f"Published data contract to {url}")
78
+
79
+ print("✅ Published data contract successfully")
80
+
81
+ location_html = response.headers.get(RESPONSE_HEADER_LOCATION_HTML)
82
+ if location_html is not None and len(location_html) > 0:
83
+ print(f"🚀 Open {location_html}")
84
+
71
85
  except Exception as e:
72
86
  print(f"Failed publishing data contract. Error: {str(e)}")
@@ -5,8 +5,9 @@ import warnings
5
5
  import fastjsonschema
6
6
  import yaml
7
7
  from fastjsonschema import JsonSchemaValueException
8
+ from open_data_contract_standard.model import OpenDataContractStandard
8
9
 
9
- from datacontract.imports.odcs_v3_importer import import_odcs_v3_from_str
10
+ from datacontract.imports.odcs_v3_importer import import_from_odcs, parse_odcs_v3_from_str
10
11
  from datacontract.lint.resources import read_resource
11
12
  from datacontract.lint.schema import fetch_schema
12
13
  from datacontract.lint.urls import fetch_resource
@@ -46,6 +47,34 @@ def resolve_data_contract(
46
47
  )
47
48
 
48
49
 
50
+ def resolve_data_contract_v2(
51
+ data_contract_location: str = None,
52
+ data_contract_str: str = None,
53
+ data_contract: DataContractSpecification | OpenDataContractStandard = None,
54
+ schema_location: str = None,
55
+ inline_definitions: bool = False,
56
+ inline_quality: bool = False,
57
+ ) -> DataContractSpecification | OpenDataContractStandard:
58
+ if data_contract_location is not None:
59
+ return resolve_data_contract_from_location_v2(
60
+ data_contract_location, schema_location, inline_definitions, inline_quality
61
+ )
62
+ elif data_contract_str is not None:
63
+ return _resolve_data_contract_from_str_v2(
64
+ data_contract_str, schema_location, inline_definitions, inline_quality
65
+ )
66
+ elif data_contract is not None:
67
+ return data_contract
68
+ else:
69
+ raise DataContractException(
70
+ type="lint",
71
+ result=ResultEnum.failed,
72
+ name="Check that data contract YAML is valid",
73
+ reason="Data contract needs to be provided",
74
+ engine="datacontract",
75
+ )
76
+
77
+
49
78
  def resolve_data_contract_dict(
50
79
  data_contract_location: str = None,
51
80
  data_contract_str: str = None,
@@ -67,6 +96,13 @@ def resolve_data_contract_dict(
67
96
  )
68
97
 
69
98
 
99
+ def resolve_data_contract_from_location_v2(
100
+ location, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False
101
+ ) -> DataContractSpecification | OpenDataContractStandard:
102
+ data_contract_str = read_resource(location)
103
+ return _resolve_data_contract_from_str_v2(data_contract_str, schema_location, inline_definitions, inline_quality)
104
+
105
+
70
106
  def resolve_data_contract_from_location(
71
107
  location, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False
72
108
  ) -> DataContractSpecification:
@@ -242,6 +278,21 @@ def _get_quality_ref_file(quality_spec: str | object) -> str | object:
242
278
  return quality_spec
243
279
 
244
280
 
281
+ def _resolve_data_contract_from_str_v2(
282
+ data_contract_str, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False
283
+ ) -> DataContractSpecification | OpenDataContractStandard:
284
+ yaml_dict = _to_yaml(data_contract_str)
285
+
286
+ if is_open_data_contract_standard(yaml_dict):
287
+ logging.info("Importing ODCS v3")
288
+ # if ODCS, then validate the ODCS schema and import to DataContractSpecification directly
289
+ odcs = parse_odcs_v3_from_str(data_contract_str)
290
+ return odcs
291
+
292
+ logging.info("Importing DCS")
293
+ return _resolve_dcs_from_yaml_dict(inline_definitions, inline_quality, schema_location, yaml_dict)
294
+
295
+
245
296
  def _resolve_data_contract_from_str(
246
297
  data_contract_str, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False
247
298
  ) -> DataContractSpecification:
@@ -250,15 +301,19 @@ def _resolve_data_contract_from_str(
250
301
  if is_open_data_contract_standard(yaml_dict):
251
302
  logging.info("Importing ODCS v3")
252
303
  # if ODCS, then validate the ODCS schema and import to DataContractSpecification directly
253
- data_contract_specification = DataContractSpecification(dataContractSpecification="1.1.0")
254
- return import_odcs_v3_from_str(data_contract_specification, source_str=data_contract_str)
255
- else:
256
- logging.info("Importing DCS")
304
+ odcs = parse_odcs_v3_from_str(data_contract_str)
257
305
 
306
+ data_contract_specification = DataContractSpecification(dataContractSpecification="1.2.0")
307
+ return import_from_odcs(data_contract_specification, odcs)
308
+
309
+ logging.info("Importing DCS")
310
+ return _resolve_dcs_from_yaml_dict(inline_definitions, inline_quality, schema_location, yaml_dict)
311
+
312
+
313
+ def _resolve_dcs_from_yaml_dict(inline_definitions, inline_quality, schema_location, yaml_dict):
258
314
  _validate_data_contract_specification_schema(yaml_dict, schema_location)
259
315
  data_contract_specification = yaml_dict
260
316
  spec = DataContractSpecification(**data_contract_specification)
261
-
262
317
  if inline_definitions:
263
318
  inline_definitions_into_data_contract(spec)
264
319
  ## Suppress DeprecationWarning when accessing spec.quality,
@@ -276,7 +331,6 @@ def _resolve_data_contract_from_str(
276
331
  )
277
332
  if spec_quality and inline_quality:
278
333
  _resolve_quality_ref(spec_quality)
279
-
280
334
  return spec
281
335
 
282
336
 
@@ -8,7 +8,7 @@ import requests
8
8
 
9
9
  from datacontract.model.exceptions import DataContractException
10
10
 
11
- DEFAULT_DATA_CONTRACT_SCHEMA = "datacontract-1.1.0.schema.json"
11
+ DEFAULT_DATA_CONTRACT_SCHEMA = "datacontract-1.2.0.schema.json"
12
12
 
13
13
 
14
14
  def fetch_schema(location: str = None) -> Dict[str, Any]:
@@ -1,4 +1,4 @@
1
- dataContractSpecification: 1.1.0
1
+ dataContractSpecification: 1.2.0
2
2
  id: my-data-contract-id
3
3
  info:
4
4
  title: My Data Contract
@@ -0,0 +1,91 @@
1
+ dataContractSpecification: 1.2.0
2
+ id: my-data-contract-id
3
+ info:
4
+ title: My Data Contract
5
+ version: 0.0.1
6
+ # description:
7
+ # owner:
8
+ # contact:
9
+ # name:
10
+ # url:
11
+ # email:
12
+
13
+
14
+ ### servers
15
+
16
+ #servers:
17
+ # production:
18
+ # type: s3
19
+ # location: s3://
20
+ # format: parquet
21
+ # delimiter: new_line
22
+
23
+ ### terms
24
+
25
+ #terms:
26
+ # usage:
27
+ # limitations:
28
+ # billing:
29
+ # noticePeriod:
30
+
31
+
32
+ ### models
33
+
34
+ # models:
35
+ # my_model:
36
+ # description:
37
+ # type:
38
+ # fields:
39
+ # my_field:
40
+ # type:
41
+ # description:
42
+
43
+
44
+ ### definitions
45
+
46
+ # definitions:
47
+ # my_field:
48
+ # domain:
49
+ # name:
50
+ # title:
51
+ # type:
52
+ # description:
53
+ # example:
54
+ # pii:
55
+ # classification:
56
+
57
+
58
+ ### servicelevels
59
+
60
+ #servicelevels:
61
+ # availability:
62
+ # description: The server is available during support hours
63
+ # percentage: 99.9%
64
+ # retention:
65
+ # description: Data is retained for one year because!
66
+ # period: P1Y
67
+ # unlimited: false
68
+ # latency:
69
+ # description: Data is available within 25 hours after the order was placed
70
+ # threshold: 25h
71
+ # sourceTimestampField: orders.order_timestamp
72
+ # processedTimestampField: orders.processed_timestamp
73
+ # freshness:
74
+ # description: The age of the youngest row in a table.
75
+ # threshold: 25h
76
+ # timestampField: orders.order_timestamp
77
+ # frequency:
78
+ # description: Data is delivered once a day
79
+ # type: batch # or streaming
80
+ # interval: daily # for batch, either or cron
81
+ # cron: 0 0 * * * # for batch, either or interval
82
+ # support:
83
+ # description: The data is available during typical business hours at headquarters
84
+ # time: 9am to 5pm in EST on business days
85
+ # responseTime: 1h
86
+ # backup:
87
+ # description: Data is backed up once a week, every Sunday at 0:00 UTC.
88
+ # interval: weekly
89
+ # cron: 0 0 * * 0
90
+ # recoveryTime: 24 hours
91
+ # recoveryPoint: 1 week