datacontract-cli 0.10.11__py3-none-any.whl → 0.10.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (40) hide show
  1. datacontract/cli.py +19 -3
  2. datacontract/data_contract.py +5 -10
  3. datacontract/engines/fastjsonschema/check_jsonschema.py +11 -0
  4. datacontract/engines/fastjsonschema/s3/s3_read_files.py +2 -0
  5. datacontract/engines/soda/check_soda_execute.py +2 -8
  6. datacontract/engines/soda/connections/duckdb.py +23 -24
  7. datacontract/engines/soda/connections/kafka.py +84 -25
  8. datacontract/export/avro_converter.py +12 -2
  9. datacontract/export/bigquery_converter.py +30 -23
  10. datacontract/export/data_caterer_converter.py +148 -0
  11. datacontract/export/dbml_converter.py +3 -2
  12. datacontract/export/exporter.py +2 -0
  13. datacontract/export/exporter_factory.py +12 -0
  14. datacontract/export/jsonschema_converter.py +13 -2
  15. datacontract/export/spark_converter.py +5 -1
  16. datacontract/export/sql_type_converter.py +65 -39
  17. datacontract/export/sqlalchemy_converter.py +169 -0
  18. datacontract/imports/avro_importer.py +1 -0
  19. datacontract/imports/bigquery_importer.py +2 -2
  20. datacontract/imports/dbml_importer.py +112 -0
  21. datacontract/imports/dbt_importer.py +67 -91
  22. datacontract/imports/glue_importer.py +62 -58
  23. datacontract/imports/importer.py +2 -1
  24. datacontract/imports/importer_factory.py +5 -0
  25. datacontract/imports/odcs_importer.py +1 -1
  26. datacontract/imports/spark_importer.py +34 -11
  27. datacontract/imports/sql_importer.py +1 -1
  28. datacontract/imports/unity_importer.py +106 -85
  29. datacontract/integration/{publish_datamesh_manager.py → datamesh_manager.py} +33 -5
  30. datacontract/integration/{publish_opentelemetry.py → opentelemetry.py} +1 -1
  31. datacontract/lint/resolve.py +10 -1
  32. datacontract/lint/urls.py +27 -13
  33. datacontract/model/data_contract_specification.py +6 -2
  34. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/METADATA +123 -32
  35. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/RECORD +39 -37
  36. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/WHEEL +1 -1
  37. datacontract/publish/publish.py +0 -32
  38. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/LICENSE +0 -0
  39. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/entry_points.txt +0 -0
  40. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,37 @@
1
1
  import json
2
- import requests
3
2
  import os
4
- import typing
3
+ from typing import List, Optional
4
+
5
+ from pyspark.sql import types
6
+ from databricks.sdk import WorkspaceClient
7
+ from databricks.sdk.service.catalog import TableInfo, ColumnInfo
5
8
 
6
9
  from datacontract.imports.importer import Importer
10
+ from datacontract.imports.spark_importer import _field_from_struct_type
7
11
  from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
8
12
  from datacontract.model.exceptions import DataContractException
9
13
 
10
14
 
11
15
  class UnityImporter(Importer):
16
+ """
17
+ UnityImporter class for importing data contract specifications from Unity Catalog.
18
+ """
19
+
12
20
  def import_source(
13
21
  self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
14
- ) -> dict:
22
+ ) -> DataContractSpecification:
23
+ """
24
+ Import data contract specification from a source.
25
+
26
+ :param data_contract_specification: The data contract specification to be imported.
27
+ :type data_contract_specification: DataContractSpecification
28
+ :param source: The source from which to import the data contract specification.
29
+ :type source: str
30
+ :param import_args: Additional arguments for the import process.
31
+ :type import_args: dict
32
+ :return: The imported data contract specification.
33
+ :rtype: DataContractSpecification
34
+ """
15
35
  if source is not None:
16
36
  data_contract_specification = import_unity_from_json(data_contract_specification, source)
17
37
  else:
@@ -24,9 +44,21 @@ class UnityImporter(Importer):
24
44
  def import_unity_from_json(
25
45
  data_contract_specification: DataContractSpecification, source: str
26
46
  ) -> DataContractSpecification:
47
+ """
48
+ Import data contract specification from a JSON file.
49
+
50
+ :param data_contract_specification: The data contract specification to be imported.
51
+ :type data_contract_specification: DataContractSpecification
52
+ :param source: The path to the JSON file.
53
+ :type source: str
54
+ :return: The imported data contract specification.
55
+ :rtype: DataContractSpecification
56
+ :raises DataContractException: If there is an error parsing the JSON file.
57
+ """
27
58
  try:
28
59
  with open(source, "r") as file:
29
- unity_schema = json.loads(file.read())
60
+ json_contents = json.loads(file.read())
61
+ unity_schema = TableInfo.from_dict(json_contents)
30
62
  except json.JSONDecodeError as e:
31
63
  raise DataContractException(
32
64
  type="schema",
@@ -39,114 +71,103 @@ def import_unity_from_json(
39
71
 
40
72
 
41
73
  def import_unity_from_api(
42
- data_contract_specification: DataContractSpecification, unity_table_full_name: typing.Optional[str] = None
74
+ data_contract_specification: DataContractSpecification, unity_table_full_name: Optional[str] = None
43
75
  ) -> DataContractSpecification:
44
- databricks_instance = os.getenv("DATABRICKS_IMPORT_INSTANCE")
45
- access_token = os.getenv("DATABRICKS_IMPORT_ACCESS_TOKEN")
46
-
47
- if not databricks_instance or not access_token:
48
- print("Missing environment variables for Databricks instance or access token.")
49
- print("Both, $DATABRICKS_IMPORT_INSTANCE and $DATABRICKS_IMPORT_ACCESS_TOKEN must be set.")
50
- exit(1) # Exit if variables are not set
51
-
52
- api_url = f"{databricks_instance}/api/2.1/unity-catalog/tables/{unity_table_full_name}"
53
-
54
- headers = {"Authorization": f"Bearer {access_token}"}
55
- response = requests.get(api_url, headers=headers)
56
-
57
- if response.status_code != 200:
76
+ """
77
+ Import data contract specification from Unity Catalog API.
78
+
79
+ :param data_contract_specification: The data contract specification to be imported.
80
+ :type data_contract_specification: DataContractSpecification
81
+ :param unity_table_full_name: The full name of the Unity table.
82
+ :type unity_table_full_name: Optional[str]
83
+ :return: The imported data contract specification.
84
+ :rtype: DataContractSpecification
85
+ :raises DataContractException: If there is an error retrieving the schema from the API.
86
+ """
87
+ try:
88
+ workspace_client = WorkspaceClient()
89
+ unity_schema: TableInfo = workspace_client.tables.get(unity_table_full_name)
90
+ except Exception as e:
58
91
  raise DataContractException(
59
92
  type="schema",
60
93
  name="Retrieve unity catalog schema",
61
- reason=f"Failed to retrieve unity catalog schema from databricks instance: {response.status_code} {response.text}",
94
+ reason=f"Failed to retrieve unity catalog schema from databricks profile: {os.getenv('DATABRICKS_CONFIG_PROFILE')}",
62
95
  engine="datacontract",
96
+ original_exception=e,
63
97
  )
64
98
 
65
- convert_unity_schema(data_contract_specification, response.json())
99
+ convert_unity_schema(data_contract_specification, unity_schema)
66
100
 
67
101
  return data_contract_specification
68
102
 
69
103
 
70
104
  def convert_unity_schema(
71
- data_contract_specification: DataContractSpecification, unity_schema: dict
105
+ data_contract_specification: DataContractSpecification, unity_schema: TableInfo
72
106
  ) -> DataContractSpecification:
107
+ """
108
+ Convert Unity schema to data contract specification.
109
+
110
+ :param data_contract_specification: The data contract specification to be converted.
111
+ :type data_contract_specification: DataContractSpecification
112
+ :param unity_schema: The Unity schema to be converted.
113
+ :type unity_schema: TableInfo
114
+ :return: The converted data contract specification.
115
+ :rtype: DataContractSpecification
116
+ """
73
117
  if data_contract_specification.models is None:
74
118
  data_contract_specification.models = {}
75
119
 
76
- fields = import_table_fields(unity_schema.get("columns"))
120
+ fields = import_table_fields(unity_schema.columns)
77
121
 
78
- table_id = unity_schema.get("table_id")
122
+ table_id = unity_schema.name or unity_schema.table_id
79
123
 
80
124
  data_contract_specification.models[table_id] = Model(fields=fields, type="table")
81
125
 
82
- if unity_schema.get("name") is not None:
83
- data_contract_specification.models[table_id].title = unity_schema.get("name")
126
+ if unity_schema.name:
127
+ data_contract_specification.models[table_id].title = unity_schema.name
128
+
129
+ if unity_schema.comment:
130
+ data_contract_specification.models[table_id].description = unity_schema.comment
84
131
 
85
132
  return data_contract_specification
86
133
 
87
134
 
88
- def import_table_fields(table_fields):
135
+ def import_table_fields(columns: List[ColumnInfo]) -> dict[str, Field]:
136
+ """
137
+ Import table fields from Unity schema columns.
138
+
139
+ Here we are first converting the `ColumnInfo.type_json` to a Spark StructField object
140
+ so we can leave the complexity of the Spark field types to the Spark JSON schema parser,
141
+ then re-use the logic in `datacontract.imports.spark_importer` to convert the StructField
142
+ into a Field object.
143
+
144
+ :param columns: The list of Unity schema columns.
145
+ :type columns: List[ColumnInfo]
146
+ :return: A dictionary of imported fields.
147
+ :rtype: dict[str, Field]
148
+ """
89
149
  imported_fields = {}
90
- for field in table_fields:
91
- field_name = field.get("name")
92
- imported_fields[field_name] = Field()
93
- imported_fields[field_name].required = field.get("nullable") == "false"
94
- imported_fields[field_name].description = field.get("comment")
95
-
96
- # databricks api 2.1 specifies that type_name can be any of:
97
- # BOOLEAN | BYTE | SHORT | INT | LONG | FLOAT | DOUBLE | DATE | TIMESTAMP | TIMESTAMP_NTZ | STRING
98
- # | BINARY | DECIMAL | INTERVAL | ARRAY | STRUCT | MAP | CHAR | NULL | USER_DEFINED_TYPE | TABLE_TYPE
99
- if field.get("type_name") in ["INTERVAL", "ARRAY", "STRUCT", "MAP", "USER_DEFINED_TYPE", "TABLE_TYPE"]:
100
- # complex types are not supported, yet
101
- raise DataContractException(
102
- type="schema",
103
- result="failed",
104
- name="Map unity type to data contract type",
105
- reason=f"type ${field.get('type_name')} is not supported yet for unity import",
106
- engine="datacontract",
107
- )
108
150
 
109
- imported_fields[field_name].type = map_type_from_unity(field.get("type_name"))
151
+ for column in columns:
152
+ struct_field: types.StructField = _type_json_to_spark_field(column.type_json)
153
+ imported_fields[column.name] = _field_from_struct_type(struct_field)
110
154
 
111
155
  return imported_fields
112
156
 
113
157
 
114
- def map_type_from_unity(type_str: str):
115
- if type_str == "BOOLEAN":
116
- return "boolean"
117
- elif type_str == "BYTE":
118
- return "bytes"
119
- elif type_str == "SHORT":
120
- return "int"
121
- elif type_str == "INT":
122
- return "int"
123
- elif type_str == "LONG":
124
- return "long"
125
- elif type_str == "FLOAT":
126
- return "float"
127
- elif type_str == "DOUBLE":
128
- return "double"
129
- elif type_str == "DATE":
130
- return "date"
131
- elif type_str == "TIMESTAMP":
132
- return "timestamp"
133
- elif type_str == "TIMESTAMP_NTZ":
134
- return "timestamp_ntz"
135
- elif type_str == "STRING":
136
- return "string"
137
- elif type_str == "BINARY":
138
- return "bytes"
139
- elif type_str == "DECIMAL":
140
- return "decimal"
141
- elif type_str == "CHAR":
142
- return "varchar"
143
- elif type_str == "NULL":
144
- return "null"
145
- else:
146
- raise DataContractException(
147
- type="schema",
148
- result="failed",
149
- name="Map unity type to data contract type",
150
- reason=f"Unsupported type {type_str} in unity json definition.",
151
- engine="datacontract",
152
- )
158
+ def _type_json_to_spark_field(type_json: str) -> types.StructField:
159
+ """
160
+ Parses a JSON string representing a Spark field and returns a StructField object.
161
+
162
+ The reason we do this is to leverage the Spark JSON schema parser to handle the
163
+ complexity of the Spark field types. The field `type_json` in the Unity API is
164
+ the output of a `StructField.jsonValue()` call.
165
+
166
+ :param type_json: The JSON string representing the Spark field.
167
+ :type type_json: str
168
+
169
+ :return: The StructField object.
170
+ :rtype: types.StructField
171
+ """
172
+ type_dict = json.loads(type_json)
173
+ return types.StructField.fromJson(type_dict)
@@ -2,28 +2,29 @@ import os
2
2
 
3
3
  import requests
4
4
 
5
+ from datacontract.model.data_contract_specification import DataContractSpecification
5
6
  from datacontract.model.run import Run
6
7
 
7
8
 
8
- def publish_datamesh_manager(run: Run, publish_url: str):
9
+ def publish_test_results_to_datamesh_manager(run: Run, publish_url: str):
9
10
  try:
10
11
  if publish_url is None:
11
12
  # this url supports Data Mesh Manager and Data Contract Manager
12
13
  url = "https://api.datamesh-manager.com/api/test-results"
13
14
  else:
14
15
  url = publish_url
16
+
15
17
  api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
16
18
  if api_key is None:
17
19
  api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY")
18
-
19
- if run.dataContractId is None:
20
- raise Exception("Cannot publish run results, as data contract ID is unknown")
21
-
22
20
  if api_key is None:
23
21
  raise Exception(
24
22
  "Cannot publish run results, as DATAMESH_MANAGER_API_KEY nor DATACONTRACT_MANAGER_API_KEY are not set"
25
23
  )
26
24
 
25
+ if run.dataContractId is None:
26
+ raise Exception("Cannot publish run results, as data contract ID is unknown")
27
+
27
28
  headers = {"Content-Type": "application/json", "x-api-key": api_key}
28
29
  request_body = run.model_dump_json()
29
30
  # print("Request Body:", request_body)
@@ -36,3 +37,30 @@ def publish_datamesh_manager(run: Run, publish_url: str):
36
37
  run.log_info(f"Published test results to {url}")
37
38
  except Exception as e:
38
39
  run.log_error(f"Failed publishing test results. Error: {str(e)}")
40
+
41
+
42
+ def publish_data_contract_to_datamesh_manager(data_contract_specification: DataContractSpecification):
43
+ try:
44
+ api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
45
+ if api_key is None:
46
+ api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY")
47
+ if api_key is None:
48
+ raise Exception(
49
+ "Cannot publish data contract, as neither DATAMESH_MANAGER_API_KEY nor DATACONTRACT_MANAGER_API_KEY is set"
50
+ )
51
+ headers = {"Content-Type": "application/json", "x-api-key": api_key}
52
+ spec = data_contract_specification
53
+ id = spec.id
54
+ url = "https://api.datamesh-manager.com/api/datacontracts/{0}".format(id)
55
+ request_body = spec.model_dump_json().encode("utf-8")
56
+ response = requests.put(
57
+ url=url,
58
+ data=request_body,
59
+ headers=headers,
60
+ )
61
+ if response.status_code != 200:
62
+ print(f"Error publishing data contract to Data Mesh Manager: {response.text}")
63
+ exit(1)
64
+ print(f"Published data contract to {url}")
65
+ except Exception as e:
66
+ print(f"Failed publishing data contract. Error: {str(e)}")
@@ -34,7 +34,7 @@ from datacontract.model.run import Run
34
34
  # - Metrics only, no logs yet (but loosely planned)
35
35
 
36
36
 
37
- def publish_opentelemetry(run: Run):
37
+ def publish_test_results_to_opentelemetry(run: Run):
38
38
  try:
39
39
  if run.dataContractId is None:
40
40
  raise Exception("Cannot publish run results, as data contract ID is unknown")
@@ -114,7 +114,16 @@ def _resolve_definition_ref(ref, spec) -> Definition:
114
114
 
115
115
  def _find_by_path_in_spec(definition_path: str, spec: DataContractSpecification):
116
116
  path_elements = definition_path.split("/")
117
- definition = spec.definitions[path_elements[2]]
117
+ definition_key = path_elements[2]
118
+ if definition_key not in spec.definitions:
119
+ raise DataContractException(
120
+ type="lint",
121
+ result="failed",
122
+ name="Check that data contract YAML is valid",
123
+ reason=f"Cannot resolve definition {definition_key}",
124
+ engine="datacontract",
125
+ )
126
+ definition = spec.definitions[definition_key]
118
127
  definition = _find_subfield_in_definition(definition, path_elements[3:])
119
128
  return definition
120
129
 
datacontract/lint/urls.py CHANGED
@@ -25,16 +25,30 @@ def fetch_resource(url: str):
25
25
 
26
26
 
27
27
  def _set_api_key(headers, url):
28
- if ".datamesh-manager.com/" not in url:
29
- return
30
- datamesh_manager_api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
31
- if datamesh_manager_api_key is None or datamesh_manager_api_key == "":
32
- print("Error: Data Mesh Manager API Key is not set. Set env variable DATAMESH_MANAGER_API_KEY.")
33
- raise DataContractException(
34
- type="lint",
35
- name=f"Reading data contract from {url}",
36
- reason="Error: Data Mesh Manager API Key is not set. Set env variable DATAMESH_MANAGER_API_KEY.",
37
- engine="datacontract",
38
- result="error",
39
- )
40
- headers["x-api-key"] = datamesh_manager_api_key
28
+ if ".datamesh-manager.com/" in url:
29
+ datamesh_manager_api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
30
+ if datamesh_manager_api_key is None or datamesh_manager_api_key == "":
31
+ print("Error: Data Mesh Manager API Key is not set. Set env variable DATAMESH_MANAGER_API_KEY.")
32
+ raise DataContractException(
33
+ type="lint",
34
+ name=f"Reading data contract from {url}",
35
+ reason="Error: Data Mesh Manager API Key is not set. Set env variable DATAMESH_MANAGER_API_KEY.",
36
+ engine="datacontract",
37
+ result="error",
38
+ )
39
+ headers["x-api-key"] = datamesh_manager_api_key
40
+ elif ".datacontract-manager.com/" in url:
41
+ datacontract_manager_api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY")
42
+ if datacontract_manager_api_key is None or datacontract_manager_api_key == "":
43
+ print("Error: Data Contract Manager API Key is not set. Set env variable DATACONTRACT_MANAGER_API_KEY.")
44
+ raise DataContractException(
45
+ type="lint",
46
+ name=f"Reading data contract from {url}",
47
+ reason="Error: Data Contract Manager API Key is not set. Set env variable DATACONTRACT_MANAGER_API_KEY.",
48
+ engine="datacontract",
49
+ result="error",
50
+ )
51
+ headers["x-api-key"] = datacontract_manager_api_key
52
+ else:
53
+ # do nothing
54
+ pass
@@ -73,7 +73,7 @@ class Definition(pyd.BaseModel):
73
73
  exclusiveMaximum: int = None
74
74
  pii: bool = None
75
75
  classification: str = None
76
- fields: Dict[str, "Definition"] = {}
76
+ fields: Dict[str, "Field"] = {}
77
77
  tags: List[str] = []
78
78
  links: Dict[str, str] = {}
79
79
  example: str = None
@@ -239,4 +239,8 @@ class DataContractSpecification(pyd.BaseModel):
239
239
  return DataContractSpecification(**data)
240
240
 
241
241
  def to_yaml(self):
242
- return yaml.dump(self.model_dump(exclude_defaults=True, exclude_none=True), sort_keys=False, allow_unicode=True)
242
+ return yaml.dump(
243
+ self.model_dump(exclude_defaults=True, exclude_none=True, by_alias=True),
244
+ sort_keys=False,
245
+ allow_unicode=True,
246
+ )