datacontract-cli 0.10.12__py3-none-any.whl → 0.10.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (37) hide show
  1. datacontract/cli.py +5 -0
  2. datacontract/data_contract.py +9 -1
  3. datacontract/engines/soda/connections/kafka.py +28 -6
  4. datacontract/export/avro_converter.py +8 -1
  5. datacontract/export/avro_idl_converter.py +1 -0
  6. datacontract/export/bigquery_converter.py +30 -23
  7. datacontract/export/data_caterer_converter.py +148 -0
  8. datacontract/export/dcs_exporter.py +6 -0
  9. datacontract/export/exporter.py +5 -1
  10. datacontract/export/exporter_factory.py +19 -1
  11. datacontract/export/jsonschema_converter.py +13 -2
  12. datacontract/export/{odcs_converter.py → odcs_v2_exporter.py} +4 -4
  13. datacontract/export/odcs_v3_exporter.py +294 -0
  14. datacontract/export/sodacl_converter.py +82 -2
  15. datacontract/export/spark_converter.py +3 -1
  16. datacontract/export/sql_type_converter.py +56 -21
  17. datacontract/imports/iceberg_importer.py +162 -0
  18. datacontract/imports/importer.py +1 -0
  19. datacontract/imports/importer_factory.py +5 -0
  20. datacontract/imports/odcs_importer.py +25 -168
  21. datacontract/imports/odcs_v2_importer.py +177 -0
  22. datacontract/imports/odcs_v3_importer.py +309 -0
  23. datacontract/imports/spark_importer.py +5 -1
  24. datacontract/imports/unity_importer.py +105 -84
  25. datacontract/integration/datamesh_manager.py +1 -1
  26. datacontract/lint/resolve.py +24 -10
  27. datacontract/lint/resources.py +21 -0
  28. datacontract/lint/urls.py +29 -13
  29. datacontract/model/data_contract_specification.py +72 -8
  30. datacontract/model/odcs.py +11 -0
  31. {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/METADATA +106 -52
  32. {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/RECORD +36 -29
  33. {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/WHEEL +1 -1
  34. datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
  35. {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/LICENSE +0 -0
  36. {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/entry_points.txt +0 -0
  37. {datacontract_cli-0.10.12.dist-info → datacontract_cli-0.10.14.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,309 @@
1
+ import datetime
2
+ import logging
3
+ from typing import Any, Dict, List
4
+ from venv import logger
5
+
6
+ import yaml
7
+
8
+ from datacontract.imports.importer import Importer
9
+ from datacontract.lint.resources import read_resource
10
+ from datacontract.model.data_contract_specification import (
11
+ Availability,
12
+ DataContractSpecification,
13
+ Info,
14
+ Model,
15
+ Field,
16
+ Retention,
17
+ Server,
18
+ ServiceLevel,
19
+ Terms,
20
+ DATACONTRACT_TYPES,
21
+ )
22
+ from datacontract.model.exceptions import DataContractException
23
+
24
+
25
+ class OdcsImporter(Importer):
26
+ def import_source(
27
+ self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
28
+ ) -> DataContractSpecification:
29
+ return import_odcs_v3(data_contract_specification, source)
30
+
31
+
32
+ def import_odcs_v3(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
33
+ source_str = read_resource(source)
34
+ return import_odcs_v3_from_str(data_contract_specification, source_str)
35
+
36
+
37
+ def import_odcs_v3_from_str(
38
+ data_contract_specification: DataContractSpecification, source_str: str
39
+ ) -> DataContractSpecification:
40
+ try:
41
+ odcs_contract = yaml.safe_load(source_str)
42
+ except Exception as e:
43
+ raise DataContractException(
44
+ type="schema",
45
+ name="Parse ODCS contract",
46
+ reason=f"Failed to parse odcs contract from {source_str}",
47
+ engine="datacontract",
48
+ original_exception=e,
49
+ )
50
+
51
+ data_contract_specification.id = odcs_contract["id"]
52
+ data_contract_specification.info = import_info(odcs_contract)
53
+ data_contract_specification.servers = import_servers(odcs_contract)
54
+ data_contract_specification.terms = import_terms(odcs_contract)
55
+ data_contract_specification.servicelevels = import_servicelevels(odcs_contract)
56
+ data_contract_specification.models = import_models(odcs_contract)
57
+ data_contract_specification.tags = import_tags(odcs_contract)
58
+
59
+ return data_contract_specification
60
+
61
+
62
+ def import_info(odcs_contract: Dict[str, Any]) -> Info:
63
+ info = Info()
64
+
65
+ info.title = odcs_contract.get("name") if odcs_contract.get("name") is not None else ""
66
+
67
+ if odcs_contract.get("version") is not None:
68
+ info.version = odcs_contract.get("version")
69
+
70
+ # odcs.description.purpose => datacontract.description
71
+ if odcs_contract.get("description") is not None and odcs_contract.get("description").get("purpose") is not None:
72
+ info.description = odcs_contract.get("description").get("purpose")
73
+
74
+ # odcs.domain => datacontract.owner
75
+ if odcs_contract.get("domain") is not None:
76
+ info.owner = odcs_contract.get("domain")
77
+
78
+ # add dataProduct as custom property
79
+ if odcs_contract.get("dataProduct") is not None:
80
+ info.dataProduct = odcs_contract.get("dataProduct")
81
+
82
+ # add tenant as custom property
83
+ if odcs_contract.get("tenant") is not None:
84
+ info.tenant = odcs_contract.get("tenant")
85
+
86
+ return info
87
+
88
+
89
+ def import_servers(odcs_contract: Dict[str, Any]) -> Dict[str, Server] | None:
90
+ if odcs_contract.get("servers") is None:
91
+ return None
92
+ servers = {}
93
+ for odcs_server in odcs_contract.get("servers"):
94
+ server_name = odcs_server.get("server")
95
+ if server_name is None:
96
+ logger.warning("Server name is missing, skipping server")
97
+ continue
98
+
99
+ server = Server()
100
+ server.type = odcs_server.get("type")
101
+ server.description = odcs_server.get("description")
102
+ server.environment = odcs_server.get("environment")
103
+ server.format = odcs_server.get("format")
104
+ server.project = odcs_server.get("project")
105
+ server.dataset = odcs_server.get("dataset")
106
+ server.path = odcs_server.get("path")
107
+ server.delimiter = odcs_server.get("delimiter")
108
+ server.endpointUrl = odcs_server.get("endpointUrl")
109
+ server.location = odcs_server.get("location")
110
+ server.account = odcs_server.get("account")
111
+ server.database = odcs_server.get("database")
112
+ server.schema_ = odcs_server.get("schema")
113
+ server.host = odcs_server.get("host")
114
+ server.port = odcs_server.get("port")
115
+ server.catalog = odcs_server.get("catalog")
116
+ server.topic = odcs_server.get("topic")
117
+ server.http_path = odcs_server.get("http_path")
118
+ server.token = odcs_server.get("token")
119
+ server.dataProductId = odcs_server.get("dataProductId")
120
+ server.outputPortId = odcs_server.get("outputPortId")
121
+ server.driver = odcs_server.get("driver")
122
+ server.roles = odcs_server.get("roles")
123
+
124
+ servers[server_name] = server
125
+ return servers
126
+
127
+
128
+ def import_terms(odcs_contract: Dict[str, Any]) -> Terms | None:
129
+ if odcs_contract.get("description") is None:
130
+ return None
131
+ if (
132
+ odcs_contract.get("description").get("usage") is not None
133
+ or odcs_contract.get("description").get("limitations") is not None
134
+ or odcs_contract.get("price") is not None
135
+ ):
136
+ terms = Terms()
137
+ if odcs_contract.get("description").get("usage") is not None:
138
+ terms.usage = odcs_contract.get("description").get("usage")
139
+ if odcs_contract.get("description").get("limitations") is not None:
140
+ terms.limitations = odcs_contract.get("description").get("limitations")
141
+ if odcs_contract.get("price") is not None:
142
+ terms.billing = f"{odcs_contract.get('price').get('priceAmount')} {odcs_contract.get('price').get('priceCurrency')} / {odcs_contract.get('price').get('priceUnit')}"
143
+
144
+ return terms
145
+ else:
146
+ return None
147
+
148
+
149
+ def import_servicelevels(odcs_contract: Dict[str, Any]) -> ServiceLevel:
150
+ # find the two properties we can map (based on the examples)
151
+ sla_properties = odcs_contract.get("slaProperties") if odcs_contract.get("slaProperties") is not None else []
152
+ availability = next((p for p in sla_properties if p["property"] == "generalAvailability"), None)
153
+ retention = next((p for p in sla_properties if p["property"] == "retention"), None)
154
+
155
+ if availability is not None or retention is not None:
156
+ servicelevel = ServiceLevel()
157
+
158
+ if availability is not None:
159
+ value = availability.get("value")
160
+ if isinstance(value, datetime.datetime):
161
+ value = value.isoformat()
162
+ servicelevel.availability = Availability(description=value)
163
+
164
+ if retention is not None:
165
+ servicelevel.retention = Retention(period=f"{retention.get('value')}{retention.get('unit')}")
166
+
167
+ return servicelevel
168
+ else:
169
+ return None
170
+
171
+
172
+ def get_server_type(odcs_contract: Dict[str, Any]) -> str | None:
173
+ servers = import_servers(odcs_contract)
174
+ if servers is None or len(servers) == 0:
175
+ return None
176
+ # get first server from map
177
+ server = next(iter(servers.values()))
178
+ return server.type
179
+
180
+
181
+ def import_models(odcs_contract: Dict[str, Any]) -> Dict[str, Model]:
182
+ custom_type_mappings = get_custom_type_mappings(odcs_contract.get("customProperties"))
183
+
184
+ odcs_schemas = odcs_contract.get("schema") if odcs_contract.get("schema") is not None else []
185
+ result = {}
186
+
187
+ for odcs_schema in odcs_schemas:
188
+ schema_name = odcs_schema.get("name")
189
+ schema_physical_name = odcs_schema.get("physicalName")
190
+ schema_description = odcs_schema.get("description") if odcs_schema.get("description") is not None else ""
191
+ model_name = schema_physical_name if schema_physical_name is not None else schema_name
192
+ model = Model(description=" ".join(schema_description.splitlines()), type="table")
193
+ model.fields = import_fields(
194
+ odcs_schema.get("properties"), custom_type_mappings, server_type=get_server_type(odcs_contract)
195
+ )
196
+ model.title = schema_name
197
+ if odcs_schema.get("dataGranularityDescription") is not None:
198
+ model.config = {"dataGranularityDescription": odcs_schema.get("dataGranularityDescription")}
199
+ result[model_name] = model
200
+
201
+ return result
202
+
203
+
204
+ def import_field_config(odcs_property: Dict[str, Any], server_type=None) -> Dict[str, Any]:
205
+ config = {}
206
+ if odcs_property.get("criticalDataElement") is not None:
207
+ config["criticalDataElement"] = odcs_property.get("criticalDataElement")
208
+ if odcs_property.get("encryptedName") is not None:
209
+ config["encryptedName"] = odcs_property.get("encryptedName")
210
+ if odcs_property.get("partitionKeyPosition") is not None:
211
+ config["partitionKeyPosition"] = odcs_property.get("partitionKeyPosition")
212
+ if odcs_property.get("partitioned") is not None:
213
+ config["partitioned"] = odcs_property.get("partitioned")
214
+
215
+ if odcs_property.get("customProperties") is not None and isinstance(odcs_property.get("customProperties"), list):
216
+ for item in odcs_property.get("customProperties"):
217
+ config[item["property"]] = item["value"]
218
+
219
+ physical_type = odcs_property.get("physicalType")
220
+ if physical_type is not None:
221
+ if server_type == "postgres" or server_type == "postgresql":
222
+ config["postgresType"] = physical_type
223
+ elif server_type == "bigquery":
224
+ config["bigqueryType"] = physical_type
225
+ elif server_type == "snowflake":
226
+ config["snowflakeType"] = physical_type
227
+ elif server_type == "redshift":
228
+ config["redshiftType"] = physical_type
229
+ elif server_type == "sqlserver":
230
+ config["sqlserverType"] = physical_type
231
+ elif server_type == "databricksType":
232
+ config["databricksType"] = physical_type
233
+ else:
234
+ config["physicalType"] = physical_type
235
+
236
+ return config
237
+
238
+
239
+ def has_composite_primary_key(odcs_properties) -> bool:
240
+ primary_keys = [prop for prop in odcs_properties if prop.get("primaryKey") is not None and prop.get("primaryKey")]
241
+ return len(primary_keys) > 1
242
+
243
+
244
+ def import_fields(
245
+ odcs_properties: Dict[str, Any], custom_type_mappings: Dict[str, str], server_type
246
+ ) -> Dict[str, Field]:
247
+ logger = logging.getLogger(__name__)
248
+ result = {}
249
+
250
+ if odcs_properties is None:
251
+ return result
252
+
253
+ for odcs_property in odcs_properties:
254
+ mapped_type = map_type(odcs_property.get("logicalType"), custom_type_mappings)
255
+ if mapped_type is not None:
256
+ property_name = odcs_property["name"]
257
+ description = odcs_property.get("description") if odcs_property.get("description") is not None else None
258
+ field = Field(
259
+ description=" ".join(description.splitlines()) if description is not None else None,
260
+ type=mapped_type,
261
+ title=odcs_property.get("businessName"),
262
+ required=not odcs_property.get("nullable") if odcs_property.get("nullable") is not None else False,
263
+ primary=odcs_property.get("primaryKey")
264
+ if not has_composite_primary_key(odcs_properties) and odcs_property.get("primaryKey") is not None
265
+ else False,
266
+ unique=odcs_property.get("unique"),
267
+ examples=odcs_property.get("examples") if odcs_property.get("examples") is not None else None,
268
+ classification=odcs_property.get("classification")
269
+ if odcs_property.get("classification") is not None
270
+ else "",
271
+ tags=odcs_property.get("tags") if odcs_property.get("tags") is not None else None,
272
+ quality=odcs_property.get("quality") if odcs_property.get("quality") is not None else [],
273
+ config=import_field_config(odcs_property, server_type),
274
+ )
275
+ result[property_name] = field
276
+ else:
277
+ logger.info(
278
+ f"Can't map {odcs_property.get('column')} to the Datacontract Mapping types, as there is no equivalent or special mapping. Consider introducing a customProperty 'dc_mapping_{odcs_property.get('logicalName')}' that defines your expected type as the 'value'"
279
+ )
280
+
281
+ return result
282
+
283
+
284
+ def map_type(odcs_type: str, custom_mappings: Dict[str, str]) -> str | None:
285
+ t = odcs_type.lower()
286
+ if t in DATACONTRACT_TYPES:
287
+ return t
288
+ elif custom_mappings.get(t) is not None:
289
+ return custom_mappings.get(t)
290
+ else:
291
+ return None
292
+
293
+
294
+ def get_custom_type_mappings(odcs_custom_properties: List[Any]) -> Dict[str, str]:
295
+ result = {}
296
+ if odcs_custom_properties is not None:
297
+ for prop in odcs_custom_properties:
298
+ if prop["property"].startswith("dc_mapping_"):
299
+ odcs_type_name = prop["property"].substring(11)
300
+ datacontract_type = prop["value"]
301
+ result[odcs_type_name] = datacontract_type
302
+
303
+ return result
304
+
305
+
306
+ def import_tags(odcs_contract) -> List[str] | None:
307
+ if odcs_contract.get("tags") is None:
308
+ return None
309
+ return odcs_contract.get("tags")
@@ -80,6 +80,8 @@ def _field_from_struct_type(spark_field: types.StructField) -> Field:
80
80
  """
81
81
  field = Field()
82
82
  field.required = not spark_field.nullable
83
+ field.description = spark_field.metadata.get("comment")
84
+
83
85
  return _type_from_data_type(field, spark_field.dataType)
84
86
 
85
87
 
@@ -121,7 +123,7 @@ def _data_type_from_spark(spark_type: types.DataType) -> str:
121
123
  """
122
124
  if isinstance(spark_type, types.StringType):
123
125
  return "string"
124
- elif isinstance(spark_type, types.IntegerType):
126
+ elif isinstance(spark_type, (types.IntegerType, types.ShortType)):
125
127
  return "integer"
126
128
  elif isinstance(spark_type, types.LongType):
127
129
  return "long"
@@ -149,5 +151,7 @@ def _data_type_from_spark(spark_type: types.DataType) -> str:
149
151
  return "decimal"
150
152
  elif isinstance(spark_type, types.NullType):
151
153
  return "null"
154
+ elif isinstance(spark_type, types.VarcharType):
155
+ return "varchar"
152
156
  else:
153
157
  raise ValueError(f"Unsupported Spark type: {spark_type}")
@@ -1,17 +1,37 @@
1
1
  import json
2
- import requests
3
2
  import os
4
- import typing
3
+ from typing import List, Optional
4
+
5
+ from pyspark.sql import types
6
+ from databricks.sdk import WorkspaceClient
7
+ from databricks.sdk.service.catalog import TableInfo, ColumnInfo
5
8
 
6
9
  from datacontract.imports.importer import Importer
10
+ from datacontract.imports.spark_importer import _field_from_struct_type
7
11
  from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
8
12
  from datacontract.model.exceptions import DataContractException
9
13
 
10
14
 
11
15
  class UnityImporter(Importer):
16
+ """
17
+ UnityImporter class for importing data contract specifications from Unity Catalog.
18
+ """
19
+
12
20
  def import_source(
13
21
  self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
14
22
  ) -> DataContractSpecification:
23
+ """
24
+ Import data contract specification from a source.
25
+
26
+ :param data_contract_specification: The data contract specification to be imported.
27
+ :type data_contract_specification: DataContractSpecification
28
+ :param source: The source from which to import the data contract specification.
29
+ :type source: str
30
+ :param import_args: Additional arguments for the import process.
31
+ :type import_args: dict
32
+ :return: The imported data contract specification.
33
+ :rtype: DataContractSpecification
34
+ """
15
35
  if source is not None:
16
36
  data_contract_specification = import_unity_from_json(data_contract_specification, source)
17
37
  else:
@@ -24,9 +44,21 @@ class UnityImporter(Importer):
24
44
  def import_unity_from_json(
25
45
  data_contract_specification: DataContractSpecification, source: str
26
46
  ) -> DataContractSpecification:
47
+ """
48
+ Import data contract specification from a JSON file.
49
+
50
+ :param data_contract_specification: The data contract specification to be imported.
51
+ :type data_contract_specification: DataContractSpecification
52
+ :param source: The path to the JSON file.
53
+ :type source: str
54
+ :return: The imported data contract specification.
55
+ :rtype: DataContractSpecification
56
+ :raises DataContractException: If there is an error parsing the JSON file.
57
+ """
27
58
  try:
28
59
  with open(source, "r") as file:
29
- unity_schema = json.loads(file.read())
60
+ json_contents = json.loads(file.read())
61
+ unity_schema = TableInfo.from_dict(json_contents)
30
62
  except json.JSONDecodeError as e:
31
63
  raise DataContractException(
32
64
  type="schema",
@@ -39,114 +71,103 @@ def import_unity_from_json(
39
71
 
40
72
 
41
73
  def import_unity_from_api(
42
- data_contract_specification: DataContractSpecification, unity_table_full_name: typing.Optional[str] = None
74
+ data_contract_specification: DataContractSpecification, unity_table_full_name: Optional[str] = None
43
75
  ) -> DataContractSpecification:
44
- databricks_instance = os.getenv("DATABRICKS_IMPORT_INSTANCE")
45
- access_token = os.getenv("DATABRICKS_IMPORT_ACCESS_TOKEN")
46
-
47
- if not databricks_instance or not access_token:
48
- print("Missing environment variables for Databricks instance or access token.")
49
- print("Both, $DATABRICKS_IMPORT_INSTANCE and $DATABRICKS_IMPORT_ACCESS_TOKEN must be set.")
50
- exit(1) # Exit if variables are not set
51
-
52
- api_url = f"{databricks_instance}/api/2.1/unity-catalog/tables/{unity_table_full_name}"
53
-
54
- headers = {"Authorization": f"Bearer {access_token}"}
55
- response = requests.get(api_url, headers=headers)
56
-
57
- if response.status_code != 200:
76
+ """
77
+ Import data contract specification from Unity Catalog API.
78
+
79
+ :param data_contract_specification: The data contract specification to be imported.
80
+ :type data_contract_specification: DataContractSpecification
81
+ :param unity_table_full_name: The full name of the Unity table.
82
+ :type unity_table_full_name: Optional[str]
83
+ :return: The imported data contract specification.
84
+ :rtype: DataContractSpecification
85
+ :raises DataContractException: If there is an error retrieving the schema from the API.
86
+ """
87
+ try:
88
+ workspace_client = WorkspaceClient()
89
+ unity_schema: TableInfo = workspace_client.tables.get(unity_table_full_name)
90
+ except Exception as e:
58
91
  raise DataContractException(
59
92
  type="schema",
60
93
  name="Retrieve unity catalog schema",
61
- reason=f"Failed to retrieve unity catalog schema from databricks instance: {response.status_code} {response.text}",
94
+ reason=f"Failed to retrieve unity catalog schema from databricks profile: {os.getenv('DATABRICKS_CONFIG_PROFILE')}",
62
95
  engine="datacontract",
96
+ original_exception=e,
63
97
  )
64
98
 
65
- convert_unity_schema(data_contract_specification, response.json())
99
+ convert_unity_schema(data_contract_specification, unity_schema)
66
100
 
67
101
  return data_contract_specification
68
102
 
69
103
 
70
104
  def convert_unity_schema(
71
- data_contract_specification: DataContractSpecification, unity_schema: dict
105
+ data_contract_specification: DataContractSpecification, unity_schema: TableInfo
72
106
  ) -> DataContractSpecification:
107
+ """
108
+ Convert Unity schema to data contract specification.
109
+
110
+ :param data_contract_specification: The data contract specification to be converted.
111
+ :type data_contract_specification: DataContractSpecification
112
+ :param unity_schema: The Unity schema to be converted.
113
+ :type unity_schema: TableInfo
114
+ :return: The converted data contract specification.
115
+ :rtype: DataContractSpecification
116
+ """
73
117
  if data_contract_specification.models is None:
74
118
  data_contract_specification.models = {}
75
119
 
76
- fields = import_table_fields(unity_schema.get("columns"))
120
+ fields = import_table_fields(unity_schema.columns)
77
121
 
78
- table_id = unity_schema.get("table_id")
122
+ table_id = unity_schema.name or unity_schema.table_id
79
123
 
80
124
  data_contract_specification.models[table_id] = Model(fields=fields, type="table")
81
125
 
82
- if unity_schema.get("name") is not None:
83
- data_contract_specification.models[table_id].title = unity_schema.get("name")
126
+ if unity_schema.name:
127
+ data_contract_specification.models[table_id].title = unity_schema.name
128
+
129
+ if unity_schema.comment:
130
+ data_contract_specification.models[table_id].description = unity_schema.comment
84
131
 
85
132
  return data_contract_specification
86
133
 
87
134
 
88
- def import_table_fields(table_fields):
135
+ def import_table_fields(columns: List[ColumnInfo]) -> dict[str, Field]:
136
+ """
137
+ Import table fields from Unity schema columns.
138
+
139
+ Here we are first converting the `ColumnInfo.type_json` to a Spark StructField object
140
+ so we can leave the complexity of the Spark field types to the Spark JSON schema parser,
141
+ then re-use the logic in `datacontract.imports.spark_importer` to convert the StructField
142
+ into a Field object.
143
+
144
+ :param columns: The list of Unity schema columns.
145
+ :type columns: List[ColumnInfo]
146
+ :return: A dictionary of imported fields.
147
+ :rtype: dict[str, Field]
148
+ """
89
149
  imported_fields = {}
90
- for field in table_fields:
91
- field_name = field.get("name")
92
- imported_fields[field_name] = Field()
93
- imported_fields[field_name].required = field.get("nullable") == "false"
94
- imported_fields[field_name].description = field.get("comment")
95
-
96
- # databricks api 2.1 specifies that type_name can be any of:
97
- # BOOLEAN | BYTE | SHORT | INT | LONG | FLOAT | DOUBLE | DATE | TIMESTAMP | TIMESTAMP_NTZ | STRING
98
- # | BINARY | DECIMAL | INTERVAL | ARRAY | STRUCT | MAP | CHAR | NULL | USER_DEFINED_TYPE | TABLE_TYPE
99
- if field.get("type_name") in ["INTERVAL", "ARRAY", "STRUCT", "MAP", "USER_DEFINED_TYPE", "TABLE_TYPE"]:
100
- # complex types are not supported, yet
101
- raise DataContractException(
102
- type="schema",
103
- result="failed",
104
- name="Map unity type to data contract type",
105
- reason=f"type ${field.get('type_name')} is not supported yet for unity import",
106
- engine="datacontract",
107
- )
108
150
 
109
- imported_fields[field_name].type = map_type_from_unity(field.get("type_name"))
151
+ for column in columns:
152
+ struct_field: types.StructField = _type_json_to_spark_field(column.type_json)
153
+ imported_fields[column.name] = _field_from_struct_type(struct_field)
110
154
 
111
155
  return imported_fields
112
156
 
113
157
 
114
- def map_type_from_unity(type_str: str):
115
- if type_str == "BOOLEAN":
116
- return "boolean"
117
- elif type_str == "BYTE":
118
- return "bytes"
119
- elif type_str == "SHORT":
120
- return "int"
121
- elif type_str == "INT":
122
- return "int"
123
- elif type_str == "LONG":
124
- return "long"
125
- elif type_str == "FLOAT":
126
- return "float"
127
- elif type_str == "DOUBLE":
128
- return "double"
129
- elif type_str == "DATE":
130
- return "date"
131
- elif type_str == "TIMESTAMP":
132
- return "timestamp"
133
- elif type_str == "TIMESTAMP_NTZ":
134
- return "timestamp_ntz"
135
- elif type_str == "STRING":
136
- return "string"
137
- elif type_str == "BINARY":
138
- return "bytes"
139
- elif type_str == "DECIMAL":
140
- return "decimal"
141
- elif type_str == "CHAR":
142
- return "varchar"
143
- elif type_str == "NULL":
144
- return "null"
145
- else:
146
- raise DataContractException(
147
- type="schema",
148
- result="failed",
149
- name="Map unity type to data contract type",
150
- reason=f"Unsupported type {type_str} in unity json definition.",
151
- engine="datacontract",
152
- )
158
+ def _type_json_to_spark_field(type_json: str) -> types.StructField:
159
+ """
160
+ Parses a JSON string representing a Spark field and returns a StructField object.
161
+
162
+ The reason we do this is to leverage the Spark JSON schema parser to handle the
163
+ complexity of the Spark field types. The field `type_json` in the Unity API is
164
+ the output of a `StructField.jsonValue()` call.
165
+
166
+ :param type_json: The JSON string representing the Spark field.
167
+ :type type_json: str
168
+
169
+ :return: The StructField object.
170
+ :rtype: types.StructField
171
+ """
172
+ type_dict = json.loads(type_json)
173
+ return types.StructField.fromJson(type_dict)
@@ -23,7 +23,7 @@ def publish_test_results_to_datamesh_manager(run: Run, publish_url: str):
23
23
  )
24
24
 
25
25
  if run.dataContractId is None:
26
- raise Exception("Cannot publish run results, as data contract ID is unknown")
26
+ raise Exception("Cannot publish run results for unknown data contract ID")
27
27
 
28
28
  headers = {"Content-Type": "application/json", "x-api-key": api_key}
29
29
  request_body = run.model_dump_json()
@@ -5,11 +5,13 @@ import fastjsonschema
5
5
  import yaml
6
6
  from fastjsonschema import JsonSchemaValueException
7
7
 
8
- from datacontract.lint.files import read_file
8
+ from datacontract.imports.odcs_v3_importer import import_odcs_v3_from_str
9
+ from datacontract.lint.resources import read_resource
9
10
  from datacontract.lint.schema import fetch_schema
10
11
  from datacontract.lint.urls import fetch_resource
11
12
  from datacontract.model.data_contract_specification import DataContractSpecification, Definition, Quality
12
13
  from datacontract.model.exceptions import DataContractException
14
+ from datacontract.model.odcs import is_open_data_contract_standard
13
15
 
14
16
 
15
17
  def resolve_data_contract(
@@ -41,10 +43,7 @@ def resolve_data_contract(
41
43
  def resolve_data_contract_from_location(
42
44
  location, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False
43
45
  ) -> DataContractSpecification:
44
- if location.startswith("http://") or location.startswith("https://"):
45
- data_contract_str = fetch_resource(location)
46
- else:
47
- data_contract_str = read_file(location)
46
+ data_contract_str = read_resource(location)
48
47
  return _resolve_data_contract_from_str(data_contract_str, schema_location, inline_definitions, inline_quality)
49
48
 
50
49
 
@@ -114,7 +113,16 @@ def _resolve_definition_ref(ref, spec) -> Definition:
114
113
 
115
114
  def _find_by_path_in_spec(definition_path: str, spec: DataContractSpecification):
116
115
  path_elements = definition_path.split("/")
117
- definition = spec.definitions[path_elements[2]]
116
+ definition_key = path_elements[2]
117
+ if definition_key not in spec.definitions:
118
+ raise DataContractException(
119
+ type="lint",
120
+ result="failed",
121
+ name="Check that data contract YAML is valid",
122
+ reason=f"Cannot resolve definition {definition_key}",
123
+ engine="datacontract",
124
+ )
125
+ definition = spec.definitions[definition_key]
118
126
  definition = _find_subfield_in_definition(definition, path_elements[3:])
119
127
  return definition
120
128
 
@@ -187,10 +195,16 @@ def _get_quality_ref_file(quality_spec: str | object) -> str | object:
187
195
  def _resolve_data_contract_from_str(
188
196
  data_contract_str, schema_location: str = None, inline_definitions: bool = False, inline_quality: bool = False
189
197
  ) -> DataContractSpecification:
190
- data_contract_yaml_dict = _to_yaml(data_contract_str)
191
- _validate(data_contract_yaml_dict, schema_location)
198
+ yaml_dict = _to_yaml(data_contract_str)
199
+
200
+ if is_open_data_contract_standard(yaml_dict):
201
+ # if ODCS, then validate the ODCS schema and import to DataContractSpecification directly
202
+ data_contract_specification = DataContractSpecification(dataContractSpecification="0.9.3")
203
+ return import_odcs_v3_from_str(data_contract_specification, source_str=data_contract_str)
192
204
 
193
- spec = DataContractSpecification(**data_contract_yaml_dict)
205
+ _validate_data_contract_specification_schema(yaml_dict, schema_location)
206
+ data_contract_specification = yaml_dict
207
+ spec = DataContractSpecification(**data_contract_specification)
194
208
 
195
209
  if inline_definitions:
196
210
  inline_definitions_into_data_contract(spec)
@@ -215,7 +229,7 @@ def _to_yaml(data_contract_str):
215
229
  )
216
230
 
217
231
 
218
- def _validate(data_contract_yaml, schema_location: str = None):
232
+ def _validate_data_contract_specification_schema(data_contract_yaml, schema_location: str = None):
219
233
  schema = fetch_schema(schema_location)
220
234
  try:
221
235
  fastjsonschema.validate(schema, data_contract_yaml)