datacontract-cli 0.10.23__py3-none-any.whl → 0.10.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. datacontract/__init__.py +13 -0
  2. datacontract/api.py +12 -5
  3. datacontract/catalog/catalog.py +5 -3
  4. datacontract/cli.py +119 -13
  5. datacontract/data_contract.py +145 -67
  6. datacontract/engines/data_contract_checks.py +366 -60
  7. datacontract/engines/data_contract_test.py +50 -4
  8. datacontract/engines/fastjsonschema/check_jsonschema.py +37 -19
  9. datacontract/engines/fastjsonschema/s3/s3_read_files.py +3 -2
  10. datacontract/engines/soda/check_soda_execute.py +27 -3
  11. datacontract/engines/soda/connections/athena.py +79 -0
  12. datacontract/engines/soda/connections/duckdb_connection.py +65 -6
  13. datacontract/engines/soda/connections/kafka.py +4 -2
  14. datacontract/engines/soda/connections/oracle.py +50 -0
  15. datacontract/export/avro_converter.py +20 -3
  16. datacontract/export/bigquery_converter.py +1 -1
  17. datacontract/export/dbt_converter.py +36 -7
  18. datacontract/export/dqx_converter.py +126 -0
  19. datacontract/export/duckdb_type_converter.py +57 -0
  20. datacontract/export/excel_exporter.py +923 -0
  21. datacontract/export/exporter.py +3 -0
  22. datacontract/export/exporter_factory.py +17 -1
  23. datacontract/export/great_expectations_converter.py +55 -5
  24. datacontract/export/{html_export.py → html_exporter.py} +31 -20
  25. datacontract/export/markdown_converter.py +134 -5
  26. datacontract/export/mermaid_exporter.py +110 -0
  27. datacontract/export/odcs_v3_exporter.py +193 -149
  28. datacontract/export/protobuf_converter.py +163 -69
  29. datacontract/export/rdf_converter.py +2 -2
  30. datacontract/export/sodacl_converter.py +9 -1
  31. datacontract/export/spark_converter.py +31 -4
  32. datacontract/export/sql_converter.py +6 -2
  33. datacontract/export/sql_type_converter.py +124 -8
  34. datacontract/imports/avro_importer.py +63 -12
  35. datacontract/imports/csv_importer.py +111 -57
  36. datacontract/imports/excel_importer.py +1112 -0
  37. datacontract/imports/importer.py +16 -3
  38. datacontract/imports/importer_factory.py +17 -0
  39. datacontract/imports/json_importer.py +325 -0
  40. datacontract/imports/odcs_importer.py +2 -2
  41. datacontract/imports/odcs_v3_importer.py +367 -151
  42. datacontract/imports/protobuf_importer.py +264 -0
  43. datacontract/imports/spark_importer.py +117 -13
  44. datacontract/imports/sql_importer.py +32 -16
  45. datacontract/imports/unity_importer.py +84 -38
  46. datacontract/init/init_template.py +1 -1
  47. datacontract/integration/entropy_data.py +126 -0
  48. datacontract/lint/resolve.py +112 -23
  49. datacontract/lint/schema.py +24 -15
  50. datacontract/lint/urls.py +17 -3
  51. datacontract/model/data_contract_specification/__init__.py +1 -0
  52. datacontract/model/odcs.py +13 -0
  53. datacontract/model/run.py +3 -0
  54. datacontract/output/junit_test_results.py +3 -3
  55. datacontract/schemas/datacontract-1.1.0.init.yaml +1 -1
  56. datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
  57. datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
  58. datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
  59. datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
  60. datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
  61. datacontract/schemas/odcs-3.1.0.schema.json +2809 -0
  62. datacontract/templates/datacontract.html +54 -3
  63. datacontract/templates/datacontract_odcs.html +685 -0
  64. datacontract/templates/index.html +5 -2
  65. datacontract/templates/partials/server.html +2 -0
  66. datacontract/templates/style/output.css +319 -145
  67. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/METADATA +711 -433
  68. datacontract_cli-0.10.40.dist-info/RECORD +121 -0
  69. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/WHEEL +1 -1
  70. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info/licenses}/LICENSE +1 -1
  71. datacontract/export/csv_type_converter.py +0 -36
  72. datacontract/integration/datamesh_manager.py +0 -72
  73. datacontract/lint/lint.py +0 -142
  74. datacontract/lint/linters/description_linter.py +0 -35
  75. datacontract/lint/linters/field_pattern_linter.py +0 -34
  76. datacontract/lint/linters/field_reference_linter.py +0 -48
  77. datacontract/lint/linters/notice_period_linter.py +0 -55
  78. datacontract/lint/linters/quality_schema_linter.py +0 -52
  79. datacontract/lint/linters/valid_constraints_linter.py +0 -100
  80. datacontract/model/data_contract_specification.py +0 -327
  81. datacontract_cli-0.10.23.dist-info/RECORD +0 -113
  82. /datacontract/{lint/linters → output}/__init__.py +0 -0
  83. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/entry_points.txt +0 -0
  84. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,11 @@
1
1
  import datetime
2
2
  import logging
3
+ import re
3
4
  from typing import Any, Dict, List
4
5
  from venv import logger
5
6
 
6
- import yaml
7
+ from datacontract_specification.model import Quality
8
+ from open_data_contract_standard.model import CustomProperty, OpenDataContractStandard, SchemaProperty
7
9
 
8
10
  from datacontract.imports.importer import Importer
9
11
  from datacontract.lint.resources import read_resource
@@ -14,9 +16,9 @@ from datacontract.model.data_contract_specification import (
14
16
  Field,
15
17
  Info,
16
18
  Model,
17
- Quality,
18
19
  Retention,
19
20
  Server,
21
+ ServerRole,
20
22
  ServiceLevel,
21
23
  Terms,
22
24
  )
@@ -27,19 +29,20 @@ class OdcsImporter(Importer):
27
29
  def import_source(
28
30
  self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
29
31
  ) -> DataContractSpecification:
30
- return import_odcs_v3(data_contract_specification, source)
32
+ return import_odcs_v3_as_dcs(data_contract_specification, source)
31
33
 
32
34
 
33
- def import_odcs_v3(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
35
+ def import_odcs_v3_as_dcs(
36
+ data_contract_specification: DataContractSpecification, source: str
37
+ ) -> DataContractSpecification:
34
38
  source_str = read_resource(source)
35
- return import_odcs_v3_from_str(data_contract_specification, source_str)
39
+ odcs = parse_odcs_v3_from_str(source_str)
40
+ return import_from_odcs(data_contract_specification, odcs)
36
41
 
37
42
 
38
- def import_odcs_v3_from_str(
39
- data_contract_specification: DataContractSpecification, source_str: str
40
- ) -> DataContractSpecification:
43
+ def parse_odcs_v3_from_str(source_str):
41
44
  try:
42
- odcs_contract = yaml.safe_load(source_str)
45
+ odcs = OpenDataContractStandard.from_string(source_str)
43
46
  except Exception as e:
44
47
  raise DataContractException(
45
48
  type="schema",
@@ -48,130 +51,145 @@ def import_odcs_v3_from_str(
48
51
  engine="datacontract",
49
52
  original_exception=e,
50
53
  )
54
+ return odcs
51
55
 
52
- data_contract_specification.id = odcs_contract["id"]
53
- data_contract_specification.info = import_info(odcs_contract)
54
- data_contract_specification.servers = import_servers(odcs_contract)
55
- data_contract_specification.terms = import_terms(odcs_contract)
56
- data_contract_specification.servicelevels = import_servicelevels(odcs_contract)
57
- data_contract_specification.models = import_models(odcs_contract)
58
- data_contract_specification.tags = import_tags(odcs_contract)
59
56
 
57
+ def import_from_odcs(data_contract_specification: DataContractSpecification, odcs: OpenDataContractStandard):
58
+ data_contract_specification.id = odcs.id
59
+ data_contract_specification.info = import_info(odcs)
60
+ data_contract_specification.servers = import_servers(odcs)
61
+ data_contract_specification.terms = import_terms(odcs)
62
+ data_contract_specification.servicelevels = import_servicelevels(odcs)
63
+ data_contract_specification.models = import_models(odcs)
64
+ data_contract_specification.tags = import_tags(odcs)
60
65
  return data_contract_specification
61
66
 
62
67
 
63
- def import_info(odcs_contract: Dict[str, Any]) -> Info:
68
+ def import_info(odcs: Any) -> Info:
64
69
  info = Info()
65
70
 
66
- info.title = odcs_contract.get("name") if odcs_contract.get("name") is not None else ""
71
+ info.title = odcs.name if odcs.name is not None else ""
67
72
 
68
- if odcs_contract.get("version") is not None:
69
- info.version = odcs_contract.get("version")
73
+ if odcs.version is not None:
74
+ info.version = odcs.version
70
75
 
71
76
  # odcs.description.purpose => datacontract.description
72
- if odcs_contract.get("description") is not None and odcs_contract.get("description").get("purpose") is not None:
73
- info.description = odcs_contract.get("description").get("purpose")
77
+ if odcs.description is not None and odcs.description.purpose is not None:
78
+ info.description = odcs.description.purpose
74
79
 
75
80
  # odcs.domain => datacontract.owner
76
- if odcs_contract.get("domain") is not None:
77
- info.owner = odcs_contract.get("domain")
81
+ owner = get_owner(odcs.customProperties)
82
+ if owner is not None:
83
+ info.owner = owner
78
84
 
79
85
  # add dataProduct as custom property
80
- if odcs_contract.get("dataProduct") is not None:
81
- info.dataProduct = odcs_contract.get("dataProduct")
86
+ if odcs.dataProduct is not None:
87
+ info.dataProduct = odcs.dataProduct
82
88
 
83
89
  # add tenant as custom property
84
- if odcs_contract.get("tenant") is not None:
85
- info.tenant = odcs_contract.get("tenant")
90
+ if odcs.tenant is not None:
91
+ info.tenant = odcs.tenant
86
92
 
87
93
  return info
88
94
 
89
95
 
90
- def import_servers(odcs_contract: Dict[str, Any]) -> Dict[str, Server] | None:
91
- if odcs_contract.get("servers") is None:
96
+ def import_server_roles(roles: List[Dict]) -> List[ServerRole] | None:
97
+ if roles is None:
98
+ return None
99
+ result = []
100
+ for role in roles:
101
+ server_role = ServerRole()
102
+ server_role.name = role.role
103
+ server_role.description = role.description
104
+ result.append(server_role)
105
+
106
+
107
+ def import_servers(odcs: OpenDataContractStandard) -> Dict[str, Server] | None:
108
+ if odcs.servers is None:
92
109
  return None
93
110
  servers = {}
94
- for odcs_server in odcs_contract.get("servers"):
95
- server_name = odcs_server.get("server")
111
+ for odcs_server in odcs.servers:
112
+ server_name = odcs_server.server
96
113
  if server_name is None:
97
114
  logger.warning("Server name is missing, skipping server")
98
115
  continue
99
116
 
100
117
  server = Server()
101
- server.type = odcs_server.get("type")
102
- server.description = odcs_server.get("description")
103
- server.environment = odcs_server.get("environment")
104
- server.format = odcs_server.get("format")
105
- server.project = odcs_server.get("project")
106
- server.dataset = odcs_server.get("dataset")
107
- server.path = odcs_server.get("path")
108
- server.delimiter = odcs_server.get("delimiter")
109
- server.endpointUrl = odcs_server.get("endpointUrl")
110
- server.location = odcs_server.get("location")
111
- server.account = odcs_server.get("account")
112
- server.database = odcs_server.get("database")
113
- server.schema_ = odcs_server.get("schema")
114
- server.host = odcs_server.get("host")
115
- server.port = odcs_server.get("port")
116
- server.catalog = odcs_server.get("catalog")
117
- server.topic = odcs_server.get("topic")
118
- server.http_path = odcs_server.get("http_path")
119
- server.token = odcs_server.get("token")
120
- server.dataProductId = odcs_server.get("dataProductId")
121
- server.outputPortId = odcs_server.get("outputPortId")
122
- server.driver = odcs_server.get("driver")
123
- server.roles = odcs_server.get("roles")
118
+ server.type = odcs_server.type
119
+ server.description = odcs_server.description
120
+ server.environment = odcs_server.environment
121
+ server.format = odcs_server.format
122
+ server.project = odcs_server.project
123
+ server.dataset = odcs_server.dataset
124
+ server.path = odcs_server.path
125
+ server.delimiter = odcs_server.delimiter
126
+ server.endpointUrl = odcs_server.endpointUrl
127
+ server.location = odcs_server.location
128
+ server.account = odcs_server.account
129
+ server.database = odcs_server.database
130
+ server.schema_ = odcs_server.schema_
131
+ server.service_name = odcs_server.serviceName
132
+ server.host = odcs_server.host
133
+ server.port = odcs_server.port
134
+ server.catalog = odcs_server.catalog
135
+ server.stagingDir = odcs_server.stagingDir
136
+ server.topic = getattr(odcs_server, "topic", None)
137
+ server.http_path = getattr(odcs_server, "http_path", None)
138
+ server.token = getattr(odcs_server, "token", None)
139
+ server.driver = getattr(odcs_server, "driver", None)
140
+ server.roles = import_server_roles(odcs_server.roles)
141
+ server.storageAccount = (
142
+ to_azure_storage_account(odcs_server.location)
143
+ if server.type == "azure" and "://" in server.location
144
+ else None
145
+ )
124
146
 
125
147
  servers[server_name] = server
126
148
  return servers
127
149
 
128
150
 
129
- def import_terms(odcs_contract: Dict[str, Any]) -> Terms | None:
130
- if odcs_contract.get("description") is None:
151
+ def import_terms(odcs: Any) -> Terms | None:
152
+ if odcs.description is None:
131
153
  return None
132
- if (
133
- odcs_contract.get("description").get("usage") is not None
134
- or odcs_contract.get("description").get("limitations") is not None
135
- or odcs_contract.get("price") is not None
136
- ):
154
+ if odcs.description.usage is not None or odcs.description.limitations is not None or odcs.price is not None:
137
155
  terms = Terms()
138
- if odcs_contract.get("description").get("usage") is not None:
139
- terms.usage = odcs_contract.get("description").get("usage")
140
- if odcs_contract.get("description").get("limitations") is not None:
141
- terms.limitations = odcs_contract.get("description").get("limitations")
142
- if odcs_contract.get("price") is not None:
143
- terms.billing = f"{odcs_contract.get('price').get('priceAmount')} {odcs_contract.get('price').get('priceCurrency')} / {odcs_contract.get('price').get('priceUnit')}"
156
+ if odcs.description.usage is not None:
157
+ terms.usage = odcs.description.usage
158
+ if odcs.description.limitations is not None:
159
+ terms.limitations = odcs.description.limitations
160
+ if odcs.price is not None:
161
+ terms.billing = f"{odcs.price.priceAmount} {odcs.price.priceCurrency} / {odcs.price.priceUnit}"
144
162
 
145
163
  return terms
146
164
  else:
147
165
  return None
148
166
 
149
167
 
150
- def import_servicelevels(odcs_contract: Dict[str, Any]) -> ServiceLevel:
168
+ def import_servicelevels(odcs: Any) -> ServiceLevel:
151
169
  # find the two properties we can map (based on the examples)
152
- sla_properties = odcs_contract.get("slaProperties") if odcs_contract.get("slaProperties") is not None else []
153
- availability = next((p for p in sla_properties if p["property"] == "generalAvailability"), None)
154
- retention = next((p for p in sla_properties if p["property"] == "retention"), None)
170
+ sla_properties = odcs.slaProperties if odcs.slaProperties is not None else []
171
+ availability = next((p for p in sla_properties if p.property == "generalAvailability"), None)
172
+ retention = next((p for p in sla_properties if p.property == "retention"), None)
155
173
 
156
174
  if availability is not None or retention is not None:
157
175
  servicelevel = ServiceLevel()
158
176
 
159
177
  if availability is not None:
160
- value = availability.get("value")
178
+ value = availability.value
161
179
  if isinstance(value, datetime.datetime):
162
180
  value = value.isoformat()
163
181
  servicelevel.availability = Availability(description=value)
164
182
 
165
183
  if retention is not None:
166
- servicelevel.retention = Retention(period=f"{retention.get('value')}{retention.get('unit')}")
184
+ servicelevel.retention = Retention(period=f"{retention.value}{retention.unit}")
167
185
 
168
186
  return servicelevel
169
187
  else:
170
188
  return None
171
189
 
172
190
 
173
- def get_server_type(odcs_contract: Dict[str, Any]) -> str | None:
174
- servers = import_servers(odcs_contract)
191
+ def get_server_type(odcs: OpenDataContractStandard) -> str | None:
192
+ servers = import_servers(odcs)
175
193
  if servers is None or len(servers) == 0:
176
194
  return None
177
195
  # get first server from map
@@ -179,49 +197,123 @@ def get_server_type(odcs_contract: Dict[str, Any]) -> str | None:
179
197
  return server.type
180
198
 
181
199
 
182
- def import_models(odcs_contract: Dict[str, Any]) -> Dict[str, Model]:
183
- custom_type_mappings = get_custom_type_mappings(odcs_contract.get("customProperties"))
200
+ def get_composite_primary_keys(properties: List[SchemaProperty]) -> list[str]:
201
+ primary_keys = [
202
+ (property.name, property.primaryKeyPosition)
203
+ for property in properties
204
+ if property.name and property.primaryKey is not None and property.primaryKey
205
+ ]
206
+
207
+ primary_keys.sort(key=lambda x: x[1] or -1)
208
+ return [name for name, _ in primary_keys]
209
+
184
210
 
185
- odcs_schemas = odcs_contract.get("schema") if odcs_contract.get("schema") is not None else []
211
+ def import_models(odcs: Any) -> Dict[str, Model]:
212
+ custom_type_mappings = get_custom_type_mappings(odcs.customProperties)
213
+
214
+ odcs_schemas = odcs.schema_ if odcs.schema_ is not None else []
186
215
  result = {}
187
216
 
188
217
  for odcs_schema in odcs_schemas:
189
- schema_name = odcs_schema.get("name")
190
- schema_physical_name = odcs_schema.get("physicalName")
191
- schema_description = odcs_schema.get("description") if odcs_schema.get("description") is not None else ""
218
+ schema_name = odcs_schema.name
219
+ schema_physical_name = odcs_schema.physicalName
220
+ schema_description = odcs_schema.description if odcs_schema.description is not None else ""
192
221
  model_name = schema_physical_name if schema_physical_name is not None else schema_name
193
- model = Model(description=" ".join(schema_description.splitlines()), type="table")
194
- model.fields = import_fields(
195
- odcs_schema.get("properties"), custom_type_mappings, server_type=get_server_type(odcs_contract)
222
+ model = Model(
223
+ description=" ".join(schema_description.splitlines()) if schema_description else "",
224
+ type="table",
225
+ tags=odcs_schema.tags if odcs_schema.tags is not None else None,
196
226
  )
197
- if odcs_schema.get("quality") is not None:
198
- # convert dict to pydantic model
199
-
200
- model.quality = [Quality.model_validate(q) for q in odcs_schema.get("quality")]
227
+ model.fields = import_fields(odcs_schema.properties, custom_type_mappings, server_type=get_server_type(odcs))
228
+ if has_composite_primary_key(odcs_properties=odcs_schema.properties):
229
+ model.primaryKey = get_composite_primary_keys(odcs_schema.properties)
230
+ if odcs_schema.quality is not None:
231
+ model.quality = convert_quality_list(odcs_schema.quality)
201
232
  model.title = schema_name
202
- if odcs_schema.get("dataGranularityDescription") is not None:
203
- model.config = {"dataGranularityDescription": odcs_schema.get("dataGranularityDescription")}
233
+ if odcs_schema.dataGranularityDescription is not None:
234
+ model.config = {"dataGranularityDescription": odcs_schema.dataGranularityDescription}
204
235
  result[model_name] = model
205
236
 
206
237
  return result
207
238
 
208
239
 
209
- def import_field_config(odcs_property: Dict[str, Any], server_type=None) -> Dict[str, Any]:
240
+ def convert_quality_list(odcs_quality_list):
241
+ """Convert a list of ODCS DataQuality objects to datacontract Quality objects"""
242
+ quality_list = []
243
+
244
+ if odcs_quality_list is not None:
245
+ for odcs_quality in odcs_quality_list:
246
+ quality = Quality(type=odcs_quality.type)
247
+
248
+ if odcs_quality.description is not None:
249
+ quality.description = odcs_quality.description
250
+ if odcs_quality.query is not None:
251
+ quality.query = odcs_quality.query
252
+ if odcs_quality.rule is not None:
253
+ quality.metric = odcs_quality.rule
254
+ if odcs_quality.mustBe is not None:
255
+ quality.mustBe = odcs_quality.mustBe
256
+ if odcs_quality.mustNotBe is not None:
257
+ quality.mustNotBe = odcs_quality.mustNotBe
258
+ if odcs_quality.mustBeGreaterThan is not None:
259
+ quality.mustBeGreaterThan = odcs_quality.mustBeGreaterThan
260
+ if odcs_quality.mustBeGreaterOrEqualTo is not None:
261
+ quality.mustBeGreaterOrEqualTo = odcs_quality.mustBeGreaterOrEqualTo
262
+ if odcs_quality.mustBeLessThan is not None:
263
+ quality.mustBeLessThan = odcs_quality.mustBeLessThan
264
+ if odcs_quality.mustBeLessOrEqualTo is not None:
265
+ quality.mustBeLessOrEqualTo = odcs_quality.mustBeLessOrEqualTo
266
+ if odcs_quality.mustBeBetween is not None:
267
+ quality.mustBeBetween = odcs_quality.mustBeBetween
268
+ if odcs_quality.mustNotBeBetween is not None:
269
+ quality.mustNotBeBetween = odcs_quality.mustNotBeBetween
270
+ if odcs_quality.engine is not None:
271
+ quality.engine = odcs_quality.engine
272
+ if odcs_quality.implementation is not None:
273
+ quality.implementation = odcs_quality.implementation
274
+ if odcs_quality.businessImpact is not None:
275
+ quality.model_extra["businessImpact"] = odcs_quality.businessImpact
276
+ if odcs_quality.dimension is not None:
277
+ quality.model_extra["dimension"] = odcs_quality.dimension
278
+ if odcs_quality.schedule is not None:
279
+ quality.model_extra["schedule"] = odcs_quality.schedule
280
+ if odcs_quality.scheduler is not None:
281
+ quality.model_extra["scheduler"] = odcs_quality.scheduler
282
+ if odcs_quality.severity is not None:
283
+ quality.model_extra["severity"] = odcs_quality.severity
284
+ if odcs_quality.method is not None:
285
+ quality.model_extra["method"] = odcs_quality.method
286
+ if odcs_quality.customProperties is not None:
287
+ quality.model_extra["customProperties"] = []
288
+ for item in odcs_quality.customProperties:
289
+ quality.model_extra["customProperties"].append(
290
+ {
291
+ "property": item.property,
292
+ "value": item.value,
293
+ }
294
+ )
295
+
296
+ quality_list.append(quality)
297
+
298
+ return quality_list
299
+
300
+
301
+ def import_field_config(odcs_property: SchemaProperty, server_type=None) -> dict[Any, Any] | None:
210
302
  config = {}
211
- if odcs_property.get("criticalDataElement") is not None:
212
- config["criticalDataElement"] = odcs_property.get("criticalDataElement")
213
- if odcs_property.get("encryptedName") is not None:
214
- config["encryptedName"] = odcs_property.get("encryptedName")
215
- if odcs_property.get("partitionKeyPosition") is not None:
216
- config["partitionKeyPosition"] = odcs_property.get("partitionKeyPosition")
217
- if odcs_property.get("partitioned") is not None:
218
- config["partitioned"] = odcs_property.get("partitioned")
219
-
220
- if odcs_property.get("customProperties") is not None and isinstance(odcs_property.get("customProperties"), list):
221
- for item in odcs_property.get("customProperties"):
222
- config[item["property"]] = item["value"]
223
-
224
- physical_type = odcs_property.get("physicalType")
303
+ if odcs_property.criticalDataElement is not None:
304
+ config["criticalDataElement"] = odcs_property.criticalDataElement
305
+ if odcs_property.encryptedName is not None:
306
+ config["encryptedName"] = odcs_property.encryptedName
307
+ if odcs_property.partitionKeyPosition is not None:
308
+ config["partitionKeyPosition"] = odcs_property.partitionKeyPosition
309
+ if odcs_property.partitioned is not None:
310
+ config["partitioned"] = odcs_property.partitioned
311
+
312
+ if odcs_property.customProperties is not None:
313
+ for item in odcs_property.customProperties:
314
+ config[item.property] = item.value
315
+
316
+ physical_type = odcs_property.physicalType
225
317
  if physical_type is not None:
226
318
  if server_type == "postgres" or server_type == "postgresql":
227
319
  config["postgresType"] = physical_type
@@ -235,82 +327,206 @@ def import_field_config(odcs_property: Dict[str, Any], server_type=None) -> Dict
235
327
  config["sqlserverType"] = physical_type
236
328
  elif server_type == "databricks":
237
329
  config["databricksType"] = physical_type
330
+ elif server_type == "oracle":
331
+ config["oracleType"] = physical_type
238
332
  else:
239
333
  config["physicalType"] = physical_type
240
334
 
335
+ if len(config) == 0:
336
+ return None
337
+
241
338
  return config
242
339
 
243
340
 
244
- def has_composite_primary_key(odcs_properties) -> bool:
245
- primary_keys = [prop for prop in odcs_properties if prop.get("primaryKey") is not None and prop.get("primaryKey")]
341
+ def has_composite_primary_key(odcs_properties: List[SchemaProperty]) -> bool:
342
+ primary_keys = [prop for prop in odcs_properties if prop.primaryKey is not None and prop.primaryKey]
246
343
  return len(primary_keys) > 1
247
344
 
248
345
 
249
346
  def import_fields(
250
- odcs_properties: Dict[str, Any], custom_type_mappings: Dict[str, str], server_type
347
+ odcs_properties: List[SchemaProperty], custom_type_mappings: Dict[str, str], server_type
251
348
  ) -> Dict[str, Field]:
252
- logger = logging.getLogger(__name__)
253
349
  result = {}
254
350
 
255
351
  if odcs_properties is None:
256
352
  return result
257
353
 
258
354
  for odcs_property in odcs_properties:
259
- mapped_type = map_type(odcs_property.get("logicalType"), custom_type_mappings)
260
- if mapped_type is not None:
261
- property_name = odcs_property["name"]
262
- description = odcs_property.get("description") if odcs_property.get("description") is not None else None
263
- field = Field(
264
- description=" ".join(description.splitlines()) if description is not None else None,
265
- type=mapped_type,
266
- title=odcs_property.get("businessName"),
267
- required=not odcs_property.get("nullable") if odcs_property.get("nullable") is not None else False,
268
- primaryKey=odcs_property.get("primaryKey")
269
- if not has_composite_primary_key(odcs_properties) and odcs_property.get("primaryKey") is not None
270
- else False,
271
- unique=odcs_property.get("unique"),
272
- examples=odcs_property.get("examples") if odcs_property.get("examples") is not None else None,
273
- classification=odcs_property.get("classification")
274
- if odcs_property.get("classification") is not None
275
- else "",
276
- tags=odcs_property.get("tags") if odcs_property.get("tags") is not None else None,
277
- quality=odcs_property.get("quality") if odcs_property.get("quality") is not None else [],
278
- config=import_field_config(odcs_property, server_type),
279
- )
280
- result[property_name] = field
281
- else:
282
- logger.info(
283
- f"Can't map {odcs_property.get('column')} to the Datacontract Mapping types, as there is no equivalent or special mapping. Consider introducing a customProperty 'dc_mapping_{odcs_property.get('logicalName')}' that defines your expected type as the 'value'"
284
- )
355
+ field = import_field(odcs_property, odcs_properties, custom_type_mappings, server_type)
356
+ if field is not None:
357
+ result[odcs_property.name] = field
285
358
 
286
359
  return result
287
360
 
288
361
 
289
- def map_type(odcs_type: str, custom_mappings: Dict[str, str]) -> str | None:
290
- if odcs_type is None:
362
+ def import_field(
363
+ odcs_property: SchemaProperty,
364
+ odcs_properties: List[SchemaProperty],
365
+ custom_type_mappings: Dict[str, str],
366
+ server_type: str,
367
+ ) -> Field | None:
368
+ """
369
+ Import a single ODCS property as a datacontract Field.
370
+ Returns None if the property cannot be mapped.
371
+ """
372
+ logger = logging.getLogger(__name__)
373
+
374
+ mapped_type = map_type(odcs_property.logicalType, custom_type_mappings, odcs_property.physicalType)
375
+
376
+ if mapped_type is None:
377
+ type_info = f"logicalType={odcs_property.logicalType}, physicalType={odcs_property.physicalType}"
378
+ logger.warning(
379
+ f"Can't map field '{odcs_property.name}' ({type_info}) to the datacontract mapping types. "
380
+ f"Both logicalType and physicalType are missing or unmappable. "
381
+ f"Consider introducing a customProperty 'dc_mapping_<type>' that defines your expected type as the 'value'"
382
+ )
291
383
  return None
292
- t = odcs_type.lower()
293
- if t in DATACONTRACT_TYPES:
294
- return t
295
- elif custom_mappings.get(t) is not None:
296
- return custom_mappings.get(t)
297
- else:
384
+
385
+ description = odcs_property.description if odcs_property.description is not None else None
386
+ field = Field(
387
+ description=" ".join(description.splitlines()) if description is not None else None,
388
+ type=mapped_type,
389
+ title=odcs_property.businessName,
390
+ required=odcs_property.required if odcs_property.required is not None else None,
391
+ primaryKey=to_primary_key(odcs_property, odcs_properties),
392
+ unique=odcs_property.unique if odcs_property.unique else None,
393
+ examples=odcs_property.examples if odcs_property.examples is not None else None,
394
+ classification=odcs_property.classification if odcs_property.classification is not None else None,
395
+ tags=odcs_property.tags if odcs_property.tags is not None else None,
396
+ quality=convert_quality_list(odcs_property.quality),
397
+ fields=import_fields(odcs_property.properties, custom_type_mappings, server_type)
398
+ if odcs_property.properties is not None
399
+ else {},
400
+ config=import_field_config(odcs_property, server_type),
401
+ format=getattr(odcs_property, "format", None),
402
+ )
403
+
404
+ # mapped_type is array
405
+ if field.type == "array" and odcs_property.items is not None:
406
+ field.items = import_field(odcs_property.items, [], custom_type_mappings, server_type)
407
+
408
+ # enum from quality validValues as enum
409
+ if field.type == "string":
410
+ for q in field.quality:
411
+ if hasattr(q, "validValues"):
412
+ field.enum = q.validValues
413
+
414
+ return field
415
+
416
+
417
+ def to_primary_key(odcs_property: SchemaProperty, odcs_properties: list[SchemaProperty]) -> bool | None:
418
+ if odcs_property.primaryKey is None:
298
419
  return None
420
+ if has_composite_primary_key(odcs_properties):
421
+ return None
422
+ return odcs_property.primaryKey
423
+
424
+
425
+ def map_type(odcs_logical_type: str, custom_mappings: Dict[str, str], physical_type: str = None) -> str | None:
426
+ # Try to map logicalType first
427
+ if odcs_logical_type is not None:
428
+ t = odcs_logical_type.lower()
429
+ if t in DATACONTRACT_TYPES:
430
+ return t
431
+ elif custom_mappings.get(t) is not None:
432
+ return custom_mappings.get(t)
433
+
434
+ # Fallback to physicalType if logicalType is not mapped
435
+ if physical_type is not None:
436
+ pt = physical_type.lower()
437
+ # Remove parameters from physical type (e.g., VARCHAR(50) -> varchar, DECIMAL(10,2) -> decimal)
438
+ pt_base = pt.split("(")[0].strip()
439
+
440
+ # Try direct mapping of physical type
441
+ if pt in DATACONTRACT_TYPES:
442
+ return pt
443
+ elif pt_base in DATACONTRACT_TYPES:
444
+ return pt_base
445
+ elif custom_mappings.get(pt) is not None:
446
+ return custom_mappings.get(pt)
447
+ elif custom_mappings.get(pt_base) is not None:
448
+ return custom_mappings.get(pt_base)
449
+ # Common physical type mappings
450
+ elif pt_base in ["varchar", "char", "nvarchar", "nchar", "text", "ntext", "string", "character varying"]:
451
+ return "string"
452
+ elif pt_base in ["int", "integer", "smallint", "tinyint", "mediumint", "int2", "int4", "int8"]:
453
+ return "int"
454
+ elif pt_base in ["bigint", "long", "int64"]:
455
+ return "long"
456
+ elif pt_base in ["float", "real", "float4", "float8"]:
457
+ return "float"
458
+ elif pt_base in ["double", "double precision"]:
459
+ return "double"
460
+ elif pt_base in ["decimal", "numeric", "number"]:
461
+ return "decimal"
462
+ elif pt_base in ["boolean", "bool", "bit"]:
463
+ return "boolean"
464
+ elif pt_base in ["timestamp", "datetime", "datetime2", "timestamptz", "timestamp with time zone"]:
465
+ return "timestamp"
466
+ elif pt_base in ["date"]:
467
+ return "date"
468
+ elif pt_base in ["time"]:
469
+ return "time"
470
+ elif pt_base in ["json", "jsonb"]:
471
+ return "json"
472
+ elif pt_base in ["array"]:
473
+ return "array"
474
+ elif pt_base in ["object", "struct", "record"]:
475
+ return "object"
476
+ elif pt_base in ["bytes", "binary", "varbinary", "blob", "bytea"]:
477
+ return "bytes"
478
+ else:
479
+ return None
480
+ return None
299
481
 
300
482
 
301
- def get_custom_type_mappings(odcs_custom_properties: List[Any]) -> Dict[str, str]:
483
+ def get_custom_type_mappings(odcs_custom_properties: List[CustomProperty]) -> Dict[str, str]:
302
484
  result = {}
303
485
  if odcs_custom_properties is not None:
304
486
  for prop in odcs_custom_properties:
305
- if prop["property"].startswith("dc_mapping_"):
306
- odcs_type_name = prop["property"].substring(11)
307
- datacontract_type = prop["value"]
487
+ if prop.property.startswith("dc_mapping_"):
488
+ odcs_type_name = prop.property[11:] # Changed substring to slice
489
+ datacontract_type = prop.value
308
490
  result[odcs_type_name] = datacontract_type
309
491
 
310
492
  return result
311
493
 
312
494
 
313
- def import_tags(odcs_contract) -> List[str] | None:
314
- if odcs_contract.get("tags") is None:
495
+ def get_owner(odcs_custom_properties: List[CustomProperty]) -> str | None:
496
+ if odcs_custom_properties is not None:
497
+ for prop in odcs_custom_properties:
498
+ if prop.property == "owner":
499
+ return prop.value
500
+
501
+ return None
502
+
503
+
504
+ def import_tags(odcs: OpenDataContractStandard) -> List[str] | None:
505
+ if odcs.tags is None:
315
506
  return None
316
- return odcs_contract.get("tags")
507
+ return odcs.tags
508
+
509
+
510
+ def to_azure_storage_account(location: str) -> str | None:
511
+ """
512
+ Converts a storage location string to extract the storage account name.
513
+ ODCS v3.0 has no explicit field for the storage account. It uses the location field, which is a URI.
514
+
515
+ This function parses a storage location string to identify and return the
516
+ storage account name. It handles two primary patterns:
517
+ 1. Protocol://containerName@storageAccountName
518
+ 2. Protocol://storageAccountName
519
+
520
+ :param location: The storage location string to parse, typically following
521
+ the format protocol://containerName@storageAccountName. or
522
+ protocol://storageAccountName.
523
+ :return: The extracted storage account name if found, otherwise None
524
+ """
525
+ # to catch protocol://containerName@storageAccountName. pattern from location
526
+ match = re.search(r"(?<=@)([^.]*)", location, re.IGNORECASE)
527
+ if match:
528
+ return match.group()
529
+ else:
530
+ # to catch protocol://storageAccountName. pattern from location
531
+ match = re.search(r"(?<=//)(?!@)([^.]*)", location, re.IGNORECASE)
532
+ return match.group() if match else None