datacontract-cli 0.10.0__py3-none-any.whl → 0.10.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. datacontract/__init__.py +13 -0
  2. datacontract/api.py +260 -0
  3. datacontract/breaking/breaking.py +242 -12
  4. datacontract/breaking/breaking_rules.py +37 -1
  5. datacontract/catalog/catalog.py +80 -0
  6. datacontract/cli.py +387 -117
  7. datacontract/data_contract.py +216 -353
  8. datacontract/engines/data_contract_checks.py +1041 -0
  9. datacontract/engines/data_contract_test.py +113 -0
  10. datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +2 -3
  11. datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
  12. datacontract/engines/fastjsonschema/check_jsonschema.py +176 -42
  13. datacontract/engines/fastjsonschema/s3/s3_read_files.py +16 -1
  14. datacontract/engines/soda/check_soda_execute.py +100 -56
  15. datacontract/engines/soda/connections/athena.py +79 -0
  16. datacontract/engines/soda/connections/bigquery.py +8 -1
  17. datacontract/engines/soda/connections/databricks.py +12 -3
  18. datacontract/engines/soda/connections/duckdb_connection.py +241 -0
  19. datacontract/engines/soda/connections/kafka.py +206 -113
  20. datacontract/engines/soda/connections/snowflake.py +8 -5
  21. datacontract/engines/soda/connections/sqlserver.py +43 -0
  22. datacontract/engines/soda/connections/trino.py +26 -0
  23. datacontract/export/avro_converter.py +72 -8
  24. datacontract/export/avro_idl_converter.py +31 -25
  25. datacontract/export/bigquery_converter.py +130 -0
  26. datacontract/export/custom_converter.py +40 -0
  27. datacontract/export/data_caterer_converter.py +161 -0
  28. datacontract/export/dbml_converter.py +148 -0
  29. datacontract/export/dbt_converter.py +141 -54
  30. datacontract/export/dcs_exporter.py +6 -0
  31. datacontract/export/dqx_converter.py +126 -0
  32. datacontract/export/duckdb_type_converter.py +57 -0
  33. datacontract/export/excel_exporter.py +923 -0
  34. datacontract/export/exporter.py +100 -0
  35. datacontract/export/exporter_factory.py +216 -0
  36. datacontract/export/go_converter.py +105 -0
  37. datacontract/export/great_expectations_converter.py +257 -36
  38. datacontract/export/html_exporter.py +86 -0
  39. datacontract/export/iceberg_converter.py +188 -0
  40. datacontract/export/jsonschema_converter.py +71 -16
  41. datacontract/export/markdown_converter.py +337 -0
  42. datacontract/export/mermaid_exporter.py +110 -0
  43. datacontract/export/odcs_v3_exporter.py +375 -0
  44. datacontract/export/pandas_type_converter.py +40 -0
  45. datacontract/export/protobuf_converter.py +168 -68
  46. datacontract/export/pydantic_converter.py +6 -0
  47. datacontract/export/rdf_converter.py +13 -6
  48. datacontract/export/sodacl_converter.py +36 -188
  49. datacontract/export/spark_converter.py +245 -0
  50. datacontract/export/sql_converter.py +37 -3
  51. datacontract/export/sql_type_converter.py +269 -8
  52. datacontract/export/sqlalchemy_converter.py +170 -0
  53. datacontract/export/terraform_converter.py +7 -2
  54. datacontract/imports/avro_importer.py +246 -26
  55. datacontract/imports/bigquery_importer.py +221 -0
  56. datacontract/imports/csv_importer.py +143 -0
  57. datacontract/imports/dbml_importer.py +112 -0
  58. datacontract/imports/dbt_importer.py +240 -0
  59. datacontract/imports/excel_importer.py +1111 -0
  60. datacontract/imports/glue_importer.py +288 -0
  61. datacontract/imports/iceberg_importer.py +172 -0
  62. datacontract/imports/importer.py +51 -0
  63. datacontract/imports/importer_factory.py +128 -0
  64. datacontract/imports/json_importer.py +325 -0
  65. datacontract/imports/jsonschema_importer.py +146 -0
  66. datacontract/imports/odcs_importer.py +60 -0
  67. datacontract/imports/odcs_v3_importer.py +516 -0
  68. datacontract/imports/parquet_importer.py +81 -0
  69. datacontract/imports/protobuf_importer.py +264 -0
  70. datacontract/imports/spark_importer.py +262 -0
  71. datacontract/imports/sql_importer.py +274 -35
  72. datacontract/imports/unity_importer.py +219 -0
  73. datacontract/init/init_template.py +20 -0
  74. datacontract/integration/datamesh_manager.py +86 -0
  75. datacontract/lint/resolve.py +271 -49
  76. datacontract/lint/resources.py +21 -0
  77. datacontract/lint/schema.py +53 -17
  78. datacontract/lint/urls.py +32 -12
  79. datacontract/model/data_contract_specification/__init__.py +1 -0
  80. datacontract/model/exceptions.py +4 -1
  81. datacontract/model/odcs.py +24 -0
  82. datacontract/model/run.py +49 -29
  83. datacontract/output/__init__.py +0 -0
  84. datacontract/output/junit_test_results.py +135 -0
  85. datacontract/output/output_format.py +10 -0
  86. datacontract/output/test_results_writer.py +79 -0
  87. datacontract/py.typed +0 -0
  88. datacontract/schemas/datacontract-1.1.0.init.yaml +91 -0
  89. datacontract/schemas/datacontract-1.1.0.schema.json +1975 -0
  90. datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
  91. datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
  92. datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
  93. datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
  94. datacontract/schemas/odcs-3.0.1.schema.json +2634 -0
  95. datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
  96. datacontract/templates/datacontract.html +139 -294
  97. datacontract/templates/datacontract_odcs.html +685 -0
  98. datacontract/templates/index.html +236 -0
  99. datacontract/templates/partials/datacontract_information.html +86 -0
  100. datacontract/templates/partials/datacontract_servicelevels.html +253 -0
  101. datacontract/templates/partials/datacontract_terms.html +51 -0
  102. datacontract/templates/partials/definition.html +25 -0
  103. datacontract/templates/partials/example.html +27 -0
  104. datacontract/templates/partials/model_field.html +144 -0
  105. datacontract/templates/partials/quality.html +49 -0
  106. datacontract/templates/partials/server.html +211 -0
  107. datacontract/templates/style/output.css +491 -72
  108. datacontract_cli-0.10.37.dist-info/METADATA +2235 -0
  109. datacontract_cli-0.10.37.dist-info/RECORD +119 -0
  110. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/WHEEL +1 -1
  111. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info/licenses}/LICENSE +1 -1
  112. datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
  113. datacontract/engines/soda/connections/dask.py +0 -28
  114. datacontract/engines/soda/connections/duckdb.py +0 -76
  115. datacontract/export/csv_type_converter.py +0 -36
  116. datacontract/export/html_export.py +0 -66
  117. datacontract/export/odcs_converter.py +0 -102
  118. datacontract/init/download_datacontract_file.py +0 -17
  119. datacontract/integration/publish_datamesh_manager.py +0 -33
  120. datacontract/integration/publish_opentelemetry.py +0 -107
  121. datacontract/lint/lint.py +0 -141
  122. datacontract/lint/linters/description_linter.py +0 -34
  123. datacontract/lint/linters/example_model_linter.py +0 -91
  124. datacontract/lint/linters/field_pattern_linter.py +0 -34
  125. datacontract/lint/linters/field_reference_linter.py +0 -38
  126. datacontract/lint/linters/notice_period_linter.py +0 -55
  127. datacontract/lint/linters/quality_schema_linter.py +0 -52
  128. datacontract/lint/linters/valid_constraints_linter.py +0 -99
  129. datacontract/model/data_contract_specification.py +0 -141
  130. datacontract/web.py +0 -14
  131. datacontract_cli-0.10.0.dist-info/METADATA +0 -951
  132. datacontract_cli-0.10.0.dist-info/RECORD +0 -66
  133. /datacontract/{model → breaking}/breaking_change.py +0 -0
  134. /datacontract/{lint/linters → export}/__init__.py +0 -0
  135. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/entry_points.txt +0 -0
  136. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,516 @@
1
+ import datetime
2
+ import logging
3
+ import re
4
+ from typing import Any, Dict, List
5
+ from venv import logger
6
+
7
+ from datacontract_specification.model import Quality
8
+ from open_data_contract_standard.model import CustomProperty, OpenDataContractStandard, SchemaProperty
9
+
10
+ from datacontract.imports.importer import Importer
11
+ from datacontract.lint.resources import read_resource
12
+ from datacontract.model.data_contract_specification import (
13
+ DATACONTRACT_TYPES,
14
+ Availability,
15
+ DataContractSpecification,
16
+ Field,
17
+ Info,
18
+ Model,
19
+ Retention,
20
+ Server,
21
+ ServerRole,
22
+ ServiceLevel,
23
+ Terms,
24
+ )
25
+ from datacontract.model.exceptions import DataContractException
26
+
27
+
28
+ class OdcsImporter(Importer):
29
+ def import_source(
30
+ self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
31
+ ) -> DataContractSpecification:
32
+ return import_odcs_v3_as_dcs(data_contract_specification, source)
33
+
34
+
35
+ def import_odcs_v3_as_dcs(
36
+ data_contract_specification: DataContractSpecification, source: str
37
+ ) -> DataContractSpecification:
38
+ source_str = read_resource(source)
39
+ odcs = parse_odcs_v3_from_str(source_str)
40
+ return import_from_odcs(data_contract_specification, odcs)
41
+
42
+
43
+ def parse_odcs_v3_from_str(source_str):
44
+ try:
45
+ odcs = OpenDataContractStandard.from_string(source_str)
46
+ except Exception as e:
47
+ raise DataContractException(
48
+ type="schema",
49
+ name="Parse ODCS contract",
50
+ reason=f"Failed to parse odcs contract from {source_str}",
51
+ engine="datacontract",
52
+ original_exception=e,
53
+ )
54
+ return odcs
55
+
56
+
57
+ def import_from_odcs(data_contract_specification: DataContractSpecification, odcs: OpenDataContractStandard):
58
+ data_contract_specification.id = odcs.id
59
+ data_contract_specification.info = import_info(odcs)
60
+ data_contract_specification.servers = import_servers(odcs)
61
+ data_contract_specification.terms = import_terms(odcs)
62
+ data_contract_specification.servicelevels = import_servicelevels(odcs)
63
+ data_contract_specification.models = import_models(odcs)
64
+ data_contract_specification.tags = import_tags(odcs)
65
+ return data_contract_specification
66
+
67
+
68
+ def import_info(odcs: Any) -> Info:
69
+ info = Info()
70
+
71
+ info.title = odcs.name if odcs.name is not None else ""
72
+
73
+ if odcs.version is not None:
74
+ info.version = odcs.version
75
+
76
+ # odcs.description.purpose => datacontract.description
77
+ if odcs.description is not None and odcs.description.purpose is not None:
78
+ info.description = odcs.description.purpose
79
+
80
+ # odcs.domain => datacontract.owner
81
+ owner = get_owner(odcs.customProperties)
82
+ if owner is not None:
83
+ info.owner = owner
84
+
85
+ # add dataProduct as custom property
86
+ if odcs.dataProduct is not None:
87
+ info.dataProduct = odcs.dataProduct
88
+
89
+ # add tenant as custom property
90
+ if odcs.tenant is not None:
91
+ info.tenant = odcs.tenant
92
+
93
+ return info
94
+
95
+
96
+ def import_server_roles(roles: List[Dict]) -> List[ServerRole] | None:
97
+ if roles is None:
98
+ return None
99
+ result = []
100
+ for role in roles:
101
+ server_role = ServerRole()
102
+ server_role.name = role.role
103
+ server_role.description = role.description
104
+ result.append(server_role)
105
+
106
+
107
+ def import_servers(odcs: OpenDataContractStandard) -> Dict[str, Server] | None:
108
+ if odcs.servers is None:
109
+ return None
110
+ servers = {}
111
+ for odcs_server in odcs.servers:
112
+ server_name = odcs_server.server
113
+ if server_name is None:
114
+ logger.warning("Server name is missing, skipping server")
115
+ continue
116
+
117
+ server = Server()
118
+ server.type = odcs_server.type
119
+ server.description = odcs_server.description
120
+ server.environment = odcs_server.environment
121
+ server.format = odcs_server.format
122
+ server.project = odcs_server.project
123
+ server.dataset = odcs_server.dataset
124
+ server.path = odcs_server.path
125
+ server.delimiter = odcs_server.delimiter
126
+ server.endpointUrl = odcs_server.endpointUrl
127
+ server.location = odcs_server.location
128
+ server.account = odcs_server.account
129
+ server.database = odcs_server.database
130
+ server.schema_ = odcs_server.schema_
131
+ server.host = odcs_server.host
132
+ server.port = odcs_server.port
133
+ server.catalog = odcs_server.catalog
134
+ server.stagingDir = odcs_server.stagingDir
135
+ server.topic = getattr(odcs_server, "topic", None)
136
+ server.http_path = getattr(odcs_server, "http_path", None)
137
+ server.token = getattr(odcs_server, "token", None)
138
+ server.driver = getattr(odcs_server, "driver", None)
139
+ server.roles = import_server_roles(odcs_server.roles)
140
+ server.storageAccount = (
141
+ to_azure_storage_account(odcs_server.location)
142
+ if server.type == "azure" and "://" in server.location
143
+ else None
144
+ )
145
+
146
+ servers[server_name] = server
147
+ return servers
148
+
149
+
150
+ def import_terms(odcs: Any) -> Terms | None:
151
+ if odcs.description is None:
152
+ return None
153
+ if odcs.description.usage is not None or odcs.description.limitations is not None or odcs.price is not None:
154
+ terms = Terms()
155
+ if odcs.description.usage is not None:
156
+ terms.usage = odcs.description.usage
157
+ if odcs.description.limitations is not None:
158
+ terms.limitations = odcs.description.limitations
159
+ if odcs.price is not None:
160
+ terms.billing = f"{odcs.price.priceAmount} {odcs.price.priceCurrency} / {odcs.price.priceUnit}"
161
+
162
+ return terms
163
+ else:
164
+ return None
165
+
166
+
167
+ def import_servicelevels(odcs: Any) -> ServiceLevel:
168
+ # find the two properties we can map (based on the examples)
169
+ sla_properties = odcs.slaProperties if odcs.slaProperties is not None else []
170
+ availability = next((p for p in sla_properties if p.property == "generalAvailability"), None)
171
+ retention = next((p for p in sla_properties if p.property == "retention"), None)
172
+
173
+ if availability is not None or retention is not None:
174
+ servicelevel = ServiceLevel()
175
+
176
+ if availability is not None:
177
+ value = availability.value
178
+ if isinstance(value, datetime.datetime):
179
+ value = value.isoformat()
180
+ servicelevel.availability = Availability(description=value)
181
+
182
+ if retention is not None:
183
+ servicelevel.retention = Retention(period=f"{retention.value}{retention.unit}")
184
+
185
+ return servicelevel
186
+ else:
187
+ return None
188
+
189
+
190
+ def get_server_type(odcs: OpenDataContractStandard) -> str | None:
191
+ servers = import_servers(odcs)
192
+ if servers is None or len(servers) == 0:
193
+ return None
194
+ # get first server from map
195
+ server = next(iter(servers.values()))
196
+ return server.type
197
+
198
+
199
+ def import_models(odcs: Any) -> Dict[str, Model]:
200
+ custom_type_mappings = get_custom_type_mappings(odcs.customProperties)
201
+
202
+ odcs_schemas = odcs.schema_ if odcs.schema_ is not None else []
203
+ result = {}
204
+
205
+ for odcs_schema in odcs_schemas:
206
+ schema_name = odcs_schema.name
207
+ schema_physical_name = odcs_schema.physicalName
208
+ schema_description = odcs_schema.description if odcs_schema.description is not None else ""
209
+ model_name = schema_physical_name if schema_physical_name is not None else schema_name
210
+ model = Model(
211
+ description=" ".join(schema_description.splitlines()) if schema_description else "",
212
+ type="table",
213
+ tags=odcs_schema.tags if odcs_schema.tags is not None else None,
214
+ )
215
+ model.fields = import_fields(odcs_schema.properties, custom_type_mappings, server_type=get_server_type(odcs))
216
+ if odcs_schema.quality is not None:
217
+ model.quality = convert_quality_list(odcs_schema.quality)
218
+ model.title = schema_name
219
+ if odcs_schema.dataGranularityDescription is not None:
220
+ model.config = {"dataGranularityDescription": odcs_schema.dataGranularityDescription}
221
+ result[model_name] = model
222
+
223
+ return result
224
+
225
+
226
+ def convert_quality_list(odcs_quality_list):
227
+ """Convert a list of ODCS DataQuality objects to datacontract Quality objects"""
228
+ quality_list = []
229
+
230
+ if odcs_quality_list is not None:
231
+ for odcs_quality in odcs_quality_list:
232
+ quality = Quality(type=odcs_quality.type)
233
+
234
+ if odcs_quality.description is not None:
235
+ quality.description = odcs_quality.description
236
+ if odcs_quality.query is not None:
237
+ quality.query = odcs_quality.query
238
+ if odcs_quality.rule is not None:
239
+ quality.metric = odcs_quality.rule
240
+ if odcs_quality.mustBe is not None:
241
+ quality.mustBe = odcs_quality.mustBe
242
+ if odcs_quality.mustNotBe is not None:
243
+ quality.mustNotBe = odcs_quality.mustNotBe
244
+ if odcs_quality.mustBeGreaterThan is not None:
245
+ quality.mustBeGreaterThan = odcs_quality.mustBeGreaterThan
246
+ if odcs_quality.mustBeGreaterOrEqualTo is not None:
247
+ quality.mustBeGreaterOrEqualTo = odcs_quality.mustBeGreaterOrEqualTo
248
+ if odcs_quality.mustBeLessThan is not None:
249
+ quality.mustBeLessThan = odcs_quality.mustBeLessThan
250
+ if odcs_quality.mustBeLessOrEqualTo is not None:
251
+ quality.mustBeLessOrEqualTo = odcs_quality.mustBeLessOrEqualTo
252
+ if odcs_quality.mustBeBetween is not None:
253
+ quality.mustBeBetween = odcs_quality.mustBeBetween
254
+ if odcs_quality.mustNotBeBetween is not None:
255
+ quality.mustNotBeBetween = odcs_quality.mustNotBeBetween
256
+ if odcs_quality.engine is not None:
257
+ quality.engine = odcs_quality.engine
258
+ if odcs_quality.implementation is not None:
259
+ quality.implementation = odcs_quality.implementation
260
+ if odcs_quality.businessImpact is not None:
261
+ quality.model_extra["businessImpact"] = odcs_quality.businessImpact
262
+ if odcs_quality.dimension is not None:
263
+ quality.model_extra["dimension"] = odcs_quality.dimension
264
+ if odcs_quality.schedule is not None:
265
+ quality.model_extra["schedule"] = odcs_quality.schedule
266
+ if odcs_quality.scheduler is not None:
267
+ quality.model_extra["scheduler"] = odcs_quality.scheduler
268
+ if odcs_quality.severity is not None:
269
+ quality.model_extra["severity"] = odcs_quality.severity
270
+ if odcs_quality.method is not None:
271
+ quality.model_extra["method"] = odcs_quality.method
272
+ if odcs_quality.customProperties is not None:
273
+ quality.model_extra["customProperties"] = []
274
+ for item in odcs_quality.customProperties:
275
+ quality.model_extra["customProperties"].append(
276
+ {
277
+ "property": item.property,
278
+ "value": item.value,
279
+ }
280
+ )
281
+
282
+ quality_list.append(quality)
283
+
284
+ return quality_list
285
+
286
+
287
+ def import_field_config(odcs_property: SchemaProperty, server_type=None) -> dict[Any, Any] | None:
288
+ config = {}
289
+ if odcs_property.criticalDataElement is not None:
290
+ config["criticalDataElement"] = odcs_property.criticalDataElement
291
+ if odcs_property.encryptedName is not None:
292
+ config["encryptedName"] = odcs_property.encryptedName
293
+ if odcs_property.partitionKeyPosition is not None:
294
+ config["partitionKeyPosition"] = odcs_property.partitionKeyPosition
295
+ if odcs_property.partitioned is not None:
296
+ config["partitioned"] = odcs_property.partitioned
297
+
298
+ if odcs_property.customProperties is not None:
299
+ for item in odcs_property.customProperties:
300
+ config[item.property] = item.value
301
+
302
+ physical_type = odcs_property.physicalType
303
+ if physical_type is not None:
304
+ if server_type == "postgres" or server_type == "postgresql":
305
+ config["postgresType"] = physical_type
306
+ elif server_type == "bigquery":
307
+ config["bigqueryType"] = physical_type
308
+ elif server_type == "snowflake":
309
+ config["snowflakeType"] = physical_type
310
+ elif server_type == "redshift":
311
+ config["redshiftType"] = physical_type
312
+ elif server_type == "sqlserver":
313
+ config["sqlserverType"] = physical_type
314
+ elif server_type == "databricks":
315
+ config["databricksType"] = physical_type
316
+ else:
317
+ config["physicalType"] = physical_type
318
+
319
+ if len(config) == 0:
320
+ return None
321
+
322
+ return config
323
+
324
+
325
+ def has_composite_primary_key(odcs_properties: List[SchemaProperty]) -> bool:
326
+ primary_keys = [prop for prop in odcs_properties if prop.primaryKey is not None and prop.primaryKey]
327
+ return len(primary_keys) > 1
328
+
329
+
330
+ def import_fields(
331
+ odcs_properties: List[SchemaProperty], custom_type_mappings: Dict[str, str], server_type
332
+ ) -> Dict[str, Field]:
333
+ result = {}
334
+
335
+ if odcs_properties is None:
336
+ return result
337
+
338
+ for odcs_property in odcs_properties:
339
+ field = import_field(odcs_property, odcs_properties, custom_type_mappings, server_type)
340
+ if field is not None:
341
+ result[odcs_property.name] = field
342
+
343
+ return result
344
+
345
+
346
+ def import_field(
347
+ odcs_property: SchemaProperty,
348
+ odcs_properties: List[SchemaProperty],
349
+ custom_type_mappings: Dict[str, str],
350
+ server_type: str,
351
+ ) -> Field | None:
352
+ """
353
+ Import a single ODCS property as a datacontract Field.
354
+ Returns None if the property cannot be mapped.
355
+ """
356
+ logger = logging.getLogger(__name__)
357
+
358
+ mapped_type = map_type(odcs_property.logicalType, custom_type_mappings, odcs_property.physicalType)
359
+
360
+ if mapped_type is None:
361
+ type_info = f"logicalType={odcs_property.logicalType}, physicalType={odcs_property.physicalType}"
362
+ logger.warning(
363
+ f"Can't map field '{odcs_property.name}' ({type_info}) to the datacontract mapping types. "
364
+ f"Both logicalType and physicalType are missing or unmappable. "
365
+ f"Consider introducing a customProperty 'dc_mapping_<type>' that defines your expected type as the 'value'"
366
+ )
367
+ return None
368
+
369
+ description = odcs_property.description if odcs_property.description is not None else None
370
+ field = Field(
371
+ description=" ".join(description.splitlines()) if description is not None else None,
372
+ type=mapped_type,
373
+ title=odcs_property.businessName,
374
+ required=odcs_property.required if odcs_property.required is not None else None,
375
+ primaryKey=to_primary_key(odcs_property, odcs_properties),
376
+ unique=odcs_property.unique if odcs_property.unique else None,
377
+ examples=odcs_property.examples if odcs_property.examples is not None else None,
378
+ classification=odcs_property.classification if odcs_property.classification is not None else None,
379
+ tags=odcs_property.tags if odcs_property.tags is not None else None,
380
+ quality=convert_quality_list(odcs_property.quality),
381
+ fields=import_fields(odcs_property.properties, custom_type_mappings, server_type)
382
+ if odcs_property.properties is not None
383
+ else {},
384
+ config=import_field_config(odcs_property, server_type),
385
+ format=getattr(odcs_property, "format", None),
386
+ )
387
+
388
+ # mapped_type is array
389
+ if field.type == "array" and odcs_property.items is not None:
390
+ field.items = import_field(odcs_property.items, [], custom_type_mappings, server_type)
391
+
392
+ # enum from quality validValues as enum
393
+ if field.type == "string":
394
+ for q in field.quality:
395
+ if hasattr(q, "validValues"):
396
+ field.enum = q.validValues
397
+
398
+ return field
399
+
400
+
401
+ def to_primary_key(odcs_property: SchemaProperty, odcs_properties: list[SchemaProperty]) -> bool | None:
402
+ if odcs_property.primaryKey is None:
403
+ return None
404
+ if has_composite_primary_key(odcs_properties):
405
+ return None
406
+ return odcs_property.primaryKey
407
+
408
+
409
+ def map_type(odcs_logical_type: str, custom_mappings: Dict[str, str], physical_type: str = None) -> str | None:
410
+ # Try to map logicalType first
411
+ if odcs_logical_type is not None:
412
+ t = odcs_logical_type.lower()
413
+ if t in DATACONTRACT_TYPES:
414
+ return t
415
+ elif custom_mappings.get(t) is not None:
416
+ return custom_mappings.get(t)
417
+
418
+ # Fallback to physicalType if logicalType is not mapped
419
+ if physical_type is not None:
420
+ pt = physical_type.lower()
421
+ # Remove parameters from physical type (e.g., VARCHAR(50) -> varchar, DECIMAL(10,2) -> decimal)
422
+ pt_base = pt.split("(")[0].strip()
423
+
424
+ # Try direct mapping of physical type
425
+ if pt in DATACONTRACT_TYPES:
426
+ return pt
427
+ elif pt_base in DATACONTRACT_TYPES:
428
+ return pt_base
429
+ elif custom_mappings.get(pt) is not None:
430
+ return custom_mappings.get(pt)
431
+ elif custom_mappings.get(pt_base) is not None:
432
+ return custom_mappings.get(pt_base)
433
+ # Common physical type mappings
434
+ elif pt_base in ["varchar", "char", "nvarchar", "nchar", "text", "ntext", "string", "character varying"]:
435
+ return "string"
436
+ elif pt_base in ["int", "integer", "smallint", "tinyint", "mediumint", "int2", "int4", "int8"]:
437
+ return "int"
438
+ elif pt_base in ["bigint", "long", "int64"]:
439
+ return "long"
440
+ elif pt_base in ["float", "real", "float4", "float8"]:
441
+ return "float"
442
+ elif pt_base in ["double", "double precision"]:
443
+ return "double"
444
+ elif pt_base in ["decimal", "numeric", "number"]:
445
+ return "decimal"
446
+ elif pt_base in ["boolean", "bool", "bit"]:
447
+ return "boolean"
448
+ elif pt_base in ["timestamp", "datetime", "datetime2", "timestamptz", "timestamp with time zone"]:
449
+ return "timestamp"
450
+ elif pt_base in ["date"]:
451
+ return "date"
452
+ elif pt_base in ["time"]:
453
+ return "time"
454
+ elif pt_base in ["json", "jsonb"]:
455
+ return "json"
456
+ elif pt_base in ["array"]:
457
+ return "array"
458
+ elif pt_base in ["object", "struct", "record"]:
459
+ return "object"
460
+ elif pt_base in ["bytes", "binary", "varbinary", "blob", "bytea"]:
461
+ return "bytes"
462
+ else:
463
+ return None
464
+ return None
465
+
466
+
467
+ def get_custom_type_mappings(odcs_custom_properties: List[CustomProperty]) -> Dict[str, str]:
468
+ result = {}
469
+ if odcs_custom_properties is not None:
470
+ for prop in odcs_custom_properties:
471
+ if prop.property.startswith("dc_mapping_"):
472
+ odcs_type_name = prop.property[11:] # Changed substring to slice
473
+ datacontract_type = prop.value
474
+ result[odcs_type_name] = datacontract_type
475
+
476
+ return result
477
+
478
+
479
+ def get_owner(odcs_custom_properties: List[CustomProperty]) -> str | None:
480
+ if odcs_custom_properties is not None:
481
+ for prop in odcs_custom_properties:
482
+ if prop.property == "owner":
483
+ return prop.value
484
+
485
+ return None
486
+
487
+
488
+ def import_tags(odcs: OpenDataContractStandard) -> List[str] | None:
489
+ if odcs.tags is None:
490
+ return None
491
+ return odcs.tags
492
+
493
+
494
+ def to_azure_storage_account(location: str) -> str | None:
495
+ """
496
+ Converts a storage location string to extract the storage account name.
497
+ ODCS v3.0 has no explicit field for the storage account. It uses the location field, which is a URI.
498
+
499
+ This function parses a storage location string to identify and return the
500
+ storage account name. It handles two primary patterns:
501
+ 1. Protocol://containerName@storageAccountName
502
+ 2. Protocol://storageAccountName
503
+
504
+ :param location: The storage location string to parse, typically following
505
+ the format protocol://containerName@storageAccountName. or
506
+ protocol://storageAccountName.
507
+ :return: The extracted storage account name if found, otherwise None
508
+ """
509
+ # to catch protocol://containerName@storageAccountName. pattern from location
510
+ match = re.search(r"(?<=@)([^.]*)", location, re.IGNORECASE)
511
+ if match:
512
+ return match.group()
513
+ else:
514
+ # to catch protocol://storageAccountName. pattern from location
515
+ match = re.search(r"(?<=//)(?!@)([^.]*)", location, re.IGNORECASE)
516
+ return match.group() if match else None
@@ -0,0 +1,81 @@
1
+ import os.path
2
+
3
+ import pyarrow
4
+ from pyarrow import parquet
5
+
6
+ from datacontract.imports.importer import Importer
7
+ from datacontract.model.data_contract_specification import (
8
+ DataContractSpecification,
9
+ Field,
10
+ Model,
11
+ )
12
+ from datacontract.model.exceptions import DataContractException
13
+
14
+
15
+ class ParquetImporter(Importer):
16
+ def import_source(
17
+ self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
18
+ ) -> DataContractSpecification:
19
+ return import_parquet(data_contract_specification, source)
20
+
21
+
22
+ def import_parquet(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
23
+ # use filename as schema name, remove .parquet suffix, avoid breaking the yaml output by replacing dots
24
+ schema_name = os.path.basename(source).removesuffix(".parquet").replace(".", "_")
25
+
26
+ fields: dict[str, Field] = {}
27
+
28
+ arrow_schema = parquet.read_schema(source)
29
+ for field_name in arrow_schema.names:
30
+ parquet_field = arrow_schema.field(field_name)
31
+
32
+ field = map_pyarrow_field_to_specification_field(parquet_field, "parquet")
33
+
34
+ if not parquet_field.nullable:
35
+ field.required = True
36
+
37
+ fields[field_name] = field
38
+
39
+ data_contract_specification.models[schema_name] = Model(fields=fields)
40
+
41
+ return data_contract_specification
42
+
43
+
44
+ def map_pyarrow_field_to_specification_field(pyarrow_field: pyarrow.Field, file_format: str) -> Field:
45
+ if pyarrow.types.is_boolean(pyarrow_field.type):
46
+ return Field(type="boolean")
47
+ if pyarrow.types.is_int32(pyarrow_field.type):
48
+ return Field(type="int")
49
+ if pyarrow.types.is_int64(pyarrow_field.type):
50
+ return Field(type="long")
51
+ if pyarrow.types.is_integer(pyarrow_field.type):
52
+ return Field(type="number")
53
+ if pyarrow.types.is_float32(pyarrow_field.type):
54
+ return Field(type="float")
55
+ if pyarrow.types.is_float64(pyarrow_field.type):
56
+ return Field(type="double")
57
+ if pyarrow.types.is_decimal(pyarrow_field.type):
58
+ return Field(type="decimal", precision=pyarrow_field.type.precision, scale=pyarrow_field.type.scale)
59
+ if pyarrow.types.is_timestamp(pyarrow_field.type):
60
+ return Field(type="timestamp")
61
+ if pyarrow.types.is_date(pyarrow_field.type):
62
+ return Field(type="date")
63
+ if pyarrow.types.is_null(pyarrow_field.type):
64
+ return Field(type="null")
65
+ if pyarrow.types.is_binary(pyarrow_field.type):
66
+ return Field(type="bytes")
67
+ if pyarrow.types.is_string(pyarrow_field.type):
68
+ return Field(type="string")
69
+ if pyarrow.types.is_map(pyarrow_field.type) or pyarrow.types.is_dictionary(pyarrow_field.type):
70
+ return Field(type="map")
71
+ if pyarrow.types.is_struct(pyarrow_field.type):
72
+ return Field(type="struct")
73
+ if pyarrow.types.is_list(pyarrow_field.type):
74
+ return Field(type="array")
75
+
76
+ raise DataContractException(
77
+ type="schema",
78
+ name=f"Parse {file_format} schema",
79
+ reason=f"{pyarrow_field.type} currently not supported.",
80
+ engine="datacontract",
81
+ )