datacontract-cli 0.10.0__py3-none-any.whl → 0.10.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datacontract/__init__.py +13 -0
- datacontract/api.py +260 -0
- datacontract/breaking/breaking.py +242 -12
- datacontract/breaking/breaking_rules.py +37 -1
- datacontract/catalog/catalog.py +80 -0
- datacontract/cli.py +387 -117
- datacontract/data_contract.py +216 -353
- datacontract/engines/data_contract_checks.py +1041 -0
- datacontract/engines/data_contract_test.py +113 -0
- datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +2 -3
- datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
- datacontract/engines/fastjsonschema/check_jsonschema.py +176 -42
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +16 -1
- datacontract/engines/soda/check_soda_execute.py +100 -56
- datacontract/engines/soda/connections/athena.py +79 -0
- datacontract/engines/soda/connections/bigquery.py +8 -1
- datacontract/engines/soda/connections/databricks.py +12 -3
- datacontract/engines/soda/connections/duckdb_connection.py +241 -0
- datacontract/engines/soda/connections/kafka.py +206 -113
- datacontract/engines/soda/connections/snowflake.py +8 -5
- datacontract/engines/soda/connections/sqlserver.py +43 -0
- datacontract/engines/soda/connections/trino.py +26 -0
- datacontract/export/avro_converter.py +72 -8
- datacontract/export/avro_idl_converter.py +31 -25
- datacontract/export/bigquery_converter.py +130 -0
- datacontract/export/custom_converter.py +40 -0
- datacontract/export/data_caterer_converter.py +161 -0
- datacontract/export/dbml_converter.py +148 -0
- datacontract/export/dbt_converter.py +141 -54
- datacontract/export/dcs_exporter.py +6 -0
- datacontract/export/dqx_converter.py +126 -0
- datacontract/export/duckdb_type_converter.py +57 -0
- datacontract/export/excel_exporter.py +923 -0
- datacontract/export/exporter.py +100 -0
- datacontract/export/exporter_factory.py +216 -0
- datacontract/export/go_converter.py +105 -0
- datacontract/export/great_expectations_converter.py +257 -36
- datacontract/export/html_exporter.py +86 -0
- datacontract/export/iceberg_converter.py +188 -0
- datacontract/export/jsonschema_converter.py +71 -16
- datacontract/export/markdown_converter.py +337 -0
- datacontract/export/mermaid_exporter.py +110 -0
- datacontract/export/odcs_v3_exporter.py +375 -0
- datacontract/export/pandas_type_converter.py +40 -0
- datacontract/export/protobuf_converter.py +168 -68
- datacontract/export/pydantic_converter.py +6 -0
- datacontract/export/rdf_converter.py +13 -6
- datacontract/export/sodacl_converter.py +36 -188
- datacontract/export/spark_converter.py +245 -0
- datacontract/export/sql_converter.py +37 -3
- datacontract/export/sql_type_converter.py +269 -8
- datacontract/export/sqlalchemy_converter.py +170 -0
- datacontract/export/terraform_converter.py +7 -2
- datacontract/imports/avro_importer.py +246 -26
- datacontract/imports/bigquery_importer.py +221 -0
- datacontract/imports/csv_importer.py +143 -0
- datacontract/imports/dbml_importer.py +112 -0
- datacontract/imports/dbt_importer.py +240 -0
- datacontract/imports/excel_importer.py +1111 -0
- datacontract/imports/glue_importer.py +288 -0
- datacontract/imports/iceberg_importer.py +172 -0
- datacontract/imports/importer.py +51 -0
- datacontract/imports/importer_factory.py +128 -0
- datacontract/imports/json_importer.py +325 -0
- datacontract/imports/jsonschema_importer.py +146 -0
- datacontract/imports/odcs_importer.py +60 -0
- datacontract/imports/odcs_v3_importer.py +516 -0
- datacontract/imports/parquet_importer.py +81 -0
- datacontract/imports/protobuf_importer.py +264 -0
- datacontract/imports/spark_importer.py +262 -0
- datacontract/imports/sql_importer.py +274 -35
- datacontract/imports/unity_importer.py +219 -0
- datacontract/init/init_template.py +20 -0
- datacontract/integration/datamesh_manager.py +86 -0
- datacontract/lint/resolve.py +271 -49
- datacontract/lint/resources.py +21 -0
- datacontract/lint/schema.py +53 -17
- datacontract/lint/urls.py +32 -12
- datacontract/model/data_contract_specification/__init__.py +1 -0
- datacontract/model/exceptions.py +4 -1
- datacontract/model/odcs.py +24 -0
- datacontract/model/run.py +49 -29
- datacontract/output/__init__.py +0 -0
- datacontract/output/junit_test_results.py +135 -0
- datacontract/output/output_format.py +10 -0
- datacontract/output/test_results_writer.py +79 -0
- datacontract/py.typed +0 -0
- datacontract/schemas/datacontract-1.1.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.1.0.schema.json +1975 -0
- datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
- datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
- datacontract/schemas/odcs-3.0.1.schema.json +2634 -0
- datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
- datacontract/templates/datacontract.html +139 -294
- datacontract/templates/datacontract_odcs.html +685 -0
- datacontract/templates/index.html +236 -0
- datacontract/templates/partials/datacontract_information.html +86 -0
- datacontract/templates/partials/datacontract_servicelevels.html +253 -0
- datacontract/templates/partials/datacontract_terms.html +51 -0
- datacontract/templates/partials/definition.html +25 -0
- datacontract/templates/partials/example.html +27 -0
- datacontract/templates/partials/model_field.html +144 -0
- datacontract/templates/partials/quality.html +49 -0
- datacontract/templates/partials/server.html +211 -0
- datacontract/templates/style/output.css +491 -72
- datacontract_cli-0.10.37.dist-info/METADATA +2235 -0
- datacontract_cli-0.10.37.dist-info/RECORD +119 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/WHEEL +1 -1
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info/licenses}/LICENSE +1 -1
- datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
- datacontract/engines/soda/connections/dask.py +0 -28
- datacontract/engines/soda/connections/duckdb.py +0 -76
- datacontract/export/csv_type_converter.py +0 -36
- datacontract/export/html_export.py +0 -66
- datacontract/export/odcs_converter.py +0 -102
- datacontract/init/download_datacontract_file.py +0 -17
- datacontract/integration/publish_datamesh_manager.py +0 -33
- datacontract/integration/publish_opentelemetry.py +0 -107
- datacontract/lint/lint.py +0 -141
- datacontract/lint/linters/description_linter.py +0 -34
- datacontract/lint/linters/example_model_linter.py +0 -91
- datacontract/lint/linters/field_pattern_linter.py +0 -34
- datacontract/lint/linters/field_reference_linter.py +0 -38
- datacontract/lint/linters/notice_period_linter.py +0 -55
- datacontract/lint/linters/quality_schema_linter.py +0 -52
- datacontract/lint/linters/valid_constraints_linter.py +0 -99
- datacontract/model/data_contract_specification.py +0 -141
- datacontract/web.py +0 -14
- datacontract_cli-0.10.0.dist-info/METADATA +0 -951
- datacontract_cli-0.10.0.dist-info/RECORD +0 -66
- /datacontract/{model → breaking}/breaking_change.py +0 -0
- /datacontract/{lint/linters → export}/__init__.py +0 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,516 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import logging
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any, Dict, List
|
|
5
|
+
from venv import logger
|
|
6
|
+
|
|
7
|
+
from datacontract_specification.model import Quality
|
|
8
|
+
from open_data_contract_standard.model import CustomProperty, OpenDataContractStandard, SchemaProperty
|
|
9
|
+
|
|
10
|
+
from datacontract.imports.importer import Importer
|
|
11
|
+
from datacontract.lint.resources import read_resource
|
|
12
|
+
from datacontract.model.data_contract_specification import (
|
|
13
|
+
DATACONTRACT_TYPES,
|
|
14
|
+
Availability,
|
|
15
|
+
DataContractSpecification,
|
|
16
|
+
Field,
|
|
17
|
+
Info,
|
|
18
|
+
Model,
|
|
19
|
+
Retention,
|
|
20
|
+
Server,
|
|
21
|
+
ServerRole,
|
|
22
|
+
ServiceLevel,
|
|
23
|
+
Terms,
|
|
24
|
+
)
|
|
25
|
+
from datacontract.model.exceptions import DataContractException
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class OdcsImporter(Importer):
|
|
29
|
+
def import_source(
|
|
30
|
+
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
31
|
+
) -> DataContractSpecification:
|
|
32
|
+
return import_odcs_v3_as_dcs(data_contract_specification, source)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def import_odcs_v3_as_dcs(
|
|
36
|
+
data_contract_specification: DataContractSpecification, source: str
|
|
37
|
+
) -> DataContractSpecification:
|
|
38
|
+
source_str = read_resource(source)
|
|
39
|
+
odcs = parse_odcs_v3_from_str(source_str)
|
|
40
|
+
return import_from_odcs(data_contract_specification, odcs)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def parse_odcs_v3_from_str(source_str):
|
|
44
|
+
try:
|
|
45
|
+
odcs = OpenDataContractStandard.from_string(source_str)
|
|
46
|
+
except Exception as e:
|
|
47
|
+
raise DataContractException(
|
|
48
|
+
type="schema",
|
|
49
|
+
name="Parse ODCS contract",
|
|
50
|
+
reason=f"Failed to parse odcs contract from {source_str}",
|
|
51
|
+
engine="datacontract",
|
|
52
|
+
original_exception=e,
|
|
53
|
+
)
|
|
54
|
+
return odcs
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def import_from_odcs(data_contract_specification: DataContractSpecification, odcs: OpenDataContractStandard):
|
|
58
|
+
data_contract_specification.id = odcs.id
|
|
59
|
+
data_contract_specification.info = import_info(odcs)
|
|
60
|
+
data_contract_specification.servers = import_servers(odcs)
|
|
61
|
+
data_contract_specification.terms = import_terms(odcs)
|
|
62
|
+
data_contract_specification.servicelevels = import_servicelevels(odcs)
|
|
63
|
+
data_contract_specification.models = import_models(odcs)
|
|
64
|
+
data_contract_specification.tags = import_tags(odcs)
|
|
65
|
+
return data_contract_specification
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def import_info(odcs: Any) -> Info:
|
|
69
|
+
info = Info()
|
|
70
|
+
|
|
71
|
+
info.title = odcs.name if odcs.name is not None else ""
|
|
72
|
+
|
|
73
|
+
if odcs.version is not None:
|
|
74
|
+
info.version = odcs.version
|
|
75
|
+
|
|
76
|
+
# odcs.description.purpose => datacontract.description
|
|
77
|
+
if odcs.description is not None and odcs.description.purpose is not None:
|
|
78
|
+
info.description = odcs.description.purpose
|
|
79
|
+
|
|
80
|
+
# odcs.domain => datacontract.owner
|
|
81
|
+
owner = get_owner(odcs.customProperties)
|
|
82
|
+
if owner is not None:
|
|
83
|
+
info.owner = owner
|
|
84
|
+
|
|
85
|
+
# add dataProduct as custom property
|
|
86
|
+
if odcs.dataProduct is not None:
|
|
87
|
+
info.dataProduct = odcs.dataProduct
|
|
88
|
+
|
|
89
|
+
# add tenant as custom property
|
|
90
|
+
if odcs.tenant is not None:
|
|
91
|
+
info.tenant = odcs.tenant
|
|
92
|
+
|
|
93
|
+
return info
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def import_server_roles(roles: List[Dict]) -> List[ServerRole] | None:
|
|
97
|
+
if roles is None:
|
|
98
|
+
return None
|
|
99
|
+
result = []
|
|
100
|
+
for role in roles:
|
|
101
|
+
server_role = ServerRole()
|
|
102
|
+
server_role.name = role.role
|
|
103
|
+
server_role.description = role.description
|
|
104
|
+
result.append(server_role)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def import_servers(odcs: OpenDataContractStandard) -> Dict[str, Server] | None:
|
|
108
|
+
if odcs.servers is None:
|
|
109
|
+
return None
|
|
110
|
+
servers = {}
|
|
111
|
+
for odcs_server in odcs.servers:
|
|
112
|
+
server_name = odcs_server.server
|
|
113
|
+
if server_name is None:
|
|
114
|
+
logger.warning("Server name is missing, skipping server")
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
server = Server()
|
|
118
|
+
server.type = odcs_server.type
|
|
119
|
+
server.description = odcs_server.description
|
|
120
|
+
server.environment = odcs_server.environment
|
|
121
|
+
server.format = odcs_server.format
|
|
122
|
+
server.project = odcs_server.project
|
|
123
|
+
server.dataset = odcs_server.dataset
|
|
124
|
+
server.path = odcs_server.path
|
|
125
|
+
server.delimiter = odcs_server.delimiter
|
|
126
|
+
server.endpointUrl = odcs_server.endpointUrl
|
|
127
|
+
server.location = odcs_server.location
|
|
128
|
+
server.account = odcs_server.account
|
|
129
|
+
server.database = odcs_server.database
|
|
130
|
+
server.schema_ = odcs_server.schema_
|
|
131
|
+
server.host = odcs_server.host
|
|
132
|
+
server.port = odcs_server.port
|
|
133
|
+
server.catalog = odcs_server.catalog
|
|
134
|
+
server.stagingDir = odcs_server.stagingDir
|
|
135
|
+
server.topic = getattr(odcs_server, "topic", None)
|
|
136
|
+
server.http_path = getattr(odcs_server, "http_path", None)
|
|
137
|
+
server.token = getattr(odcs_server, "token", None)
|
|
138
|
+
server.driver = getattr(odcs_server, "driver", None)
|
|
139
|
+
server.roles = import_server_roles(odcs_server.roles)
|
|
140
|
+
server.storageAccount = (
|
|
141
|
+
to_azure_storage_account(odcs_server.location)
|
|
142
|
+
if server.type == "azure" and "://" in server.location
|
|
143
|
+
else None
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
servers[server_name] = server
|
|
147
|
+
return servers
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def import_terms(odcs: Any) -> Terms | None:
|
|
151
|
+
if odcs.description is None:
|
|
152
|
+
return None
|
|
153
|
+
if odcs.description.usage is not None or odcs.description.limitations is not None or odcs.price is not None:
|
|
154
|
+
terms = Terms()
|
|
155
|
+
if odcs.description.usage is not None:
|
|
156
|
+
terms.usage = odcs.description.usage
|
|
157
|
+
if odcs.description.limitations is not None:
|
|
158
|
+
terms.limitations = odcs.description.limitations
|
|
159
|
+
if odcs.price is not None:
|
|
160
|
+
terms.billing = f"{odcs.price.priceAmount} {odcs.price.priceCurrency} / {odcs.price.priceUnit}"
|
|
161
|
+
|
|
162
|
+
return terms
|
|
163
|
+
else:
|
|
164
|
+
return None
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def import_servicelevels(odcs: Any) -> ServiceLevel:
|
|
168
|
+
# find the two properties we can map (based on the examples)
|
|
169
|
+
sla_properties = odcs.slaProperties if odcs.slaProperties is not None else []
|
|
170
|
+
availability = next((p for p in sla_properties if p.property == "generalAvailability"), None)
|
|
171
|
+
retention = next((p for p in sla_properties if p.property == "retention"), None)
|
|
172
|
+
|
|
173
|
+
if availability is not None or retention is not None:
|
|
174
|
+
servicelevel = ServiceLevel()
|
|
175
|
+
|
|
176
|
+
if availability is not None:
|
|
177
|
+
value = availability.value
|
|
178
|
+
if isinstance(value, datetime.datetime):
|
|
179
|
+
value = value.isoformat()
|
|
180
|
+
servicelevel.availability = Availability(description=value)
|
|
181
|
+
|
|
182
|
+
if retention is not None:
|
|
183
|
+
servicelevel.retention = Retention(period=f"{retention.value}{retention.unit}")
|
|
184
|
+
|
|
185
|
+
return servicelevel
|
|
186
|
+
else:
|
|
187
|
+
return None
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def get_server_type(odcs: OpenDataContractStandard) -> str | None:
|
|
191
|
+
servers = import_servers(odcs)
|
|
192
|
+
if servers is None or len(servers) == 0:
|
|
193
|
+
return None
|
|
194
|
+
# get first server from map
|
|
195
|
+
server = next(iter(servers.values()))
|
|
196
|
+
return server.type
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def import_models(odcs: Any) -> Dict[str, Model]:
|
|
200
|
+
custom_type_mappings = get_custom_type_mappings(odcs.customProperties)
|
|
201
|
+
|
|
202
|
+
odcs_schemas = odcs.schema_ if odcs.schema_ is not None else []
|
|
203
|
+
result = {}
|
|
204
|
+
|
|
205
|
+
for odcs_schema in odcs_schemas:
|
|
206
|
+
schema_name = odcs_schema.name
|
|
207
|
+
schema_physical_name = odcs_schema.physicalName
|
|
208
|
+
schema_description = odcs_schema.description if odcs_schema.description is not None else ""
|
|
209
|
+
model_name = schema_physical_name if schema_physical_name is not None else schema_name
|
|
210
|
+
model = Model(
|
|
211
|
+
description=" ".join(schema_description.splitlines()) if schema_description else "",
|
|
212
|
+
type="table",
|
|
213
|
+
tags=odcs_schema.tags if odcs_schema.tags is not None else None,
|
|
214
|
+
)
|
|
215
|
+
model.fields = import_fields(odcs_schema.properties, custom_type_mappings, server_type=get_server_type(odcs))
|
|
216
|
+
if odcs_schema.quality is not None:
|
|
217
|
+
model.quality = convert_quality_list(odcs_schema.quality)
|
|
218
|
+
model.title = schema_name
|
|
219
|
+
if odcs_schema.dataGranularityDescription is not None:
|
|
220
|
+
model.config = {"dataGranularityDescription": odcs_schema.dataGranularityDescription}
|
|
221
|
+
result[model_name] = model
|
|
222
|
+
|
|
223
|
+
return result
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def convert_quality_list(odcs_quality_list):
|
|
227
|
+
"""Convert a list of ODCS DataQuality objects to datacontract Quality objects"""
|
|
228
|
+
quality_list = []
|
|
229
|
+
|
|
230
|
+
if odcs_quality_list is not None:
|
|
231
|
+
for odcs_quality in odcs_quality_list:
|
|
232
|
+
quality = Quality(type=odcs_quality.type)
|
|
233
|
+
|
|
234
|
+
if odcs_quality.description is not None:
|
|
235
|
+
quality.description = odcs_quality.description
|
|
236
|
+
if odcs_quality.query is not None:
|
|
237
|
+
quality.query = odcs_quality.query
|
|
238
|
+
if odcs_quality.rule is not None:
|
|
239
|
+
quality.metric = odcs_quality.rule
|
|
240
|
+
if odcs_quality.mustBe is not None:
|
|
241
|
+
quality.mustBe = odcs_quality.mustBe
|
|
242
|
+
if odcs_quality.mustNotBe is not None:
|
|
243
|
+
quality.mustNotBe = odcs_quality.mustNotBe
|
|
244
|
+
if odcs_quality.mustBeGreaterThan is not None:
|
|
245
|
+
quality.mustBeGreaterThan = odcs_quality.mustBeGreaterThan
|
|
246
|
+
if odcs_quality.mustBeGreaterOrEqualTo is not None:
|
|
247
|
+
quality.mustBeGreaterOrEqualTo = odcs_quality.mustBeGreaterOrEqualTo
|
|
248
|
+
if odcs_quality.mustBeLessThan is not None:
|
|
249
|
+
quality.mustBeLessThan = odcs_quality.mustBeLessThan
|
|
250
|
+
if odcs_quality.mustBeLessOrEqualTo is not None:
|
|
251
|
+
quality.mustBeLessOrEqualTo = odcs_quality.mustBeLessOrEqualTo
|
|
252
|
+
if odcs_quality.mustBeBetween is not None:
|
|
253
|
+
quality.mustBeBetween = odcs_quality.mustBeBetween
|
|
254
|
+
if odcs_quality.mustNotBeBetween is not None:
|
|
255
|
+
quality.mustNotBeBetween = odcs_quality.mustNotBeBetween
|
|
256
|
+
if odcs_quality.engine is not None:
|
|
257
|
+
quality.engine = odcs_quality.engine
|
|
258
|
+
if odcs_quality.implementation is not None:
|
|
259
|
+
quality.implementation = odcs_quality.implementation
|
|
260
|
+
if odcs_quality.businessImpact is not None:
|
|
261
|
+
quality.model_extra["businessImpact"] = odcs_quality.businessImpact
|
|
262
|
+
if odcs_quality.dimension is not None:
|
|
263
|
+
quality.model_extra["dimension"] = odcs_quality.dimension
|
|
264
|
+
if odcs_quality.schedule is not None:
|
|
265
|
+
quality.model_extra["schedule"] = odcs_quality.schedule
|
|
266
|
+
if odcs_quality.scheduler is not None:
|
|
267
|
+
quality.model_extra["scheduler"] = odcs_quality.scheduler
|
|
268
|
+
if odcs_quality.severity is not None:
|
|
269
|
+
quality.model_extra["severity"] = odcs_quality.severity
|
|
270
|
+
if odcs_quality.method is not None:
|
|
271
|
+
quality.model_extra["method"] = odcs_quality.method
|
|
272
|
+
if odcs_quality.customProperties is not None:
|
|
273
|
+
quality.model_extra["customProperties"] = []
|
|
274
|
+
for item in odcs_quality.customProperties:
|
|
275
|
+
quality.model_extra["customProperties"].append(
|
|
276
|
+
{
|
|
277
|
+
"property": item.property,
|
|
278
|
+
"value": item.value,
|
|
279
|
+
}
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
quality_list.append(quality)
|
|
283
|
+
|
|
284
|
+
return quality_list
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def import_field_config(odcs_property: SchemaProperty, server_type=None) -> dict[Any, Any] | None:
|
|
288
|
+
config = {}
|
|
289
|
+
if odcs_property.criticalDataElement is not None:
|
|
290
|
+
config["criticalDataElement"] = odcs_property.criticalDataElement
|
|
291
|
+
if odcs_property.encryptedName is not None:
|
|
292
|
+
config["encryptedName"] = odcs_property.encryptedName
|
|
293
|
+
if odcs_property.partitionKeyPosition is not None:
|
|
294
|
+
config["partitionKeyPosition"] = odcs_property.partitionKeyPosition
|
|
295
|
+
if odcs_property.partitioned is not None:
|
|
296
|
+
config["partitioned"] = odcs_property.partitioned
|
|
297
|
+
|
|
298
|
+
if odcs_property.customProperties is not None:
|
|
299
|
+
for item in odcs_property.customProperties:
|
|
300
|
+
config[item.property] = item.value
|
|
301
|
+
|
|
302
|
+
physical_type = odcs_property.physicalType
|
|
303
|
+
if physical_type is not None:
|
|
304
|
+
if server_type == "postgres" or server_type == "postgresql":
|
|
305
|
+
config["postgresType"] = physical_type
|
|
306
|
+
elif server_type == "bigquery":
|
|
307
|
+
config["bigqueryType"] = physical_type
|
|
308
|
+
elif server_type == "snowflake":
|
|
309
|
+
config["snowflakeType"] = physical_type
|
|
310
|
+
elif server_type == "redshift":
|
|
311
|
+
config["redshiftType"] = physical_type
|
|
312
|
+
elif server_type == "sqlserver":
|
|
313
|
+
config["sqlserverType"] = physical_type
|
|
314
|
+
elif server_type == "databricks":
|
|
315
|
+
config["databricksType"] = physical_type
|
|
316
|
+
else:
|
|
317
|
+
config["physicalType"] = physical_type
|
|
318
|
+
|
|
319
|
+
if len(config) == 0:
|
|
320
|
+
return None
|
|
321
|
+
|
|
322
|
+
return config
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def has_composite_primary_key(odcs_properties: List[SchemaProperty]) -> bool:
|
|
326
|
+
primary_keys = [prop for prop in odcs_properties if prop.primaryKey is not None and prop.primaryKey]
|
|
327
|
+
return len(primary_keys) > 1
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def import_fields(
|
|
331
|
+
odcs_properties: List[SchemaProperty], custom_type_mappings: Dict[str, str], server_type
|
|
332
|
+
) -> Dict[str, Field]:
|
|
333
|
+
result = {}
|
|
334
|
+
|
|
335
|
+
if odcs_properties is None:
|
|
336
|
+
return result
|
|
337
|
+
|
|
338
|
+
for odcs_property in odcs_properties:
|
|
339
|
+
field = import_field(odcs_property, odcs_properties, custom_type_mappings, server_type)
|
|
340
|
+
if field is not None:
|
|
341
|
+
result[odcs_property.name] = field
|
|
342
|
+
|
|
343
|
+
return result
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def import_field(
|
|
347
|
+
odcs_property: SchemaProperty,
|
|
348
|
+
odcs_properties: List[SchemaProperty],
|
|
349
|
+
custom_type_mappings: Dict[str, str],
|
|
350
|
+
server_type: str,
|
|
351
|
+
) -> Field | None:
|
|
352
|
+
"""
|
|
353
|
+
Import a single ODCS property as a datacontract Field.
|
|
354
|
+
Returns None if the property cannot be mapped.
|
|
355
|
+
"""
|
|
356
|
+
logger = logging.getLogger(__name__)
|
|
357
|
+
|
|
358
|
+
mapped_type = map_type(odcs_property.logicalType, custom_type_mappings, odcs_property.physicalType)
|
|
359
|
+
|
|
360
|
+
if mapped_type is None:
|
|
361
|
+
type_info = f"logicalType={odcs_property.logicalType}, physicalType={odcs_property.physicalType}"
|
|
362
|
+
logger.warning(
|
|
363
|
+
f"Can't map field '{odcs_property.name}' ({type_info}) to the datacontract mapping types. "
|
|
364
|
+
f"Both logicalType and physicalType are missing or unmappable. "
|
|
365
|
+
f"Consider introducing a customProperty 'dc_mapping_<type>' that defines your expected type as the 'value'"
|
|
366
|
+
)
|
|
367
|
+
return None
|
|
368
|
+
|
|
369
|
+
description = odcs_property.description if odcs_property.description is not None else None
|
|
370
|
+
field = Field(
|
|
371
|
+
description=" ".join(description.splitlines()) if description is not None else None,
|
|
372
|
+
type=mapped_type,
|
|
373
|
+
title=odcs_property.businessName,
|
|
374
|
+
required=odcs_property.required if odcs_property.required is not None else None,
|
|
375
|
+
primaryKey=to_primary_key(odcs_property, odcs_properties),
|
|
376
|
+
unique=odcs_property.unique if odcs_property.unique else None,
|
|
377
|
+
examples=odcs_property.examples if odcs_property.examples is not None else None,
|
|
378
|
+
classification=odcs_property.classification if odcs_property.classification is not None else None,
|
|
379
|
+
tags=odcs_property.tags if odcs_property.tags is not None else None,
|
|
380
|
+
quality=convert_quality_list(odcs_property.quality),
|
|
381
|
+
fields=import_fields(odcs_property.properties, custom_type_mappings, server_type)
|
|
382
|
+
if odcs_property.properties is not None
|
|
383
|
+
else {},
|
|
384
|
+
config=import_field_config(odcs_property, server_type),
|
|
385
|
+
format=getattr(odcs_property, "format", None),
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
# mapped_type is array
|
|
389
|
+
if field.type == "array" and odcs_property.items is not None:
|
|
390
|
+
field.items = import_field(odcs_property.items, [], custom_type_mappings, server_type)
|
|
391
|
+
|
|
392
|
+
# enum from quality validValues as enum
|
|
393
|
+
if field.type == "string":
|
|
394
|
+
for q in field.quality:
|
|
395
|
+
if hasattr(q, "validValues"):
|
|
396
|
+
field.enum = q.validValues
|
|
397
|
+
|
|
398
|
+
return field
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def to_primary_key(odcs_property: SchemaProperty, odcs_properties: list[SchemaProperty]) -> bool | None:
|
|
402
|
+
if odcs_property.primaryKey is None:
|
|
403
|
+
return None
|
|
404
|
+
if has_composite_primary_key(odcs_properties):
|
|
405
|
+
return None
|
|
406
|
+
return odcs_property.primaryKey
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def map_type(odcs_logical_type: str, custom_mappings: Dict[str, str], physical_type: str = None) -> str | None:
|
|
410
|
+
# Try to map logicalType first
|
|
411
|
+
if odcs_logical_type is not None:
|
|
412
|
+
t = odcs_logical_type.lower()
|
|
413
|
+
if t in DATACONTRACT_TYPES:
|
|
414
|
+
return t
|
|
415
|
+
elif custom_mappings.get(t) is not None:
|
|
416
|
+
return custom_mappings.get(t)
|
|
417
|
+
|
|
418
|
+
# Fallback to physicalType if logicalType is not mapped
|
|
419
|
+
if physical_type is not None:
|
|
420
|
+
pt = physical_type.lower()
|
|
421
|
+
# Remove parameters from physical type (e.g., VARCHAR(50) -> varchar, DECIMAL(10,2) -> decimal)
|
|
422
|
+
pt_base = pt.split("(")[0].strip()
|
|
423
|
+
|
|
424
|
+
# Try direct mapping of physical type
|
|
425
|
+
if pt in DATACONTRACT_TYPES:
|
|
426
|
+
return pt
|
|
427
|
+
elif pt_base in DATACONTRACT_TYPES:
|
|
428
|
+
return pt_base
|
|
429
|
+
elif custom_mappings.get(pt) is not None:
|
|
430
|
+
return custom_mappings.get(pt)
|
|
431
|
+
elif custom_mappings.get(pt_base) is not None:
|
|
432
|
+
return custom_mappings.get(pt_base)
|
|
433
|
+
# Common physical type mappings
|
|
434
|
+
elif pt_base in ["varchar", "char", "nvarchar", "nchar", "text", "ntext", "string", "character varying"]:
|
|
435
|
+
return "string"
|
|
436
|
+
elif pt_base in ["int", "integer", "smallint", "tinyint", "mediumint", "int2", "int4", "int8"]:
|
|
437
|
+
return "int"
|
|
438
|
+
elif pt_base in ["bigint", "long", "int64"]:
|
|
439
|
+
return "long"
|
|
440
|
+
elif pt_base in ["float", "real", "float4", "float8"]:
|
|
441
|
+
return "float"
|
|
442
|
+
elif pt_base in ["double", "double precision"]:
|
|
443
|
+
return "double"
|
|
444
|
+
elif pt_base in ["decimal", "numeric", "number"]:
|
|
445
|
+
return "decimal"
|
|
446
|
+
elif pt_base in ["boolean", "bool", "bit"]:
|
|
447
|
+
return "boolean"
|
|
448
|
+
elif pt_base in ["timestamp", "datetime", "datetime2", "timestamptz", "timestamp with time zone"]:
|
|
449
|
+
return "timestamp"
|
|
450
|
+
elif pt_base in ["date"]:
|
|
451
|
+
return "date"
|
|
452
|
+
elif pt_base in ["time"]:
|
|
453
|
+
return "time"
|
|
454
|
+
elif pt_base in ["json", "jsonb"]:
|
|
455
|
+
return "json"
|
|
456
|
+
elif pt_base in ["array"]:
|
|
457
|
+
return "array"
|
|
458
|
+
elif pt_base in ["object", "struct", "record"]:
|
|
459
|
+
return "object"
|
|
460
|
+
elif pt_base in ["bytes", "binary", "varbinary", "blob", "bytea"]:
|
|
461
|
+
return "bytes"
|
|
462
|
+
else:
|
|
463
|
+
return None
|
|
464
|
+
return None
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def get_custom_type_mappings(odcs_custom_properties: List[CustomProperty]) -> Dict[str, str]:
|
|
468
|
+
result = {}
|
|
469
|
+
if odcs_custom_properties is not None:
|
|
470
|
+
for prop in odcs_custom_properties:
|
|
471
|
+
if prop.property.startswith("dc_mapping_"):
|
|
472
|
+
odcs_type_name = prop.property[11:] # Changed substring to slice
|
|
473
|
+
datacontract_type = prop.value
|
|
474
|
+
result[odcs_type_name] = datacontract_type
|
|
475
|
+
|
|
476
|
+
return result
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def get_owner(odcs_custom_properties: List[CustomProperty]) -> str | None:
|
|
480
|
+
if odcs_custom_properties is not None:
|
|
481
|
+
for prop in odcs_custom_properties:
|
|
482
|
+
if prop.property == "owner":
|
|
483
|
+
return prop.value
|
|
484
|
+
|
|
485
|
+
return None
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
def import_tags(odcs: OpenDataContractStandard) -> List[str] | None:
|
|
489
|
+
if odcs.tags is None:
|
|
490
|
+
return None
|
|
491
|
+
return odcs.tags
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def to_azure_storage_account(location: str) -> str | None:
|
|
495
|
+
"""
|
|
496
|
+
Converts a storage location string to extract the storage account name.
|
|
497
|
+
ODCS v3.0 has no explicit field for the storage account. It uses the location field, which is a URI.
|
|
498
|
+
|
|
499
|
+
This function parses a storage location string to identify and return the
|
|
500
|
+
storage account name. It handles two primary patterns:
|
|
501
|
+
1. Protocol://containerName@storageAccountName
|
|
502
|
+
2. Protocol://storageAccountName
|
|
503
|
+
|
|
504
|
+
:param location: The storage location string to parse, typically following
|
|
505
|
+
the format protocol://containerName@storageAccountName. or
|
|
506
|
+
protocol://storageAccountName.
|
|
507
|
+
:return: The extracted storage account name if found, otherwise None
|
|
508
|
+
"""
|
|
509
|
+
# to catch protocol://containerName@storageAccountName. pattern from location
|
|
510
|
+
match = re.search(r"(?<=@)([^.]*)", location, re.IGNORECASE)
|
|
511
|
+
if match:
|
|
512
|
+
return match.group()
|
|
513
|
+
else:
|
|
514
|
+
# to catch protocol://storageAccountName. pattern from location
|
|
515
|
+
match = re.search(r"(?<=//)(?!@)([^.]*)", location, re.IGNORECASE)
|
|
516
|
+
return match.group() if match else None
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
|
|
3
|
+
import pyarrow
|
|
4
|
+
from pyarrow import parquet
|
|
5
|
+
|
|
6
|
+
from datacontract.imports.importer import Importer
|
|
7
|
+
from datacontract.model.data_contract_specification import (
|
|
8
|
+
DataContractSpecification,
|
|
9
|
+
Field,
|
|
10
|
+
Model,
|
|
11
|
+
)
|
|
12
|
+
from datacontract.model.exceptions import DataContractException
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ParquetImporter(Importer):
|
|
16
|
+
def import_source(
|
|
17
|
+
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
18
|
+
) -> DataContractSpecification:
|
|
19
|
+
return import_parquet(data_contract_specification, source)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def import_parquet(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
|
|
23
|
+
# use filename as schema name, remove .parquet suffix, avoid breaking the yaml output by replacing dots
|
|
24
|
+
schema_name = os.path.basename(source).removesuffix(".parquet").replace(".", "_")
|
|
25
|
+
|
|
26
|
+
fields: dict[str, Field] = {}
|
|
27
|
+
|
|
28
|
+
arrow_schema = parquet.read_schema(source)
|
|
29
|
+
for field_name in arrow_schema.names:
|
|
30
|
+
parquet_field = arrow_schema.field(field_name)
|
|
31
|
+
|
|
32
|
+
field = map_pyarrow_field_to_specification_field(parquet_field, "parquet")
|
|
33
|
+
|
|
34
|
+
if not parquet_field.nullable:
|
|
35
|
+
field.required = True
|
|
36
|
+
|
|
37
|
+
fields[field_name] = field
|
|
38
|
+
|
|
39
|
+
data_contract_specification.models[schema_name] = Model(fields=fields)
|
|
40
|
+
|
|
41
|
+
return data_contract_specification
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def map_pyarrow_field_to_specification_field(pyarrow_field: pyarrow.Field, file_format: str) -> Field:
|
|
45
|
+
if pyarrow.types.is_boolean(pyarrow_field.type):
|
|
46
|
+
return Field(type="boolean")
|
|
47
|
+
if pyarrow.types.is_int32(pyarrow_field.type):
|
|
48
|
+
return Field(type="int")
|
|
49
|
+
if pyarrow.types.is_int64(pyarrow_field.type):
|
|
50
|
+
return Field(type="long")
|
|
51
|
+
if pyarrow.types.is_integer(pyarrow_field.type):
|
|
52
|
+
return Field(type="number")
|
|
53
|
+
if pyarrow.types.is_float32(pyarrow_field.type):
|
|
54
|
+
return Field(type="float")
|
|
55
|
+
if pyarrow.types.is_float64(pyarrow_field.type):
|
|
56
|
+
return Field(type="double")
|
|
57
|
+
if pyarrow.types.is_decimal(pyarrow_field.type):
|
|
58
|
+
return Field(type="decimal", precision=pyarrow_field.type.precision, scale=pyarrow_field.type.scale)
|
|
59
|
+
if pyarrow.types.is_timestamp(pyarrow_field.type):
|
|
60
|
+
return Field(type="timestamp")
|
|
61
|
+
if pyarrow.types.is_date(pyarrow_field.type):
|
|
62
|
+
return Field(type="date")
|
|
63
|
+
if pyarrow.types.is_null(pyarrow_field.type):
|
|
64
|
+
return Field(type="null")
|
|
65
|
+
if pyarrow.types.is_binary(pyarrow_field.type):
|
|
66
|
+
return Field(type="bytes")
|
|
67
|
+
if pyarrow.types.is_string(pyarrow_field.type):
|
|
68
|
+
return Field(type="string")
|
|
69
|
+
if pyarrow.types.is_map(pyarrow_field.type) or pyarrow.types.is_dictionary(pyarrow_field.type):
|
|
70
|
+
return Field(type="map")
|
|
71
|
+
if pyarrow.types.is_struct(pyarrow_field.type):
|
|
72
|
+
return Field(type="struct")
|
|
73
|
+
if pyarrow.types.is_list(pyarrow_field.type):
|
|
74
|
+
return Field(type="array")
|
|
75
|
+
|
|
76
|
+
raise DataContractException(
|
|
77
|
+
type="schema",
|
|
78
|
+
name=f"Parse {file_format} schema",
|
|
79
|
+
reason=f"{pyarrow_field.type} currently not supported.",
|
|
80
|
+
engine="datacontract",
|
|
81
|
+
)
|