datacontract-cli 0.10.13__py3-none-any.whl → 0.10.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/breaking/breaking.py +227 -9
- datacontract/breaking/breaking_rules.py +24 -0
- datacontract/catalog/catalog.py +1 -1
- datacontract/cli.py +104 -32
- datacontract/data_contract.py +35 -5
- datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
- datacontract/engines/fastjsonschema/check_jsonschema.py +114 -22
- datacontract/engines/soda/check_soda_execute.py +5 -3
- datacontract/engines/soda/connections/duckdb.py +1 -0
- datacontract/engines/soda/connections/kafka.py +38 -17
- datacontract/export/avro_converter.py +8 -1
- datacontract/export/avro_idl_converter.py +2 -2
- datacontract/export/bigquery_converter.py +4 -3
- datacontract/export/data_caterer_converter.py +1 -1
- datacontract/export/dbml_converter.py +2 -4
- datacontract/export/dbt_converter.py +2 -3
- datacontract/export/dcs_exporter.py +6 -0
- datacontract/export/exporter.py +5 -2
- datacontract/export/exporter_factory.py +16 -3
- datacontract/export/go_converter.py +3 -2
- datacontract/export/great_expectations_converter.py +202 -40
- datacontract/export/html_export.py +1 -1
- datacontract/export/jsonschema_converter.py +3 -2
- datacontract/export/{odcs_converter.py → odcs_v2_exporter.py} +5 -5
- datacontract/export/odcs_v3_exporter.py +294 -0
- datacontract/export/pandas_type_converter.py +40 -0
- datacontract/export/protobuf_converter.py +1 -1
- datacontract/export/rdf_converter.py +4 -5
- datacontract/export/sodacl_converter.py +86 -2
- datacontract/export/spark_converter.py +10 -7
- datacontract/export/sql_converter.py +1 -2
- datacontract/export/sql_type_converter.py +55 -11
- datacontract/export/sqlalchemy_converter.py +1 -2
- datacontract/export/terraform_converter.py +1 -1
- datacontract/imports/avro_importer.py +1 -1
- datacontract/imports/bigquery_importer.py +1 -1
- datacontract/imports/dbml_importer.py +2 -2
- datacontract/imports/dbt_importer.py +3 -2
- datacontract/imports/glue_importer.py +5 -3
- datacontract/imports/iceberg_importer.py +161 -0
- datacontract/imports/importer.py +2 -0
- datacontract/imports/importer_factory.py +12 -1
- datacontract/imports/jsonschema_importer.py +3 -2
- datacontract/imports/odcs_importer.py +25 -168
- datacontract/imports/odcs_v2_importer.py +177 -0
- datacontract/imports/odcs_v3_importer.py +309 -0
- datacontract/imports/parquet_importer.py +81 -0
- datacontract/imports/spark_importer.py +2 -1
- datacontract/imports/sql_importer.py +1 -1
- datacontract/imports/unity_importer.py +3 -3
- datacontract/integration/datamesh_manager.py +1 -1
- datacontract/integration/opentelemetry.py +0 -1
- datacontract/lint/lint.py +2 -1
- datacontract/lint/linters/description_linter.py +1 -0
- datacontract/lint/linters/example_model_linter.py +1 -0
- datacontract/lint/linters/field_pattern_linter.py +1 -0
- datacontract/lint/linters/field_reference_linter.py +1 -0
- datacontract/lint/linters/notice_period_linter.py +1 -0
- datacontract/lint/linters/quality_schema_linter.py +1 -0
- datacontract/lint/linters/valid_constraints_linter.py +1 -0
- datacontract/lint/resolve.py +14 -9
- datacontract/lint/resources.py +21 -0
- datacontract/lint/schema.py +1 -1
- datacontract/lint/urls.py +4 -2
- datacontract/model/data_contract_specification.py +83 -13
- datacontract/model/odcs.py +11 -0
- datacontract/model/run.py +21 -12
- datacontract/templates/index.html +6 -6
- datacontract/web.py +2 -3
- {datacontract_cli-0.10.13.dist-info → datacontract_cli-0.10.15.dist-info}/METADATA +176 -93
- datacontract_cli-0.10.15.dist-info/RECORD +105 -0
- {datacontract_cli-0.10.13.dist-info → datacontract_cli-0.10.15.dist-info}/WHEEL +1 -1
- datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
- datacontract_cli-0.10.13.dist-info/RECORD +0 -97
- {datacontract_cli-0.10.13.dist-info → datacontract_cli-0.10.15.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.10.13.dist-info → datacontract_cli-0.10.15.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.13.dist-info → datacontract_cli-0.10.15.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
import yaml
|
|
4
|
+
|
|
5
|
+
from datacontract.export.exporter import Exporter
|
|
6
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class OdcsV3Exporter(Exporter):
|
|
10
|
+
def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
|
|
11
|
+
return to_odcs_v3_yaml(data_contract)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def to_odcs_v3_yaml(data_contract_spec: DataContractSpecification) -> str:
|
|
15
|
+
odcs = {
|
|
16
|
+
"apiVersion": "v3.0.0",
|
|
17
|
+
"kind": "DataContract",
|
|
18
|
+
"id": data_contract_spec.id,
|
|
19
|
+
"name": data_contract_spec.info.title,
|
|
20
|
+
"version": data_contract_spec.info.version,
|
|
21
|
+
"domain": data_contract_spec.info.owner,
|
|
22
|
+
"status": data_contract_spec.info.status,
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
if data_contract_spec.terms is not None:
|
|
26
|
+
odcs["description"] = {
|
|
27
|
+
"purpose": data_contract_spec.terms.description.strip()
|
|
28
|
+
if data_contract_spec.terms.description is not None
|
|
29
|
+
else None,
|
|
30
|
+
"usage": data_contract_spec.terms.usage.strip() if data_contract_spec.terms.usage is not None else None,
|
|
31
|
+
"limitations": data_contract_spec.terms.limitations.strip()
|
|
32
|
+
if data_contract_spec.terms.limitations is not None
|
|
33
|
+
else None,
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
odcs["schema"] = []
|
|
37
|
+
for model_key, model_value in data_contract_spec.models.items():
|
|
38
|
+
odcs_schema = to_odcs_schema(model_key, model_value)
|
|
39
|
+
odcs["schema"].append(odcs_schema)
|
|
40
|
+
|
|
41
|
+
if data_contract_spec.servicelevels is not None:
|
|
42
|
+
slas = []
|
|
43
|
+
if data_contract_spec.servicelevels.availability is not None:
|
|
44
|
+
slas.append(
|
|
45
|
+
{
|
|
46
|
+
"property": "generalAvailability",
|
|
47
|
+
"value": data_contract_spec.servicelevels.availability.description,
|
|
48
|
+
}
|
|
49
|
+
)
|
|
50
|
+
if data_contract_spec.servicelevels.retention is not None:
|
|
51
|
+
slas.append({"property": "retention", "value": data_contract_spec.servicelevels.retention.period})
|
|
52
|
+
|
|
53
|
+
if len(slas) > 0:
|
|
54
|
+
odcs["slaProperties"] = slas
|
|
55
|
+
|
|
56
|
+
if data_contract_spec.info.contact is not None:
|
|
57
|
+
support = []
|
|
58
|
+
if data_contract_spec.info.contact.email is not None:
|
|
59
|
+
support.append(
|
|
60
|
+
{
|
|
61
|
+
"channel": "email",
|
|
62
|
+
"url": "mailto:" + data_contract_spec.info.contact.email,
|
|
63
|
+
}
|
|
64
|
+
)
|
|
65
|
+
if data_contract_spec.info.contact.url is not None:
|
|
66
|
+
support.append(
|
|
67
|
+
{
|
|
68
|
+
"channel": "other",
|
|
69
|
+
"url": data_contract_spec.info.contact.url,
|
|
70
|
+
}
|
|
71
|
+
)
|
|
72
|
+
if len(support) > 0:
|
|
73
|
+
odcs["support"] = support
|
|
74
|
+
|
|
75
|
+
if data_contract_spec.servers is not None and len(data_contract_spec.servers) > 0:
|
|
76
|
+
servers = []
|
|
77
|
+
|
|
78
|
+
for server_key, server_value in data_contract_spec.servers.items():
|
|
79
|
+
server_dict = {}
|
|
80
|
+
server_dict["server"] = server_key
|
|
81
|
+
if server_value.type is not None:
|
|
82
|
+
server_dict["type"] = server_value.type
|
|
83
|
+
if server_value.environment is not None:
|
|
84
|
+
server_dict["environment"] = server_value.environment
|
|
85
|
+
if server_value.account is not None:
|
|
86
|
+
server_dict["account"] = server_value.account
|
|
87
|
+
if server_value.database is not None:
|
|
88
|
+
server_dict["database"] = server_value.database
|
|
89
|
+
if server_value.schema_ is not None:
|
|
90
|
+
server_dict["schema"] = server_value.schema_
|
|
91
|
+
if server_value.format is not None:
|
|
92
|
+
server_dict["format"] = server_value.format
|
|
93
|
+
if server_value.project is not None:
|
|
94
|
+
server_dict["project"] = server_value.project
|
|
95
|
+
if server_value.dataset is not None:
|
|
96
|
+
server_dict["dataset"] = server_value.dataset
|
|
97
|
+
if server_value.path is not None:
|
|
98
|
+
server_dict["path"] = server_value.path
|
|
99
|
+
if server_value.delimiter is not None:
|
|
100
|
+
server_dict["delimiter"] = server_value.delimiter
|
|
101
|
+
if server_value.endpointUrl is not None:
|
|
102
|
+
server_dict["endpointUrl"] = server_value.endpointUrl
|
|
103
|
+
if server_value.location is not None:
|
|
104
|
+
server_dict["location"] = server_value.location
|
|
105
|
+
if server_value.host is not None:
|
|
106
|
+
server_dict["host"] = server_value.host
|
|
107
|
+
if server_value.port is not None:
|
|
108
|
+
server_dict["port"] = server_value.port
|
|
109
|
+
if server_value.catalog is not None:
|
|
110
|
+
server_dict["catalog"] = server_value.catalog
|
|
111
|
+
if server_value.topic is not None:
|
|
112
|
+
server_dict["topic"] = server_value.topic
|
|
113
|
+
if server_value.http_path is not None:
|
|
114
|
+
server_dict["http_path"] = server_value.http_path
|
|
115
|
+
if server_value.token is not None:
|
|
116
|
+
server_dict["token"] = server_value.token
|
|
117
|
+
if server_value.driver is not None:
|
|
118
|
+
server_dict["driver"] = server_value.driver
|
|
119
|
+
if server_value.roles is not None:
|
|
120
|
+
server_dict["roles"] = [
|
|
121
|
+
{"name": role.name, "description": role.description} for role in server_value.roles
|
|
122
|
+
]
|
|
123
|
+
servers.append(server_dict)
|
|
124
|
+
|
|
125
|
+
if len(servers) > 0:
|
|
126
|
+
odcs["servers"] = servers
|
|
127
|
+
|
|
128
|
+
odcs["customProperties"] = []
|
|
129
|
+
if data_contract_spec.info.model_extra is not None:
|
|
130
|
+
for key, value in data_contract_spec.info.model_extra.items():
|
|
131
|
+
odcs["customProperties"].append({"property": key, "value": value})
|
|
132
|
+
if len(odcs["customProperties"]) == 0:
|
|
133
|
+
del odcs["customProperties"]
|
|
134
|
+
|
|
135
|
+
return yaml.dump(odcs, indent=2, sort_keys=False, allow_unicode=True)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def to_odcs_schema(model_key, model_value: Model) -> dict:
|
|
139
|
+
odcs_table = {
|
|
140
|
+
"name": model_key,
|
|
141
|
+
"physicalName": model_key,
|
|
142
|
+
"logicalType": "object",
|
|
143
|
+
"physicalType": model_value.type,
|
|
144
|
+
}
|
|
145
|
+
if model_value.description is not None:
|
|
146
|
+
odcs_table["description"] = model_value.description
|
|
147
|
+
properties = to_properties(model_value.fields)
|
|
148
|
+
if properties:
|
|
149
|
+
odcs_table["properties"] = properties
|
|
150
|
+
|
|
151
|
+
odcs_table["customProperties"] = []
|
|
152
|
+
if model_value.model_extra is not None:
|
|
153
|
+
for key, value in model_value.model_extra.items():
|
|
154
|
+
odcs_table["customProperties"].append({"property": key, "value": value})
|
|
155
|
+
if len(odcs_table["customProperties"]) == 0:
|
|
156
|
+
del odcs_table["customProperties"]
|
|
157
|
+
|
|
158
|
+
return odcs_table
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def to_properties(fields: Dict[str, Field]) -> list:
|
|
162
|
+
properties = []
|
|
163
|
+
for field_name, field in fields.items():
|
|
164
|
+
property = to_property(field_name, field)
|
|
165
|
+
properties.append(property)
|
|
166
|
+
return properties
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def to_logical_type(type: str) -> str | None:
|
|
170
|
+
if type is None:
|
|
171
|
+
return None
|
|
172
|
+
if type.lower() in ["string", "varchar", "text"]:
|
|
173
|
+
return "string"
|
|
174
|
+
if type.lower() in ["timestamp", "timestamp_tz"]:
|
|
175
|
+
return "date"
|
|
176
|
+
if type.lower() in ["timestamp_ntz"]:
|
|
177
|
+
return "date"
|
|
178
|
+
if type.lower() in ["date"]:
|
|
179
|
+
return "date"
|
|
180
|
+
if type.lower() in ["time"]:
|
|
181
|
+
return "string"
|
|
182
|
+
if type.lower() in ["number", "decimal", "numeric"]:
|
|
183
|
+
return "number"
|
|
184
|
+
if type.lower() in ["float", "double"]:
|
|
185
|
+
return "number"
|
|
186
|
+
if type.lower() in ["integer", "int", "long", "bigint"]:
|
|
187
|
+
return "integer"
|
|
188
|
+
if type.lower() in ["boolean"]:
|
|
189
|
+
return "boolean"
|
|
190
|
+
if type.lower() in ["object", "record", "struct"]:
|
|
191
|
+
return "object"
|
|
192
|
+
if type.lower() in ["bytes"]:
|
|
193
|
+
return "array"
|
|
194
|
+
if type.lower() in ["array"]:
|
|
195
|
+
return "array"
|
|
196
|
+
if type.lower() in ["null"]:
|
|
197
|
+
return None
|
|
198
|
+
return None
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def to_physical_type(type: str) -> str | None:
|
|
202
|
+
# TODO: to we need to do a server mapping here?
|
|
203
|
+
return type
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def to_property(field_name: str, field: Field) -> dict:
|
|
207
|
+
property = {"name": field_name}
|
|
208
|
+
if field.title is not None:
|
|
209
|
+
property["businessName"] = field.title
|
|
210
|
+
if field.type is not None:
|
|
211
|
+
property["logicalType"] = to_logical_type(field.type)
|
|
212
|
+
property["physicalType"] = to_physical_type(field.type)
|
|
213
|
+
if field.description is not None:
|
|
214
|
+
property["description"] = field.description
|
|
215
|
+
if field.required is not None:
|
|
216
|
+
property["isNullable"] = not field.required
|
|
217
|
+
if field.unique is not None:
|
|
218
|
+
property["isUnique"] = field.unique
|
|
219
|
+
if field.classification is not None:
|
|
220
|
+
property["classification"] = field.classification
|
|
221
|
+
if field.examples is not None:
|
|
222
|
+
property["examples"] = field.examples
|
|
223
|
+
if field.example is not None:
|
|
224
|
+
property["examples"] = [field.example]
|
|
225
|
+
|
|
226
|
+
property["customProperties"] = []
|
|
227
|
+
if field.model_extra is not None:
|
|
228
|
+
for key, value in field.model_extra.items():
|
|
229
|
+
property["customProperties"].append({"property": key, "value": value})
|
|
230
|
+
if field.pii is not None:
|
|
231
|
+
property["customProperties"].append({"property": "pii", "value": field.pii})
|
|
232
|
+
if property.get("customProperties") is not None and len(property["customProperties"]) == 0:
|
|
233
|
+
del property["customProperties"]
|
|
234
|
+
|
|
235
|
+
property["tags"] = []
|
|
236
|
+
if field.tags is not None:
|
|
237
|
+
property["tags"].extend(field.tags)
|
|
238
|
+
if not property["tags"]:
|
|
239
|
+
del property["tags"]
|
|
240
|
+
|
|
241
|
+
property["logicalTypeOptions"] = {}
|
|
242
|
+
if field.minLength is not None:
|
|
243
|
+
property["logicalTypeOptions"]["minLength"] = field.minLength
|
|
244
|
+
if field.maxLength is not None:
|
|
245
|
+
property["logicalTypeOptions"]["maxLength"] = field.maxLength
|
|
246
|
+
if field.pattern is not None:
|
|
247
|
+
property["logicalTypeOptions"]["pattern"] = field.pattern
|
|
248
|
+
if field.minimum is not None:
|
|
249
|
+
property["logicalTypeOptions"]["minimum"] = field.minimum
|
|
250
|
+
if field.maximum is not None:
|
|
251
|
+
property["logicalTypeOptions"]["maximum"] = field.maximum
|
|
252
|
+
if field.exclusiveMinimum is not None:
|
|
253
|
+
property["logicalTypeOptions"]["exclusiveMinimum"] = field.exclusiveMinimum
|
|
254
|
+
if field.exclusiveMaximum is not None:
|
|
255
|
+
property["logicalTypeOptions"]["exclusiveMaximum"] = field.exclusiveMaximum
|
|
256
|
+
if property["logicalTypeOptions"] == {}:
|
|
257
|
+
del property["logicalTypeOptions"]
|
|
258
|
+
|
|
259
|
+
if field.quality is not None:
|
|
260
|
+
quality_property = []
|
|
261
|
+
for quality in field.quality:
|
|
262
|
+
quality_dict = {"type": quality.type}
|
|
263
|
+
if quality.description is not None:
|
|
264
|
+
quality_dict["description"] = quality.description
|
|
265
|
+
if quality.query is not None:
|
|
266
|
+
quality_dict["query"] = quality.query
|
|
267
|
+
# dialect is not supported in v3.0.0
|
|
268
|
+
if quality.mustBe is not None:
|
|
269
|
+
quality_dict["mustBe"] = quality.mustBe
|
|
270
|
+
if quality.mustNotBe is not None:
|
|
271
|
+
quality_dict["mustNotBe"] = quality.mustNotBe
|
|
272
|
+
if quality.mustBeGreaterThan is not None:
|
|
273
|
+
quality_dict["mustBeGreaterThan"] = quality.mustBeGreaterThan
|
|
274
|
+
if quality.mustBeGreaterThanOrEqualTo is not None:
|
|
275
|
+
quality_dict["mustBeGreaterThanOrEqualTo"] = quality.mustBeGreaterThanOrEqualTo
|
|
276
|
+
if quality.mustBeLessThan is not None:
|
|
277
|
+
quality_dict["mustBeLessThan"] = quality.mustBeLessThan
|
|
278
|
+
if quality.mustBeLessThanOrEqualTo is not None:
|
|
279
|
+
quality_dict["mustBeLessThanOrEqualTo"] = quality.mustBeLessThanOrEqualTo
|
|
280
|
+
if quality.mustBeBetween is not None:
|
|
281
|
+
quality_dict["mustBeBetween"] = quality.mustBeBetween
|
|
282
|
+
if quality.mustNotBeBetween is not None:
|
|
283
|
+
quality_dict["mustNotBeBetween"] = quality.mustNotBeBetween
|
|
284
|
+
if quality.engine is not None:
|
|
285
|
+
quality_dict["engine"] = quality.engine
|
|
286
|
+
if quality.implementation is not None:
|
|
287
|
+
quality_dict["implementation"] = quality.implementation
|
|
288
|
+
quality_property.append(quality_dict)
|
|
289
|
+
if len(quality_property) > 0:
|
|
290
|
+
property["quality"] = quality_property
|
|
291
|
+
|
|
292
|
+
# todo enum
|
|
293
|
+
|
|
294
|
+
return property
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Module for converting data contract field types to corresponding pandas data types.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from datacontract.model.data_contract_specification import Field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def convert_to_pandas_type(field: Field) -> str:
|
|
9
|
+
"""
|
|
10
|
+
Convert a data contract field type to the equivalent pandas data type.
|
|
11
|
+
|
|
12
|
+
Parameters:
|
|
13
|
+
----------
|
|
14
|
+
field : Field
|
|
15
|
+
A Field object containing metadata about the data type of the field.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
-------
|
|
19
|
+
str
|
|
20
|
+
The corresponding pandas data type as a string.
|
|
21
|
+
"""
|
|
22
|
+
field_type = field.type
|
|
23
|
+
|
|
24
|
+
if field_type in ["string", "varchar", "text"]:
|
|
25
|
+
return "str"
|
|
26
|
+
if field_type in ["integer", "int"]:
|
|
27
|
+
return "int32"
|
|
28
|
+
if field_type == "long":
|
|
29
|
+
return "int64"
|
|
30
|
+
if field_type == "float":
|
|
31
|
+
return "float32"
|
|
32
|
+
if field_type in ["number", "decimal", "numeric", "double"]:
|
|
33
|
+
return "float64"
|
|
34
|
+
if field_type == "boolean":
|
|
35
|
+
return "bool"
|
|
36
|
+
if field_type in ["timestamp", "timestamp_tz", "timestamp_ntz", "date"]:
|
|
37
|
+
return "datetime64[ns]"
|
|
38
|
+
if field_type == "bytes":
|
|
39
|
+
return "object"
|
|
40
|
+
return "object"
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
from pydantic import BaseModel
|
|
2
|
-
from rdflib import
|
|
3
|
-
|
|
4
|
-
from datacontract.model.data_contract_specification import DataContractSpecification
|
|
2
|
+
from rdflib import RDF, BNode, Graph, Literal, Namespace, URIRef
|
|
5
3
|
|
|
6
4
|
from datacontract.export.exporter import Exporter
|
|
5
|
+
from datacontract.model.data_contract_specification import DataContractSpecification
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
class RdfExporter(Exporter):
|
|
@@ -58,8 +57,8 @@ def to_rdf(data_contract_spec: DataContractSpecification, base) -> Graph:
|
|
|
58
57
|
else:
|
|
59
58
|
g = Graph(base=Namespace(""))
|
|
60
59
|
|
|
61
|
-
dc = Namespace("https://datacontract.com/DataContractSpecification/
|
|
62
|
-
dcx = Namespace("https://datacontract.com/DataContractSpecification/
|
|
60
|
+
dc = Namespace("https://datacontract.com/DataContractSpecification/1.1.0/")
|
|
61
|
+
dcx = Namespace("https://datacontract.com/DataContractSpecification/1.1.0/Extension/")
|
|
63
62
|
|
|
64
63
|
g.bind("dc", dc)
|
|
65
64
|
g.bind("dcx", dcx)
|
|
@@ -1,8 +1,11 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from venv import logger
|
|
3
|
+
|
|
1
4
|
import yaml
|
|
2
5
|
|
|
3
|
-
from datacontract.export.sql_type_converter import convert_to_sql_type
|
|
4
|
-
from datacontract.model.data_contract_specification import DataContractSpecification
|
|
5
6
|
from datacontract.export.exporter import Exporter
|
|
7
|
+
from datacontract.export.sql_type_converter import convert_to_sql_type
|
|
8
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Quality
|
|
6
9
|
|
|
7
10
|
|
|
8
11
|
class SodaExporter(Exporter):
|
|
@@ -58,9 +61,18 @@ def to_checks(model_key, model_value, server_type: str, check_types: bool):
|
|
|
58
61
|
checks.append(check_field_regex(field_name, field.pattern, quote_field_name))
|
|
59
62
|
if field.enum is not None and len(field.enum) > 0:
|
|
60
63
|
checks.append(check_field_enum(field_name, field.enum, quote_field_name))
|
|
64
|
+
if field.quality is not None and len(field.quality) > 0:
|
|
65
|
+
quality_list = check_quality_list(model_key, field_name, field.quality)
|
|
66
|
+
if (quality_list is not None) and len(quality_list) > 0:
|
|
67
|
+
checks.append(quality_list)
|
|
61
68
|
# TODO references: str = None
|
|
62
69
|
# TODO format
|
|
63
70
|
|
|
71
|
+
if model_value.quality is not None and len(model_value.quality) > 0:
|
|
72
|
+
quality_list = check_quality_list(model_key, None, model_value.quality)
|
|
73
|
+
if (quality_list is not None) and len(quality_list) > 0:
|
|
74
|
+
checks.append(quality_list)
|
|
75
|
+
|
|
64
76
|
checks_for_model_key = f"checks for {model_key}"
|
|
65
77
|
|
|
66
78
|
if quote_field_name:
|
|
@@ -181,6 +193,78 @@ def check_field_regex(field_name, pattern, quote_field_name: bool = False):
|
|
|
181
193
|
}
|
|
182
194
|
|
|
183
195
|
|
|
196
|
+
def check_quality_list(model_name, field_name, quality_list: List[Quality]):
|
|
197
|
+
checks = {}
|
|
198
|
+
|
|
199
|
+
count = 0
|
|
200
|
+
for quality in quality_list:
|
|
201
|
+
if quality.type == "sql":
|
|
202
|
+
if field_name is None:
|
|
203
|
+
metric_name = f"{model_name}_{field_name}_quality_sql_{count}"
|
|
204
|
+
else:
|
|
205
|
+
metric_name = f"{model_name}_quality_sql_{count}"
|
|
206
|
+
threshold = to_sodacl_threshold(quality)
|
|
207
|
+
query = prepare_query(quality, model_name, field_name)
|
|
208
|
+
if query is None:
|
|
209
|
+
logger.warning(f"Quality check {metric_name} has no query")
|
|
210
|
+
continue
|
|
211
|
+
if threshold is None:
|
|
212
|
+
logger.warning(f"Quality check {metric_name} has no valid threshold")
|
|
213
|
+
continue
|
|
214
|
+
checks[f"{metric_name} {threshold}"] = {f"{metric_name} query": query}
|
|
215
|
+
count += 1
|
|
216
|
+
|
|
217
|
+
return checks
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def prepare_query(quality: Quality, model_name: str, field_name: str = None) -> str | None:
|
|
221
|
+
if quality.query is None:
|
|
222
|
+
return None
|
|
223
|
+
if quality.query == "":
|
|
224
|
+
return None
|
|
225
|
+
|
|
226
|
+
query = quality.query
|
|
227
|
+
|
|
228
|
+
query = query.replace("{model}", model_name)
|
|
229
|
+
query = query.replace("{table}", model_name)
|
|
230
|
+
|
|
231
|
+
if field_name is not None:
|
|
232
|
+
query = query.replace("{field}", field_name)
|
|
233
|
+
query = query.replace("{column}", field_name)
|
|
234
|
+
|
|
235
|
+
return query
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def to_sodacl_threshold(quality: Quality) -> str | None:
|
|
239
|
+
if quality.mustBe is not None:
|
|
240
|
+
return f"= {quality.mustBe}"
|
|
241
|
+
if quality.mustNotBe is not None:
|
|
242
|
+
return f"!= {quality.mustNotBe}"
|
|
243
|
+
if quality.mustBeGreaterThan is not None:
|
|
244
|
+
return f"> {quality.mustBeGreaterThan}"
|
|
245
|
+
if quality.mustBeGreaterThanOrEqualTo is not None:
|
|
246
|
+
return f">= {quality.mustBeGreaterThanOrEqualTo}"
|
|
247
|
+
if quality.mustBeLessThan is not None:
|
|
248
|
+
return f"< {quality.mustBeLessThan}"
|
|
249
|
+
if quality.mustBeLessThanOrEqualTo is not None:
|
|
250
|
+
return f"<= {quality.mustBeLessThanOrEqualTo}"
|
|
251
|
+
if quality.mustBeBetween is not None:
|
|
252
|
+
if len(quality.mustBeBetween) != 2:
|
|
253
|
+
logger.warning(
|
|
254
|
+
f"Quality check has invalid mustBeBetween, must have exactly 2 integers in an array: {quality.mustBeBetween}"
|
|
255
|
+
)
|
|
256
|
+
return None
|
|
257
|
+
return f"between {quality.mustBeBetween[0]} and {quality.mustBeBetween[1]}"
|
|
258
|
+
if quality.mustNotBeBetween is not None:
|
|
259
|
+
if len(quality.mustNotBeBetween) != 2:
|
|
260
|
+
logger.warning(
|
|
261
|
+
f"Quality check has invalid mustNotBeBetween, must have exactly 2 integers in an array: {quality.mustNotBeBetween}"
|
|
262
|
+
)
|
|
263
|
+
return None
|
|
264
|
+
return f"not between {quality.mustNotBeBetween[0]} and {quality.mustNotBeBetween[1]}"
|
|
265
|
+
return None
|
|
266
|
+
|
|
267
|
+
|
|
184
268
|
def add_quality_checks(sodacl, data_contract_spec):
|
|
185
269
|
if data_contract_spec.quality is None:
|
|
186
270
|
return
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
from pyspark.sql import types
|
|
2
|
+
|
|
3
|
+
from datacontract.export.exporter import Exporter
|
|
2
4
|
from datacontract.model.data_contract_specification import (
|
|
3
5
|
DataContractSpecification,
|
|
4
|
-
Model,
|
|
5
6
|
Field,
|
|
7
|
+
Model,
|
|
6
8
|
)
|
|
7
|
-
from datacontract.export.exporter import Exporter
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
class SparkExporter(Exporter):
|
|
@@ -102,11 +103,11 @@ def to_struct_field(field: Field, field_name: str) -> types.StructField:
|
|
|
102
103
|
Returns:
|
|
103
104
|
types.StructField: The corresponding Spark StructField.
|
|
104
105
|
"""
|
|
105
|
-
data_type =
|
|
106
|
+
data_type = to_spark_data_type(field)
|
|
106
107
|
return types.StructField(name=field_name, dataType=data_type, nullable=not field.required)
|
|
107
108
|
|
|
108
109
|
|
|
109
|
-
def
|
|
110
|
+
def to_spark_data_type(field: Field) -> types.DataType:
|
|
110
111
|
"""
|
|
111
112
|
Convert a field to a Spark DataType.
|
|
112
113
|
|
|
@@ -120,15 +121,17 @@ def to_data_type(field: Field) -> types.DataType:
|
|
|
120
121
|
if field_type is None or field_type in ["null"]:
|
|
121
122
|
return types.NullType()
|
|
122
123
|
if field_type == "array":
|
|
123
|
-
return types.ArrayType(
|
|
124
|
+
return types.ArrayType(to_spark_data_type(field.items))
|
|
124
125
|
if field_type in ["object", "record", "struct"]:
|
|
125
126
|
return types.StructType(to_struct_type(field.fields))
|
|
126
127
|
if field_type == "map":
|
|
127
|
-
return types.MapType(
|
|
128
|
+
return types.MapType(to_spark_data_type(field.keys), to_spark_data_type(field.values))
|
|
128
129
|
if field_type in ["string", "varchar", "text"]:
|
|
129
130
|
return types.StringType()
|
|
130
131
|
if field_type in ["number", "decimal", "numeric"]:
|
|
131
|
-
|
|
132
|
+
precision = field.precision if field.precision is not None else 38
|
|
133
|
+
scale = field.scale if field.scale is not None else 0
|
|
134
|
+
return types.DecimalType(precision=precision, scale=scale)
|
|
132
135
|
if field_type in ["integer", "int"]:
|
|
133
136
|
return types.IntegerType()
|
|
134
137
|
if field_type == "long":
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
+
from datacontract.export.exporter import Exporter, _check_models_for_export, _determine_sql_server_type
|
|
1
2
|
from datacontract.export.sql_type_converter import convert_to_sql_type
|
|
2
3
|
from datacontract.model.data_contract_specification import DataContractSpecification, Model
|
|
3
4
|
|
|
4
|
-
from datacontract.export.exporter import Exporter, _check_models_for_export, _determine_sql_server_type
|
|
5
|
-
|
|
6
5
|
|
|
7
6
|
class SqlExporter(Exporter):
|
|
8
7
|
def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
|
|
@@ -7,6 +7,8 @@ def convert_to_sql_type(field: Field, server_type: str) -> str:
|
|
|
7
7
|
return convert_to_snowflake(field)
|
|
8
8
|
elif server_type == "postgres":
|
|
9
9
|
return convert_type_to_postgres(field)
|
|
10
|
+
elif server_type == "dataframe":
|
|
11
|
+
return convert_to_dataframe(field)
|
|
10
12
|
elif server_type == "databricks":
|
|
11
13
|
return convert_to_databricks(field)
|
|
12
14
|
elif server_type == "local" or server_type == "s3":
|
|
@@ -108,6 +110,46 @@ def convert_type_to_postgres(field: Field) -> None | str:
|
|
|
108
110
|
return None
|
|
109
111
|
|
|
110
112
|
|
|
113
|
+
# dataframe data types:
|
|
114
|
+
# https://spark.apache.org/docs/latest/sql-ref-datatypes.html
|
|
115
|
+
def convert_to_dataframe(field: Field) -> None | str:
|
|
116
|
+
if field.config and "dataframeType" in field.config:
|
|
117
|
+
return field.config["dataframeType"]
|
|
118
|
+
type = field.type
|
|
119
|
+
if type is None:
|
|
120
|
+
return None
|
|
121
|
+
if type.lower() in ["string", "varchar", "text"]:
|
|
122
|
+
return "STRING"
|
|
123
|
+
if type.lower() in ["timestamp", "timestamp_tz"]:
|
|
124
|
+
return "TIMESTAMP"
|
|
125
|
+
if type.lower() in ["timestamp_ntz"]:
|
|
126
|
+
return "TIMESTAMP_NTZ"
|
|
127
|
+
if type.lower() in ["date"]:
|
|
128
|
+
return "DATE"
|
|
129
|
+
if type.lower() in ["time"]:
|
|
130
|
+
return "STRING"
|
|
131
|
+
if type.lower() in ["number", "decimal", "numeric"]:
|
|
132
|
+
# precision and scale not supported by data contract
|
|
133
|
+
return "DECIMAL"
|
|
134
|
+
if type.lower() in ["float"]:
|
|
135
|
+
return "FLOAT"
|
|
136
|
+
if type.lower() in ["double"]:
|
|
137
|
+
return "DOUBLE"
|
|
138
|
+
if type.lower() in ["integer", "int"]:
|
|
139
|
+
return "INT"
|
|
140
|
+
if type.lower() in ["long", "bigint"]:
|
|
141
|
+
return "BIGINT"
|
|
142
|
+
if type.lower() in ["boolean"]:
|
|
143
|
+
return "BOOLEAN"
|
|
144
|
+
if type.lower() in ["object", "record", "struct"]:
|
|
145
|
+
return "STRUCT"
|
|
146
|
+
if type.lower() in ["bytes"]:
|
|
147
|
+
return "BINARY"
|
|
148
|
+
if type.lower() in ["array"]:
|
|
149
|
+
return "ARRAY"
|
|
150
|
+
return None
|
|
151
|
+
|
|
152
|
+
|
|
111
153
|
# databricks data types:
|
|
112
154
|
# https://docs.databricks.com/en/sql/language-manual/sql-ref-datatypes.html
|
|
113
155
|
def convert_to_databricks(field: Field) -> None | str:
|
|
@@ -186,7 +228,7 @@ def convert_to_duckdb(field: Field) -> None | str:
|
|
|
186
228
|
"time": "TIME",
|
|
187
229
|
"timestamp": "TIMESTAMP WITH TIME ZONE",
|
|
188
230
|
"timestamp_tz": "TIMESTAMP WITH TIME ZONE",
|
|
189
|
-
"timestamp_ntz": "
|
|
231
|
+
"timestamp_ntz": "TIMESTAMP",
|
|
190
232
|
}
|
|
191
233
|
|
|
192
234
|
# Convert simple mappings
|
|
@@ -281,25 +323,27 @@ def get_type_config(field: Field, config_attr: str) -> dict[str, str] | None:
|
|
|
281
323
|
|
|
282
324
|
def convert_type_to_trino(field: Field) -> None | str:
|
|
283
325
|
"""Convert from supported datacontract types to equivalent trino types"""
|
|
284
|
-
|
|
326
|
+
if field.config and "trinoType" in field.config:
|
|
327
|
+
return field.config["trinoType"]
|
|
285
328
|
|
|
286
|
-
|
|
329
|
+
field_type = field.type.lower()
|
|
330
|
+
if field_type in ["string", "text", "varchar"]:
|
|
287
331
|
return "varchar"
|
|
288
332
|
# tinyint, smallint not supported by data contract
|
|
289
|
-
if field_type
|
|
333
|
+
if field_type in ["number", "decimal", "numeric"]:
|
|
290
334
|
# precision and scale not supported by data contract
|
|
291
335
|
return "decimal"
|
|
292
|
-
if field_type
|
|
336
|
+
if field_type in ["int", "integer"]:
|
|
293
337
|
return "integer"
|
|
294
|
-
if field_type
|
|
338
|
+
if field_type in ["long", "bigint"]:
|
|
295
339
|
return "bigint"
|
|
296
|
-
if field_type
|
|
340
|
+
if field_type in ["float"]:
|
|
297
341
|
return "real"
|
|
298
|
-
if field_type
|
|
342
|
+
if field_type in ["timestamp", "timestamp_tz"]:
|
|
299
343
|
return "timestamp(3) with time zone"
|
|
300
|
-
if field_type
|
|
344
|
+
if field_type in ["timestamp_ntz"]:
|
|
301
345
|
return "timestamp(3)"
|
|
302
|
-
if field_type
|
|
346
|
+
if field_type in ["bytes"]:
|
|
303
347
|
return "varbinary"
|
|
304
|
-
if field_type
|
|
348
|
+
if field_type in ["object", "record", "struct"]:
|
|
305
349
|
return "json"
|
|
@@ -2,8 +2,7 @@ import ast
|
|
|
2
2
|
import typing
|
|
3
3
|
|
|
4
4
|
import datacontract.model.data_contract_specification as spec
|
|
5
|
-
from datacontract.export.exporter import Exporter
|
|
6
|
-
from datacontract.export.exporter import _determine_sql_server_type
|
|
5
|
+
from datacontract.export.exporter import Exporter, _determine_sql_server_type
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
class SQLAlchemyExporter(Exporter):
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import re
|
|
2
2
|
|
|
3
|
-
from datacontract.model.data_contract_specification import DataContractSpecification, Server
|
|
4
3
|
from datacontract.export.exporter import Exporter
|
|
4
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Server
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class TerraformExporter(Exporter):
|
|
@@ -3,7 +3,7 @@ from typing import Dict, List
|
|
|
3
3
|
import avro.schema
|
|
4
4
|
|
|
5
5
|
from datacontract.imports.importer import Importer
|
|
6
|
-
from datacontract.model.data_contract_specification import DataContractSpecification,
|
|
6
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
7
7
|
from datacontract.model.exceptions import DataContractException
|
|
8
8
|
|
|
9
9
|
|