datacontract-cli 0.10.23__py3-none-any.whl → 0.10.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datacontract/__init__.py +13 -0
- datacontract/api.py +12 -5
- datacontract/catalog/catalog.py +5 -3
- datacontract/cli.py +119 -13
- datacontract/data_contract.py +145 -67
- datacontract/engines/data_contract_checks.py +366 -60
- datacontract/engines/data_contract_test.py +50 -4
- datacontract/engines/fastjsonschema/check_jsonschema.py +37 -19
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +3 -2
- datacontract/engines/soda/check_soda_execute.py +27 -3
- datacontract/engines/soda/connections/athena.py +79 -0
- datacontract/engines/soda/connections/duckdb_connection.py +65 -6
- datacontract/engines/soda/connections/kafka.py +4 -2
- datacontract/engines/soda/connections/oracle.py +50 -0
- datacontract/export/avro_converter.py +20 -3
- datacontract/export/bigquery_converter.py +1 -1
- datacontract/export/dbt_converter.py +36 -7
- datacontract/export/dqx_converter.py +126 -0
- datacontract/export/duckdb_type_converter.py +57 -0
- datacontract/export/excel_exporter.py +923 -0
- datacontract/export/exporter.py +3 -0
- datacontract/export/exporter_factory.py +17 -1
- datacontract/export/great_expectations_converter.py +55 -5
- datacontract/export/{html_export.py → html_exporter.py} +31 -20
- datacontract/export/markdown_converter.py +134 -5
- datacontract/export/mermaid_exporter.py +110 -0
- datacontract/export/odcs_v3_exporter.py +193 -149
- datacontract/export/protobuf_converter.py +163 -69
- datacontract/export/rdf_converter.py +2 -2
- datacontract/export/sodacl_converter.py +9 -1
- datacontract/export/spark_converter.py +31 -4
- datacontract/export/sql_converter.py +6 -2
- datacontract/export/sql_type_converter.py +124 -8
- datacontract/imports/avro_importer.py +63 -12
- datacontract/imports/csv_importer.py +111 -57
- datacontract/imports/excel_importer.py +1112 -0
- datacontract/imports/importer.py +16 -3
- datacontract/imports/importer_factory.py +17 -0
- datacontract/imports/json_importer.py +325 -0
- datacontract/imports/odcs_importer.py +2 -2
- datacontract/imports/odcs_v3_importer.py +367 -151
- datacontract/imports/protobuf_importer.py +264 -0
- datacontract/imports/spark_importer.py +117 -13
- datacontract/imports/sql_importer.py +32 -16
- datacontract/imports/unity_importer.py +84 -38
- datacontract/init/init_template.py +1 -1
- datacontract/integration/entropy_data.py +126 -0
- datacontract/lint/resolve.py +112 -23
- datacontract/lint/schema.py +24 -15
- datacontract/lint/urls.py +17 -3
- datacontract/model/data_contract_specification/__init__.py +1 -0
- datacontract/model/odcs.py +13 -0
- datacontract/model/run.py +3 -0
- datacontract/output/junit_test_results.py +3 -3
- datacontract/schemas/datacontract-1.1.0.init.yaml +1 -1
- datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
- datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
- datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
- datacontract/schemas/odcs-3.1.0.schema.json +2809 -0
- datacontract/templates/datacontract.html +54 -3
- datacontract/templates/datacontract_odcs.html +685 -0
- datacontract/templates/index.html +5 -2
- datacontract/templates/partials/server.html +2 -0
- datacontract/templates/style/output.css +319 -145
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/METADATA +711 -433
- datacontract_cli-0.10.40.dist-info/RECORD +121 -0
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/WHEEL +1 -1
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info/licenses}/LICENSE +1 -1
- datacontract/export/csv_type_converter.py +0 -36
- datacontract/integration/datamesh_manager.py +0 -72
- datacontract/lint/lint.py +0 -142
- datacontract/lint/linters/description_linter.py +0 -35
- datacontract/lint/linters/field_pattern_linter.py +0 -34
- datacontract/lint/linters/field_reference_linter.py +0 -48
- datacontract/lint/linters/notice_period_linter.py +0 -55
- datacontract/lint/linters/quality_schema_linter.py +0 -52
- datacontract/lint/linters/valid_constraints_linter.py +0 -100
- datacontract/model/data_contract_specification.py +0 -327
- datacontract_cli-0.10.23.dist-info/RECORD +0 -113
- /datacontract/{lint/linters → output}/__init__.py +0 -0
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,6 @@
|
|
|
1
|
+
import re
|
|
1
2
|
import uuid
|
|
3
|
+
from dataclasses import dataclass
|
|
2
4
|
from typing import List
|
|
3
5
|
from venv import logger
|
|
4
6
|
|
|
@@ -9,6 +11,13 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
|
|
|
9
11
|
from datacontract.model.run import Check
|
|
10
12
|
|
|
11
13
|
|
|
14
|
+
@dataclass
|
|
15
|
+
class QuotingConfig:
|
|
16
|
+
quote_field_name: bool = False
|
|
17
|
+
quote_model_name: bool = False
|
|
18
|
+
quote_model_name_with_backticks: bool = False
|
|
19
|
+
|
|
20
|
+
|
|
12
21
|
def create_checks(data_contract_spec: DataContractSpecification, server: Server) -> List[Check]:
|
|
13
22
|
checks: List[Check] = []
|
|
14
23
|
for model_key, model_value in data_contract_spec.models.items():
|
|
@@ -26,37 +35,44 @@ def to_model_checks(model_key, model_value, server: Server) -> List[Check]:
|
|
|
26
35
|
fields = model_value.fields
|
|
27
36
|
|
|
28
37
|
check_types = is_check_types(server)
|
|
29
|
-
|
|
38
|
+
|
|
39
|
+
type1 = server.type if server and server.type else None
|
|
40
|
+
config = QuotingConfig(
|
|
41
|
+
quote_field_name=type1 in ["postgres", "sqlserver"],
|
|
42
|
+
quote_model_name=type1 in ["postgres", "sqlserver"],
|
|
43
|
+
quote_model_name_with_backticks=type1 == "bigquery",
|
|
44
|
+
)
|
|
45
|
+
quoting_config = config
|
|
30
46
|
|
|
31
47
|
for field_name, field in fields.items():
|
|
32
|
-
checks.append(check_field_is_present(model_name, field_name,
|
|
48
|
+
checks.append(check_field_is_present(model_name, field_name, quoting_config))
|
|
33
49
|
if check_types and field.type is not None:
|
|
34
|
-
sql_type = convert_to_sql_type(field, server_type)
|
|
35
|
-
checks.append(check_field_type(model_name, field_name, sql_type,
|
|
50
|
+
sql_type: str = convert_to_sql_type(field, server_type)
|
|
51
|
+
checks.append(check_field_type(model_name, field_name, sql_type, quoting_config))
|
|
36
52
|
if field.required:
|
|
37
|
-
checks.append(check_field_required(model_name, field_name,
|
|
53
|
+
checks.append(check_field_required(model_name, field_name, quoting_config))
|
|
38
54
|
if field.unique:
|
|
39
|
-
checks.append(check_field_unique(model_name, field_name,
|
|
55
|
+
checks.append(check_field_unique(model_name, field_name, quoting_config))
|
|
40
56
|
if field.minLength is not None:
|
|
41
|
-
checks.append(check_field_min_length(model_name, field_name, field.minLength,
|
|
57
|
+
checks.append(check_field_min_length(model_name, field_name, field.minLength, quoting_config))
|
|
42
58
|
if field.maxLength is not None:
|
|
43
|
-
checks.append(check_field_max_length(model_name, field_name, field.maxLength,
|
|
59
|
+
checks.append(check_field_max_length(model_name, field_name, field.maxLength, quoting_config))
|
|
44
60
|
if field.minimum is not None:
|
|
45
|
-
checks.append(check_field_minimum(model_name, field_name, field.minimum,
|
|
61
|
+
checks.append(check_field_minimum(model_name, field_name, field.minimum, quoting_config))
|
|
46
62
|
if field.maximum is not None:
|
|
47
|
-
checks.append(check_field_maximum(model_name, field_name, field.maximum,
|
|
63
|
+
checks.append(check_field_maximum(model_name, field_name, field.maximum, quoting_config))
|
|
48
64
|
if field.exclusiveMinimum is not None:
|
|
49
|
-
checks.append(check_field_minimum(model_name, field_name, field.exclusiveMinimum,
|
|
50
|
-
checks.append(check_field_not_equal(model_name, field_name, field.exclusiveMinimum,
|
|
65
|
+
checks.append(check_field_minimum(model_name, field_name, field.exclusiveMinimum, quoting_config))
|
|
66
|
+
checks.append(check_field_not_equal(model_name, field_name, field.exclusiveMinimum, quoting_config))
|
|
51
67
|
if field.exclusiveMaximum is not None:
|
|
52
|
-
checks.append(check_field_maximum(model_name, field_name, field.exclusiveMaximum,
|
|
53
|
-
checks.append(check_field_not_equal(model_name, field_name, field.exclusiveMaximum,
|
|
68
|
+
checks.append(check_field_maximum(model_name, field_name, field.exclusiveMaximum, quoting_config))
|
|
69
|
+
checks.append(check_field_not_equal(model_name, field_name, field.exclusiveMaximum, quoting_config))
|
|
54
70
|
if field.pattern is not None:
|
|
55
|
-
checks.append(check_field_regex(model_name, field_name, field.pattern,
|
|
71
|
+
checks.append(check_field_regex(model_name, field_name, field.pattern, quoting_config))
|
|
56
72
|
if field.enum is not None and len(field.enum) > 0:
|
|
57
|
-
checks.append(check_field_enum(model_name, field_name, field.enum,
|
|
73
|
+
checks.append(check_field_enum(model_name, field_name, field.enum, quoting_config))
|
|
58
74
|
if field.quality is not None and len(field.quality) > 0:
|
|
59
|
-
quality_list = check_quality_list(model_name, field_name, field.quality)
|
|
75
|
+
quality_list = check_quality_list(model_name, field_name, field.quality, quoting_config)
|
|
60
76
|
if (quality_list is not None) and len(quality_list) > 0:
|
|
61
77
|
checks.extend(quality_list)
|
|
62
78
|
# TODO references: str = None
|
|
@@ -70,9 +86,11 @@ def to_model_checks(model_key, model_value, server: Server) -> List[Check]:
|
|
|
70
86
|
return checks
|
|
71
87
|
|
|
72
88
|
|
|
73
|
-
def checks_for(model_name,
|
|
74
|
-
if
|
|
89
|
+
def checks_for(model_name: str, quoting_config: QuotingConfig, check_type: str) -> str:
|
|
90
|
+
if quoting_config.quote_model_name:
|
|
75
91
|
return f'checks for "{model_name}"'
|
|
92
|
+
elif quoting_config.quote_model_name_with_backticks and check_type not in ["field_is_present", "field_type"]:
|
|
93
|
+
return f"checks for `{model_name}`"
|
|
76
94
|
return f"checks for {model_name}"
|
|
77
95
|
|
|
78
96
|
|
|
@@ -98,11 +116,11 @@ def to_model_name(model_key, model_value, server_type):
|
|
|
98
116
|
return model_key
|
|
99
117
|
|
|
100
118
|
|
|
101
|
-
def check_field_is_present(model_name, field_name,
|
|
119
|
+
def check_field_is_present(model_name, field_name, quoting_config: QuotingConfig = QuotingConfig()) -> Check:
|
|
102
120
|
check_type = "field_is_present"
|
|
103
121
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
104
122
|
sodacl_check_dict = {
|
|
105
|
-
checks_for(model_name,
|
|
123
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
106
124
|
{
|
|
107
125
|
"schema": {
|
|
108
126
|
"name": check_key,
|
|
@@ -127,11 +145,13 @@ def check_field_is_present(model_name, field_name, quote_field_name: bool) -> Ch
|
|
|
127
145
|
)
|
|
128
146
|
|
|
129
147
|
|
|
130
|
-
def check_field_type(
|
|
148
|
+
def check_field_type(
|
|
149
|
+
model_name: str, field_name: str, expected_type: str, quoting_config: QuotingConfig = QuotingConfig()
|
|
150
|
+
):
|
|
131
151
|
check_type = "field_type"
|
|
132
152
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
133
153
|
sodacl_check_dict = {
|
|
134
|
-
checks_for(model_name,
|
|
154
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
135
155
|
{
|
|
136
156
|
"schema": {
|
|
137
157
|
"name": check_key,
|
|
@@ -158,8 +178,8 @@ def check_field_type(model_name: str, field_name: str, expected_type: str, quote
|
|
|
158
178
|
)
|
|
159
179
|
|
|
160
180
|
|
|
161
|
-
def check_field_required(model_name: str, field_name: str,
|
|
162
|
-
if quote_field_name:
|
|
181
|
+
def check_field_required(model_name: str, field_name: str, quoting_config: QuotingConfig = QuotingConfig()):
|
|
182
|
+
if quoting_config.quote_field_name:
|
|
163
183
|
field_name_for_soda = f'"{field_name}"'
|
|
164
184
|
else:
|
|
165
185
|
field_name_for_soda = field_name
|
|
@@ -167,7 +187,7 @@ def check_field_required(model_name: str, field_name: str, quote_field_name: boo
|
|
|
167
187
|
check_type = "field_required"
|
|
168
188
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
169
189
|
sodacl_check_dict = {
|
|
170
|
-
checks_for(model_name,
|
|
190
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
171
191
|
{
|
|
172
192
|
f"missing_count({field_name_for_soda}) = 0": {
|
|
173
193
|
"name": check_key,
|
|
@@ -189,8 +209,8 @@ def check_field_required(model_name: str, field_name: str, quote_field_name: boo
|
|
|
189
209
|
)
|
|
190
210
|
|
|
191
211
|
|
|
192
|
-
def check_field_unique(model_name: str, field_name: str,
|
|
193
|
-
if quote_field_name:
|
|
212
|
+
def check_field_unique(model_name: str, field_name: str, quoting_config: QuotingConfig = QuotingConfig()):
|
|
213
|
+
if quoting_config.quote_field_name:
|
|
194
214
|
field_name_for_soda = f'"{field_name}"'
|
|
195
215
|
else:
|
|
196
216
|
field_name_for_soda = field_name
|
|
@@ -198,7 +218,7 @@ def check_field_unique(model_name: str, field_name: str, quote_field_name: bool
|
|
|
198
218
|
check_type = "field_unique"
|
|
199
219
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
200
220
|
sodacl_check_dict = {
|
|
201
|
-
checks_for(model_name,
|
|
221
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
202
222
|
{
|
|
203
223
|
f"duplicate_count({field_name_for_soda}) = 0": {
|
|
204
224
|
"name": check_key,
|
|
@@ -220,8 +240,10 @@ def check_field_unique(model_name: str, field_name: str, quote_field_name: bool
|
|
|
220
240
|
)
|
|
221
241
|
|
|
222
242
|
|
|
223
|
-
def check_field_min_length(
|
|
224
|
-
|
|
243
|
+
def check_field_min_length(
|
|
244
|
+
model_name: str, field_name: str, min_length: int, quoting_config: QuotingConfig = QuotingConfig()
|
|
245
|
+
):
|
|
246
|
+
if quoting_config.quote_field_name:
|
|
225
247
|
field_name_for_soda = f'"{field_name}"'
|
|
226
248
|
else:
|
|
227
249
|
field_name_for_soda = field_name
|
|
@@ -229,7 +251,7 @@ def check_field_min_length(model_name: str, field_name: str, min_length: int, qu
|
|
|
229
251
|
check_type = "field_min_length"
|
|
230
252
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
231
253
|
sodacl_check_dict = {
|
|
232
|
-
checks_for(model_name,
|
|
254
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
233
255
|
{
|
|
234
256
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
235
257
|
"name": check_key,
|
|
@@ -252,8 +274,10 @@ def check_field_min_length(model_name: str, field_name: str, min_length: int, qu
|
|
|
252
274
|
)
|
|
253
275
|
|
|
254
276
|
|
|
255
|
-
def check_field_max_length(
|
|
256
|
-
|
|
277
|
+
def check_field_max_length(
|
|
278
|
+
model_name: str, field_name: str, max_length: int, quoting_config: QuotingConfig = QuotingConfig()
|
|
279
|
+
):
|
|
280
|
+
if quoting_config.quote_field_name:
|
|
257
281
|
field_name_for_soda = f'"{field_name}"'
|
|
258
282
|
else:
|
|
259
283
|
field_name_for_soda = field_name
|
|
@@ -261,7 +285,7 @@ def check_field_max_length(model_name: str, field_name: str, max_length: int, qu
|
|
|
261
285
|
check_type = "field_max_length"
|
|
262
286
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
263
287
|
sodacl_check_dict = {
|
|
264
|
-
checks_for(model_name,
|
|
288
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
265
289
|
{
|
|
266
290
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
267
291
|
"name": check_key,
|
|
@@ -284,8 +308,10 @@ def check_field_max_length(model_name: str, field_name: str, max_length: int, qu
|
|
|
284
308
|
)
|
|
285
309
|
|
|
286
310
|
|
|
287
|
-
def check_field_minimum(
|
|
288
|
-
|
|
311
|
+
def check_field_minimum(
|
|
312
|
+
model_name: str, field_name: str, minimum: int, quoting_config: QuotingConfig = QuotingConfig()
|
|
313
|
+
):
|
|
314
|
+
if quoting_config.quote_field_name:
|
|
289
315
|
field_name_for_soda = f'"{field_name}"'
|
|
290
316
|
else:
|
|
291
317
|
field_name_for_soda = field_name
|
|
@@ -293,7 +319,7 @@ def check_field_minimum(model_name: str, field_name: str, minimum: int, quote_fi
|
|
|
293
319
|
check_type = "field_minimum"
|
|
294
320
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
295
321
|
sodacl_check_dict = {
|
|
296
|
-
checks_for(model_name,
|
|
322
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
297
323
|
{
|
|
298
324
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
299
325
|
"name": check_key,
|
|
@@ -316,8 +342,10 @@ def check_field_minimum(model_name: str, field_name: str, minimum: int, quote_fi
|
|
|
316
342
|
)
|
|
317
343
|
|
|
318
344
|
|
|
319
|
-
def check_field_maximum(
|
|
320
|
-
|
|
345
|
+
def check_field_maximum(
|
|
346
|
+
model_name: str, field_name: str, maximum: int, quoting_config: QuotingConfig = QuotingConfig()
|
|
347
|
+
):
|
|
348
|
+
if quoting_config.quote_field_name:
|
|
321
349
|
field_name_for_soda = f'"{field_name}"'
|
|
322
350
|
else:
|
|
323
351
|
field_name_for_soda = field_name
|
|
@@ -325,7 +353,7 @@ def check_field_maximum(model_name: str, field_name: str, maximum: int, quote_fi
|
|
|
325
353
|
check_type = "field_maximum"
|
|
326
354
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
327
355
|
sodacl_check_dict = {
|
|
328
|
-
checks_for(model_name,
|
|
356
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
329
357
|
{
|
|
330
358
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
331
359
|
"name": check_key,
|
|
@@ -348,8 +376,10 @@ def check_field_maximum(model_name: str, field_name: str, maximum: int, quote_fi
|
|
|
348
376
|
)
|
|
349
377
|
|
|
350
378
|
|
|
351
|
-
def check_field_not_equal(
|
|
352
|
-
|
|
379
|
+
def check_field_not_equal(
|
|
380
|
+
model_name: str, field_name: str, value: int, quoting_config: QuotingConfig = QuotingConfig()
|
|
381
|
+
):
|
|
382
|
+
if quoting_config.quote_field_name:
|
|
353
383
|
field_name_for_soda = f'"{field_name}"'
|
|
354
384
|
else:
|
|
355
385
|
field_name_for_soda = field_name
|
|
@@ -357,7 +387,7 @@ def check_field_not_equal(model_name: str, field_name: str, value: int, quote_fi
|
|
|
357
387
|
check_type = "field_not_equal"
|
|
358
388
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
359
389
|
sodacl_check_dict = {
|
|
360
|
-
checks_for(model_name,
|
|
390
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
361
391
|
{
|
|
362
392
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
363
393
|
"name": check_key,
|
|
@@ -380,8 +410,8 @@ def check_field_not_equal(model_name: str, field_name: str, value: int, quote_fi
|
|
|
380
410
|
)
|
|
381
411
|
|
|
382
412
|
|
|
383
|
-
def check_field_enum(model_name: str, field_name: str, enum: list,
|
|
384
|
-
if quote_field_name:
|
|
413
|
+
def check_field_enum(model_name: str, field_name: str, enum: list, quoting_config: QuotingConfig = QuotingConfig()):
|
|
414
|
+
if quoting_config.quote_field_name:
|
|
385
415
|
field_name_for_soda = f'"{field_name}"'
|
|
386
416
|
else:
|
|
387
417
|
field_name_for_soda = field_name
|
|
@@ -389,7 +419,7 @@ def check_field_enum(model_name: str, field_name: str, enum: list, quote_field_n
|
|
|
389
419
|
check_type = "field_enum"
|
|
390
420
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
391
421
|
sodacl_check_dict = {
|
|
392
|
-
checks_for(model_name,
|
|
422
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
393
423
|
{
|
|
394
424
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
395
425
|
"name": check_key,
|
|
@@ -412,8 +442,8 @@ def check_field_enum(model_name: str, field_name: str, enum: list, quote_field_n
|
|
|
412
442
|
)
|
|
413
443
|
|
|
414
444
|
|
|
415
|
-
def check_field_regex(model_name: str, field_name: str, pattern: str,
|
|
416
|
-
if quote_field_name:
|
|
445
|
+
def check_field_regex(model_name: str, field_name: str, pattern: str, quoting_config: QuotingConfig = QuotingConfig()):
|
|
446
|
+
if quoting_config.quote_field_name:
|
|
417
447
|
field_name_for_soda = f'"{field_name}"'
|
|
418
448
|
else:
|
|
419
449
|
field_name_for_soda = field_name
|
|
@@ -421,7 +451,7 @@ def check_field_regex(model_name: str, field_name: str, pattern: str, quote_fiel
|
|
|
421
451
|
check_type = "field_regex"
|
|
422
452
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
423
453
|
sodacl_check_dict = {
|
|
424
|
-
checks_for(model_name,
|
|
454
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
425
455
|
{
|
|
426
456
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
427
457
|
"name": check_key,
|
|
@@ -444,7 +474,215 @@ def check_field_regex(model_name: str, field_name: str, pattern: str, quote_fiel
|
|
|
444
474
|
)
|
|
445
475
|
|
|
446
476
|
|
|
447
|
-
def
|
|
477
|
+
def check_row_count(model_name: str, threshold: str, quoting_config: QuotingConfig = QuotingConfig()):
|
|
478
|
+
check_type = "row_count"
|
|
479
|
+
check_key = f"{model_name}__{check_type}"
|
|
480
|
+
sodacl_check_dict = {
|
|
481
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
482
|
+
{
|
|
483
|
+
f"row_count {threshold}": {"name": check_key},
|
|
484
|
+
}
|
|
485
|
+
],
|
|
486
|
+
}
|
|
487
|
+
return Check(
|
|
488
|
+
id=str(uuid.uuid4()),
|
|
489
|
+
key=check_key,
|
|
490
|
+
category="schema",
|
|
491
|
+
type=check_type,
|
|
492
|
+
name=f"Check that model {model_name} has row_count {threshold}",
|
|
493
|
+
model=model_name,
|
|
494
|
+
field=None,
|
|
495
|
+
engine="soda",
|
|
496
|
+
language="sodacl",
|
|
497
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def check_model_duplicate_values(
|
|
502
|
+
model_name: str, cols: list[str], threshold: str, quoting_config: QuotingConfig = QuotingConfig()
|
|
503
|
+
):
|
|
504
|
+
check_type = "model_duplicate_values"
|
|
505
|
+
check_key = f"{model_name}__{check_type}"
|
|
506
|
+
col_joined = ", ".join(cols)
|
|
507
|
+
sodacl_check_dict = {
|
|
508
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
509
|
+
{
|
|
510
|
+
f"duplicate_count({col_joined}) {threshold}": {"name": check_key},
|
|
511
|
+
}
|
|
512
|
+
],
|
|
513
|
+
}
|
|
514
|
+
return Check(
|
|
515
|
+
id=str(uuid.uuid4()),
|
|
516
|
+
key=check_key,
|
|
517
|
+
category="quality",
|
|
518
|
+
type=check_type,
|
|
519
|
+
name=f"Check that model {model_name} has duplicate_count {threshold} for columns {col_joined}",
|
|
520
|
+
model=model_name,
|
|
521
|
+
field=None,
|
|
522
|
+
engine="soda",
|
|
523
|
+
language="sodacl",
|
|
524
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
def check_field_duplicate_values(
|
|
529
|
+
model_name: str, field_name: str, threshold: str, quoting_config: QuotingConfig = QuotingConfig()
|
|
530
|
+
):
|
|
531
|
+
if quoting_config.quote_field_name:
|
|
532
|
+
field_name_for_soda = f'"{field_name}"'
|
|
533
|
+
else:
|
|
534
|
+
field_name_for_soda = field_name
|
|
535
|
+
|
|
536
|
+
check_type = "field_duplicate_values"
|
|
537
|
+
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
538
|
+
sodacl_check_dict = {
|
|
539
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
540
|
+
{
|
|
541
|
+
f"duplicate_count({field_name_for_soda}) {threshold}": {
|
|
542
|
+
"name": check_key,
|
|
543
|
+
},
|
|
544
|
+
}
|
|
545
|
+
],
|
|
546
|
+
}
|
|
547
|
+
return Check(
|
|
548
|
+
id=str(uuid.uuid4()),
|
|
549
|
+
key=check_key,
|
|
550
|
+
category="quality",
|
|
551
|
+
type=check_type,
|
|
552
|
+
name=f"Check that field {field_name} has duplicate_count {threshold}",
|
|
553
|
+
model=model_name,
|
|
554
|
+
field=field_name,
|
|
555
|
+
engine="soda",
|
|
556
|
+
language="sodacl",
|
|
557
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
def check_field_null_values(
|
|
562
|
+
model_name: str, field_name: str, threshold: str, quoting_config: QuotingConfig = QuotingConfig()
|
|
563
|
+
):
|
|
564
|
+
if quoting_config.quote_field_name:
|
|
565
|
+
field_name_for_soda = f'"{field_name}"'
|
|
566
|
+
else:
|
|
567
|
+
field_name_for_soda = field_name
|
|
568
|
+
|
|
569
|
+
check_type = "field_null_values"
|
|
570
|
+
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
571
|
+
sodacl_check_dict = {
|
|
572
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
573
|
+
{
|
|
574
|
+
f"missing_count({field_name_for_soda}) {threshold}": {
|
|
575
|
+
"name": check_key,
|
|
576
|
+
},
|
|
577
|
+
}
|
|
578
|
+
],
|
|
579
|
+
}
|
|
580
|
+
return Check(
|
|
581
|
+
id=str(uuid.uuid4()),
|
|
582
|
+
key=check_key,
|
|
583
|
+
category="quality",
|
|
584
|
+
type=check_type,
|
|
585
|
+
name=f"Check that field {field_name} has missing_count {threshold}",
|
|
586
|
+
model=model_name,
|
|
587
|
+
field=field_name,
|
|
588
|
+
engine="soda",
|
|
589
|
+
language="sodacl",
|
|
590
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
def check_field_invalid_values(
|
|
595
|
+
model_name: str,
|
|
596
|
+
field_name: str,
|
|
597
|
+
threshold: str,
|
|
598
|
+
valid_values: list = None,
|
|
599
|
+
quoting_config: QuotingConfig = QuotingConfig(),
|
|
600
|
+
):
|
|
601
|
+
if quoting_config.quote_field_name:
|
|
602
|
+
field_name_for_soda = f'"{field_name}"'
|
|
603
|
+
else:
|
|
604
|
+
field_name_for_soda = field_name
|
|
605
|
+
|
|
606
|
+
check_type = "field_invalid_values"
|
|
607
|
+
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
608
|
+
|
|
609
|
+
sodacl_check_config = {
|
|
610
|
+
"name": check_key,
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
if valid_values is not None:
|
|
614
|
+
sodacl_check_config["valid values"] = valid_values
|
|
615
|
+
|
|
616
|
+
sodacl_check_dict = {
|
|
617
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
618
|
+
{
|
|
619
|
+
f"invalid_count({field_name_for_soda}) {threshold}": sodacl_check_config,
|
|
620
|
+
}
|
|
621
|
+
],
|
|
622
|
+
}
|
|
623
|
+
return Check(
|
|
624
|
+
id=str(uuid.uuid4()),
|
|
625
|
+
key=check_key,
|
|
626
|
+
category="quality",
|
|
627
|
+
type=check_type,
|
|
628
|
+
name=f"Check that field {field_name} has invalid_count {threshold}",
|
|
629
|
+
model=model_name,
|
|
630
|
+
field=field_name,
|
|
631
|
+
engine="soda",
|
|
632
|
+
language="sodacl",
|
|
633
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
634
|
+
)
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
def check_field_missing_values(
|
|
638
|
+
model_name: str,
|
|
639
|
+
field_name: str,
|
|
640
|
+
threshold: str,
|
|
641
|
+
missing_values: list = None,
|
|
642
|
+
quoting_config: QuotingConfig = QuotingConfig(),
|
|
643
|
+
):
|
|
644
|
+
if quoting_config.quote_field_name:
|
|
645
|
+
field_name_for_soda = f'"{field_name}"'
|
|
646
|
+
else:
|
|
647
|
+
field_name_for_soda = field_name
|
|
648
|
+
|
|
649
|
+
check_type = "field_missing_values"
|
|
650
|
+
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
651
|
+
|
|
652
|
+
sodacl_check_config = {
|
|
653
|
+
"name": check_key,
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
if missing_values is not None:
|
|
657
|
+
# Filter out null/None values as SodaCL handles these automatically
|
|
658
|
+
filtered_missing_values = [v for v in missing_values if v is not None]
|
|
659
|
+
if filtered_missing_values:
|
|
660
|
+
sodacl_check_config["missing values"] = filtered_missing_values
|
|
661
|
+
|
|
662
|
+
sodacl_check_dict = {
|
|
663
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
664
|
+
{
|
|
665
|
+
f"missing_count({field_name_for_soda}) {threshold}": sodacl_check_config,
|
|
666
|
+
}
|
|
667
|
+
],
|
|
668
|
+
}
|
|
669
|
+
return Check(
|
|
670
|
+
id=str(uuid.uuid4()),
|
|
671
|
+
key=check_key,
|
|
672
|
+
category="quality",
|
|
673
|
+
type=check_type,
|
|
674
|
+
name=f"Check that field {field_name} has missing_count {threshold}",
|
|
675
|
+
model=model_name,
|
|
676
|
+
field=field_name,
|
|
677
|
+
engine="soda",
|
|
678
|
+
language="sodacl",
|
|
679
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
680
|
+
)
|
|
681
|
+
|
|
682
|
+
|
|
683
|
+
def check_quality_list(
|
|
684
|
+
model_name, field_name, quality_list: List[Quality], quoting_config: QuotingConfig = QuotingConfig()
|
|
685
|
+
) -> List[Check]:
|
|
448
686
|
checks: List[Check] = []
|
|
449
687
|
|
|
450
688
|
count = 0
|
|
@@ -457,15 +695,20 @@ def check_quality_list(model_name, field_name, quality_list: List[Quality]) -> L
|
|
|
457
695
|
check_key = f"{model_name}__{field_name}__quality_sql_{count}"
|
|
458
696
|
check_type = "model_quality_sql"
|
|
459
697
|
threshold = to_sodacl_threshold(quality)
|
|
460
|
-
query = prepare_query(quality, model_name, field_name)
|
|
698
|
+
query = prepare_query(quality, model_name, field_name, quoting_config)
|
|
461
699
|
if query is None:
|
|
462
700
|
logger.warning(f"Quality check {check_key} has no query")
|
|
463
701
|
continue
|
|
464
702
|
if threshold is None:
|
|
465
703
|
logger.warning(f"Quality check {check_key} has no valid threshold")
|
|
466
704
|
continue
|
|
705
|
+
|
|
706
|
+
if quoting_config.quote_model_name:
|
|
707
|
+
model_name_for_soda = f'"{model_name}"'
|
|
708
|
+
else:
|
|
709
|
+
model_name_for_soda = model_name
|
|
467
710
|
sodacl_check_dict = {
|
|
468
|
-
f"checks for {
|
|
711
|
+
f"checks for {model_name_for_soda}": [
|
|
469
712
|
{
|
|
470
713
|
f"{check_key} {threshold}": {
|
|
471
714
|
f"{check_key} query": query,
|
|
@@ -488,12 +731,57 @@ def check_quality_list(model_name, field_name, quality_list: List[Quality]) -> L
|
|
|
488
731
|
implementation=yaml.dump(sodacl_check_dict),
|
|
489
732
|
)
|
|
490
733
|
)
|
|
734
|
+
elif quality.metric is not None:
|
|
735
|
+
threshold = to_sodacl_threshold(quality)
|
|
736
|
+
|
|
737
|
+
if threshold is None:
|
|
738
|
+
logger.warning(f"Quality metric {quality.metric} has no valid threshold")
|
|
739
|
+
continue
|
|
740
|
+
|
|
741
|
+
if quality.metric == "rowCount":
|
|
742
|
+
checks.append(check_row_count(model_name, threshold, quoting_config))
|
|
743
|
+
elif quality.metric == "duplicateValues":
|
|
744
|
+
if field_name is None:
|
|
745
|
+
# TODO check that quality.arguments.get("properties") is a list of strings and contains at lease one property
|
|
746
|
+
checks.append(
|
|
747
|
+
check_model_duplicate_values(
|
|
748
|
+
model_name, quality.arguments.get("properties"), threshold, quoting_config
|
|
749
|
+
)
|
|
750
|
+
)
|
|
751
|
+
else:
|
|
752
|
+
checks.append(check_field_duplicate_values(model_name, field_name, threshold, quoting_config))
|
|
753
|
+
elif quality.metric == "nullValues":
|
|
754
|
+
if field_name is not None:
|
|
755
|
+
checks.append(check_field_null_values(model_name, field_name, threshold, quoting_config))
|
|
756
|
+
else:
|
|
757
|
+
logger.warning("Quality check nullValues is only supported at field level")
|
|
758
|
+
elif quality.metric == "invalidValues":
|
|
759
|
+
if field_name is not None:
|
|
760
|
+
valid_values = quality.arguments.get("validValues") if quality.arguments else None
|
|
761
|
+
checks.append(
|
|
762
|
+
check_field_invalid_values(model_name, field_name, threshold, valid_values, quoting_config)
|
|
763
|
+
)
|
|
764
|
+
else:
|
|
765
|
+
logger.warning("Quality check invalidValues is only supported at field level")
|
|
766
|
+
elif quality.metric == "missingValues":
|
|
767
|
+
if field_name is not None:
|
|
768
|
+
missing_values = quality.arguments.get("missingValues") if quality.arguments else None
|
|
769
|
+
checks.append(
|
|
770
|
+
check_field_missing_values(model_name, field_name, threshold, missing_values, quoting_config)
|
|
771
|
+
)
|
|
772
|
+
else:
|
|
773
|
+
logger.warning("Quality check missingValues is only supported at field level")
|
|
774
|
+
else:
|
|
775
|
+
logger.warning(f"Quality check {quality.metric} is not yet supported")
|
|
776
|
+
|
|
491
777
|
count += 1
|
|
492
778
|
|
|
493
779
|
return checks
|
|
494
780
|
|
|
495
781
|
|
|
496
|
-
def prepare_query(
|
|
782
|
+
def prepare_query(
|
|
783
|
+
quality: Quality, model_name: str, field_name: str = None, quoting_config: QuotingConfig = QuotingConfig()
|
|
784
|
+
) -> str | None:
|
|
497
785
|
if quality.query is None:
|
|
498
786
|
return None
|
|
499
787
|
if quality.query == "":
|
|
@@ -501,12 +789,26 @@ def prepare_query(quality: Quality, model_name: str, field_name: str = None) ->
|
|
|
501
789
|
|
|
502
790
|
query = quality.query
|
|
503
791
|
|
|
504
|
-
|
|
505
|
-
|
|
792
|
+
if quoting_config.quote_field_name:
|
|
793
|
+
field_name_for_soda = f'"{field_name}"'
|
|
794
|
+
else:
|
|
795
|
+
field_name_for_soda = field_name
|
|
796
|
+
|
|
797
|
+
if quoting_config.quote_model_name:
|
|
798
|
+
model_name_for_soda = f'"{model_name}"'
|
|
799
|
+
elif quoting_config.quote_model_name_with_backticks:
|
|
800
|
+
model_name_for_soda = f"`{model_name}`"
|
|
801
|
+
else:
|
|
802
|
+
model_name_for_soda = model_name
|
|
803
|
+
|
|
804
|
+
query = re.sub(r'["\']?\{model}["\']?', model_name_for_soda, query)
|
|
805
|
+
query = re.sub(r'["\']?{schema}["\']?', model_name_for_soda, query)
|
|
806
|
+
query = re.sub(r'["\']?{table}["\']?', model_name_for_soda, query)
|
|
506
807
|
|
|
507
808
|
if field_name is not None:
|
|
508
|
-
query =
|
|
509
|
-
query =
|
|
809
|
+
query = re.sub(r'["\']?{field}["\']?', field_name_for_soda, query)
|
|
810
|
+
query = re.sub(r'["\']?{column}["\']?', field_name_for_soda, query)
|
|
811
|
+
query = re.sub(r'["\']?{property}["\']?', field_name_for_soda, query)
|
|
510
812
|
|
|
511
813
|
return query
|
|
512
814
|
|
|
@@ -518,10 +820,14 @@ def to_sodacl_threshold(quality: Quality) -> str | None:
|
|
|
518
820
|
return f"!= {quality.mustNotBe}"
|
|
519
821
|
if quality.mustBeGreaterThan is not None:
|
|
520
822
|
return f"> {quality.mustBeGreaterThan}"
|
|
823
|
+
if quality.mustBeGreaterOrEqualTo is not None:
|
|
824
|
+
return f">= {quality.mustBeGreaterOrEqualTo}"
|
|
521
825
|
if quality.mustBeGreaterThanOrEqualTo is not None:
|
|
522
826
|
return f">= {quality.mustBeGreaterThanOrEqualTo}"
|
|
523
827
|
if quality.mustBeLessThan is not None:
|
|
524
828
|
return f"< {quality.mustBeLessThan}"
|
|
829
|
+
if quality.mustBeLessOrEqualTo is not None:
|
|
830
|
+
return f"<= {quality.mustBeLessOrEqualTo}"
|
|
525
831
|
if quality.mustBeLessThanOrEqualTo is not None:
|
|
526
832
|
return f"<= {quality.mustBeLessThanOrEqualTo}"
|
|
527
833
|
if quality.mustBeBetween is not None:
|
|
@@ -594,7 +900,7 @@ def to_servicelevel_freshness_check(data_contract_spec: DataContractSpecificatio
|
|
|
594
900
|
check_key = "servicelevel_freshness"
|
|
595
901
|
|
|
596
902
|
sodacl_check_dict = {
|
|
597
|
-
checks_for(model_name,
|
|
903
|
+
checks_for(model_name, QuotingConfig(), check_type): [
|
|
598
904
|
{
|
|
599
905
|
f"freshness({field_name}) < {threshold}": {
|
|
600
906
|
"name": check_key,
|
|
@@ -646,7 +952,7 @@ def to_servicelevel_retention_check(data_contract_spec) -> Check | None:
|
|
|
646
952
|
check_type = "servicelevel_retention"
|
|
647
953
|
check_key = "servicelevel_retention"
|
|
648
954
|
sodacl_check_dict = {
|
|
649
|
-
checks_for(model_name,
|
|
955
|
+
checks_for(model_name, QuotingConfig(), check_type): [
|
|
650
956
|
{
|
|
651
957
|
f"orders_servicelevel_retention < {period_in_seconds}": {
|
|
652
958
|
"orders_servicelevel_retention expression": f"TIMESTAMPDIFF(SECOND, MIN({field_name}), CURRENT_TIMESTAMP)",
|