datacontract-cli 0.10.20__py3-none-any.whl → 0.10.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/{web.py → api.py} +55 -3
- datacontract/breaking/breaking.py +1 -1
- datacontract/breaking/breaking_rules.py +1 -1
- datacontract/cli.py +32 -10
- datacontract/data_contract.py +14 -100
- datacontract/engines/data_contract_checks.py +735 -0
- datacontract/engines/data_contract_test.py +51 -0
- datacontract/engines/soda/check_soda_execute.py +36 -30
- datacontract/engines/soda/connections/kafka.py +8 -3
- datacontract/export/avro_converter.py +2 -0
- datacontract/export/custom_converter.py +40 -0
- datacontract/export/exporter.py +1 -2
- datacontract/export/exporter_factory.py +4 -12
- datacontract/export/sodacl_converter.py +22 -294
- datacontract/export/sql_type_converter.py +7 -2
- datacontract/imports/odcs_importer.py +6 -3
- datacontract/imports/odcs_v3_importer.py +2 -0
- datacontract/imports/sql_importer.py +229 -29
- datacontract/lint/urls.py +4 -4
- datacontract/model/data_contract_specification.py +130 -129
- datacontract/model/exceptions.py +4 -1
- datacontract/model/run.py +25 -18
- datacontract/templates/datacontract.html +16 -2
- datacontract/templates/partials/definition.html +3 -95
- datacontract/templates/partials/model_field.html +13 -0
- datacontract/templates/partials/quality.html +49 -0
- datacontract/templates/style/output.css +151 -152
- {datacontract_cli-0.10.20.dist-info → datacontract_cli-0.10.22.dist-info}/METADATA +238 -184
- {datacontract_cli-0.10.20.dist-info → datacontract_cli-0.10.22.dist-info}/RECORD +34 -34
- datacontract/engines/soda/connections/dask.py +0 -28
- datacontract/export/odcs_v2_exporter.py +0 -124
- datacontract/imports/odcs_v2_importer.py +0 -177
- datacontract/lint/linters/example_model_linter.py +0 -91
- /datacontract/{model → breaking}/breaking_change.py +0 -0
- {datacontract_cli-0.10.20.dist-info → datacontract_cli-0.10.22.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.10.20.dist-info → datacontract_cli-0.10.22.dist-info}/WHEEL +0 -0
- {datacontract_cli-0.10.20.dist-info → datacontract_cli-0.10.22.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.20.dist-info → datacontract_cli-0.10.22.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,735 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
from typing import List
|
|
3
|
+
from venv import logger
|
|
4
|
+
|
|
5
|
+
import yaml
|
|
6
|
+
|
|
7
|
+
from datacontract.export.sql_type_converter import convert_to_sql_type
|
|
8
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Quality, Server
|
|
9
|
+
from datacontract.model.run import Check
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def create_checks(data_contract_spec: DataContractSpecification, server: Server) -> List[Check]:
|
|
13
|
+
checks: List[Check] = []
|
|
14
|
+
for model_key, model_value in data_contract_spec.models.items():
|
|
15
|
+
model_checks = to_model_checks(model_key, model_value, server)
|
|
16
|
+
checks.extend(model_checks)
|
|
17
|
+
checks.extend(to_servicelevel_checks(data_contract_spec))
|
|
18
|
+
checks.append(to_quality_check(data_contract_spec))
|
|
19
|
+
return [check for check in checks if check is not None]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def to_model_checks(model_key, model_value, server: Server) -> List[Check]:
|
|
23
|
+
checks: List[Check] = []
|
|
24
|
+
server_type = server.type if server and server.type else None
|
|
25
|
+
model_name = to_model_name(model_key, model_value, server_type)
|
|
26
|
+
fields = model_value.fields
|
|
27
|
+
|
|
28
|
+
check_types = is_check_types(server)
|
|
29
|
+
quote_field_name = server_type in ["postgres", "sqlserver"]
|
|
30
|
+
|
|
31
|
+
for field_name, field in fields.items():
|
|
32
|
+
checks.append(check_field_is_present(model_name, field_name, quote_field_name))
|
|
33
|
+
if check_types and field.type is not None:
|
|
34
|
+
sql_type = convert_to_sql_type(field, server_type)
|
|
35
|
+
checks.append(check_field_type(model_name, field_name, sql_type, quote_field_name))
|
|
36
|
+
if field.required:
|
|
37
|
+
checks.append(check_field_required(model_name, field_name, quote_field_name))
|
|
38
|
+
if field.unique:
|
|
39
|
+
checks.append(check_field_unique(model_name, field_name, quote_field_name))
|
|
40
|
+
if field.minLength is not None:
|
|
41
|
+
checks.append(check_field_min_length(model_name, field_name, field.minLength, quote_field_name))
|
|
42
|
+
if field.maxLength is not None:
|
|
43
|
+
checks.append(check_field_max_length(model_name, field_name, field.maxLength, quote_field_name))
|
|
44
|
+
if field.minimum is not None:
|
|
45
|
+
checks.append(check_field_minimum(model_name, field_name, field.minimum, quote_field_name))
|
|
46
|
+
if field.maximum is not None:
|
|
47
|
+
checks.append(check_field_maximum(model_name, field_name, field.maximum, quote_field_name))
|
|
48
|
+
if field.exclusiveMinimum is not None:
|
|
49
|
+
checks.append(check_field_minimum(model_name, field_name, field.exclusiveMinimum, quote_field_name))
|
|
50
|
+
checks.append(check_field_not_equal(model_name, field_name, field.exclusiveMinimum, quote_field_name))
|
|
51
|
+
if field.exclusiveMaximum is not None:
|
|
52
|
+
checks.append(check_field_maximum(model_name, field_name, field.exclusiveMaximum, quote_field_name))
|
|
53
|
+
checks.append(check_field_not_equal(model_name, field_name, field.exclusiveMaximum, quote_field_name))
|
|
54
|
+
if field.pattern is not None:
|
|
55
|
+
checks.append(check_field_regex(model_name, field_name, field.pattern, quote_field_name))
|
|
56
|
+
if field.enum is not None and len(field.enum) > 0:
|
|
57
|
+
checks.append(check_field_enum(model_name, field_name, field.enum, quote_field_name))
|
|
58
|
+
if field.quality is not None and len(field.quality) > 0:
|
|
59
|
+
quality_list = check_quality_list(model_name, field_name, field.quality)
|
|
60
|
+
if (quality_list is not None) and len(quality_list) > 0:
|
|
61
|
+
checks.extend(quality_list)
|
|
62
|
+
# TODO references: str = None
|
|
63
|
+
# TODO format
|
|
64
|
+
|
|
65
|
+
if model_value.quality is not None and len(model_value.quality) > 0:
|
|
66
|
+
quality_list = check_quality_list(model_name, None, model_value.quality)
|
|
67
|
+
if (quality_list is not None) and len(quality_list) > 0:
|
|
68
|
+
checks.extend(quality_list)
|
|
69
|
+
|
|
70
|
+
return checks
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def checks_for(model_name, quote_field_name):
|
|
74
|
+
if quote_field_name:
|
|
75
|
+
return f'checks for "{model_name}"'
|
|
76
|
+
return f"checks for {model_name}"
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def is_check_types(server: Server) -> bool:
|
|
80
|
+
if server is None:
|
|
81
|
+
return True
|
|
82
|
+
return server.format != "json" and server.format != "csv" and server.format != "avro"
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def to_model_name(model_key, model_value, server_type):
|
|
86
|
+
if server_type == "databricks":
|
|
87
|
+
if model_value.config is not None and "databricksTable" in model_value.config:
|
|
88
|
+
return model_value.config["databricksTable"]
|
|
89
|
+
if server_type == "snowflake":
|
|
90
|
+
if model_value.config is not None and "snowflakeTable" in model_value.config:
|
|
91
|
+
return model_value.config["snowflakeTable"]
|
|
92
|
+
if server_type == "sqlserver":
|
|
93
|
+
if model_value.config is not None and "sqlserverTable" in model_value.config:
|
|
94
|
+
return model_value.config["sqlserverTable"]
|
|
95
|
+
if server_type == "postgres" or server_type == "postgresql":
|
|
96
|
+
if model_value.config is not None and "postgresTable" in model_value.config:
|
|
97
|
+
return model_value.config["postgresTable"]
|
|
98
|
+
return model_key
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def check_field_is_present(model_name, field_name, quote_field_name: bool) -> Check:
|
|
102
|
+
check_type = "field_is_present"
|
|
103
|
+
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
104
|
+
sodacl_check_dict = {
|
|
105
|
+
checks_for(model_name, quote_field_name): [
|
|
106
|
+
{
|
|
107
|
+
"schema": {
|
|
108
|
+
"name": check_key,
|
|
109
|
+
"fail": {
|
|
110
|
+
"when required column missing": [field_name],
|
|
111
|
+
},
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
]
|
|
115
|
+
}
|
|
116
|
+
return Check(
|
|
117
|
+
id=str(uuid.uuid4()),
|
|
118
|
+
key=check_key,
|
|
119
|
+
category="schema",
|
|
120
|
+
type=check_type,
|
|
121
|
+
name=f"Check that field '{field_name}' is present",
|
|
122
|
+
model=model_name,
|
|
123
|
+
field=field_name,
|
|
124
|
+
engine="soda",
|
|
125
|
+
language="sodacl",
|
|
126
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def check_field_type(model_name: str, field_name: str, expected_type: str, quote_field_name: bool = False):
|
|
131
|
+
check_type = "field_type"
|
|
132
|
+
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
133
|
+
sodacl_check_dict = {
|
|
134
|
+
checks_for(model_name, quote_field_name): [
|
|
135
|
+
{
|
|
136
|
+
"schema": {
|
|
137
|
+
"name": check_key,
|
|
138
|
+
"fail": {
|
|
139
|
+
"when wrong column type": {
|
|
140
|
+
field_name: expected_type,
|
|
141
|
+
},
|
|
142
|
+
},
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
]
|
|
146
|
+
}
|
|
147
|
+
return Check(
|
|
148
|
+
id=str(uuid.uuid4()),
|
|
149
|
+
key=check_key,
|
|
150
|
+
category="schema",
|
|
151
|
+
type=check_type,
|
|
152
|
+
name=f"Check that field {field_name} has type {expected_type}",
|
|
153
|
+
model=model_name,
|
|
154
|
+
field=field_name,
|
|
155
|
+
engine="soda",
|
|
156
|
+
language="sodacl",
|
|
157
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def check_field_required(model_name: str, field_name: str, quote_field_name: bool = False):
|
|
162
|
+
if quote_field_name:
|
|
163
|
+
field_name_for_soda = f'"{field_name}"'
|
|
164
|
+
else:
|
|
165
|
+
field_name_for_soda = field_name
|
|
166
|
+
|
|
167
|
+
check_type = "field_required"
|
|
168
|
+
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
169
|
+
sodacl_check_dict = {
|
|
170
|
+
checks_for(model_name, quote_field_name): [
|
|
171
|
+
{
|
|
172
|
+
f"missing_count({field_name_for_soda}) = 0": {
|
|
173
|
+
"name": check_key,
|
|
174
|
+
},
|
|
175
|
+
}
|
|
176
|
+
],
|
|
177
|
+
}
|
|
178
|
+
return Check(
|
|
179
|
+
id=str(uuid.uuid4()),
|
|
180
|
+
key=check_key,
|
|
181
|
+
category="schema",
|
|
182
|
+
type=check_type,
|
|
183
|
+
name=f"Check that field {field_name} has no missing values",
|
|
184
|
+
model=model_name,
|
|
185
|
+
field=field_name,
|
|
186
|
+
engine="soda",
|
|
187
|
+
language="sodacl",
|
|
188
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def check_field_unique(model_name: str, field_name: str, quote_field_name: bool = False):
|
|
193
|
+
if quote_field_name:
|
|
194
|
+
field_name_for_soda = f'"{field_name}"'
|
|
195
|
+
else:
|
|
196
|
+
field_name_for_soda = field_name
|
|
197
|
+
|
|
198
|
+
check_type = "field_unique"
|
|
199
|
+
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
200
|
+
sodacl_check_dict = {
|
|
201
|
+
checks_for(model_name, quote_field_name): [
|
|
202
|
+
{
|
|
203
|
+
f"duplicate_count({field_name_for_soda}) = 0": {
|
|
204
|
+
"name": check_key,
|
|
205
|
+
},
|
|
206
|
+
}
|
|
207
|
+
],
|
|
208
|
+
}
|
|
209
|
+
return Check(
|
|
210
|
+
id=str(uuid.uuid4()),
|
|
211
|
+
key=check_key,
|
|
212
|
+
category="schema",
|
|
213
|
+
type=check_type,
|
|
214
|
+
name=f"Check that unique field {field_name} has no duplicate values",
|
|
215
|
+
model=model_name,
|
|
216
|
+
field=field_name,
|
|
217
|
+
engine="soda",
|
|
218
|
+
language="sodacl",
|
|
219
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def check_field_min_length(model_name: str, field_name: str, min_length: int, quote_field_name: bool = False):
|
|
224
|
+
if quote_field_name:
|
|
225
|
+
field_name_for_soda = f'"{field_name}"'
|
|
226
|
+
else:
|
|
227
|
+
field_name_for_soda = field_name
|
|
228
|
+
|
|
229
|
+
check_type = "field_min_length"
|
|
230
|
+
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
231
|
+
sodacl_check_dict = {
|
|
232
|
+
checks_for(model_name, quote_field_name): [
|
|
233
|
+
{
|
|
234
|
+
f"invalid_count({field_name_for_soda}) = 0": {
|
|
235
|
+
"name": check_key,
|
|
236
|
+
"valid min length": min_length,
|
|
237
|
+
},
|
|
238
|
+
}
|
|
239
|
+
]
|
|
240
|
+
}
|
|
241
|
+
return Check(
|
|
242
|
+
id=str(uuid.uuid4()),
|
|
243
|
+
key=check_key,
|
|
244
|
+
category="schema",
|
|
245
|
+
type=check_type,
|
|
246
|
+
name=f"Check that field {field_name} has a min length of {min_length}",
|
|
247
|
+
model=model_name,
|
|
248
|
+
field=field_name,
|
|
249
|
+
engine="soda",
|
|
250
|
+
language="sodacl",
|
|
251
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def check_field_max_length(model_name: str, field_name: str, max_length: int, quote_field_name: bool = False):
|
|
256
|
+
if quote_field_name:
|
|
257
|
+
field_name_for_soda = f'"{field_name}"'
|
|
258
|
+
else:
|
|
259
|
+
field_name_for_soda = field_name
|
|
260
|
+
|
|
261
|
+
check_type = "field_max_length"
|
|
262
|
+
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
263
|
+
sodacl_check_dict = {
|
|
264
|
+
checks_for(model_name, quote_field_name): [
|
|
265
|
+
{
|
|
266
|
+
f"invalid_count({field_name_for_soda}) = 0": {
|
|
267
|
+
"name": check_key,
|
|
268
|
+
"valid max length": max_length,
|
|
269
|
+
},
|
|
270
|
+
}
|
|
271
|
+
],
|
|
272
|
+
}
|
|
273
|
+
return Check(
|
|
274
|
+
id=str(uuid.uuid4()),
|
|
275
|
+
key=check_key,
|
|
276
|
+
category="schema",
|
|
277
|
+
type=check_type,
|
|
278
|
+
name=f"Check that field {field_name} has a max length of {max_length}",
|
|
279
|
+
model=model_name,
|
|
280
|
+
field=field_name,
|
|
281
|
+
engine="soda",
|
|
282
|
+
language="sodacl",
|
|
283
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def check_field_minimum(model_name: str, field_name: str, minimum: int, quote_field_name: bool = False):
|
|
288
|
+
if quote_field_name:
|
|
289
|
+
field_name_for_soda = f'"{field_name}"'
|
|
290
|
+
else:
|
|
291
|
+
field_name_for_soda = field_name
|
|
292
|
+
|
|
293
|
+
check_type = "field_minimum"
|
|
294
|
+
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
295
|
+
sodacl_check_dict = {
|
|
296
|
+
checks_for(model_name, quote_field_name): [
|
|
297
|
+
{
|
|
298
|
+
f"invalid_count({field_name_for_soda}) = 0": {
|
|
299
|
+
"name": check_key,
|
|
300
|
+
"valid min": minimum,
|
|
301
|
+
},
|
|
302
|
+
}
|
|
303
|
+
],
|
|
304
|
+
}
|
|
305
|
+
return Check(
|
|
306
|
+
id=str(uuid.uuid4()),
|
|
307
|
+
key=check_key,
|
|
308
|
+
category="schema",
|
|
309
|
+
type=check_type,
|
|
310
|
+
name=f"Check that field {field_name} has a minimum of {minimum}",
|
|
311
|
+
model=model_name,
|
|
312
|
+
field=field_name,
|
|
313
|
+
engine="soda",
|
|
314
|
+
language="sodacl",
|
|
315
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def check_field_maximum(model_name: str, field_name: str, maximum: int, quote_field_name: bool = False):
|
|
320
|
+
if quote_field_name:
|
|
321
|
+
field_name_for_soda = f'"{field_name}"'
|
|
322
|
+
else:
|
|
323
|
+
field_name_for_soda = field_name
|
|
324
|
+
|
|
325
|
+
check_type = "field_maximum"
|
|
326
|
+
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
327
|
+
sodacl_check_dict = {
|
|
328
|
+
checks_for(model_name, quote_field_name): [
|
|
329
|
+
{
|
|
330
|
+
f"invalid_count({field_name_for_soda}) = 0": {
|
|
331
|
+
"name": check_key,
|
|
332
|
+
"valid max": maximum,
|
|
333
|
+
},
|
|
334
|
+
}
|
|
335
|
+
],
|
|
336
|
+
}
|
|
337
|
+
return Check(
|
|
338
|
+
id=str(uuid.uuid4()),
|
|
339
|
+
key=check_key,
|
|
340
|
+
category="schema",
|
|
341
|
+
type=check_type,
|
|
342
|
+
name=f"Check that field {field_name} has a maximum of {maximum}",
|
|
343
|
+
model=model_name,
|
|
344
|
+
field=field_name,
|
|
345
|
+
engine="soda",
|
|
346
|
+
language="sodacl",
|
|
347
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def check_field_not_equal(model_name: str, field_name: str, value: int, quote_field_name: bool = False):
|
|
352
|
+
if quote_field_name:
|
|
353
|
+
field_name_for_soda = f'"{field_name}"'
|
|
354
|
+
else:
|
|
355
|
+
field_name_for_soda = field_name
|
|
356
|
+
|
|
357
|
+
check_type = "field_not_equal"
|
|
358
|
+
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
359
|
+
sodacl_check_dict = {
|
|
360
|
+
checks_for(model_name, quote_field_name): [
|
|
361
|
+
{
|
|
362
|
+
f"invalid_count({field_name_for_soda}) = 0": {
|
|
363
|
+
"name": check_key,
|
|
364
|
+
"invalid values": [value],
|
|
365
|
+
},
|
|
366
|
+
}
|
|
367
|
+
],
|
|
368
|
+
}
|
|
369
|
+
return Check(
|
|
370
|
+
id=str(uuid.uuid4()),
|
|
371
|
+
key=check_key,
|
|
372
|
+
category="schema",
|
|
373
|
+
type=check_type,
|
|
374
|
+
name=f"Check that field {field_name} is not equal to {value}",
|
|
375
|
+
model=model_name,
|
|
376
|
+
field=field_name,
|
|
377
|
+
engine="soda",
|
|
378
|
+
language="sodacl",
|
|
379
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def check_field_enum(model_name: str, field_name: str, enum: list, quote_field_name: bool = False):
|
|
384
|
+
if quote_field_name:
|
|
385
|
+
field_name_for_soda = f'"{field_name}"'
|
|
386
|
+
else:
|
|
387
|
+
field_name_for_soda = field_name
|
|
388
|
+
|
|
389
|
+
check_type = "field_enum"
|
|
390
|
+
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
391
|
+
sodacl_check_dict = {
|
|
392
|
+
checks_for(model_name, quote_field_name): [
|
|
393
|
+
{
|
|
394
|
+
f"invalid_count({field_name_for_soda}) = 0": {
|
|
395
|
+
"name": check_key,
|
|
396
|
+
"valid values": enum,
|
|
397
|
+
},
|
|
398
|
+
}
|
|
399
|
+
],
|
|
400
|
+
}
|
|
401
|
+
return Check(
|
|
402
|
+
id=str(uuid.uuid4()),
|
|
403
|
+
key=check_key,
|
|
404
|
+
category="schema",
|
|
405
|
+
type=check_type,
|
|
406
|
+
name=f"Check that field {field_name} only contains enum values {enum}",
|
|
407
|
+
model=model_name,
|
|
408
|
+
field=field_name,
|
|
409
|
+
engine="soda",
|
|
410
|
+
language="sodacl",
|
|
411
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def check_field_regex(model_name: str, field_name: str, pattern: str, quote_field_name: bool = False):
|
|
416
|
+
if quote_field_name:
|
|
417
|
+
field_name_for_soda = f'"{field_name}"'
|
|
418
|
+
else:
|
|
419
|
+
field_name_for_soda = field_name
|
|
420
|
+
|
|
421
|
+
check_type = "field_regex"
|
|
422
|
+
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
423
|
+
sodacl_check_dict = {
|
|
424
|
+
checks_for(model_name, quote_field_name): [
|
|
425
|
+
{
|
|
426
|
+
f"invalid_count({field_name_for_soda}) = 0": {
|
|
427
|
+
"name": check_key,
|
|
428
|
+
"valid regex": pattern,
|
|
429
|
+
},
|
|
430
|
+
}
|
|
431
|
+
],
|
|
432
|
+
}
|
|
433
|
+
return Check(
|
|
434
|
+
id=str(uuid.uuid4()),
|
|
435
|
+
key=check_key,
|
|
436
|
+
category="schema",
|
|
437
|
+
type=check_type,
|
|
438
|
+
name=f"Check that field {field_name} matches regex pattern {pattern}",
|
|
439
|
+
model=model_name,
|
|
440
|
+
field=field_name,
|
|
441
|
+
engine="soda",
|
|
442
|
+
language="sodacl",
|
|
443
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def check_quality_list(model_name, field_name, quality_list: List[Quality]) -> List[Check]:
|
|
448
|
+
checks: List[Check] = []
|
|
449
|
+
|
|
450
|
+
count = 0
|
|
451
|
+
for quality in quality_list:
|
|
452
|
+
if quality.type == "sql":
|
|
453
|
+
if field_name is None:
|
|
454
|
+
check_key = f"{model_name}__quality_sql_{count}"
|
|
455
|
+
check_type = "field_quality_sql"
|
|
456
|
+
else:
|
|
457
|
+
check_key = f"{model_name}__{field_name}__quality_sql_{count}"
|
|
458
|
+
check_type = "model_quality_sql"
|
|
459
|
+
threshold = to_sodacl_threshold(quality)
|
|
460
|
+
query = prepare_query(quality, model_name, field_name)
|
|
461
|
+
if query is None:
|
|
462
|
+
logger.warning(f"Quality check {check_key} has no query")
|
|
463
|
+
continue
|
|
464
|
+
if threshold is None:
|
|
465
|
+
logger.warning(f"Quality check {check_key} has no valid threshold")
|
|
466
|
+
continue
|
|
467
|
+
sodacl_check_dict = {
|
|
468
|
+
f"checks for {model_name}": [
|
|
469
|
+
{
|
|
470
|
+
f"{check_key} {threshold}": {
|
|
471
|
+
f"{check_key} query": query,
|
|
472
|
+
"name": check_key,
|
|
473
|
+
},
|
|
474
|
+
}
|
|
475
|
+
]
|
|
476
|
+
}
|
|
477
|
+
checks.append(
|
|
478
|
+
Check(
|
|
479
|
+
id=str(uuid.uuid4()),
|
|
480
|
+
key=check_key,
|
|
481
|
+
category="quality",
|
|
482
|
+
type=check_type,
|
|
483
|
+
name=quality.description if quality.description is not None else "Quality Check",
|
|
484
|
+
model=model_name,
|
|
485
|
+
field=field_name,
|
|
486
|
+
engine="soda",
|
|
487
|
+
language="sodacl",
|
|
488
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
489
|
+
)
|
|
490
|
+
)
|
|
491
|
+
count += 1
|
|
492
|
+
|
|
493
|
+
return checks
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def prepare_query(quality: Quality, model_name: str, field_name: str = None) -> str | None:
|
|
497
|
+
if quality.query is None:
|
|
498
|
+
return None
|
|
499
|
+
if quality.query == "":
|
|
500
|
+
return None
|
|
501
|
+
|
|
502
|
+
query = quality.query
|
|
503
|
+
|
|
504
|
+
query = query.replace("{model}", model_name)
|
|
505
|
+
query = query.replace("{table}", model_name)
|
|
506
|
+
|
|
507
|
+
if field_name is not None:
|
|
508
|
+
query = query.replace("{field}", field_name)
|
|
509
|
+
query = query.replace("{column}", field_name)
|
|
510
|
+
|
|
511
|
+
return query
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
def to_sodacl_threshold(quality: Quality) -> str | None:
|
|
515
|
+
if quality.mustBe is not None:
|
|
516
|
+
return f"= {quality.mustBe}"
|
|
517
|
+
if quality.mustNotBe is not None:
|
|
518
|
+
return f"!= {quality.mustNotBe}"
|
|
519
|
+
if quality.mustBeGreaterThan is not None:
|
|
520
|
+
return f"> {quality.mustBeGreaterThan}"
|
|
521
|
+
if quality.mustBeGreaterThanOrEqualTo is not None:
|
|
522
|
+
return f">= {quality.mustBeGreaterThanOrEqualTo}"
|
|
523
|
+
if quality.mustBeLessThan is not None:
|
|
524
|
+
return f"< {quality.mustBeLessThan}"
|
|
525
|
+
if quality.mustBeLessThanOrEqualTo is not None:
|
|
526
|
+
return f"<= {quality.mustBeLessThanOrEqualTo}"
|
|
527
|
+
if quality.mustBeBetween is not None:
|
|
528
|
+
if len(quality.mustBeBetween) != 2:
|
|
529
|
+
logger.warning(
|
|
530
|
+
f"Quality check has invalid mustBeBetween, must have exactly 2 integers in an array: {quality.mustBeBetween}"
|
|
531
|
+
)
|
|
532
|
+
return None
|
|
533
|
+
return f"between {quality.mustBeBetween[0]} and {quality.mustBeBetween[1]}"
|
|
534
|
+
if quality.mustNotBeBetween is not None:
|
|
535
|
+
if len(quality.mustNotBeBetween) != 2:
|
|
536
|
+
logger.warning(
|
|
537
|
+
f"Quality check has invalid mustNotBeBetween, must have exactly 2 integers in an array: {quality.mustNotBeBetween}"
|
|
538
|
+
)
|
|
539
|
+
return None
|
|
540
|
+
return f"not between {quality.mustNotBeBetween[0]} and {quality.mustNotBeBetween[1]}"
|
|
541
|
+
return None
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
def to_servicelevel_checks(data_contract_spec: DataContractSpecification) -> List[Check]:
|
|
545
|
+
checks: List[Check] = []
|
|
546
|
+
if data_contract_spec.servicelevels is None:
|
|
547
|
+
return checks
|
|
548
|
+
if data_contract_spec.servicelevels.freshness is not None:
|
|
549
|
+
checks.append(to_servicelevel_freshness_check(data_contract_spec))
|
|
550
|
+
if data_contract_spec.servicelevels.retention is not None:
|
|
551
|
+
checks.append(to_servicelevel_retention_check(data_contract_spec))
|
|
552
|
+
# only return checks that are not None
|
|
553
|
+
return [check for check in checks if check is not None]
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
def to_servicelevel_freshness_check(data_contract_spec: DataContractSpecification) -> Check | None:
|
|
557
|
+
if data_contract_spec.servicelevels.freshness.timestampField is None:
|
|
558
|
+
return None
|
|
559
|
+
freshness_threshold = data_contract_spec.servicelevels.freshness.threshold
|
|
560
|
+
if freshness_threshold is None:
|
|
561
|
+
logger.info("servicelevel.freshness.threshold is not defined")
|
|
562
|
+
return None
|
|
563
|
+
|
|
564
|
+
if not (
|
|
565
|
+
"d" in freshness_threshold
|
|
566
|
+
or "D" in freshness_threshold
|
|
567
|
+
or "h" in freshness_threshold
|
|
568
|
+
or "H" in freshness_threshold
|
|
569
|
+
or "m" in freshness_threshold
|
|
570
|
+
or "M" in freshness_threshold
|
|
571
|
+
):
|
|
572
|
+
logger.info("servicelevel.freshness.threshold must be in days, hours, or minutes (e.g., PT1H, or 1h)")
|
|
573
|
+
return None
|
|
574
|
+
timestamp_field_fully_qualified = data_contract_spec.servicelevels.freshness.timestampField
|
|
575
|
+
if "." not in timestamp_field_fully_qualified:
|
|
576
|
+
logger.info("servicelevel.freshness.timestampField is not fully qualified, skipping freshness check")
|
|
577
|
+
return None
|
|
578
|
+
if timestamp_field_fully_qualified.count(".") > 1:
|
|
579
|
+
logger.info(
|
|
580
|
+
"servicelevel.freshness.timestampField contains multiple dots, which is currently not supported, skipping freshness check"
|
|
581
|
+
)
|
|
582
|
+
return None
|
|
583
|
+
model_name = timestamp_field_fully_qualified.split(".")[0]
|
|
584
|
+
field_name = timestamp_field_fully_qualified.split(".")[1]
|
|
585
|
+
threshold = freshness_threshold
|
|
586
|
+
threshold = threshold.replace("P", "")
|
|
587
|
+
threshold = threshold.replace("T", "")
|
|
588
|
+
threshold = threshold.lower()
|
|
589
|
+
if model_name not in data_contract_spec.models:
|
|
590
|
+
logger.info(f"Model {model_name} not found in data_contract_spec.models, skipping freshness check")
|
|
591
|
+
return None
|
|
592
|
+
|
|
593
|
+
check_type = "servicelevel_freshness"
|
|
594
|
+
check_key = "servicelevel_freshness"
|
|
595
|
+
|
|
596
|
+
sodacl_check_dict = {
|
|
597
|
+
checks_for(model_name, False): [
|
|
598
|
+
{
|
|
599
|
+
f"freshness({field_name}) < {threshold}": {
|
|
600
|
+
"name": check_key,
|
|
601
|
+
},
|
|
602
|
+
}
|
|
603
|
+
]
|
|
604
|
+
}
|
|
605
|
+
return Check(
|
|
606
|
+
id=str(uuid.uuid4()),
|
|
607
|
+
key=check_key,
|
|
608
|
+
category="servicelevel",
|
|
609
|
+
type=check_type,
|
|
610
|
+
name="Freshness",
|
|
611
|
+
model=model_name,
|
|
612
|
+
engine="soda",
|
|
613
|
+
language="sodacl",
|
|
614
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
def to_servicelevel_retention_check(data_contract_spec) -> Check | None:
|
|
619
|
+
if data_contract_spec.servicelevels.retention is None:
|
|
620
|
+
return None
|
|
621
|
+
if data_contract_spec.servicelevels.retention.unlimited is True:
|
|
622
|
+
return None
|
|
623
|
+
if data_contract_spec.servicelevels.retention.timestampField is None:
|
|
624
|
+
logger.info("servicelevel.retention.timestampField is not defined")
|
|
625
|
+
return None
|
|
626
|
+
if data_contract_spec.servicelevels.retention.period is None:
|
|
627
|
+
logger.info("servicelevel.retention.period is not defined")
|
|
628
|
+
return None
|
|
629
|
+
timestamp_field_fully_qualified = data_contract_spec.servicelevels.retention.timestampField
|
|
630
|
+
if "." not in timestamp_field_fully_qualified:
|
|
631
|
+
logger.info("servicelevel.retention.timestampField is not fully qualified, skipping retention check")
|
|
632
|
+
return None
|
|
633
|
+
if timestamp_field_fully_qualified.count(".") > 1:
|
|
634
|
+
logger.info(
|
|
635
|
+
"servicelevel.retention.timestampField contains multiple dots, which is currently not supported, skipping retention check"
|
|
636
|
+
)
|
|
637
|
+
return None
|
|
638
|
+
|
|
639
|
+
model_name = timestamp_field_fully_qualified.split(".")[0]
|
|
640
|
+
field_name = timestamp_field_fully_qualified.split(".")[1]
|
|
641
|
+
period = data_contract_spec.servicelevels.retention.period
|
|
642
|
+
period_in_seconds = period_to_seconds(period)
|
|
643
|
+
if model_name not in data_contract_spec.models:
|
|
644
|
+
logger.info(f"Model {model_name} not found in data_contract_spec.models, skipping retention check")
|
|
645
|
+
return None
|
|
646
|
+
check_type = "servicelevel_retention"
|
|
647
|
+
check_key = "servicelevel_retention"
|
|
648
|
+
sodacl_check_dict = {
|
|
649
|
+
checks_for(model_name, False): [
|
|
650
|
+
{
|
|
651
|
+
f"orders_servicelevel_retention < {period_in_seconds}": {
|
|
652
|
+
"orders_servicelevel_retention expression": f"TIMESTAMPDIFF(SECOND, MIN({field_name}), CURRENT_TIMESTAMP)",
|
|
653
|
+
"name": check_key,
|
|
654
|
+
}
|
|
655
|
+
},
|
|
656
|
+
]
|
|
657
|
+
}
|
|
658
|
+
return Check(
|
|
659
|
+
id=str(uuid.uuid4()),
|
|
660
|
+
key=check_key,
|
|
661
|
+
category="servicelevel",
|
|
662
|
+
type=check_type,
|
|
663
|
+
name=f"Retention: Oldest entry has a max age of {period}",
|
|
664
|
+
model=model_name,
|
|
665
|
+
engine="soda",
|
|
666
|
+
language="sodacl",
|
|
667
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
def period_to_seconds(period: str) -> int | None:
|
|
672
|
+
import re
|
|
673
|
+
|
|
674
|
+
# if period is None:
|
|
675
|
+
# return None
|
|
676
|
+
# if period is in form "30d" or "24h" or "60m"
|
|
677
|
+
if re.match(r"^\d+[dhm]$", period):
|
|
678
|
+
if period[-1] == "d":
|
|
679
|
+
return int(period[:-1]) * 86400
|
|
680
|
+
if period[-1] == "h":
|
|
681
|
+
return int(period[:-1]) * 3600
|
|
682
|
+
if period[-1] == "m":
|
|
683
|
+
return int(period[:-1]) * 60
|
|
684
|
+
# if it is in iso period format (do not use isodate, can also be years)
|
|
685
|
+
iso_period_regex = re.compile(
|
|
686
|
+
r"P(?:(?P<years>\d+)Y)?(?:(?P<months>\d+)M)?(?:(?P<days>\d+)D)?"
|
|
687
|
+
r"(?:T(?:(?P<hours>\d+)H)?(?:(?P<minutes>\d+)M)?(?:(?P<seconds>\d+)S)?)?"
|
|
688
|
+
)
|
|
689
|
+
match = iso_period_regex.match(period)
|
|
690
|
+
if match:
|
|
691
|
+
years = int(match.group("years") or 0)
|
|
692
|
+
months = int(match.group("months") or 0)
|
|
693
|
+
days = int(match.group("days") or 0)
|
|
694
|
+
hours = int(match.group("hours") or 0)
|
|
695
|
+
minutes = int(match.group("minutes") or 0)
|
|
696
|
+
seconds = int(match.group("seconds") or 0)
|
|
697
|
+
|
|
698
|
+
# Convert everything to seconds
|
|
699
|
+
total_seconds = (
|
|
700
|
+
years * 365 * 86400 # Approximate conversion of years to seconds
|
|
701
|
+
+ months * 30 * 86400 # Approximate conversion of months to seconds
|
|
702
|
+
+ days * 86400
|
|
703
|
+
+ hours * 3600
|
|
704
|
+
+ minutes * 60
|
|
705
|
+
+ seconds
|
|
706
|
+
)
|
|
707
|
+
return total_seconds
|
|
708
|
+
|
|
709
|
+
return None
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
# These are deprecated root-level quality specifications, use the model-level and field-level quality fields instead
|
|
713
|
+
def to_quality_check(data_contract_spec) -> Check | None:
|
|
714
|
+
if data_contract_spec.quality is None:
|
|
715
|
+
return None
|
|
716
|
+
if data_contract_spec.quality.type is None:
|
|
717
|
+
return None
|
|
718
|
+
if data_contract_spec.quality.type.lower() != "sodacl":
|
|
719
|
+
return None
|
|
720
|
+
if isinstance(data_contract_spec.quality.specification, str):
|
|
721
|
+
quality_specification = yaml.safe_load(data_contract_spec.quality.specification)
|
|
722
|
+
else:
|
|
723
|
+
quality_specification = data_contract_spec.quality.specification
|
|
724
|
+
|
|
725
|
+
return Check(
|
|
726
|
+
id=str(uuid.uuid4()),
|
|
727
|
+
key="quality__sodacl",
|
|
728
|
+
category="quality",
|
|
729
|
+
type="quality",
|
|
730
|
+
name="Quality Check",
|
|
731
|
+
model=None,
|
|
732
|
+
engine="soda",
|
|
733
|
+
language="sodacl",
|
|
734
|
+
implementation=yaml.dump(quality_specification),
|
|
735
|
+
)
|