datacontract-cli 0.10.21__py3-none-any.whl → 0.10.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (29) hide show
  1. datacontract/breaking/breaking.py +1 -1
  2. datacontract/breaking/breaking_rules.py +1 -1
  3. datacontract/cli.py +5 -5
  4. datacontract/data_contract.py +14 -100
  5. datacontract/engines/data_contract_checks.py +735 -0
  6. datacontract/engines/data_contract_test.py +51 -0
  7. datacontract/engines/soda/check_soda_execute.py +36 -30
  8. datacontract/engines/soda/connections/kafka.py +8 -3
  9. datacontract/export/avro_converter.py +2 -0
  10. datacontract/export/exporter.py +0 -2
  11. datacontract/export/exporter_factory.py +0 -12
  12. datacontract/export/sodacl_converter.py +22 -294
  13. datacontract/export/sql_type_converter.py +7 -2
  14. datacontract/imports/odcs_importer.py +6 -3
  15. datacontract/imports/odcs_v3_importer.py +2 -0
  16. datacontract/imports/sql_importer.py +229 -29
  17. datacontract/model/exceptions.py +4 -1
  18. datacontract/model/run.py +11 -4
  19. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.22.dist-info}/METADATA +139 -166
  20. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.22.dist-info}/RECORD +25 -27
  21. datacontract/engines/soda/connections/dask.py +0 -28
  22. datacontract/export/odcs_v2_exporter.py +0 -124
  23. datacontract/imports/odcs_v2_importer.py +0 -177
  24. datacontract/lint/linters/example_model_linter.py +0 -91
  25. /datacontract/{model → breaking}/breaking_change.py +0 -0
  26. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.22.dist-info}/LICENSE +0 -0
  27. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.22.dist-info}/WHEEL +0 -0
  28. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.22.dist-info}/entry_points.txt +0 -0
  29. {datacontract_cli-0.10.21.dist-info → datacontract_cli-0.10.22.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,735 @@
1
+ import uuid
2
+ from typing import List
3
+ from venv import logger
4
+
5
+ import yaml
6
+
7
+ from datacontract.export.sql_type_converter import convert_to_sql_type
8
+ from datacontract.model.data_contract_specification import DataContractSpecification, Quality, Server
9
+ from datacontract.model.run import Check
10
+
11
+
12
+ def create_checks(data_contract_spec: DataContractSpecification, server: Server) -> List[Check]:
13
+ checks: List[Check] = []
14
+ for model_key, model_value in data_contract_spec.models.items():
15
+ model_checks = to_model_checks(model_key, model_value, server)
16
+ checks.extend(model_checks)
17
+ checks.extend(to_servicelevel_checks(data_contract_spec))
18
+ checks.append(to_quality_check(data_contract_spec))
19
+ return [check for check in checks if check is not None]
20
+
21
+
22
+ def to_model_checks(model_key, model_value, server: Server) -> List[Check]:
23
+ checks: List[Check] = []
24
+ server_type = server.type if server and server.type else None
25
+ model_name = to_model_name(model_key, model_value, server_type)
26
+ fields = model_value.fields
27
+
28
+ check_types = is_check_types(server)
29
+ quote_field_name = server_type in ["postgres", "sqlserver"]
30
+
31
+ for field_name, field in fields.items():
32
+ checks.append(check_field_is_present(model_name, field_name, quote_field_name))
33
+ if check_types and field.type is not None:
34
+ sql_type = convert_to_sql_type(field, server_type)
35
+ checks.append(check_field_type(model_name, field_name, sql_type, quote_field_name))
36
+ if field.required:
37
+ checks.append(check_field_required(model_name, field_name, quote_field_name))
38
+ if field.unique:
39
+ checks.append(check_field_unique(model_name, field_name, quote_field_name))
40
+ if field.minLength is not None:
41
+ checks.append(check_field_min_length(model_name, field_name, field.minLength, quote_field_name))
42
+ if field.maxLength is not None:
43
+ checks.append(check_field_max_length(model_name, field_name, field.maxLength, quote_field_name))
44
+ if field.minimum is not None:
45
+ checks.append(check_field_minimum(model_name, field_name, field.minimum, quote_field_name))
46
+ if field.maximum is not None:
47
+ checks.append(check_field_maximum(model_name, field_name, field.maximum, quote_field_name))
48
+ if field.exclusiveMinimum is not None:
49
+ checks.append(check_field_minimum(model_name, field_name, field.exclusiveMinimum, quote_field_name))
50
+ checks.append(check_field_not_equal(model_name, field_name, field.exclusiveMinimum, quote_field_name))
51
+ if field.exclusiveMaximum is not None:
52
+ checks.append(check_field_maximum(model_name, field_name, field.exclusiveMaximum, quote_field_name))
53
+ checks.append(check_field_not_equal(model_name, field_name, field.exclusiveMaximum, quote_field_name))
54
+ if field.pattern is not None:
55
+ checks.append(check_field_regex(model_name, field_name, field.pattern, quote_field_name))
56
+ if field.enum is not None and len(field.enum) > 0:
57
+ checks.append(check_field_enum(model_name, field_name, field.enum, quote_field_name))
58
+ if field.quality is not None and len(field.quality) > 0:
59
+ quality_list = check_quality_list(model_name, field_name, field.quality)
60
+ if (quality_list is not None) and len(quality_list) > 0:
61
+ checks.extend(quality_list)
62
+ # TODO references: str = None
63
+ # TODO format
64
+
65
+ if model_value.quality is not None and len(model_value.quality) > 0:
66
+ quality_list = check_quality_list(model_name, None, model_value.quality)
67
+ if (quality_list is not None) and len(quality_list) > 0:
68
+ checks.extend(quality_list)
69
+
70
+ return checks
71
+
72
+
73
+ def checks_for(model_name, quote_field_name):
74
+ if quote_field_name:
75
+ return f'checks for "{model_name}"'
76
+ return f"checks for {model_name}"
77
+
78
+
79
+ def is_check_types(server: Server) -> bool:
80
+ if server is None:
81
+ return True
82
+ return server.format != "json" and server.format != "csv" and server.format != "avro"
83
+
84
+
85
+ def to_model_name(model_key, model_value, server_type):
86
+ if server_type == "databricks":
87
+ if model_value.config is not None and "databricksTable" in model_value.config:
88
+ return model_value.config["databricksTable"]
89
+ if server_type == "snowflake":
90
+ if model_value.config is not None and "snowflakeTable" in model_value.config:
91
+ return model_value.config["snowflakeTable"]
92
+ if server_type == "sqlserver":
93
+ if model_value.config is not None and "sqlserverTable" in model_value.config:
94
+ return model_value.config["sqlserverTable"]
95
+ if server_type == "postgres" or server_type == "postgresql":
96
+ if model_value.config is not None and "postgresTable" in model_value.config:
97
+ return model_value.config["postgresTable"]
98
+ return model_key
99
+
100
+
101
+ def check_field_is_present(model_name, field_name, quote_field_name: bool) -> Check:
102
+ check_type = "field_is_present"
103
+ check_key = f"{model_name}__{field_name}__{check_type}"
104
+ sodacl_check_dict = {
105
+ checks_for(model_name, quote_field_name): [
106
+ {
107
+ "schema": {
108
+ "name": check_key,
109
+ "fail": {
110
+ "when required column missing": [field_name],
111
+ },
112
+ }
113
+ }
114
+ ]
115
+ }
116
+ return Check(
117
+ id=str(uuid.uuid4()),
118
+ key=check_key,
119
+ category="schema",
120
+ type=check_type,
121
+ name=f"Check that field '{field_name}' is present",
122
+ model=model_name,
123
+ field=field_name,
124
+ engine="soda",
125
+ language="sodacl",
126
+ implementation=yaml.dump(sodacl_check_dict),
127
+ )
128
+
129
+
130
+ def check_field_type(model_name: str, field_name: str, expected_type: str, quote_field_name: bool = False):
131
+ check_type = "field_type"
132
+ check_key = f"{model_name}__{field_name}__{check_type}"
133
+ sodacl_check_dict = {
134
+ checks_for(model_name, quote_field_name): [
135
+ {
136
+ "schema": {
137
+ "name": check_key,
138
+ "fail": {
139
+ "when wrong column type": {
140
+ field_name: expected_type,
141
+ },
142
+ },
143
+ }
144
+ }
145
+ ]
146
+ }
147
+ return Check(
148
+ id=str(uuid.uuid4()),
149
+ key=check_key,
150
+ category="schema",
151
+ type=check_type,
152
+ name=f"Check that field {field_name} has type {expected_type}",
153
+ model=model_name,
154
+ field=field_name,
155
+ engine="soda",
156
+ language="sodacl",
157
+ implementation=yaml.dump(sodacl_check_dict),
158
+ )
159
+
160
+
161
+ def check_field_required(model_name: str, field_name: str, quote_field_name: bool = False):
162
+ if quote_field_name:
163
+ field_name_for_soda = f'"{field_name}"'
164
+ else:
165
+ field_name_for_soda = field_name
166
+
167
+ check_type = "field_required"
168
+ check_key = f"{model_name}__{field_name}__{check_type}"
169
+ sodacl_check_dict = {
170
+ checks_for(model_name, quote_field_name): [
171
+ {
172
+ f"missing_count({field_name_for_soda}) = 0": {
173
+ "name": check_key,
174
+ },
175
+ }
176
+ ],
177
+ }
178
+ return Check(
179
+ id=str(uuid.uuid4()),
180
+ key=check_key,
181
+ category="schema",
182
+ type=check_type,
183
+ name=f"Check that field {field_name} has no missing values",
184
+ model=model_name,
185
+ field=field_name,
186
+ engine="soda",
187
+ language="sodacl",
188
+ implementation=yaml.dump(sodacl_check_dict),
189
+ )
190
+
191
+
192
+ def check_field_unique(model_name: str, field_name: str, quote_field_name: bool = False):
193
+ if quote_field_name:
194
+ field_name_for_soda = f'"{field_name}"'
195
+ else:
196
+ field_name_for_soda = field_name
197
+
198
+ check_type = "field_unique"
199
+ check_key = f"{model_name}__{field_name}__{check_type}"
200
+ sodacl_check_dict = {
201
+ checks_for(model_name, quote_field_name): [
202
+ {
203
+ f"duplicate_count({field_name_for_soda}) = 0": {
204
+ "name": check_key,
205
+ },
206
+ }
207
+ ],
208
+ }
209
+ return Check(
210
+ id=str(uuid.uuid4()),
211
+ key=check_key,
212
+ category="schema",
213
+ type=check_type,
214
+ name=f"Check that unique field {field_name} has no duplicate values",
215
+ model=model_name,
216
+ field=field_name,
217
+ engine="soda",
218
+ language="sodacl",
219
+ implementation=yaml.dump(sodacl_check_dict),
220
+ )
221
+
222
+
223
+ def check_field_min_length(model_name: str, field_name: str, min_length: int, quote_field_name: bool = False):
224
+ if quote_field_name:
225
+ field_name_for_soda = f'"{field_name}"'
226
+ else:
227
+ field_name_for_soda = field_name
228
+
229
+ check_type = "field_min_length"
230
+ check_key = f"{model_name}__{field_name}__{check_type}"
231
+ sodacl_check_dict = {
232
+ checks_for(model_name, quote_field_name): [
233
+ {
234
+ f"invalid_count({field_name_for_soda}) = 0": {
235
+ "name": check_key,
236
+ "valid min length": min_length,
237
+ },
238
+ }
239
+ ]
240
+ }
241
+ return Check(
242
+ id=str(uuid.uuid4()),
243
+ key=check_key,
244
+ category="schema",
245
+ type=check_type,
246
+ name=f"Check that field {field_name} has a min length of {min_length}",
247
+ model=model_name,
248
+ field=field_name,
249
+ engine="soda",
250
+ language="sodacl",
251
+ implementation=yaml.dump(sodacl_check_dict),
252
+ )
253
+
254
+
255
+ def check_field_max_length(model_name: str, field_name: str, max_length: int, quote_field_name: bool = False):
256
+ if quote_field_name:
257
+ field_name_for_soda = f'"{field_name}"'
258
+ else:
259
+ field_name_for_soda = field_name
260
+
261
+ check_type = "field_max_length"
262
+ check_key = f"{model_name}__{field_name}__{check_type}"
263
+ sodacl_check_dict = {
264
+ checks_for(model_name, quote_field_name): [
265
+ {
266
+ f"invalid_count({field_name_for_soda}) = 0": {
267
+ "name": check_key,
268
+ "valid max length": max_length,
269
+ },
270
+ }
271
+ ],
272
+ }
273
+ return Check(
274
+ id=str(uuid.uuid4()),
275
+ key=check_key,
276
+ category="schema",
277
+ type=check_type,
278
+ name=f"Check that field {field_name} has a max length of {max_length}",
279
+ model=model_name,
280
+ field=field_name,
281
+ engine="soda",
282
+ language="sodacl",
283
+ implementation=yaml.dump(sodacl_check_dict),
284
+ )
285
+
286
+
287
+ def check_field_minimum(model_name: str, field_name: str, minimum: int, quote_field_name: bool = False):
288
+ if quote_field_name:
289
+ field_name_for_soda = f'"{field_name}"'
290
+ else:
291
+ field_name_for_soda = field_name
292
+
293
+ check_type = "field_minimum"
294
+ check_key = f"{model_name}__{field_name}__{check_type}"
295
+ sodacl_check_dict = {
296
+ checks_for(model_name, quote_field_name): [
297
+ {
298
+ f"invalid_count({field_name_for_soda}) = 0": {
299
+ "name": check_key,
300
+ "valid min": minimum,
301
+ },
302
+ }
303
+ ],
304
+ }
305
+ return Check(
306
+ id=str(uuid.uuid4()),
307
+ key=check_key,
308
+ category="schema",
309
+ type=check_type,
310
+ name=f"Check that field {field_name} has a minimum of {minimum}",
311
+ model=model_name,
312
+ field=field_name,
313
+ engine="soda",
314
+ language="sodacl",
315
+ implementation=yaml.dump(sodacl_check_dict),
316
+ )
317
+
318
+
319
+ def check_field_maximum(model_name: str, field_name: str, maximum: int, quote_field_name: bool = False):
320
+ if quote_field_name:
321
+ field_name_for_soda = f'"{field_name}"'
322
+ else:
323
+ field_name_for_soda = field_name
324
+
325
+ check_type = "field_maximum"
326
+ check_key = f"{model_name}__{field_name}__{check_type}"
327
+ sodacl_check_dict = {
328
+ checks_for(model_name, quote_field_name): [
329
+ {
330
+ f"invalid_count({field_name_for_soda}) = 0": {
331
+ "name": check_key,
332
+ "valid max": maximum,
333
+ },
334
+ }
335
+ ],
336
+ }
337
+ return Check(
338
+ id=str(uuid.uuid4()),
339
+ key=check_key,
340
+ category="schema",
341
+ type=check_type,
342
+ name=f"Check that field {field_name} has a maximum of {maximum}",
343
+ model=model_name,
344
+ field=field_name,
345
+ engine="soda",
346
+ language="sodacl",
347
+ implementation=yaml.dump(sodacl_check_dict),
348
+ )
349
+
350
+
351
+ def check_field_not_equal(model_name: str, field_name: str, value: int, quote_field_name: bool = False):
352
+ if quote_field_name:
353
+ field_name_for_soda = f'"{field_name}"'
354
+ else:
355
+ field_name_for_soda = field_name
356
+
357
+ check_type = "field_not_equal"
358
+ check_key = f"{model_name}__{field_name}__{check_type}"
359
+ sodacl_check_dict = {
360
+ checks_for(model_name, quote_field_name): [
361
+ {
362
+ f"invalid_count({field_name_for_soda}) = 0": {
363
+ "name": check_key,
364
+ "invalid values": [value],
365
+ },
366
+ }
367
+ ],
368
+ }
369
+ return Check(
370
+ id=str(uuid.uuid4()),
371
+ key=check_key,
372
+ category="schema",
373
+ type=check_type,
374
+ name=f"Check that field {field_name} is not equal to {value}",
375
+ model=model_name,
376
+ field=field_name,
377
+ engine="soda",
378
+ language="sodacl",
379
+ implementation=yaml.dump(sodacl_check_dict),
380
+ )
381
+
382
+
383
+ def check_field_enum(model_name: str, field_name: str, enum: list, quote_field_name: bool = False):
384
+ if quote_field_name:
385
+ field_name_for_soda = f'"{field_name}"'
386
+ else:
387
+ field_name_for_soda = field_name
388
+
389
+ check_type = "field_enum"
390
+ check_key = f"{model_name}__{field_name}__{check_type}"
391
+ sodacl_check_dict = {
392
+ checks_for(model_name, quote_field_name): [
393
+ {
394
+ f"invalid_count({field_name_for_soda}) = 0": {
395
+ "name": check_key,
396
+ "valid values": enum,
397
+ },
398
+ }
399
+ ],
400
+ }
401
+ return Check(
402
+ id=str(uuid.uuid4()),
403
+ key=check_key,
404
+ category="schema",
405
+ type=check_type,
406
+ name=f"Check that field {field_name} only contains enum values {enum}",
407
+ model=model_name,
408
+ field=field_name,
409
+ engine="soda",
410
+ language="sodacl",
411
+ implementation=yaml.dump(sodacl_check_dict),
412
+ )
413
+
414
+
415
+ def check_field_regex(model_name: str, field_name: str, pattern: str, quote_field_name: bool = False):
416
+ if quote_field_name:
417
+ field_name_for_soda = f'"{field_name}"'
418
+ else:
419
+ field_name_for_soda = field_name
420
+
421
+ check_type = "field_regex"
422
+ check_key = f"{model_name}__{field_name}__{check_type}"
423
+ sodacl_check_dict = {
424
+ checks_for(model_name, quote_field_name): [
425
+ {
426
+ f"invalid_count({field_name_for_soda}) = 0": {
427
+ "name": check_key,
428
+ "valid regex": pattern,
429
+ },
430
+ }
431
+ ],
432
+ }
433
+ return Check(
434
+ id=str(uuid.uuid4()),
435
+ key=check_key,
436
+ category="schema",
437
+ type=check_type,
438
+ name=f"Check that field {field_name} matches regex pattern {pattern}",
439
+ model=model_name,
440
+ field=field_name,
441
+ engine="soda",
442
+ language="sodacl",
443
+ implementation=yaml.dump(sodacl_check_dict),
444
+ )
445
+
446
+
447
+ def check_quality_list(model_name, field_name, quality_list: List[Quality]) -> List[Check]:
448
+ checks: List[Check] = []
449
+
450
+ count = 0
451
+ for quality in quality_list:
452
+ if quality.type == "sql":
453
+ if field_name is None:
454
+ check_key = f"{model_name}__quality_sql_{count}"
455
+ check_type = "field_quality_sql"
456
+ else:
457
+ check_key = f"{model_name}__{field_name}__quality_sql_{count}"
458
+ check_type = "model_quality_sql"
459
+ threshold = to_sodacl_threshold(quality)
460
+ query = prepare_query(quality, model_name, field_name)
461
+ if query is None:
462
+ logger.warning(f"Quality check {check_key} has no query")
463
+ continue
464
+ if threshold is None:
465
+ logger.warning(f"Quality check {check_key} has no valid threshold")
466
+ continue
467
+ sodacl_check_dict = {
468
+ f"checks for {model_name}": [
469
+ {
470
+ f"{check_key} {threshold}": {
471
+ f"{check_key} query": query,
472
+ "name": check_key,
473
+ },
474
+ }
475
+ ]
476
+ }
477
+ checks.append(
478
+ Check(
479
+ id=str(uuid.uuid4()),
480
+ key=check_key,
481
+ category="quality",
482
+ type=check_type,
483
+ name=quality.description if quality.description is not None else "Quality Check",
484
+ model=model_name,
485
+ field=field_name,
486
+ engine="soda",
487
+ language="sodacl",
488
+ implementation=yaml.dump(sodacl_check_dict),
489
+ )
490
+ )
491
+ count += 1
492
+
493
+ return checks
494
+
495
+
496
+ def prepare_query(quality: Quality, model_name: str, field_name: str = None) -> str | None:
497
+ if quality.query is None:
498
+ return None
499
+ if quality.query == "":
500
+ return None
501
+
502
+ query = quality.query
503
+
504
+ query = query.replace("{model}", model_name)
505
+ query = query.replace("{table}", model_name)
506
+
507
+ if field_name is not None:
508
+ query = query.replace("{field}", field_name)
509
+ query = query.replace("{column}", field_name)
510
+
511
+ return query
512
+
513
+
514
+ def to_sodacl_threshold(quality: Quality) -> str | None:
515
+ if quality.mustBe is not None:
516
+ return f"= {quality.mustBe}"
517
+ if quality.mustNotBe is not None:
518
+ return f"!= {quality.mustNotBe}"
519
+ if quality.mustBeGreaterThan is not None:
520
+ return f"> {quality.mustBeGreaterThan}"
521
+ if quality.mustBeGreaterThanOrEqualTo is not None:
522
+ return f">= {quality.mustBeGreaterThanOrEqualTo}"
523
+ if quality.mustBeLessThan is not None:
524
+ return f"< {quality.mustBeLessThan}"
525
+ if quality.mustBeLessThanOrEqualTo is not None:
526
+ return f"<= {quality.mustBeLessThanOrEqualTo}"
527
+ if quality.mustBeBetween is not None:
528
+ if len(quality.mustBeBetween) != 2:
529
+ logger.warning(
530
+ f"Quality check has invalid mustBeBetween, must have exactly 2 integers in an array: {quality.mustBeBetween}"
531
+ )
532
+ return None
533
+ return f"between {quality.mustBeBetween[0]} and {quality.mustBeBetween[1]}"
534
+ if quality.mustNotBeBetween is not None:
535
+ if len(quality.mustNotBeBetween) != 2:
536
+ logger.warning(
537
+ f"Quality check has invalid mustNotBeBetween, must have exactly 2 integers in an array: {quality.mustNotBeBetween}"
538
+ )
539
+ return None
540
+ return f"not between {quality.mustNotBeBetween[0]} and {quality.mustNotBeBetween[1]}"
541
+ return None
542
+
543
+
544
+ def to_servicelevel_checks(data_contract_spec: DataContractSpecification) -> List[Check]:
545
+ checks: List[Check] = []
546
+ if data_contract_spec.servicelevels is None:
547
+ return checks
548
+ if data_contract_spec.servicelevels.freshness is not None:
549
+ checks.append(to_servicelevel_freshness_check(data_contract_spec))
550
+ if data_contract_spec.servicelevels.retention is not None:
551
+ checks.append(to_servicelevel_retention_check(data_contract_spec))
552
+ # only return checks that are not None
553
+ return [check for check in checks if check is not None]
554
+
555
+
556
+ def to_servicelevel_freshness_check(data_contract_spec: DataContractSpecification) -> Check | None:
557
+ if data_contract_spec.servicelevels.freshness.timestampField is None:
558
+ return None
559
+ freshness_threshold = data_contract_spec.servicelevels.freshness.threshold
560
+ if freshness_threshold is None:
561
+ logger.info("servicelevel.freshness.threshold is not defined")
562
+ return None
563
+
564
+ if not (
565
+ "d" in freshness_threshold
566
+ or "D" in freshness_threshold
567
+ or "h" in freshness_threshold
568
+ or "H" in freshness_threshold
569
+ or "m" in freshness_threshold
570
+ or "M" in freshness_threshold
571
+ ):
572
+ logger.info("servicelevel.freshness.threshold must be in days, hours, or minutes (e.g., PT1H, or 1h)")
573
+ return None
574
+ timestamp_field_fully_qualified = data_contract_spec.servicelevels.freshness.timestampField
575
+ if "." not in timestamp_field_fully_qualified:
576
+ logger.info("servicelevel.freshness.timestampField is not fully qualified, skipping freshness check")
577
+ return None
578
+ if timestamp_field_fully_qualified.count(".") > 1:
579
+ logger.info(
580
+ "servicelevel.freshness.timestampField contains multiple dots, which is currently not supported, skipping freshness check"
581
+ )
582
+ return None
583
+ model_name = timestamp_field_fully_qualified.split(".")[0]
584
+ field_name = timestamp_field_fully_qualified.split(".")[1]
585
+ threshold = freshness_threshold
586
+ threshold = threshold.replace("P", "")
587
+ threshold = threshold.replace("T", "")
588
+ threshold = threshold.lower()
589
+ if model_name not in data_contract_spec.models:
590
+ logger.info(f"Model {model_name} not found in data_contract_spec.models, skipping freshness check")
591
+ return None
592
+
593
+ check_type = "servicelevel_freshness"
594
+ check_key = "servicelevel_freshness"
595
+
596
+ sodacl_check_dict = {
597
+ checks_for(model_name, False): [
598
+ {
599
+ f"freshness({field_name}) < {threshold}": {
600
+ "name": check_key,
601
+ },
602
+ }
603
+ ]
604
+ }
605
+ return Check(
606
+ id=str(uuid.uuid4()),
607
+ key=check_key,
608
+ category="servicelevel",
609
+ type=check_type,
610
+ name="Freshness",
611
+ model=model_name,
612
+ engine="soda",
613
+ language="sodacl",
614
+ implementation=yaml.dump(sodacl_check_dict),
615
+ )
616
+
617
+
618
+ def to_servicelevel_retention_check(data_contract_spec) -> Check | None:
619
+ if data_contract_spec.servicelevels.retention is None:
620
+ return None
621
+ if data_contract_spec.servicelevels.retention.unlimited is True:
622
+ return None
623
+ if data_contract_spec.servicelevels.retention.timestampField is None:
624
+ logger.info("servicelevel.retention.timestampField is not defined")
625
+ return None
626
+ if data_contract_spec.servicelevels.retention.period is None:
627
+ logger.info("servicelevel.retention.period is not defined")
628
+ return None
629
+ timestamp_field_fully_qualified = data_contract_spec.servicelevels.retention.timestampField
630
+ if "." not in timestamp_field_fully_qualified:
631
+ logger.info("servicelevel.retention.timestampField is not fully qualified, skipping retention check")
632
+ return None
633
+ if timestamp_field_fully_qualified.count(".") > 1:
634
+ logger.info(
635
+ "servicelevel.retention.timestampField contains multiple dots, which is currently not supported, skipping retention check"
636
+ )
637
+ return None
638
+
639
+ model_name = timestamp_field_fully_qualified.split(".")[0]
640
+ field_name = timestamp_field_fully_qualified.split(".")[1]
641
+ period = data_contract_spec.servicelevels.retention.period
642
+ period_in_seconds = period_to_seconds(period)
643
+ if model_name not in data_contract_spec.models:
644
+ logger.info(f"Model {model_name} not found in data_contract_spec.models, skipping retention check")
645
+ return None
646
+ check_type = "servicelevel_retention"
647
+ check_key = "servicelevel_retention"
648
+ sodacl_check_dict = {
649
+ checks_for(model_name, False): [
650
+ {
651
+ f"orders_servicelevel_retention < {period_in_seconds}": {
652
+ "orders_servicelevel_retention expression": f"TIMESTAMPDIFF(SECOND, MIN({field_name}), CURRENT_TIMESTAMP)",
653
+ "name": check_key,
654
+ }
655
+ },
656
+ ]
657
+ }
658
+ return Check(
659
+ id=str(uuid.uuid4()),
660
+ key=check_key,
661
+ category="servicelevel",
662
+ type=check_type,
663
+ name=f"Retention: Oldest entry has a max age of {period}",
664
+ model=model_name,
665
+ engine="soda",
666
+ language="sodacl",
667
+ implementation=yaml.dump(sodacl_check_dict),
668
+ )
669
+
670
+
671
+ def period_to_seconds(period: str) -> int | None:
672
+ import re
673
+
674
+ # if period is None:
675
+ # return None
676
+ # if period is in form "30d" or "24h" or "60m"
677
+ if re.match(r"^\d+[dhm]$", period):
678
+ if period[-1] == "d":
679
+ return int(period[:-1]) * 86400
680
+ if period[-1] == "h":
681
+ return int(period[:-1]) * 3600
682
+ if period[-1] == "m":
683
+ return int(period[:-1]) * 60
684
+ # if it is in iso period format (do not use isodate, can also be years)
685
+ iso_period_regex = re.compile(
686
+ r"P(?:(?P<years>\d+)Y)?(?:(?P<months>\d+)M)?(?:(?P<days>\d+)D)?"
687
+ r"(?:T(?:(?P<hours>\d+)H)?(?:(?P<minutes>\d+)M)?(?:(?P<seconds>\d+)S)?)?"
688
+ )
689
+ match = iso_period_regex.match(period)
690
+ if match:
691
+ years = int(match.group("years") or 0)
692
+ months = int(match.group("months") or 0)
693
+ days = int(match.group("days") or 0)
694
+ hours = int(match.group("hours") or 0)
695
+ minutes = int(match.group("minutes") or 0)
696
+ seconds = int(match.group("seconds") or 0)
697
+
698
+ # Convert everything to seconds
699
+ total_seconds = (
700
+ years * 365 * 86400 # Approximate conversion of years to seconds
701
+ + months * 30 * 86400 # Approximate conversion of months to seconds
702
+ + days * 86400
703
+ + hours * 3600
704
+ + minutes * 60
705
+ + seconds
706
+ )
707
+ return total_seconds
708
+
709
+ return None
710
+
711
+
712
+ # These are deprecated root-level quality specifications, use the model-level and field-level quality fields instead
713
+ def to_quality_check(data_contract_spec) -> Check | None:
714
+ if data_contract_spec.quality is None:
715
+ return None
716
+ if data_contract_spec.quality.type is None:
717
+ return None
718
+ if data_contract_spec.quality.type.lower() != "sodacl":
719
+ return None
720
+ if isinstance(data_contract_spec.quality.specification, str):
721
+ quality_specification = yaml.safe_load(data_contract_spec.quality.specification)
722
+ else:
723
+ quality_specification = data_contract_spec.quality.specification
724
+
725
+ return Check(
726
+ id=str(uuid.uuid4()),
727
+ key="quality__sodacl",
728
+ category="quality",
729
+ type="quality",
730
+ name="Quality Check",
731
+ model=None,
732
+ engine="soda",
733
+ language="sodacl",
734
+ implementation=yaml.dump(quality_specification),
735
+ )