datacontract-cli 0.10.0__py3-none-any.whl → 0.10.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. datacontract/__init__.py +13 -0
  2. datacontract/api.py +260 -0
  3. datacontract/breaking/breaking.py +242 -12
  4. datacontract/breaking/breaking_rules.py +37 -1
  5. datacontract/catalog/catalog.py +80 -0
  6. datacontract/cli.py +387 -117
  7. datacontract/data_contract.py +216 -353
  8. datacontract/engines/data_contract_checks.py +1041 -0
  9. datacontract/engines/data_contract_test.py +113 -0
  10. datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +2 -3
  11. datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
  12. datacontract/engines/fastjsonschema/check_jsonschema.py +176 -42
  13. datacontract/engines/fastjsonschema/s3/s3_read_files.py +16 -1
  14. datacontract/engines/soda/check_soda_execute.py +100 -56
  15. datacontract/engines/soda/connections/athena.py +79 -0
  16. datacontract/engines/soda/connections/bigquery.py +8 -1
  17. datacontract/engines/soda/connections/databricks.py +12 -3
  18. datacontract/engines/soda/connections/duckdb_connection.py +241 -0
  19. datacontract/engines/soda/connections/kafka.py +206 -113
  20. datacontract/engines/soda/connections/snowflake.py +8 -5
  21. datacontract/engines/soda/connections/sqlserver.py +43 -0
  22. datacontract/engines/soda/connections/trino.py +26 -0
  23. datacontract/export/avro_converter.py +72 -8
  24. datacontract/export/avro_idl_converter.py +31 -25
  25. datacontract/export/bigquery_converter.py +130 -0
  26. datacontract/export/custom_converter.py +40 -0
  27. datacontract/export/data_caterer_converter.py +161 -0
  28. datacontract/export/dbml_converter.py +148 -0
  29. datacontract/export/dbt_converter.py +141 -54
  30. datacontract/export/dcs_exporter.py +6 -0
  31. datacontract/export/dqx_converter.py +126 -0
  32. datacontract/export/duckdb_type_converter.py +57 -0
  33. datacontract/export/excel_exporter.py +923 -0
  34. datacontract/export/exporter.py +100 -0
  35. datacontract/export/exporter_factory.py +216 -0
  36. datacontract/export/go_converter.py +105 -0
  37. datacontract/export/great_expectations_converter.py +257 -36
  38. datacontract/export/html_exporter.py +86 -0
  39. datacontract/export/iceberg_converter.py +188 -0
  40. datacontract/export/jsonschema_converter.py +71 -16
  41. datacontract/export/markdown_converter.py +337 -0
  42. datacontract/export/mermaid_exporter.py +110 -0
  43. datacontract/export/odcs_v3_exporter.py +375 -0
  44. datacontract/export/pandas_type_converter.py +40 -0
  45. datacontract/export/protobuf_converter.py +168 -68
  46. datacontract/export/pydantic_converter.py +6 -0
  47. datacontract/export/rdf_converter.py +13 -6
  48. datacontract/export/sodacl_converter.py +36 -188
  49. datacontract/export/spark_converter.py +245 -0
  50. datacontract/export/sql_converter.py +37 -3
  51. datacontract/export/sql_type_converter.py +269 -8
  52. datacontract/export/sqlalchemy_converter.py +170 -0
  53. datacontract/export/terraform_converter.py +7 -2
  54. datacontract/imports/avro_importer.py +246 -26
  55. datacontract/imports/bigquery_importer.py +221 -0
  56. datacontract/imports/csv_importer.py +143 -0
  57. datacontract/imports/dbml_importer.py +112 -0
  58. datacontract/imports/dbt_importer.py +240 -0
  59. datacontract/imports/excel_importer.py +1111 -0
  60. datacontract/imports/glue_importer.py +288 -0
  61. datacontract/imports/iceberg_importer.py +172 -0
  62. datacontract/imports/importer.py +51 -0
  63. datacontract/imports/importer_factory.py +128 -0
  64. datacontract/imports/json_importer.py +325 -0
  65. datacontract/imports/jsonschema_importer.py +146 -0
  66. datacontract/imports/odcs_importer.py +60 -0
  67. datacontract/imports/odcs_v3_importer.py +516 -0
  68. datacontract/imports/parquet_importer.py +81 -0
  69. datacontract/imports/protobuf_importer.py +264 -0
  70. datacontract/imports/spark_importer.py +262 -0
  71. datacontract/imports/sql_importer.py +274 -35
  72. datacontract/imports/unity_importer.py +219 -0
  73. datacontract/init/init_template.py +20 -0
  74. datacontract/integration/datamesh_manager.py +86 -0
  75. datacontract/lint/resolve.py +271 -49
  76. datacontract/lint/resources.py +21 -0
  77. datacontract/lint/schema.py +53 -17
  78. datacontract/lint/urls.py +32 -12
  79. datacontract/model/data_contract_specification/__init__.py +1 -0
  80. datacontract/model/exceptions.py +4 -1
  81. datacontract/model/odcs.py +24 -0
  82. datacontract/model/run.py +49 -29
  83. datacontract/output/__init__.py +0 -0
  84. datacontract/output/junit_test_results.py +135 -0
  85. datacontract/output/output_format.py +10 -0
  86. datacontract/output/test_results_writer.py +79 -0
  87. datacontract/py.typed +0 -0
  88. datacontract/schemas/datacontract-1.1.0.init.yaml +91 -0
  89. datacontract/schemas/datacontract-1.1.0.schema.json +1975 -0
  90. datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
  91. datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
  92. datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
  93. datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
  94. datacontract/schemas/odcs-3.0.1.schema.json +2634 -0
  95. datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
  96. datacontract/templates/datacontract.html +139 -294
  97. datacontract/templates/datacontract_odcs.html +685 -0
  98. datacontract/templates/index.html +236 -0
  99. datacontract/templates/partials/datacontract_information.html +86 -0
  100. datacontract/templates/partials/datacontract_servicelevels.html +253 -0
  101. datacontract/templates/partials/datacontract_terms.html +51 -0
  102. datacontract/templates/partials/definition.html +25 -0
  103. datacontract/templates/partials/example.html +27 -0
  104. datacontract/templates/partials/model_field.html +144 -0
  105. datacontract/templates/partials/quality.html +49 -0
  106. datacontract/templates/partials/server.html +211 -0
  107. datacontract/templates/style/output.css +491 -72
  108. datacontract_cli-0.10.37.dist-info/METADATA +2235 -0
  109. datacontract_cli-0.10.37.dist-info/RECORD +119 -0
  110. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/WHEEL +1 -1
  111. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info/licenses}/LICENSE +1 -1
  112. datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
  113. datacontract/engines/soda/connections/dask.py +0 -28
  114. datacontract/engines/soda/connections/duckdb.py +0 -76
  115. datacontract/export/csv_type_converter.py +0 -36
  116. datacontract/export/html_export.py +0 -66
  117. datacontract/export/odcs_converter.py +0 -102
  118. datacontract/init/download_datacontract_file.py +0 -17
  119. datacontract/integration/publish_datamesh_manager.py +0 -33
  120. datacontract/integration/publish_opentelemetry.py +0 -107
  121. datacontract/lint/lint.py +0 -141
  122. datacontract/lint/linters/description_linter.py +0 -34
  123. datacontract/lint/linters/example_model_linter.py +0 -91
  124. datacontract/lint/linters/field_pattern_linter.py +0 -34
  125. datacontract/lint/linters/field_reference_linter.py +0 -38
  126. datacontract/lint/linters/notice_period_linter.py +0 -55
  127. datacontract/lint/linters/quality_schema_linter.py +0 -52
  128. datacontract/lint/linters/valid_constraints_linter.py +0 -99
  129. datacontract/model/data_contract_specification.py +0 -141
  130. datacontract/web.py +0 -14
  131. datacontract_cli-0.10.0.dist-info/METADATA +0 -951
  132. datacontract_cli-0.10.0.dist-info/RECORD +0 -66
  133. /datacontract/{model → breaking}/breaking_change.py +0 -0
  134. /datacontract/{lint/linters → export}/__init__.py +0 -0
  135. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/entry_points.txt +0 -0
  136. {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1041 @@
1
+ import re
2
+ import uuid
3
+ from dataclasses import dataclass
4
+ from typing import List
5
+ from venv import logger
6
+
7
+ import yaml
8
+
9
+ from datacontract.export.sql_type_converter import convert_to_sql_type
10
+ from datacontract.model.data_contract_specification import DataContractSpecification, Quality, Server
11
+ from datacontract.model.run import Check
12
+
13
+
14
+ @dataclass
15
+ class QuotingConfig:
16
+ quote_field_name: bool = False
17
+ quote_model_name: bool = False
18
+ quote_model_name_with_backticks: bool = False
19
+
20
+
21
+ def create_checks(data_contract_spec: DataContractSpecification, server: Server) -> List[Check]:
22
+ checks: List[Check] = []
23
+ for model_key, model_value in data_contract_spec.models.items():
24
+ model_checks = to_model_checks(model_key, model_value, server)
25
+ checks.extend(model_checks)
26
+ checks.extend(to_servicelevel_checks(data_contract_spec))
27
+ checks.append(to_quality_check(data_contract_spec))
28
+ return [check for check in checks if check is not None]
29
+
30
+
31
+ def to_model_checks(model_key, model_value, server: Server) -> List[Check]:
32
+ checks: List[Check] = []
33
+ server_type = server.type if server and server.type else None
34
+ model_name = to_model_name(model_key, model_value, server_type)
35
+ fields = model_value.fields
36
+
37
+ check_types = is_check_types(server)
38
+
39
+ type1 = server.type if server and server.type else None
40
+ config = QuotingConfig(
41
+ quote_field_name=type1 in ["postgres", "sqlserver"],
42
+ quote_model_name=type1 in ["postgres", "sqlserver"],
43
+ quote_model_name_with_backticks=type1 == "bigquery",
44
+ )
45
+ quoting_config = config
46
+
47
+ for field_name, field in fields.items():
48
+ checks.append(check_field_is_present(model_name, field_name, quoting_config))
49
+ if check_types and field.type is not None:
50
+ sql_type: str = convert_to_sql_type(field, server_type)
51
+ checks.append(check_field_type(model_name, field_name, sql_type, quoting_config))
52
+ if field.required:
53
+ checks.append(check_field_required(model_name, field_name, quoting_config))
54
+ if field.unique:
55
+ checks.append(check_field_unique(model_name, field_name, quoting_config))
56
+ if field.minLength is not None:
57
+ checks.append(check_field_min_length(model_name, field_name, field.minLength, quoting_config))
58
+ if field.maxLength is not None:
59
+ checks.append(check_field_max_length(model_name, field_name, field.maxLength, quoting_config))
60
+ if field.minimum is not None:
61
+ checks.append(check_field_minimum(model_name, field_name, field.minimum, quoting_config))
62
+ if field.maximum is not None:
63
+ checks.append(check_field_maximum(model_name, field_name, field.maximum, quoting_config))
64
+ if field.exclusiveMinimum is not None:
65
+ checks.append(check_field_minimum(model_name, field_name, field.exclusiveMinimum, quoting_config))
66
+ checks.append(check_field_not_equal(model_name, field_name, field.exclusiveMinimum, quoting_config))
67
+ if field.exclusiveMaximum is not None:
68
+ checks.append(check_field_maximum(model_name, field_name, field.exclusiveMaximum, quoting_config))
69
+ checks.append(check_field_not_equal(model_name, field_name, field.exclusiveMaximum, quoting_config))
70
+ if field.pattern is not None:
71
+ checks.append(check_field_regex(model_name, field_name, field.pattern, quoting_config))
72
+ if field.enum is not None and len(field.enum) > 0:
73
+ checks.append(check_field_enum(model_name, field_name, field.enum, quoting_config))
74
+ if field.quality is not None and len(field.quality) > 0:
75
+ quality_list = check_quality_list(model_name, field_name, field.quality, quoting_config)
76
+ if (quality_list is not None) and len(quality_list) > 0:
77
+ checks.extend(quality_list)
78
+ # TODO references: str = None
79
+ # TODO format
80
+
81
+ if model_value.quality is not None and len(model_value.quality) > 0:
82
+ quality_list = check_quality_list(model_name, None, model_value.quality)
83
+ if (quality_list is not None) and len(quality_list) > 0:
84
+ checks.extend(quality_list)
85
+
86
+ return checks
87
+
88
+
89
+ def checks_for(model_name: str, quoting_config: QuotingConfig, check_type: str) -> str:
90
+ if quoting_config.quote_model_name:
91
+ return f'checks for "{model_name}"'
92
+ elif quoting_config.quote_model_name_with_backticks and check_type not in ["field_is_present", "field_type"]:
93
+ return f"checks for `{model_name}`"
94
+ return f"checks for {model_name}"
95
+
96
+
97
+ def is_check_types(server: Server) -> bool:
98
+ if server is None:
99
+ return True
100
+ return server.format != "json" and server.format != "csv" and server.format != "avro"
101
+
102
+
103
+ def to_model_name(model_key, model_value, server_type):
104
+ if server_type == "databricks":
105
+ if model_value.config is not None and "databricksTable" in model_value.config:
106
+ return model_value.config["databricksTable"]
107
+ if server_type == "snowflake":
108
+ if model_value.config is not None and "snowflakeTable" in model_value.config:
109
+ return model_value.config["snowflakeTable"]
110
+ if server_type == "sqlserver":
111
+ if model_value.config is not None and "sqlserverTable" in model_value.config:
112
+ return model_value.config["sqlserverTable"]
113
+ if server_type == "postgres" or server_type == "postgresql":
114
+ if model_value.config is not None and "postgresTable" in model_value.config:
115
+ return model_value.config["postgresTable"]
116
+ return model_key
117
+
118
+
119
+ def check_field_is_present(model_name, field_name, quoting_config: QuotingConfig = QuotingConfig()) -> Check:
120
+ check_type = "field_is_present"
121
+ check_key = f"{model_name}__{field_name}__{check_type}"
122
+ sodacl_check_dict = {
123
+ checks_for(model_name, quoting_config, check_type): [
124
+ {
125
+ "schema": {
126
+ "name": check_key,
127
+ "fail": {
128
+ "when required column missing": [field_name],
129
+ },
130
+ }
131
+ }
132
+ ]
133
+ }
134
+ return Check(
135
+ id=str(uuid.uuid4()),
136
+ key=check_key,
137
+ category="schema",
138
+ type=check_type,
139
+ name=f"Check that field '{field_name}' is present",
140
+ model=model_name,
141
+ field=field_name,
142
+ engine="soda",
143
+ language="sodacl",
144
+ implementation=yaml.dump(sodacl_check_dict),
145
+ )
146
+
147
+
148
+ def check_field_type(
149
+ model_name: str, field_name: str, expected_type: str, quoting_config: QuotingConfig = QuotingConfig()
150
+ ):
151
+ check_type = "field_type"
152
+ check_key = f"{model_name}__{field_name}__{check_type}"
153
+ sodacl_check_dict = {
154
+ checks_for(model_name, quoting_config, check_type): [
155
+ {
156
+ "schema": {
157
+ "name": check_key,
158
+ "fail": {
159
+ "when wrong column type": {
160
+ field_name: expected_type,
161
+ },
162
+ },
163
+ }
164
+ }
165
+ ]
166
+ }
167
+ return Check(
168
+ id=str(uuid.uuid4()),
169
+ key=check_key,
170
+ category="schema",
171
+ type=check_type,
172
+ name=f"Check that field {field_name} has type {expected_type}",
173
+ model=model_name,
174
+ field=field_name,
175
+ engine="soda",
176
+ language="sodacl",
177
+ implementation=yaml.dump(sodacl_check_dict),
178
+ )
179
+
180
+
181
+ def check_field_required(model_name: str, field_name: str, quoting_config: QuotingConfig = QuotingConfig()):
182
+ if quoting_config.quote_field_name:
183
+ field_name_for_soda = f'"{field_name}"'
184
+ else:
185
+ field_name_for_soda = field_name
186
+
187
+ check_type = "field_required"
188
+ check_key = f"{model_name}__{field_name}__{check_type}"
189
+ sodacl_check_dict = {
190
+ checks_for(model_name, quoting_config, check_type): [
191
+ {
192
+ f"missing_count({field_name_for_soda}) = 0": {
193
+ "name": check_key,
194
+ },
195
+ }
196
+ ],
197
+ }
198
+ return Check(
199
+ id=str(uuid.uuid4()),
200
+ key=check_key,
201
+ category="schema",
202
+ type=check_type,
203
+ name=f"Check that field {field_name} has no missing values",
204
+ model=model_name,
205
+ field=field_name,
206
+ engine="soda",
207
+ language="sodacl",
208
+ implementation=yaml.dump(sodacl_check_dict),
209
+ )
210
+
211
+
212
+ def check_field_unique(model_name: str, field_name: str, quoting_config: QuotingConfig = QuotingConfig()):
213
+ if quoting_config.quote_field_name:
214
+ field_name_for_soda = f'"{field_name}"'
215
+ else:
216
+ field_name_for_soda = field_name
217
+
218
+ check_type = "field_unique"
219
+ check_key = f"{model_name}__{field_name}__{check_type}"
220
+ sodacl_check_dict = {
221
+ checks_for(model_name, quoting_config, check_type): [
222
+ {
223
+ f"duplicate_count({field_name_for_soda}) = 0": {
224
+ "name": check_key,
225
+ },
226
+ }
227
+ ],
228
+ }
229
+ return Check(
230
+ id=str(uuid.uuid4()),
231
+ key=check_key,
232
+ category="schema",
233
+ type=check_type,
234
+ name=f"Check that unique field {field_name} has no duplicate values",
235
+ model=model_name,
236
+ field=field_name,
237
+ engine="soda",
238
+ language="sodacl",
239
+ implementation=yaml.dump(sodacl_check_dict),
240
+ )
241
+
242
+
243
+ def check_field_min_length(
244
+ model_name: str, field_name: str, min_length: int, quoting_config: QuotingConfig = QuotingConfig()
245
+ ):
246
+ if quoting_config.quote_field_name:
247
+ field_name_for_soda = f'"{field_name}"'
248
+ else:
249
+ field_name_for_soda = field_name
250
+
251
+ check_type = "field_min_length"
252
+ check_key = f"{model_name}__{field_name}__{check_type}"
253
+ sodacl_check_dict = {
254
+ checks_for(model_name, quoting_config, check_type): [
255
+ {
256
+ f"invalid_count({field_name_for_soda}) = 0": {
257
+ "name": check_key,
258
+ "valid min length": min_length,
259
+ },
260
+ }
261
+ ]
262
+ }
263
+ return Check(
264
+ id=str(uuid.uuid4()),
265
+ key=check_key,
266
+ category="schema",
267
+ type=check_type,
268
+ name=f"Check that field {field_name} has a min length of {min_length}",
269
+ model=model_name,
270
+ field=field_name,
271
+ engine="soda",
272
+ language="sodacl",
273
+ implementation=yaml.dump(sodacl_check_dict),
274
+ )
275
+
276
+
277
+ def check_field_max_length(
278
+ model_name: str, field_name: str, max_length: int, quoting_config: QuotingConfig = QuotingConfig()
279
+ ):
280
+ if quoting_config.quote_field_name:
281
+ field_name_for_soda = f'"{field_name}"'
282
+ else:
283
+ field_name_for_soda = field_name
284
+
285
+ check_type = "field_max_length"
286
+ check_key = f"{model_name}__{field_name}__{check_type}"
287
+ sodacl_check_dict = {
288
+ checks_for(model_name, quoting_config, check_type): [
289
+ {
290
+ f"invalid_count({field_name_for_soda}) = 0": {
291
+ "name": check_key,
292
+ "valid max length": max_length,
293
+ },
294
+ }
295
+ ],
296
+ }
297
+ return Check(
298
+ id=str(uuid.uuid4()),
299
+ key=check_key,
300
+ category="schema",
301
+ type=check_type,
302
+ name=f"Check that field {field_name} has a max length of {max_length}",
303
+ model=model_name,
304
+ field=field_name,
305
+ engine="soda",
306
+ language="sodacl",
307
+ implementation=yaml.dump(sodacl_check_dict),
308
+ )
309
+
310
+
311
+ def check_field_minimum(
312
+ model_name: str, field_name: str, minimum: int, quoting_config: QuotingConfig = QuotingConfig()
313
+ ):
314
+ if quoting_config.quote_field_name:
315
+ field_name_for_soda = f'"{field_name}"'
316
+ else:
317
+ field_name_for_soda = field_name
318
+
319
+ check_type = "field_minimum"
320
+ check_key = f"{model_name}__{field_name}__{check_type}"
321
+ sodacl_check_dict = {
322
+ checks_for(model_name, quoting_config, check_type): [
323
+ {
324
+ f"invalid_count({field_name_for_soda}) = 0": {
325
+ "name": check_key,
326
+ "valid min": minimum,
327
+ },
328
+ }
329
+ ],
330
+ }
331
+ return Check(
332
+ id=str(uuid.uuid4()),
333
+ key=check_key,
334
+ category="schema",
335
+ type=check_type,
336
+ name=f"Check that field {field_name} has a minimum of {minimum}",
337
+ model=model_name,
338
+ field=field_name,
339
+ engine="soda",
340
+ language="sodacl",
341
+ implementation=yaml.dump(sodacl_check_dict),
342
+ )
343
+
344
+
345
+ def check_field_maximum(
346
+ model_name: str, field_name: str, maximum: int, quoting_config: QuotingConfig = QuotingConfig()
347
+ ):
348
+ if quoting_config.quote_field_name:
349
+ field_name_for_soda = f'"{field_name}"'
350
+ else:
351
+ field_name_for_soda = field_name
352
+
353
+ check_type = "field_maximum"
354
+ check_key = f"{model_name}__{field_name}__{check_type}"
355
+ sodacl_check_dict = {
356
+ checks_for(model_name, quoting_config, check_type): [
357
+ {
358
+ f"invalid_count({field_name_for_soda}) = 0": {
359
+ "name": check_key,
360
+ "valid max": maximum,
361
+ },
362
+ }
363
+ ],
364
+ }
365
+ return Check(
366
+ id=str(uuid.uuid4()),
367
+ key=check_key,
368
+ category="schema",
369
+ type=check_type,
370
+ name=f"Check that field {field_name} has a maximum of {maximum}",
371
+ model=model_name,
372
+ field=field_name,
373
+ engine="soda",
374
+ language="sodacl",
375
+ implementation=yaml.dump(sodacl_check_dict),
376
+ )
377
+
378
+
379
+ def check_field_not_equal(
380
+ model_name: str, field_name: str, value: int, quoting_config: QuotingConfig = QuotingConfig()
381
+ ):
382
+ if quoting_config.quote_field_name:
383
+ field_name_for_soda = f'"{field_name}"'
384
+ else:
385
+ field_name_for_soda = field_name
386
+
387
+ check_type = "field_not_equal"
388
+ check_key = f"{model_name}__{field_name}__{check_type}"
389
+ sodacl_check_dict = {
390
+ checks_for(model_name, quoting_config, check_type): [
391
+ {
392
+ f"invalid_count({field_name_for_soda}) = 0": {
393
+ "name": check_key,
394
+ "invalid values": [value],
395
+ },
396
+ }
397
+ ],
398
+ }
399
+ return Check(
400
+ id=str(uuid.uuid4()),
401
+ key=check_key,
402
+ category="schema",
403
+ type=check_type,
404
+ name=f"Check that field {field_name} is not equal to {value}",
405
+ model=model_name,
406
+ field=field_name,
407
+ engine="soda",
408
+ language="sodacl",
409
+ implementation=yaml.dump(sodacl_check_dict),
410
+ )
411
+
412
+
413
+ def check_field_enum(model_name: str, field_name: str, enum: list, quoting_config: QuotingConfig = QuotingConfig()):
414
+ if quoting_config.quote_field_name:
415
+ field_name_for_soda = f'"{field_name}"'
416
+ else:
417
+ field_name_for_soda = field_name
418
+
419
+ check_type = "field_enum"
420
+ check_key = f"{model_name}__{field_name}__{check_type}"
421
+ sodacl_check_dict = {
422
+ checks_for(model_name, quoting_config, check_type): [
423
+ {
424
+ f"invalid_count({field_name_for_soda}) = 0": {
425
+ "name": check_key,
426
+ "valid values": enum,
427
+ },
428
+ }
429
+ ],
430
+ }
431
+ return Check(
432
+ id=str(uuid.uuid4()),
433
+ key=check_key,
434
+ category="schema",
435
+ type=check_type,
436
+ name=f"Check that field {field_name} only contains enum values {enum}",
437
+ model=model_name,
438
+ field=field_name,
439
+ engine="soda",
440
+ language="sodacl",
441
+ implementation=yaml.dump(sodacl_check_dict),
442
+ )
443
+
444
+
445
+ def check_field_regex(model_name: str, field_name: str, pattern: str, quoting_config: QuotingConfig = QuotingConfig()):
446
+ if quoting_config.quote_field_name:
447
+ field_name_for_soda = f'"{field_name}"'
448
+ else:
449
+ field_name_for_soda = field_name
450
+
451
+ check_type = "field_regex"
452
+ check_key = f"{model_name}__{field_name}__{check_type}"
453
+ sodacl_check_dict = {
454
+ checks_for(model_name, quoting_config, check_type): [
455
+ {
456
+ f"invalid_count({field_name_for_soda}) = 0": {
457
+ "name": check_key,
458
+ "valid regex": pattern,
459
+ },
460
+ }
461
+ ],
462
+ }
463
+ return Check(
464
+ id=str(uuid.uuid4()),
465
+ key=check_key,
466
+ category="schema",
467
+ type=check_type,
468
+ name=f"Check that field {field_name} matches regex pattern {pattern}",
469
+ model=model_name,
470
+ field=field_name,
471
+ engine="soda",
472
+ language="sodacl",
473
+ implementation=yaml.dump(sodacl_check_dict),
474
+ )
475
+
476
+
477
+ def check_row_count(model_name: str, threshold: str, quoting_config: QuotingConfig = QuotingConfig()):
478
+ check_type = "row_count"
479
+ check_key = f"{model_name}__{check_type}"
480
+ sodacl_check_dict = {
481
+ checks_for(model_name, quoting_config, check_type): [
482
+ {
483
+ f"row_count {threshold}": {"name": check_key},
484
+ }
485
+ ],
486
+ }
487
+ return Check(
488
+ id=str(uuid.uuid4()),
489
+ key=check_key,
490
+ category="schema",
491
+ type=check_type,
492
+ name=f"Check that model {model_name} has row_count {threshold}",
493
+ model=model_name,
494
+ field=None,
495
+ engine="soda",
496
+ language="sodacl",
497
+ implementation=yaml.dump(sodacl_check_dict),
498
+ )
499
+
500
+
501
+ def check_model_duplicate_values(
502
+ model_name: str, cols: list[str], threshold: str, quoting_config: QuotingConfig = QuotingConfig()
503
+ ):
504
+ check_type = "model_duplicate_values"
505
+ check_key = f"{model_name}__{check_type}"
506
+ col_joined = ", ".join(cols)
507
+ sodacl_check_dict = {
508
+ checks_for(model_name, quoting_config, check_type): [
509
+ {
510
+ f"duplicate_count({col_joined}) {threshold}": {"name": check_key},
511
+ }
512
+ ],
513
+ }
514
+ return Check(
515
+ id=str(uuid.uuid4()),
516
+ key=check_key,
517
+ category="quality",
518
+ type=check_type,
519
+ name=f"Check that model {model_name} has duplicate_count {threshold} for columns {col_joined}",
520
+ model=model_name,
521
+ field=None,
522
+ engine="soda",
523
+ language="sodacl",
524
+ implementation=yaml.dump(sodacl_check_dict),
525
+ )
526
+
527
+
528
+ def check_field_duplicate_values(
529
+ model_name: str, field_name: str, threshold: str, quoting_config: QuotingConfig = QuotingConfig()
530
+ ):
531
+ if quoting_config.quote_field_name:
532
+ field_name_for_soda = f'"{field_name}"'
533
+ else:
534
+ field_name_for_soda = field_name
535
+
536
+ check_type = "field_duplicate_values"
537
+ check_key = f"{model_name}__{field_name}__{check_type}"
538
+ sodacl_check_dict = {
539
+ checks_for(model_name, quoting_config, check_type): [
540
+ {
541
+ f"duplicate_count({field_name_for_soda}) {threshold}": {
542
+ "name": check_key,
543
+ },
544
+ }
545
+ ],
546
+ }
547
+ return Check(
548
+ id=str(uuid.uuid4()),
549
+ key=check_key,
550
+ category="quality",
551
+ type=check_type,
552
+ name=f"Check that field {field_name} has duplicate_count {threshold}",
553
+ model=model_name,
554
+ field=field_name,
555
+ engine="soda",
556
+ language="sodacl",
557
+ implementation=yaml.dump(sodacl_check_dict),
558
+ )
559
+
560
+
561
+ def check_field_null_values(
562
+ model_name: str, field_name: str, threshold: str, quoting_config: QuotingConfig = QuotingConfig()
563
+ ):
564
+ if quoting_config.quote_field_name:
565
+ field_name_for_soda = f'"{field_name}"'
566
+ else:
567
+ field_name_for_soda = field_name
568
+
569
+ check_type = "field_null_values"
570
+ check_key = f"{model_name}__{field_name}__{check_type}"
571
+ sodacl_check_dict = {
572
+ checks_for(model_name, quoting_config, check_type): [
573
+ {
574
+ f"missing_count({field_name_for_soda}) {threshold}": {
575
+ "name": check_key,
576
+ },
577
+ }
578
+ ],
579
+ }
580
+ return Check(
581
+ id=str(uuid.uuid4()),
582
+ key=check_key,
583
+ category="quality",
584
+ type=check_type,
585
+ name=f"Check that field {field_name} has missing_count {threshold}",
586
+ model=model_name,
587
+ field=field_name,
588
+ engine="soda",
589
+ language="sodacl",
590
+ implementation=yaml.dump(sodacl_check_dict),
591
+ )
592
+
593
+
594
+ def check_field_invalid_values(
595
+ model_name: str,
596
+ field_name: str,
597
+ threshold: str,
598
+ valid_values: list = None,
599
+ quoting_config: QuotingConfig = QuotingConfig(),
600
+ ):
601
+ if quoting_config.quote_field_name:
602
+ field_name_for_soda = f'"{field_name}"'
603
+ else:
604
+ field_name_for_soda = field_name
605
+
606
+ check_type = "field_invalid_values"
607
+ check_key = f"{model_name}__{field_name}__{check_type}"
608
+
609
+ sodacl_check_config = {
610
+ "name": check_key,
611
+ }
612
+
613
+ if valid_values is not None:
614
+ sodacl_check_config["valid values"] = valid_values
615
+
616
+ sodacl_check_dict = {
617
+ checks_for(model_name, quoting_config, check_type): [
618
+ {
619
+ f"invalid_count({field_name_for_soda}) {threshold}": sodacl_check_config,
620
+ }
621
+ ],
622
+ }
623
+ return Check(
624
+ id=str(uuid.uuid4()),
625
+ key=check_key,
626
+ category="quality",
627
+ type=check_type,
628
+ name=f"Check that field {field_name} has invalid_count {threshold}",
629
+ model=model_name,
630
+ field=field_name,
631
+ engine="soda",
632
+ language="sodacl",
633
+ implementation=yaml.dump(sodacl_check_dict),
634
+ )
635
+
636
+
637
+ def check_field_missing_values(
638
+ model_name: str,
639
+ field_name: str,
640
+ threshold: str,
641
+ missing_values: list = None,
642
+ quoting_config: QuotingConfig = QuotingConfig(),
643
+ ):
644
+ if quoting_config.quote_field_name:
645
+ field_name_for_soda = f'"{field_name}"'
646
+ else:
647
+ field_name_for_soda = field_name
648
+
649
+ check_type = "field_missing_values"
650
+ check_key = f"{model_name}__{field_name}__{check_type}"
651
+
652
+ sodacl_check_config = {
653
+ "name": check_key,
654
+ }
655
+
656
+ if missing_values is not None:
657
+ # Filter out null/None values as SodaCL handles these automatically
658
+ filtered_missing_values = [v for v in missing_values if v is not None]
659
+ if filtered_missing_values:
660
+ sodacl_check_config["missing values"] = filtered_missing_values
661
+
662
+ sodacl_check_dict = {
663
+ checks_for(model_name, quoting_config, check_type): [
664
+ {
665
+ f"missing_count({field_name_for_soda}) {threshold}": sodacl_check_config,
666
+ }
667
+ ],
668
+ }
669
+ return Check(
670
+ id=str(uuid.uuid4()),
671
+ key=check_key,
672
+ category="quality",
673
+ type=check_type,
674
+ name=f"Check that field {field_name} has missing_count {threshold}",
675
+ model=model_name,
676
+ field=field_name,
677
+ engine="soda",
678
+ language="sodacl",
679
+ implementation=yaml.dump(sodacl_check_dict),
680
+ )
681
+
682
+
683
+ def check_quality_list(
684
+ model_name, field_name, quality_list: List[Quality], quoting_config: QuotingConfig = QuotingConfig()
685
+ ) -> List[Check]:
686
+ checks: List[Check] = []
687
+
688
+ count = 0
689
+ for quality in quality_list:
690
+ if quality.type == "sql":
691
+ if field_name is None:
692
+ check_key = f"{model_name}__quality_sql_{count}"
693
+ check_type = "field_quality_sql"
694
+ else:
695
+ check_key = f"{model_name}__{field_name}__quality_sql_{count}"
696
+ check_type = "model_quality_sql"
697
+ threshold = to_sodacl_threshold(quality)
698
+ query = prepare_query(quality, model_name, field_name, quoting_config)
699
+ if query is None:
700
+ logger.warning(f"Quality check {check_key} has no query")
701
+ continue
702
+ if threshold is None:
703
+ logger.warning(f"Quality check {check_key} has no valid threshold")
704
+ continue
705
+
706
+ if quoting_config.quote_model_name:
707
+ model_name_for_soda = f'"{model_name}"'
708
+ else:
709
+ model_name_for_soda = model_name
710
+ sodacl_check_dict = {
711
+ f"checks for {model_name_for_soda}": [
712
+ {
713
+ f"{check_key} {threshold}": {
714
+ f"{check_key} query": query,
715
+ "name": check_key,
716
+ },
717
+ }
718
+ ]
719
+ }
720
+ checks.append(
721
+ Check(
722
+ id=str(uuid.uuid4()),
723
+ key=check_key,
724
+ category="quality",
725
+ type=check_type,
726
+ name=quality.description if quality.description is not None else "Quality Check",
727
+ model=model_name,
728
+ field=field_name,
729
+ engine="soda",
730
+ language="sodacl",
731
+ implementation=yaml.dump(sodacl_check_dict),
732
+ )
733
+ )
734
+ elif quality.metric is not None:
735
+ threshold = to_sodacl_threshold(quality)
736
+
737
+ if threshold is None:
738
+ logger.warning(f"Quality metric {quality.metric} has no valid threshold")
739
+ continue
740
+
741
+ if quality.metric == "rowCount":
742
+ checks.append(check_row_count(model_name, threshold, quoting_config))
743
+ elif quality.metric == "duplicateValues":
744
+ if field_name is None:
745
+ # TODO check that quality.arguments.get("properties") is a list of strings and contains at lease one property
746
+ checks.append(
747
+ check_model_duplicate_values(
748
+ model_name, quality.arguments.get("properties"), threshold, quoting_config
749
+ )
750
+ )
751
+ else:
752
+ checks.append(check_field_duplicate_values(model_name, field_name, threshold, quoting_config))
753
+ elif quality.metric == "nullValues":
754
+ if field_name is not None:
755
+ checks.append(check_field_null_values(model_name, field_name, threshold, quoting_config))
756
+ else:
757
+ logger.warning("Quality check nullValues is only supported at field level")
758
+ elif quality.metric == "invalidValues":
759
+ if field_name is not None:
760
+ valid_values = quality.arguments.get("validValues") if quality.arguments else None
761
+ checks.append(
762
+ check_field_invalid_values(model_name, field_name, threshold, valid_values, quoting_config)
763
+ )
764
+ else:
765
+ logger.warning("Quality check invalidValues is only supported at field level")
766
+ elif quality.metric == "missingValues":
767
+ if field_name is not None:
768
+ missing_values = quality.arguments.get("missingValues") if quality.arguments else None
769
+ checks.append(
770
+ check_field_missing_values(model_name, field_name, threshold, missing_values, quoting_config)
771
+ )
772
+ else:
773
+ logger.warning("Quality check missingValues is only supported at field level")
774
+ else:
775
+ logger.warning(f"Quality check {quality.metric} is not yet supported")
776
+
777
+ count += 1
778
+
779
+ return checks
780
+
781
+
782
+ def prepare_query(
783
+ quality: Quality, model_name: str, field_name: str = None, quoting_config: QuotingConfig = QuotingConfig()
784
+ ) -> str | None:
785
+ if quality.query is None:
786
+ return None
787
+ if quality.query == "":
788
+ return None
789
+
790
+ query = quality.query
791
+
792
+ if quoting_config.quote_field_name:
793
+ field_name_for_soda = f'"{field_name}"'
794
+ else:
795
+ field_name_for_soda = field_name
796
+
797
+ if quoting_config.quote_model_name:
798
+ model_name_for_soda = f'"{model_name}"'
799
+ elif quoting_config.quote_model_name_with_backticks:
800
+ model_name_for_soda = f"`{model_name}`"
801
+ else:
802
+ model_name_for_soda = model_name
803
+
804
+ query = re.sub(r'["\']?\{model}["\']?', model_name_for_soda, query)
805
+ query = re.sub(r'["\']?{schema}["\']?', model_name_for_soda, query)
806
+ query = re.sub(r'["\']?{table}["\']?', model_name_for_soda, query)
807
+
808
+ if field_name is not None:
809
+ query = re.sub(r'["\']?{field}["\']?', field_name_for_soda, query)
810
+ query = re.sub(r'["\']?{column}["\']?', field_name_for_soda, query)
811
+ query = re.sub(r'["\']?{property}["\']?', field_name_for_soda, query)
812
+
813
+ return query
814
+
815
+
816
+ def to_sodacl_threshold(quality: Quality) -> str | None:
817
+ if quality.mustBe is not None:
818
+ return f"= {quality.mustBe}"
819
+ if quality.mustNotBe is not None:
820
+ return f"!= {quality.mustNotBe}"
821
+ if quality.mustBeGreaterThan is not None:
822
+ return f"> {quality.mustBeGreaterThan}"
823
+ if quality.mustBeGreaterOrEqualTo is not None:
824
+ return f">= {quality.mustBeGreaterOrEqualTo}"
825
+ if quality.mustBeGreaterThanOrEqualTo is not None:
826
+ return f">= {quality.mustBeGreaterThanOrEqualTo}"
827
+ if quality.mustBeLessThan is not None:
828
+ return f"< {quality.mustBeLessThan}"
829
+ if quality.mustBeLessOrEqualTo is not None:
830
+ return f"<= {quality.mustBeLessOrEqualTo}"
831
+ if quality.mustBeLessThanOrEqualTo is not None:
832
+ return f"<= {quality.mustBeLessThanOrEqualTo}"
833
+ if quality.mustBeBetween is not None:
834
+ if len(quality.mustBeBetween) != 2:
835
+ logger.warning(
836
+ f"Quality check has invalid mustBeBetween, must have exactly 2 integers in an array: {quality.mustBeBetween}"
837
+ )
838
+ return None
839
+ return f"between {quality.mustBeBetween[0]} and {quality.mustBeBetween[1]}"
840
+ if quality.mustNotBeBetween is not None:
841
+ if len(quality.mustNotBeBetween) != 2:
842
+ logger.warning(
843
+ f"Quality check has invalid mustNotBeBetween, must have exactly 2 integers in an array: {quality.mustNotBeBetween}"
844
+ )
845
+ return None
846
+ return f"not between {quality.mustNotBeBetween[0]} and {quality.mustNotBeBetween[1]}"
847
+ return None
848
+
849
+
850
+ def to_servicelevel_checks(data_contract_spec: DataContractSpecification) -> List[Check]:
851
+ checks: List[Check] = []
852
+ if data_contract_spec.servicelevels is None:
853
+ return checks
854
+ if data_contract_spec.servicelevels.freshness is not None:
855
+ checks.append(to_servicelevel_freshness_check(data_contract_spec))
856
+ if data_contract_spec.servicelevels.retention is not None:
857
+ checks.append(to_servicelevel_retention_check(data_contract_spec))
858
+ # only return checks that are not None
859
+ return [check for check in checks if check is not None]
860
+
861
+
862
+ def to_servicelevel_freshness_check(data_contract_spec: DataContractSpecification) -> Check | None:
863
+ if data_contract_spec.servicelevels.freshness.timestampField is None:
864
+ return None
865
+ freshness_threshold = data_contract_spec.servicelevels.freshness.threshold
866
+ if freshness_threshold is None:
867
+ logger.info("servicelevel.freshness.threshold is not defined")
868
+ return None
869
+
870
+ if not (
871
+ "d" in freshness_threshold
872
+ or "D" in freshness_threshold
873
+ or "h" in freshness_threshold
874
+ or "H" in freshness_threshold
875
+ or "m" in freshness_threshold
876
+ or "M" in freshness_threshold
877
+ ):
878
+ logger.info("servicelevel.freshness.threshold must be in days, hours, or minutes (e.g., PT1H, or 1h)")
879
+ return None
880
+ timestamp_field_fully_qualified = data_contract_spec.servicelevels.freshness.timestampField
881
+ if "." not in timestamp_field_fully_qualified:
882
+ logger.info("servicelevel.freshness.timestampField is not fully qualified, skipping freshness check")
883
+ return None
884
+ if timestamp_field_fully_qualified.count(".") > 1:
885
+ logger.info(
886
+ "servicelevel.freshness.timestampField contains multiple dots, which is currently not supported, skipping freshness check"
887
+ )
888
+ return None
889
+ model_name = timestamp_field_fully_qualified.split(".")[0]
890
+ field_name = timestamp_field_fully_qualified.split(".")[1]
891
+ threshold = freshness_threshold
892
+ threshold = threshold.replace("P", "")
893
+ threshold = threshold.replace("T", "")
894
+ threshold = threshold.lower()
895
+ if model_name not in data_contract_spec.models:
896
+ logger.info(f"Model {model_name} not found in data_contract_spec.models, skipping freshness check")
897
+ return None
898
+
899
+ check_type = "servicelevel_freshness"
900
+ check_key = "servicelevel_freshness"
901
+
902
+ sodacl_check_dict = {
903
+ checks_for(model_name, QuotingConfig(), check_type): [
904
+ {
905
+ f"freshness({field_name}) < {threshold}": {
906
+ "name": check_key,
907
+ },
908
+ }
909
+ ]
910
+ }
911
+ return Check(
912
+ id=str(uuid.uuid4()),
913
+ key=check_key,
914
+ category="servicelevel",
915
+ type=check_type,
916
+ name="Freshness",
917
+ model=model_name,
918
+ engine="soda",
919
+ language="sodacl",
920
+ implementation=yaml.dump(sodacl_check_dict),
921
+ )
922
+
923
+
924
+ def to_servicelevel_retention_check(data_contract_spec) -> Check | None:
925
+ if data_contract_spec.servicelevels.retention is None:
926
+ return None
927
+ if data_contract_spec.servicelevels.retention.unlimited is True:
928
+ return None
929
+ if data_contract_spec.servicelevels.retention.timestampField is None:
930
+ logger.info("servicelevel.retention.timestampField is not defined")
931
+ return None
932
+ if data_contract_spec.servicelevels.retention.period is None:
933
+ logger.info("servicelevel.retention.period is not defined")
934
+ return None
935
+ timestamp_field_fully_qualified = data_contract_spec.servicelevels.retention.timestampField
936
+ if "." not in timestamp_field_fully_qualified:
937
+ logger.info("servicelevel.retention.timestampField is not fully qualified, skipping retention check")
938
+ return None
939
+ if timestamp_field_fully_qualified.count(".") > 1:
940
+ logger.info(
941
+ "servicelevel.retention.timestampField contains multiple dots, which is currently not supported, skipping retention check"
942
+ )
943
+ return None
944
+
945
+ model_name = timestamp_field_fully_qualified.split(".")[0]
946
+ field_name = timestamp_field_fully_qualified.split(".")[1]
947
+ period = data_contract_spec.servicelevels.retention.period
948
+ period_in_seconds = period_to_seconds(period)
949
+ if model_name not in data_contract_spec.models:
950
+ logger.info(f"Model {model_name} not found in data_contract_spec.models, skipping retention check")
951
+ return None
952
+ check_type = "servicelevel_retention"
953
+ check_key = "servicelevel_retention"
954
+ sodacl_check_dict = {
955
+ checks_for(model_name, QuotingConfig(), check_type): [
956
+ {
957
+ f"orders_servicelevel_retention < {period_in_seconds}": {
958
+ "orders_servicelevel_retention expression": f"TIMESTAMPDIFF(SECOND, MIN({field_name}), CURRENT_TIMESTAMP)",
959
+ "name": check_key,
960
+ }
961
+ },
962
+ ]
963
+ }
964
+ return Check(
965
+ id=str(uuid.uuid4()),
966
+ key=check_key,
967
+ category="servicelevel",
968
+ type=check_type,
969
+ name=f"Retention: Oldest entry has a max age of {period}",
970
+ model=model_name,
971
+ engine="soda",
972
+ language="sodacl",
973
+ implementation=yaml.dump(sodacl_check_dict),
974
+ )
975
+
976
+
977
+ def period_to_seconds(period: str) -> int | None:
978
+ import re
979
+
980
+ # if period is None:
981
+ # return None
982
+ # if period is in form "30d" or "24h" or "60m"
983
+ if re.match(r"^\d+[dhm]$", period):
984
+ if period[-1] == "d":
985
+ return int(period[:-1]) * 86400
986
+ if period[-1] == "h":
987
+ return int(period[:-1]) * 3600
988
+ if period[-1] == "m":
989
+ return int(period[:-1]) * 60
990
+ # if it is in iso period format (do not use isodate, can also be years)
991
+ iso_period_regex = re.compile(
992
+ r"P(?:(?P<years>\d+)Y)?(?:(?P<months>\d+)M)?(?:(?P<days>\d+)D)?"
993
+ r"(?:T(?:(?P<hours>\d+)H)?(?:(?P<minutes>\d+)M)?(?:(?P<seconds>\d+)S)?)?"
994
+ )
995
+ match = iso_period_regex.match(period)
996
+ if match:
997
+ years = int(match.group("years") or 0)
998
+ months = int(match.group("months") or 0)
999
+ days = int(match.group("days") or 0)
1000
+ hours = int(match.group("hours") or 0)
1001
+ minutes = int(match.group("minutes") or 0)
1002
+ seconds = int(match.group("seconds") or 0)
1003
+
1004
+ # Convert everything to seconds
1005
+ total_seconds = (
1006
+ years * 365 * 86400 # Approximate conversion of years to seconds
1007
+ + months * 30 * 86400 # Approximate conversion of months to seconds
1008
+ + days * 86400
1009
+ + hours * 3600
1010
+ + minutes * 60
1011
+ + seconds
1012
+ )
1013
+ return total_seconds
1014
+
1015
+ return None
1016
+
1017
+
1018
+ # These are deprecated root-level quality specifications, use the model-level and field-level quality fields instead
1019
+ def to_quality_check(data_contract_spec) -> Check | None:
1020
+ if data_contract_spec.quality is None:
1021
+ return None
1022
+ if data_contract_spec.quality.type is None:
1023
+ return None
1024
+ if data_contract_spec.quality.type.lower() != "sodacl":
1025
+ return None
1026
+ if isinstance(data_contract_spec.quality.specification, str):
1027
+ quality_specification = yaml.safe_load(data_contract_spec.quality.specification)
1028
+ else:
1029
+ quality_specification = data_contract_spec.quality.specification
1030
+
1031
+ return Check(
1032
+ id=str(uuid.uuid4()),
1033
+ key="quality__sodacl",
1034
+ category="quality",
1035
+ type="quality",
1036
+ name="Quality Check",
1037
+ model=None,
1038
+ engine="soda",
1039
+ language="sodacl",
1040
+ implementation=yaml.dump(quality_specification),
1041
+ )