datacontract-cli 0.10.34__py3-none-any.whl → 0.10.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/api.py +10 -3
- datacontract/cli.py +5 -3
- datacontract/data_contract.py +18 -51
- datacontract/engines/data_contract_checks.py +280 -19
- datacontract/engines/fastjsonschema/check_jsonschema.py +29 -19
- datacontract/export/dbt_converter.py +30 -4
- datacontract/export/dqx_converter.py +126 -0
- datacontract/export/excel_exporter.py +3 -3
- datacontract/export/exporter.py +1 -0
- datacontract/export/exporter_factory.py +6 -0
- datacontract/export/markdown_converter.py +35 -16
- datacontract/export/mermaid_exporter.py +24 -11
- datacontract/export/rdf_converter.py +2 -2
- datacontract/export/spark_converter.py +28 -3
- datacontract/export/sql_type_converter.py +6 -4
- datacontract/imports/odcs_v3_importer.py +100 -19
- datacontract/imports/unity_importer.py +16 -11
- datacontract/init/init_template.py +1 -1
- datacontract/lint/resolve.py +1 -1
- datacontract/lint/schema.py +1 -1
- datacontract/schemas/datacontract-1.1.0.init.yaml +1 -1
- datacontract/schemas/datacontract-1.2.0.init.yaml +1 -1
- datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
- datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
- datacontract/templates/datacontract_odcs.html +60 -41
- {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/METADATA +68 -56
- {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/RECORD +32 -35
- datacontract/lint/lint.py +0 -142
- datacontract/lint/linters/__init__.py +0 -0
- datacontract/lint/linters/description_linter.py +0 -33
- datacontract/lint/linters/field_pattern_linter.py +0 -34
- datacontract/lint/linters/field_reference_linter.py +0 -47
- datacontract/lint/linters/notice_period_linter.py +0 -55
- datacontract/lint/linters/valid_constraints_linter.py +0 -100
- {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/WHEEL +0 -0
- {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/licenses/LICENSE +0 -0
- {datacontract_cli-0.10.34.dist-info → datacontract_cli-0.10.36.dist-info}/top_level.txt +0 -0
datacontract/api.py
CHANGED
|
@@ -10,7 +10,7 @@ from fastapi.security.api_key import APIKeyHeader
|
|
|
10
10
|
from datacontract.data_contract import DataContract, ExportFormat
|
|
11
11
|
from datacontract.model.run import Run
|
|
12
12
|
|
|
13
|
-
DATA_CONTRACT_EXAMPLE_PAYLOAD = """dataContractSpecification: 1.2.
|
|
13
|
+
DATA_CONTRACT_EXAMPLE_PAYLOAD = """dataContractSpecification: 1.2.1
|
|
14
14
|
id: urn:datacontract:checkout:orders-latest
|
|
15
15
|
info:
|
|
16
16
|
title: Orders Latest
|
|
@@ -162,15 +162,22 @@ async def test(
|
|
|
162
162
|
server: Annotated[
|
|
163
163
|
str | None,
|
|
164
164
|
Query(
|
|
165
|
-
examples=["production"],
|
|
166
165
|
description="The server name to test. Optional, if there is only one server.",
|
|
166
|
+
examples=["production"],
|
|
167
|
+
),
|
|
168
|
+
] = None,
|
|
169
|
+
publish_url: Annotated[
|
|
170
|
+
str | None,
|
|
171
|
+
Query(
|
|
172
|
+
description="URL to publish test results. Optional, if you want to publish the test results to a Data Mesh Manager or Data Contract Manager. Example: https://api.datamesh-manager.com/api/test-results",
|
|
173
|
+
examples=["https://api.datamesh-manager.com/api/test-results"],
|
|
167
174
|
),
|
|
168
175
|
] = None,
|
|
169
176
|
) -> Run:
|
|
170
177
|
check_api_key(api_key)
|
|
171
178
|
logging.info("Testing data contract...")
|
|
172
179
|
logging.info(body)
|
|
173
|
-
return DataContract(data_contract_str=body, server=server).test()
|
|
180
|
+
return DataContract(data_contract_str=body, server=server, publish_url=publish_url).test()
|
|
174
181
|
|
|
175
182
|
|
|
176
183
|
@app.post(
|
datacontract/cli.py
CHANGED
|
@@ -126,8 +126,10 @@ def test(
|
|
|
126
126
|
"servers (default)."
|
|
127
127
|
),
|
|
128
128
|
] = "all",
|
|
129
|
-
publish_test_results: Annotated[
|
|
130
|
-
|
|
129
|
+
publish_test_results: Annotated[
|
|
130
|
+
bool, typer.Option(help="Deprecated. Use publish parameter. Publish the results after the test")
|
|
131
|
+
] = False,
|
|
132
|
+
publish: Annotated[str, typer.Option(help="The url to publish the results after the test.")] = None,
|
|
131
133
|
output: Annotated[
|
|
132
134
|
Path,
|
|
133
135
|
typer.Option(
|
|
@@ -329,7 +331,7 @@ def import_(
|
|
|
329
331
|
"""
|
|
330
332
|
Create a data contract from the given source location. Saves to file specified by `output` option if present, otherwise prints to stdout.
|
|
331
333
|
"""
|
|
332
|
-
result = DataContract
|
|
334
|
+
result = DataContract.import_from_source(
|
|
333
335
|
format=format,
|
|
334
336
|
source=source,
|
|
335
337
|
spec=spec,
|
datacontract/data_contract.py
CHANGED
|
@@ -26,11 +26,6 @@ from datacontract.imports.importer_factory import importer_factory
|
|
|
26
26
|
from datacontract.init.init_template import get_init_template
|
|
27
27
|
from datacontract.integration.datamesh_manager import publish_test_results_to_datamesh_manager
|
|
28
28
|
from datacontract.lint import resolve
|
|
29
|
-
from datacontract.lint.linters.description_linter import DescriptionLinter
|
|
30
|
-
from datacontract.lint.linters.field_pattern_linter import FieldPatternLinter
|
|
31
|
-
from datacontract.lint.linters.field_reference_linter import FieldReferenceLinter
|
|
32
|
-
from datacontract.lint.linters.notice_period_linter import NoticePeriodLinter
|
|
33
|
-
from datacontract.lint.linters.valid_constraints_linter import ValidFieldConstraintsLinter
|
|
34
29
|
from datacontract.model.data_contract_specification import DataContractSpecification, Info
|
|
35
30
|
from datacontract.model.exceptions import DataContractException
|
|
36
31
|
from datacontract.model.run import Check, ResultEnum, Run
|
|
@@ -64,24 +59,14 @@ class DataContract:
|
|
|
64
59
|
self._inline_definitions = inline_definitions
|
|
65
60
|
self._inline_quality = inline_quality
|
|
66
61
|
self._ssl_verification = ssl_verification
|
|
67
|
-
self.all_linters = {
|
|
68
|
-
FieldPatternLinter(),
|
|
69
|
-
FieldReferenceLinter(),
|
|
70
|
-
NoticePeriodLinter(),
|
|
71
|
-
ValidFieldConstraintsLinter(),
|
|
72
|
-
DescriptionLinter(),
|
|
73
|
-
}
|
|
74
62
|
|
|
75
63
|
@classmethod
|
|
76
64
|
def init(cls, template: typing.Optional[str], schema: typing.Optional[str] = None) -> DataContractSpecification:
|
|
77
65
|
template_str = get_init_template(template)
|
|
78
66
|
return resolve.resolve_data_contract(data_contract_str=template_str, schema_location=schema)
|
|
79
67
|
|
|
80
|
-
def lint(self
|
|
81
|
-
"""Lint the data contract by
|
|
82
|
-
|
|
83
|
-
enabled_linters can be either "all" or "none", or a set of linter IDs. The "schema" linter is always enabled, even with enabled_linters="none".
|
|
84
|
-
"""
|
|
68
|
+
def lint(self) -> Run:
|
|
69
|
+
"""Lint the data contract by validating it against the JSON schema."""
|
|
85
70
|
run = Run.create_run()
|
|
86
71
|
try:
|
|
87
72
|
run.log_info("Linting data contract")
|
|
@@ -101,27 +86,6 @@ class DataContract:
|
|
|
101
86
|
engine="datacontract",
|
|
102
87
|
)
|
|
103
88
|
)
|
|
104
|
-
if enabled_linters == "none":
|
|
105
|
-
linters_to_check = set()
|
|
106
|
-
elif enabled_linters == "all":
|
|
107
|
-
linters_to_check = self.all_linters
|
|
108
|
-
elif isinstance(enabled_linters, set):
|
|
109
|
-
linters_to_check = {linter for linter in self.all_linters if linter.id in enabled_linters}
|
|
110
|
-
else:
|
|
111
|
-
raise RuntimeError(f"Unknown argument enabled_linters={enabled_linters} for lint()")
|
|
112
|
-
for linter in linters_to_check:
|
|
113
|
-
try:
|
|
114
|
-
run.checks.extend(linter.lint(data_contract))
|
|
115
|
-
except Exception as e:
|
|
116
|
-
run.checks.append(
|
|
117
|
-
Check(
|
|
118
|
-
type="general",
|
|
119
|
-
result=ResultEnum.error,
|
|
120
|
-
name=f"Linter '{linter.name}'",
|
|
121
|
-
reason=str(e),
|
|
122
|
-
engine="datacontract",
|
|
123
|
-
)
|
|
124
|
-
)
|
|
125
89
|
run.dataContractId = data_contract.id
|
|
126
90
|
run.dataContractVersion = data_contract.info.version
|
|
127
91
|
except DataContractException as e:
|
|
@@ -292,10 +256,9 @@ class DataContract:
|
|
|
292
256
|
export_args=kwargs,
|
|
293
257
|
)
|
|
294
258
|
|
|
295
|
-
|
|
296
|
-
# could be a class method, not using anything from the instance
|
|
259
|
+
@classmethod
|
|
297
260
|
def import_from_source(
|
|
298
|
-
|
|
261
|
+
cls,
|
|
299
262
|
format: str,
|
|
300
263
|
source: typing.Optional[str] = None,
|
|
301
264
|
template: typing.Optional[str] = None,
|
|
@@ -307,7 +270,7 @@ class DataContract:
|
|
|
307
270
|
owner = kwargs.get("owner")
|
|
308
271
|
|
|
309
272
|
if spec == Spec.odcs or format == ImportFormat.excel:
|
|
310
|
-
data_contract_specification_initial =
|
|
273
|
+
data_contract_specification_initial = cls.init(template=template, schema=schema)
|
|
311
274
|
|
|
312
275
|
odcs_imported = importer_factory.create(format).import_source(
|
|
313
276
|
data_contract_specification=data_contract_specification_initial, source=source, import_args=kwargs
|
|
@@ -317,12 +280,12 @@ class DataContract:
|
|
|
317
280
|
# convert automatically
|
|
318
281
|
odcs_imported = to_odcs_v3(odcs_imported)
|
|
319
282
|
|
|
320
|
-
|
|
321
|
-
|
|
283
|
+
cls._overwrite_id_in_odcs(odcs_imported, id)
|
|
284
|
+
cls._overwrite_owner_in_odcs(odcs_imported, owner)
|
|
322
285
|
|
|
323
286
|
return odcs_imported
|
|
324
287
|
elif spec == Spec.datacontract_specification:
|
|
325
|
-
data_contract_specification_initial =
|
|
288
|
+
data_contract_specification_initial = cls.init(template=template, schema=schema)
|
|
326
289
|
|
|
327
290
|
data_contract_specification_imported = importer_factory.create(format).import_source(
|
|
328
291
|
data_contract_specification=data_contract_specification_initial, source=source, import_args=kwargs
|
|
@@ -334,8 +297,8 @@ class DataContract:
|
|
|
334
297
|
data_contract_specification_initial, data_contract_specification_imported
|
|
335
298
|
)
|
|
336
299
|
|
|
337
|
-
|
|
338
|
-
|
|
300
|
+
cls._overwrite_id_in_data_contract_specification(data_contract_specification_imported, id)
|
|
301
|
+
cls._overwrite_owner_in_data_contract_specification(data_contract_specification_imported, owner)
|
|
339
302
|
|
|
340
303
|
return data_contract_specification_imported
|
|
341
304
|
else:
|
|
@@ -347,16 +310,18 @@ class DataContract:
|
|
|
347
310
|
engine="datacontract",
|
|
348
311
|
)
|
|
349
312
|
|
|
313
|
+
@staticmethod
|
|
350
314
|
def _overwrite_id_in_data_contract_specification(
|
|
351
|
-
|
|
315
|
+
data_contract_specification: DataContractSpecification, id: str | None
|
|
352
316
|
):
|
|
353
317
|
if not id:
|
|
354
318
|
return
|
|
355
319
|
|
|
356
320
|
data_contract_specification.id = id
|
|
357
321
|
|
|
322
|
+
@staticmethod
|
|
358
323
|
def _overwrite_owner_in_data_contract_specification(
|
|
359
|
-
|
|
324
|
+
data_contract_specification: DataContractSpecification, owner: str | None
|
|
360
325
|
):
|
|
361
326
|
if not owner:
|
|
362
327
|
return
|
|
@@ -365,7 +330,8 @@ class DataContract:
|
|
|
365
330
|
data_contract_specification.info = Info()
|
|
366
331
|
data_contract_specification.info.owner = owner
|
|
367
332
|
|
|
368
|
-
|
|
333
|
+
@staticmethod
|
|
334
|
+
def _overwrite_owner_in_odcs(odcs: OpenDataContractStandard, owner: str | None):
|
|
369
335
|
if not owner:
|
|
370
336
|
return
|
|
371
337
|
|
|
@@ -377,7 +343,8 @@ class DataContract:
|
|
|
377
343
|
return
|
|
378
344
|
odcs.customProperties.append(CustomProperty(property="owner", value=owner))
|
|
379
345
|
|
|
380
|
-
|
|
346
|
+
@staticmethod
|
|
347
|
+
def _overwrite_id_in_odcs(odcs: OpenDataContractStandard, id: str | None):
|
|
381
348
|
if not id:
|
|
382
349
|
return
|
|
383
350
|
|
|
@@ -15,6 +15,7 @@ from datacontract.model.run import Check
|
|
|
15
15
|
class QuotingConfig:
|
|
16
16
|
quote_field_name: bool = False
|
|
17
17
|
quote_model_name: bool = False
|
|
18
|
+
quote_model_name_with_backticks: bool = False
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
def create_checks(data_contract_spec: DataContractSpecification, server: Server) -> List[Check]:
|
|
@@ -35,15 +36,18 @@ def to_model_checks(model_key, model_value, server: Server) -> List[Check]:
|
|
|
35
36
|
|
|
36
37
|
check_types = is_check_types(server)
|
|
37
38
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
39
|
+
type1 = server.type if server and server.type else None
|
|
40
|
+
config = QuotingConfig(
|
|
41
|
+
quote_field_name=type1 in ["postgres", "sqlserver"],
|
|
42
|
+
quote_model_name=type1 in ["postgres", "sqlserver"],
|
|
43
|
+
quote_model_name_with_backticks=type1 == "bigquery",
|
|
41
44
|
)
|
|
45
|
+
quoting_config = config
|
|
42
46
|
|
|
43
47
|
for field_name, field in fields.items():
|
|
44
48
|
checks.append(check_field_is_present(model_name, field_name, quoting_config))
|
|
45
49
|
if check_types and field.type is not None:
|
|
46
|
-
sql_type = convert_to_sql_type(field, server_type)
|
|
50
|
+
sql_type: str = convert_to_sql_type(field, server_type)
|
|
47
51
|
checks.append(check_field_type(model_name, field_name, sql_type, quoting_config))
|
|
48
52
|
if field.required:
|
|
49
53
|
checks.append(check_field_required(model_name, field_name, quoting_config))
|
|
@@ -82,9 +86,11 @@ def to_model_checks(model_key, model_value, server: Server) -> List[Check]:
|
|
|
82
86
|
return checks
|
|
83
87
|
|
|
84
88
|
|
|
85
|
-
def checks_for(model_name,
|
|
86
|
-
if quote_model_name:
|
|
89
|
+
def checks_for(model_name: str, quoting_config: QuotingConfig, check_type: str) -> str:
|
|
90
|
+
if quoting_config.quote_model_name:
|
|
87
91
|
return f'checks for "{model_name}"'
|
|
92
|
+
elif quoting_config.quote_model_name_with_backticks and check_type not in ["field_is_present", "field_type"]:
|
|
93
|
+
return f"checks for `{model_name}`"
|
|
88
94
|
return f"checks for {model_name}"
|
|
89
95
|
|
|
90
96
|
|
|
@@ -114,7 +120,7 @@ def check_field_is_present(model_name, field_name, quoting_config: QuotingConfig
|
|
|
114
120
|
check_type = "field_is_present"
|
|
115
121
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
116
122
|
sodacl_check_dict = {
|
|
117
|
-
checks_for(model_name, quoting_config
|
|
123
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
118
124
|
{
|
|
119
125
|
"schema": {
|
|
120
126
|
"name": check_key,
|
|
@@ -145,7 +151,7 @@ def check_field_type(
|
|
|
145
151
|
check_type = "field_type"
|
|
146
152
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
147
153
|
sodacl_check_dict = {
|
|
148
|
-
checks_for(model_name, quoting_config
|
|
154
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
149
155
|
{
|
|
150
156
|
"schema": {
|
|
151
157
|
"name": check_key,
|
|
@@ -181,7 +187,7 @@ def check_field_required(model_name: str, field_name: str, quoting_config: Quoti
|
|
|
181
187
|
check_type = "field_required"
|
|
182
188
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
183
189
|
sodacl_check_dict = {
|
|
184
|
-
checks_for(model_name, quoting_config
|
|
190
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
185
191
|
{
|
|
186
192
|
f"missing_count({field_name_for_soda}) = 0": {
|
|
187
193
|
"name": check_key,
|
|
@@ -212,7 +218,7 @@ def check_field_unique(model_name: str, field_name: str, quoting_config: Quoting
|
|
|
212
218
|
check_type = "field_unique"
|
|
213
219
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
214
220
|
sodacl_check_dict = {
|
|
215
|
-
checks_for(model_name, quoting_config
|
|
221
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
216
222
|
{
|
|
217
223
|
f"duplicate_count({field_name_for_soda}) = 0": {
|
|
218
224
|
"name": check_key,
|
|
@@ -245,7 +251,7 @@ def check_field_min_length(
|
|
|
245
251
|
check_type = "field_min_length"
|
|
246
252
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
247
253
|
sodacl_check_dict = {
|
|
248
|
-
checks_for(model_name, quoting_config
|
|
254
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
249
255
|
{
|
|
250
256
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
251
257
|
"name": check_key,
|
|
@@ -279,7 +285,7 @@ def check_field_max_length(
|
|
|
279
285
|
check_type = "field_max_length"
|
|
280
286
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
281
287
|
sodacl_check_dict = {
|
|
282
|
-
checks_for(model_name, quoting_config
|
|
288
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
283
289
|
{
|
|
284
290
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
285
291
|
"name": check_key,
|
|
@@ -313,7 +319,7 @@ def check_field_minimum(
|
|
|
313
319
|
check_type = "field_minimum"
|
|
314
320
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
315
321
|
sodacl_check_dict = {
|
|
316
|
-
checks_for(model_name, quoting_config
|
|
322
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
317
323
|
{
|
|
318
324
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
319
325
|
"name": check_key,
|
|
@@ -347,7 +353,7 @@ def check_field_maximum(
|
|
|
347
353
|
check_type = "field_maximum"
|
|
348
354
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
349
355
|
sodacl_check_dict = {
|
|
350
|
-
checks_for(model_name, quoting_config
|
|
356
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
351
357
|
{
|
|
352
358
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
353
359
|
"name": check_key,
|
|
@@ -381,7 +387,7 @@ def check_field_not_equal(
|
|
|
381
387
|
check_type = "field_not_equal"
|
|
382
388
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
383
389
|
sodacl_check_dict = {
|
|
384
|
-
checks_for(model_name, quoting_config
|
|
390
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
385
391
|
{
|
|
386
392
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
387
393
|
"name": check_key,
|
|
@@ -413,7 +419,7 @@ def check_field_enum(model_name: str, field_name: str, enum: list, quoting_confi
|
|
|
413
419
|
check_type = "field_enum"
|
|
414
420
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
415
421
|
sodacl_check_dict = {
|
|
416
|
-
checks_for(model_name, quoting_config
|
|
422
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
417
423
|
{
|
|
418
424
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
419
425
|
"name": check_key,
|
|
@@ -445,7 +451,7 @@ def check_field_regex(model_name: str, field_name: str, pattern: str, quoting_co
|
|
|
445
451
|
check_type = "field_regex"
|
|
446
452
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
447
453
|
sodacl_check_dict = {
|
|
448
|
-
checks_for(model_name, quoting_config
|
|
454
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
449
455
|
{
|
|
450
456
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
451
457
|
"name": check_key,
|
|
@@ -468,6 +474,212 @@ def check_field_regex(model_name: str, field_name: str, pattern: str, quoting_co
|
|
|
468
474
|
)
|
|
469
475
|
|
|
470
476
|
|
|
477
|
+
def check_row_count(model_name: str, threshold: str, quoting_config: QuotingConfig = QuotingConfig()):
|
|
478
|
+
check_type = "row_count"
|
|
479
|
+
check_key = f"{model_name}__{check_type}"
|
|
480
|
+
sodacl_check_dict = {
|
|
481
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
482
|
+
{
|
|
483
|
+
f"row_count {threshold}": {"name": check_key},
|
|
484
|
+
}
|
|
485
|
+
],
|
|
486
|
+
}
|
|
487
|
+
return Check(
|
|
488
|
+
id=str(uuid.uuid4()),
|
|
489
|
+
key=check_key,
|
|
490
|
+
category="schema",
|
|
491
|
+
type=check_type,
|
|
492
|
+
name=f"Check that model {model_name} has row_count {threshold}",
|
|
493
|
+
model=model_name,
|
|
494
|
+
field=None,
|
|
495
|
+
engine="soda",
|
|
496
|
+
language="sodacl",
|
|
497
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def check_model_duplicate_values(
|
|
502
|
+
model_name: str, cols: list[str], threshold: str, quoting_config: QuotingConfig = QuotingConfig()
|
|
503
|
+
):
|
|
504
|
+
check_type = "model_duplicate_values"
|
|
505
|
+
check_key = f"{model_name}__{check_type}"
|
|
506
|
+
col_joined = ", ".join(cols)
|
|
507
|
+
sodacl_check_dict = {
|
|
508
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
509
|
+
{
|
|
510
|
+
f"duplicate_count({col_joined}) {threshold}": {"name": check_key},
|
|
511
|
+
}
|
|
512
|
+
],
|
|
513
|
+
}
|
|
514
|
+
return Check(
|
|
515
|
+
id=str(uuid.uuid4()),
|
|
516
|
+
key=check_key,
|
|
517
|
+
category="quality",
|
|
518
|
+
type=check_type,
|
|
519
|
+
name=f"Check that model {model_name} has duplicate_count {threshold} for columns {col_joined}",
|
|
520
|
+
model=model_name,
|
|
521
|
+
field=None,
|
|
522
|
+
engine="soda",
|
|
523
|
+
language="sodacl",
|
|
524
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
def check_field_duplicate_values(
|
|
529
|
+
model_name: str, field_name: str, threshold: str, quoting_config: QuotingConfig = QuotingConfig()
|
|
530
|
+
):
|
|
531
|
+
if quoting_config.quote_field_name:
|
|
532
|
+
field_name_for_soda = f'"{field_name}"'
|
|
533
|
+
else:
|
|
534
|
+
field_name_for_soda = field_name
|
|
535
|
+
|
|
536
|
+
check_type = "field_duplicate_values"
|
|
537
|
+
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
538
|
+
sodacl_check_dict = {
|
|
539
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
540
|
+
{
|
|
541
|
+
f"duplicate_count({field_name_for_soda}) {threshold}": {
|
|
542
|
+
"name": check_key,
|
|
543
|
+
},
|
|
544
|
+
}
|
|
545
|
+
],
|
|
546
|
+
}
|
|
547
|
+
return Check(
|
|
548
|
+
id=str(uuid.uuid4()),
|
|
549
|
+
key=check_key,
|
|
550
|
+
category="quality",
|
|
551
|
+
type=check_type,
|
|
552
|
+
name=f"Check that field {field_name} has duplicate_count {threshold}",
|
|
553
|
+
model=model_name,
|
|
554
|
+
field=field_name,
|
|
555
|
+
engine="soda",
|
|
556
|
+
language="sodacl",
|
|
557
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
def check_field_null_values(
|
|
562
|
+
model_name: str, field_name: str, threshold: str, quoting_config: QuotingConfig = QuotingConfig()
|
|
563
|
+
):
|
|
564
|
+
if quoting_config.quote_field_name:
|
|
565
|
+
field_name_for_soda = f'"{field_name}"'
|
|
566
|
+
else:
|
|
567
|
+
field_name_for_soda = field_name
|
|
568
|
+
|
|
569
|
+
check_type = "field_null_values"
|
|
570
|
+
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
571
|
+
sodacl_check_dict = {
|
|
572
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
573
|
+
{
|
|
574
|
+
f"missing_count({field_name_for_soda}) {threshold}": {
|
|
575
|
+
"name": check_key,
|
|
576
|
+
},
|
|
577
|
+
}
|
|
578
|
+
],
|
|
579
|
+
}
|
|
580
|
+
return Check(
|
|
581
|
+
id=str(uuid.uuid4()),
|
|
582
|
+
key=check_key,
|
|
583
|
+
category="quality",
|
|
584
|
+
type=check_type,
|
|
585
|
+
name=f"Check that field {field_name} has missing_count {threshold}",
|
|
586
|
+
model=model_name,
|
|
587
|
+
field=field_name,
|
|
588
|
+
engine="soda",
|
|
589
|
+
language="sodacl",
|
|
590
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
def check_field_invalid_values(
|
|
595
|
+
model_name: str,
|
|
596
|
+
field_name: str,
|
|
597
|
+
threshold: str,
|
|
598
|
+
valid_values: list = None,
|
|
599
|
+
quoting_config: QuotingConfig = QuotingConfig(),
|
|
600
|
+
):
|
|
601
|
+
if quoting_config.quote_field_name:
|
|
602
|
+
field_name_for_soda = f'"{field_name}"'
|
|
603
|
+
else:
|
|
604
|
+
field_name_for_soda = field_name
|
|
605
|
+
|
|
606
|
+
check_type = "field_invalid_values"
|
|
607
|
+
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
608
|
+
|
|
609
|
+
sodacl_check_config = {
|
|
610
|
+
"name": check_key,
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
if valid_values is not None:
|
|
614
|
+
sodacl_check_config["valid values"] = valid_values
|
|
615
|
+
|
|
616
|
+
sodacl_check_dict = {
|
|
617
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
618
|
+
{
|
|
619
|
+
f"invalid_count({field_name_for_soda}) {threshold}": sodacl_check_config,
|
|
620
|
+
}
|
|
621
|
+
],
|
|
622
|
+
}
|
|
623
|
+
return Check(
|
|
624
|
+
id=str(uuid.uuid4()),
|
|
625
|
+
key=check_key,
|
|
626
|
+
category="quality",
|
|
627
|
+
type=check_type,
|
|
628
|
+
name=f"Check that field {field_name} has invalid_count {threshold}",
|
|
629
|
+
model=model_name,
|
|
630
|
+
field=field_name,
|
|
631
|
+
engine="soda",
|
|
632
|
+
language="sodacl",
|
|
633
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
634
|
+
)
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
def check_field_missing_values(
|
|
638
|
+
model_name: str,
|
|
639
|
+
field_name: str,
|
|
640
|
+
threshold: str,
|
|
641
|
+
missing_values: list = None,
|
|
642
|
+
quoting_config: QuotingConfig = QuotingConfig(),
|
|
643
|
+
):
|
|
644
|
+
if quoting_config.quote_field_name:
|
|
645
|
+
field_name_for_soda = f'"{field_name}"'
|
|
646
|
+
else:
|
|
647
|
+
field_name_for_soda = field_name
|
|
648
|
+
|
|
649
|
+
check_type = "field_missing_values"
|
|
650
|
+
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
651
|
+
|
|
652
|
+
sodacl_check_config = {
|
|
653
|
+
"name": check_key,
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
if missing_values is not None:
|
|
657
|
+
# Filter out null/None values as SodaCL handles these automatically
|
|
658
|
+
filtered_missing_values = [v for v in missing_values if v is not None]
|
|
659
|
+
if filtered_missing_values:
|
|
660
|
+
sodacl_check_config["missing values"] = filtered_missing_values
|
|
661
|
+
|
|
662
|
+
sodacl_check_dict = {
|
|
663
|
+
checks_for(model_name, quoting_config, check_type): [
|
|
664
|
+
{
|
|
665
|
+
f"missing_count({field_name_for_soda}) {threshold}": sodacl_check_config,
|
|
666
|
+
}
|
|
667
|
+
],
|
|
668
|
+
}
|
|
669
|
+
return Check(
|
|
670
|
+
id=str(uuid.uuid4()),
|
|
671
|
+
key=check_key,
|
|
672
|
+
category="quality",
|
|
673
|
+
type=check_type,
|
|
674
|
+
name=f"Check that field {field_name} has missing_count {threshold}",
|
|
675
|
+
model=model_name,
|
|
676
|
+
field=field_name,
|
|
677
|
+
engine="soda",
|
|
678
|
+
language="sodacl",
|
|
679
|
+
implementation=yaml.dump(sodacl_check_dict),
|
|
680
|
+
)
|
|
681
|
+
|
|
682
|
+
|
|
471
683
|
def check_quality_list(
|
|
472
684
|
model_name, field_name, quality_list: List[Quality], quoting_config: QuotingConfig = QuotingConfig()
|
|
473
685
|
) -> List[Check]:
|
|
@@ -519,6 +731,49 @@ def check_quality_list(
|
|
|
519
731
|
implementation=yaml.dump(sodacl_check_dict),
|
|
520
732
|
)
|
|
521
733
|
)
|
|
734
|
+
elif quality.metric is not None:
|
|
735
|
+
threshold = to_sodacl_threshold(quality)
|
|
736
|
+
|
|
737
|
+
if threshold is None:
|
|
738
|
+
logger.warning(f"Quality metric {quality.metric} has no valid threshold")
|
|
739
|
+
continue
|
|
740
|
+
|
|
741
|
+
if quality.metric == "rowCount":
|
|
742
|
+
checks.append(check_row_count(model_name, threshold, quoting_config))
|
|
743
|
+
elif quality.metric == "duplicateValues":
|
|
744
|
+
if field_name is None:
|
|
745
|
+
# TODO check that quality.arguments.get("properties") is a list of strings and contains at lease one property
|
|
746
|
+
checks.append(
|
|
747
|
+
check_model_duplicate_values(
|
|
748
|
+
model_name, quality.arguments.get("properties"), threshold, quoting_config
|
|
749
|
+
)
|
|
750
|
+
)
|
|
751
|
+
else:
|
|
752
|
+
checks.append(check_field_duplicate_values(model_name, field_name, threshold, quoting_config))
|
|
753
|
+
elif quality.metric == "nullValues":
|
|
754
|
+
if field_name is not None:
|
|
755
|
+
checks.append(check_field_null_values(model_name, field_name, threshold, quoting_config))
|
|
756
|
+
else:
|
|
757
|
+
logger.warning("Quality check nullValues is only supported at field level")
|
|
758
|
+
elif quality.metric == "invalidValues":
|
|
759
|
+
if field_name is not None:
|
|
760
|
+
valid_values = quality.arguments.get("validValues") if quality.arguments else None
|
|
761
|
+
checks.append(
|
|
762
|
+
check_field_invalid_values(model_name, field_name, threshold, valid_values, quoting_config)
|
|
763
|
+
)
|
|
764
|
+
else:
|
|
765
|
+
logger.warning("Quality check invalidValues is only supported at field level")
|
|
766
|
+
elif quality.metric == "missingValues":
|
|
767
|
+
if field_name is not None:
|
|
768
|
+
missing_values = quality.arguments.get("missingValues") if quality.arguments else None
|
|
769
|
+
checks.append(
|
|
770
|
+
check_field_missing_values(model_name, field_name, threshold, missing_values, quoting_config)
|
|
771
|
+
)
|
|
772
|
+
else:
|
|
773
|
+
logger.warning("Quality check missingValues is only supported at field level")
|
|
774
|
+
else:
|
|
775
|
+
logger.warning(f"Quality check {quality.metric} is not yet supported")
|
|
776
|
+
|
|
522
777
|
count += 1
|
|
523
778
|
|
|
524
779
|
return checks
|
|
@@ -541,6 +796,8 @@ def prepare_query(
|
|
|
541
796
|
|
|
542
797
|
if quoting_config.quote_model_name:
|
|
543
798
|
model_name_for_soda = f'"{model_name}"'
|
|
799
|
+
elif quoting_config.quote_model_name_with_backticks:
|
|
800
|
+
model_name_for_soda = f"`{model_name}`"
|
|
544
801
|
else:
|
|
545
802
|
model_name_for_soda = model_name
|
|
546
803
|
|
|
@@ -563,10 +820,14 @@ def to_sodacl_threshold(quality: Quality) -> str | None:
|
|
|
563
820
|
return f"!= {quality.mustNotBe}"
|
|
564
821
|
if quality.mustBeGreaterThan is not None:
|
|
565
822
|
return f"> {quality.mustBeGreaterThan}"
|
|
823
|
+
if quality.mustBeGreaterOrEqualTo is not None:
|
|
824
|
+
return f">= {quality.mustBeGreaterOrEqualTo}"
|
|
566
825
|
if quality.mustBeGreaterThanOrEqualTo is not None:
|
|
567
826
|
return f">= {quality.mustBeGreaterThanOrEqualTo}"
|
|
568
827
|
if quality.mustBeLessThan is not None:
|
|
569
828
|
return f"< {quality.mustBeLessThan}"
|
|
829
|
+
if quality.mustBeLessOrEqualTo is not None:
|
|
830
|
+
return f"<= {quality.mustBeLessOrEqualTo}"
|
|
570
831
|
if quality.mustBeLessThanOrEqualTo is not None:
|
|
571
832
|
return f"<= {quality.mustBeLessThanOrEqualTo}"
|
|
572
833
|
if quality.mustBeBetween is not None:
|
|
@@ -639,7 +900,7 @@ def to_servicelevel_freshness_check(data_contract_spec: DataContractSpecificatio
|
|
|
639
900
|
check_key = "servicelevel_freshness"
|
|
640
901
|
|
|
641
902
|
sodacl_check_dict = {
|
|
642
|
-
checks_for(model_name,
|
|
903
|
+
checks_for(model_name, QuotingConfig(), check_type): [
|
|
643
904
|
{
|
|
644
905
|
f"freshness({field_name}) < {threshold}": {
|
|
645
906
|
"name": check_key,
|
|
@@ -691,7 +952,7 @@ def to_servicelevel_retention_check(data_contract_spec) -> Check | None:
|
|
|
691
952
|
check_type = "servicelevel_retention"
|
|
692
953
|
check_key = "servicelevel_retention"
|
|
693
954
|
sodacl_check_dict = {
|
|
694
|
-
checks_for(model_name,
|
|
955
|
+
checks_for(model_name, QuotingConfig(), check_type): [
|
|
695
956
|
{
|
|
696
957
|
f"orders_servicelevel_retention < {period_in_seconds}": {
|
|
697
958
|
"orders_servicelevel_retention expression": f"TIMESTAMPDIFF(SECOND, MIN({field_name}), CURRENT_TIMESTAMP)",
|