datacontract-cli 0.10.33__py3-none-any.whl → 0.10.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/api.py +9 -2
- datacontract/cli.py +4 -2
- datacontract/engines/data_contract_checks.py +102 -59
- datacontract/engines/data_contract_test.py +37 -0
- datacontract/engines/fastjsonschema/check_jsonschema.py +37 -19
- datacontract/engines/soda/check_soda_execute.py +6 -0
- datacontract/engines/soda/connections/athena.py +79 -0
- datacontract/engines/soda/connections/duckdb_connection.py +3 -0
- datacontract/export/avro_converter.py +12 -2
- datacontract/export/dqx_converter.py +121 -0
- datacontract/export/exporter.py +1 -0
- datacontract/export/exporter_factory.py +6 -0
- datacontract/export/markdown_converter.py +115 -5
- datacontract/export/mermaid_exporter.py +24 -11
- datacontract/export/spark_converter.py +28 -3
- datacontract/export/sql_type_converter.py +4 -0
- datacontract/imports/avro_importer.py +33 -7
- datacontract/imports/odcs_v3_importer.py +30 -1
- datacontract/imports/spark_importer.py +12 -1
- {datacontract_cli-0.10.33.dist-info → datacontract_cli-0.10.35.dist-info}/METADATA +126 -42
- {datacontract_cli-0.10.33.dist-info → datacontract_cli-0.10.35.dist-info}/RECORD +25 -23
- {datacontract_cli-0.10.33.dist-info → datacontract_cli-0.10.35.dist-info}/licenses/LICENSE +1 -1
- {datacontract_cli-0.10.33.dist-info → datacontract_cli-0.10.35.dist-info}/WHEEL +0 -0
- {datacontract_cli-0.10.33.dist-info → datacontract_cli-0.10.35.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.33.dist-info → datacontract_cli-0.10.35.dist-info}/top_level.txt +0 -0
datacontract/api.py
CHANGED
|
@@ -162,15 +162,22 @@ async def test(
|
|
|
162
162
|
server: Annotated[
|
|
163
163
|
str | None,
|
|
164
164
|
Query(
|
|
165
|
-
examples=["production"],
|
|
166
165
|
description="The server name to test. Optional, if there is only one server.",
|
|
166
|
+
examples=["production"],
|
|
167
|
+
),
|
|
168
|
+
] = None,
|
|
169
|
+
publish_url: Annotated[
|
|
170
|
+
str | None,
|
|
171
|
+
Query(
|
|
172
|
+
description="URL to publish test results. Optional, if you want to publish the test results to a Data Mesh Manager or Data Contract Manager. Example: https://api.datamesh-manager.com/api/test-results",
|
|
173
|
+
examples=["https://api.datamesh-manager.com/api/test-results"],
|
|
167
174
|
),
|
|
168
175
|
] = None,
|
|
169
176
|
) -> Run:
|
|
170
177
|
check_api_key(api_key)
|
|
171
178
|
logging.info("Testing data contract...")
|
|
172
179
|
logging.info(body)
|
|
173
|
-
return DataContract(data_contract_str=body, server=server).test()
|
|
180
|
+
return DataContract(data_contract_str=body, server=server, publish_url=publish_url).test()
|
|
174
181
|
|
|
175
182
|
|
|
176
183
|
@app.post(
|
datacontract/cli.py
CHANGED
|
@@ -126,8 +126,10 @@ def test(
|
|
|
126
126
|
"servers (default)."
|
|
127
127
|
),
|
|
128
128
|
] = "all",
|
|
129
|
-
publish_test_results: Annotated[
|
|
130
|
-
|
|
129
|
+
publish_test_results: Annotated[
|
|
130
|
+
bool, typer.Option(help="Deprecated. Use publish parameter. Publish the results after the test")
|
|
131
|
+
] = False,
|
|
132
|
+
publish: Annotated[str, typer.Option(help="The url to publish the results after the test.")] = None,
|
|
131
133
|
output: Annotated[
|
|
132
134
|
Path,
|
|
133
135
|
typer.Option(
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
+
import re
|
|
1
2
|
import uuid
|
|
3
|
+
from dataclasses import dataclass
|
|
2
4
|
from typing import List
|
|
3
5
|
from venv import logger
|
|
4
6
|
|
|
@@ -9,6 +11,12 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
|
|
|
9
11
|
from datacontract.model.run import Check
|
|
10
12
|
|
|
11
13
|
|
|
14
|
+
@dataclass
|
|
15
|
+
class QuotingConfig:
|
|
16
|
+
quote_field_name: bool = False
|
|
17
|
+
quote_model_name: bool = False
|
|
18
|
+
|
|
19
|
+
|
|
12
20
|
def create_checks(data_contract_spec: DataContractSpecification, server: Server) -> List[Check]:
|
|
13
21
|
checks: List[Check] = []
|
|
14
22
|
for model_key, model_value in data_contract_spec.models.items():
|
|
@@ -26,37 +34,41 @@ def to_model_checks(model_key, model_value, server: Server) -> List[Check]:
|
|
|
26
34
|
fields = model_value.fields
|
|
27
35
|
|
|
28
36
|
check_types = is_check_types(server)
|
|
29
|
-
|
|
37
|
+
|
|
38
|
+
quoting_config = QuotingConfig(
|
|
39
|
+
quote_field_name=server_type in ["postgres", "sqlserver"],
|
|
40
|
+
quote_model_name=server_type in ["postgres", "sqlserver"],
|
|
41
|
+
)
|
|
30
42
|
|
|
31
43
|
for field_name, field in fields.items():
|
|
32
|
-
checks.append(check_field_is_present(model_name, field_name,
|
|
44
|
+
checks.append(check_field_is_present(model_name, field_name, quoting_config))
|
|
33
45
|
if check_types and field.type is not None:
|
|
34
46
|
sql_type = convert_to_sql_type(field, server_type)
|
|
35
|
-
checks.append(check_field_type(model_name, field_name, sql_type,
|
|
47
|
+
checks.append(check_field_type(model_name, field_name, sql_type, quoting_config))
|
|
36
48
|
if field.required:
|
|
37
|
-
checks.append(check_field_required(model_name, field_name,
|
|
49
|
+
checks.append(check_field_required(model_name, field_name, quoting_config))
|
|
38
50
|
if field.unique:
|
|
39
|
-
checks.append(check_field_unique(model_name, field_name,
|
|
51
|
+
checks.append(check_field_unique(model_name, field_name, quoting_config))
|
|
40
52
|
if field.minLength is not None:
|
|
41
|
-
checks.append(check_field_min_length(model_name, field_name, field.minLength,
|
|
53
|
+
checks.append(check_field_min_length(model_name, field_name, field.minLength, quoting_config))
|
|
42
54
|
if field.maxLength is not None:
|
|
43
|
-
checks.append(check_field_max_length(model_name, field_name, field.maxLength,
|
|
55
|
+
checks.append(check_field_max_length(model_name, field_name, field.maxLength, quoting_config))
|
|
44
56
|
if field.minimum is not None:
|
|
45
|
-
checks.append(check_field_minimum(model_name, field_name, field.minimum,
|
|
57
|
+
checks.append(check_field_minimum(model_name, field_name, field.minimum, quoting_config))
|
|
46
58
|
if field.maximum is not None:
|
|
47
|
-
checks.append(check_field_maximum(model_name, field_name, field.maximum,
|
|
59
|
+
checks.append(check_field_maximum(model_name, field_name, field.maximum, quoting_config))
|
|
48
60
|
if field.exclusiveMinimum is not None:
|
|
49
|
-
checks.append(check_field_minimum(model_name, field_name, field.exclusiveMinimum,
|
|
50
|
-
checks.append(check_field_not_equal(model_name, field_name, field.exclusiveMinimum,
|
|
61
|
+
checks.append(check_field_minimum(model_name, field_name, field.exclusiveMinimum, quoting_config))
|
|
62
|
+
checks.append(check_field_not_equal(model_name, field_name, field.exclusiveMinimum, quoting_config))
|
|
51
63
|
if field.exclusiveMaximum is not None:
|
|
52
|
-
checks.append(check_field_maximum(model_name, field_name, field.exclusiveMaximum,
|
|
53
|
-
checks.append(check_field_not_equal(model_name, field_name, field.exclusiveMaximum,
|
|
64
|
+
checks.append(check_field_maximum(model_name, field_name, field.exclusiveMaximum, quoting_config))
|
|
65
|
+
checks.append(check_field_not_equal(model_name, field_name, field.exclusiveMaximum, quoting_config))
|
|
54
66
|
if field.pattern is not None:
|
|
55
|
-
checks.append(check_field_regex(model_name, field_name, field.pattern,
|
|
67
|
+
checks.append(check_field_regex(model_name, field_name, field.pattern, quoting_config))
|
|
56
68
|
if field.enum is not None and len(field.enum) > 0:
|
|
57
|
-
checks.append(check_field_enum(model_name, field_name, field.enum,
|
|
69
|
+
checks.append(check_field_enum(model_name, field_name, field.enum, quoting_config))
|
|
58
70
|
if field.quality is not None and len(field.quality) > 0:
|
|
59
|
-
quality_list = check_quality_list(model_name, field_name, field.quality)
|
|
71
|
+
quality_list = check_quality_list(model_name, field_name, field.quality, quoting_config)
|
|
60
72
|
if (quality_list is not None) and len(quality_list) > 0:
|
|
61
73
|
checks.extend(quality_list)
|
|
62
74
|
# TODO references: str = None
|
|
@@ -70,8 +82,8 @@ def to_model_checks(model_key, model_value, server: Server) -> List[Check]:
|
|
|
70
82
|
return checks
|
|
71
83
|
|
|
72
84
|
|
|
73
|
-
def checks_for(model_name,
|
|
74
|
-
if
|
|
85
|
+
def checks_for(model_name, quote_model_name: bool):
|
|
86
|
+
if quote_model_name:
|
|
75
87
|
return f'checks for "{model_name}"'
|
|
76
88
|
return f"checks for {model_name}"
|
|
77
89
|
|
|
@@ -98,11 +110,11 @@ def to_model_name(model_key, model_value, server_type):
|
|
|
98
110
|
return model_key
|
|
99
111
|
|
|
100
112
|
|
|
101
|
-
def check_field_is_present(model_name, field_name,
|
|
113
|
+
def check_field_is_present(model_name, field_name, quoting_config: QuotingConfig = QuotingConfig()) -> Check:
|
|
102
114
|
check_type = "field_is_present"
|
|
103
115
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
104
116
|
sodacl_check_dict = {
|
|
105
|
-
checks_for(model_name,
|
|
117
|
+
checks_for(model_name, quoting_config.quote_model_name): [
|
|
106
118
|
{
|
|
107
119
|
"schema": {
|
|
108
120
|
"name": check_key,
|
|
@@ -127,11 +139,13 @@ def check_field_is_present(model_name, field_name, quote_field_name: bool) -> Ch
|
|
|
127
139
|
)
|
|
128
140
|
|
|
129
141
|
|
|
130
|
-
def check_field_type(
|
|
142
|
+
def check_field_type(
|
|
143
|
+
model_name: str, field_name: str, expected_type: str, quoting_config: QuotingConfig = QuotingConfig()
|
|
144
|
+
):
|
|
131
145
|
check_type = "field_type"
|
|
132
146
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
133
147
|
sodacl_check_dict = {
|
|
134
|
-
checks_for(model_name,
|
|
148
|
+
checks_for(model_name, quoting_config.quote_model_name): [
|
|
135
149
|
{
|
|
136
150
|
"schema": {
|
|
137
151
|
"name": check_key,
|
|
@@ -158,8 +172,8 @@ def check_field_type(model_name: str, field_name: str, expected_type: str, quote
|
|
|
158
172
|
)
|
|
159
173
|
|
|
160
174
|
|
|
161
|
-
def check_field_required(model_name: str, field_name: str,
|
|
162
|
-
if quote_field_name:
|
|
175
|
+
def check_field_required(model_name: str, field_name: str, quoting_config: QuotingConfig = QuotingConfig()):
|
|
176
|
+
if quoting_config.quote_field_name:
|
|
163
177
|
field_name_for_soda = f'"{field_name}"'
|
|
164
178
|
else:
|
|
165
179
|
field_name_for_soda = field_name
|
|
@@ -167,7 +181,7 @@ def check_field_required(model_name: str, field_name: str, quote_field_name: boo
|
|
|
167
181
|
check_type = "field_required"
|
|
168
182
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
169
183
|
sodacl_check_dict = {
|
|
170
|
-
checks_for(model_name,
|
|
184
|
+
checks_for(model_name, quoting_config.quote_model_name): [
|
|
171
185
|
{
|
|
172
186
|
f"missing_count({field_name_for_soda}) = 0": {
|
|
173
187
|
"name": check_key,
|
|
@@ -189,8 +203,8 @@ def check_field_required(model_name: str, field_name: str, quote_field_name: boo
|
|
|
189
203
|
)
|
|
190
204
|
|
|
191
205
|
|
|
192
|
-
def check_field_unique(model_name: str, field_name: str,
|
|
193
|
-
if quote_field_name:
|
|
206
|
+
def check_field_unique(model_name: str, field_name: str, quoting_config: QuotingConfig = QuotingConfig()):
|
|
207
|
+
if quoting_config.quote_field_name:
|
|
194
208
|
field_name_for_soda = f'"{field_name}"'
|
|
195
209
|
else:
|
|
196
210
|
field_name_for_soda = field_name
|
|
@@ -198,7 +212,7 @@ def check_field_unique(model_name: str, field_name: str, quote_field_name: bool
|
|
|
198
212
|
check_type = "field_unique"
|
|
199
213
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
200
214
|
sodacl_check_dict = {
|
|
201
|
-
checks_for(model_name,
|
|
215
|
+
checks_for(model_name, quoting_config.quote_model_name): [
|
|
202
216
|
{
|
|
203
217
|
f"duplicate_count({field_name_for_soda}) = 0": {
|
|
204
218
|
"name": check_key,
|
|
@@ -220,8 +234,10 @@ def check_field_unique(model_name: str, field_name: str, quote_field_name: bool
|
|
|
220
234
|
)
|
|
221
235
|
|
|
222
236
|
|
|
223
|
-
def check_field_min_length(
|
|
224
|
-
|
|
237
|
+
def check_field_min_length(
|
|
238
|
+
model_name: str, field_name: str, min_length: int, quoting_config: QuotingConfig = QuotingConfig()
|
|
239
|
+
):
|
|
240
|
+
if quoting_config.quote_field_name:
|
|
225
241
|
field_name_for_soda = f'"{field_name}"'
|
|
226
242
|
else:
|
|
227
243
|
field_name_for_soda = field_name
|
|
@@ -229,7 +245,7 @@ def check_field_min_length(model_name: str, field_name: str, min_length: int, qu
|
|
|
229
245
|
check_type = "field_min_length"
|
|
230
246
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
231
247
|
sodacl_check_dict = {
|
|
232
|
-
checks_for(model_name,
|
|
248
|
+
checks_for(model_name, quoting_config.quote_model_name): [
|
|
233
249
|
{
|
|
234
250
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
235
251
|
"name": check_key,
|
|
@@ -252,8 +268,10 @@ def check_field_min_length(model_name: str, field_name: str, min_length: int, qu
|
|
|
252
268
|
)
|
|
253
269
|
|
|
254
270
|
|
|
255
|
-
def check_field_max_length(
|
|
256
|
-
|
|
271
|
+
def check_field_max_length(
|
|
272
|
+
model_name: str, field_name: str, max_length: int, quoting_config: QuotingConfig = QuotingConfig()
|
|
273
|
+
):
|
|
274
|
+
if quoting_config.quote_field_name:
|
|
257
275
|
field_name_for_soda = f'"{field_name}"'
|
|
258
276
|
else:
|
|
259
277
|
field_name_for_soda = field_name
|
|
@@ -261,7 +279,7 @@ def check_field_max_length(model_name: str, field_name: str, max_length: int, qu
|
|
|
261
279
|
check_type = "field_max_length"
|
|
262
280
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
263
281
|
sodacl_check_dict = {
|
|
264
|
-
checks_for(model_name,
|
|
282
|
+
checks_for(model_name, quoting_config.quote_model_name): [
|
|
265
283
|
{
|
|
266
284
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
267
285
|
"name": check_key,
|
|
@@ -284,8 +302,10 @@ def check_field_max_length(model_name: str, field_name: str, max_length: int, qu
|
|
|
284
302
|
)
|
|
285
303
|
|
|
286
304
|
|
|
287
|
-
def check_field_minimum(
|
|
288
|
-
|
|
305
|
+
def check_field_minimum(
|
|
306
|
+
model_name: str, field_name: str, minimum: int, quoting_config: QuotingConfig = QuotingConfig()
|
|
307
|
+
):
|
|
308
|
+
if quoting_config.quote_field_name:
|
|
289
309
|
field_name_for_soda = f'"{field_name}"'
|
|
290
310
|
else:
|
|
291
311
|
field_name_for_soda = field_name
|
|
@@ -293,7 +313,7 @@ def check_field_minimum(model_name: str, field_name: str, minimum: int, quote_fi
|
|
|
293
313
|
check_type = "field_minimum"
|
|
294
314
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
295
315
|
sodacl_check_dict = {
|
|
296
|
-
checks_for(model_name,
|
|
316
|
+
checks_for(model_name, quoting_config.quote_model_name): [
|
|
297
317
|
{
|
|
298
318
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
299
319
|
"name": check_key,
|
|
@@ -316,8 +336,10 @@ def check_field_minimum(model_name: str, field_name: str, minimum: int, quote_fi
|
|
|
316
336
|
)
|
|
317
337
|
|
|
318
338
|
|
|
319
|
-
def check_field_maximum(
|
|
320
|
-
|
|
339
|
+
def check_field_maximum(
|
|
340
|
+
model_name: str, field_name: str, maximum: int, quoting_config: QuotingConfig = QuotingConfig()
|
|
341
|
+
):
|
|
342
|
+
if quoting_config.quote_field_name:
|
|
321
343
|
field_name_for_soda = f'"{field_name}"'
|
|
322
344
|
else:
|
|
323
345
|
field_name_for_soda = field_name
|
|
@@ -325,7 +347,7 @@ def check_field_maximum(model_name: str, field_name: str, maximum: int, quote_fi
|
|
|
325
347
|
check_type = "field_maximum"
|
|
326
348
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
327
349
|
sodacl_check_dict = {
|
|
328
|
-
checks_for(model_name,
|
|
350
|
+
checks_for(model_name, quoting_config.quote_model_name): [
|
|
329
351
|
{
|
|
330
352
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
331
353
|
"name": check_key,
|
|
@@ -348,8 +370,10 @@ def check_field_maximum(model_name: str, field_name: str, maximum: int, quote_fi
|
|
|
348
370
|
)
|
|
349
371
|
|
|
350
372
|
|
|
351
|
-
def check_field_not_equal(
|
|
352
|
-
|
|
373
|
+
def check_field_not_equal(
|
|
374
|
+
model_name: str, field_name: str, value: int, quoting_config: QuotingConfig = QuotingConfig()
|
|
375
|
+
):
|
|
376
|
+
if quoting_config.quote_field_name:
|
|
353
377
|
field_name_for_soda = f'"{field_name}"'
|
|
354
378
|
else:
|
|
355
379
|
field_name_for_soda = field_name
|
|
@@ -357,7 +381,7 @@ def check_field_not_equal(model_name: str, field_name: str, value: int, quote_fi
|
|
|
357
381
|
check_type = "field_not_equal"
|
|
358
382
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
359
383
|
sodacl_check_dict = {
|
|
360
|
-
checks_for(model_name,
|
|
384
|
+
checks_for(model_name, quoting_config.quote_model_name): [
|
|
361
385
|
{
|
|
362
386
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
363
387
|
"name": check_key,
|
|
@@ -380,8 +404,8 @@ def check_field_not_equal(model_name: str, field_name: str, value: int, quote_fi
|
|
|
380
404
|
)
|
|
381
405
|
|
|
382
406
|
|
|
383
|
-
def check_field_enum(model_name: str, field_name: str, enum: list,
|
|
384
|
-
if quote_field_name:
|
|
407
|
+
def check_field_enum(model_name: str, field_name: str, enum: list, quoting_config: QuotingConfig = QuotingConfig()):
|
|
408
|
+
if quoting_config.quote_field_name:
|
|
385
409
|
field_name_for_soda = f'"{field_name}"'
|
|
386
410
|
else:
|
|
387
411
|
field_name_for_soda = field_name
|
|
@@ -389,7 +413,7 @@ def check_field_enum(model_name: str, field_name: str, enum: list, quote_field_n
|
|
|
389
413
|
check_type = "field_enum"
|
|
390
414
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
391
415
|
sodacl_check_dict = {
|
|
392
|
-
checks_for(model_name,
|
|
416
|
+
checks_for(model_name, quoting_config.quote_model_name): [
|
|
393
417
|
{
|
|
394
418
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
395
419
|
"name": check_key,
|
|
@@ -412,8 +436,8 @@ def check_field_enum(model_name: str, field_name: str, enum: list, quote_field_n
|
|
|
412
436
|
)
|
|
413
437
|
|
|
414
438
|
|
|
415
|
-
def check_field_regex(model_name: str, field_name: str, pattern: str,
|
|
416
|
-
if quote_field_name:
|
|
439
|
+
def check_field_regex(model_name: str, field_name: str, pattern: str, quoting_config: QuotingConfig = QuotingConfig()):
|
|
440
|
+
if quoting_config.quote_field_name:
|
|
417
441
|
field_name_for_soda = f'"{field_name}"'
|
|
418
442
|
else:
|
|
419
443
|
field_name_for_soda = field_name
|
|
@@ -421,7 +445,7 @@ def check_field_regex(model_name: str, field_name: str, pattern: str, quote_fiel
|
|
|
421
445
|
check_type = "field_regex"
|
|
422
446
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
423
447
|
sodacl_check_dict = {
|
|
424
|
-
checks_for(model_name,
|
|
448
|
+
checks_for(model_name, quoting_config.quote_model_name): [
|
|
425
449
|
{
|
|
426
450
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
427
451
|
"name": check_key,
|
|
@@ -444,7 +468,9 @@ def check_field_regex(model_name: str, field_name: str, pattern: str, quote_fiel
|
|
|
444
468
|
)
|
|
445
469
|
|
|
446
470
|
|
|
447
|
-
def check_quality_list(
|
|
471
|
+
def check_quality_list(
|
|
472
|
+
model_name, field_name, quality_list: List[Quality], quoting_config: QuotingConfig = QuotingConfig()
|
|
473
|
+
) -> List[Check]:
|
|
448
474
|
checks: List[Check] = []
|
|
449
475
|
|
|
450
476
|
count = 0
|
|
@@ -457,15 +483,20 @@ def check_quality_list(model_name, field_name, quality_list: List[Quality]) -> L
|
|
|
457
483
|
check_key = f"{model_name}__{field_name}__quality_sql_{count}"
|
|
458
484
|
check_type = "model_quality_sql"
|
|
459
485
|
threshold = to_sodacl_threshold(quality)
|
|
460
|
-
query = prepare_query(quality, model_name, field_name)
|
|
486
|
+
query = prepare_query(quality, model_name, field_name, quoting_config)
|
|
461
487
|
if query is None:
|
|
462
488
|
logger.warning(f"Quality check {check_key} has no query")
|
|
463
489
|
continue
|
|
464
490
|
if threshold is None:
|
|
465
491
|
logger.warning(f"Quality check {check_key} has no valid threshold")
|
|
466
492
|
continue
|
|
493
|
+
|
|
494
|
+
if quoting_config.quote_model_name:
|
|
495
|
+
model_name_for_soda = f'"{model_name}"'
|
|
496
|
+
else:
|
|
497
|
+
model_name_for_soda = model_name
|
|
467
498
|
sodacl_check_dict = {
|
|
468
|
-
f"checks for {
|
|
499
|
+
f"checks for {model_name_for_soda}": [
|
|
469
500
|
{
|
|
470
501
|
f"{check_key} {threshold}": {
|
|
471
502
|
f"{check_key} query": query,
|
|
@@ -493,7 +524,9 @@ def check_quality_list(model_name, field_name, quality_list: List[Quality]) -> L
|
|
|
493
524
|
return checks
|
|
494
525
|
|
|
495
526
|
|
|
496
|
-
def prepare_query(
|
|
527
|
+
def prepare_query(
|
|
528
|
+
quality: Quality, model_name: str, field_name: str = None, quoting_config: QuotingConfig = QuotingConfig()
|
|
529
|
+
) -> str | None:
|
|
497
530
|
if quality.query is None:
|
|
498
531
|
return None
|
|
499
532
|
if quality.query == "":
|
|
@@ -501,14 +534,24 @@ def prepare_query(quality: Quality, model_name: str, field_name: str = None) ->
|
|
|
501
534
|
|
|
502
535
|
query = quality.query
|
|
503
536
|
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
537
|
+
if quoting_config.quote_field_name:
|
|
538
|
+
field_name_for_soda = f'"{field_name}"'
|
|
539
|
+
else:
|
|
540
|
+
field_name_for_soda = field_name
|
|
541
|
+
|
|
542
|
+
if quoting_config.quote_model_name:
|
|
543
|
+
model_name_for_soda = f'"{model_name}"'
|
|
544
|
+
else:
|
|
545
|
+
model_name_for_soda = model_name
|
|
546
|
+
|
|
547
|
+
query = re.sub(r'["\']?\{model}["\']?', model_name_for_soda, query)
|
|
548
|
+
query = re.sub(r'["\']?{schema}["\']?', model_name_for_soda, query)
|
|
549
|
+
query = re.sub(r'["\']?{table}["\']?', model_name_for_soda, query)
|
|
507
550
|
|
|
508
551
|
if field_name is not None:
|
|
509
|
-
query =
|
|
510
|
-
query =
|
|
511
|
-
query =
|
|
552
|
+
query = re.sub(r'["\']?{field}["\']?', field_name_for_soda, query)
|
|
553
|
+
query = re.sub(r'["\']?{column}["\']?', field_name_for_soda, query)
|
|
554
|
+
query = re.sub(r'["\']?{property}["\']?', field_name_for_soda, query)
|
|
512
555
|
|
|
513
556
|
return query
|
|
514
557
|
|
|
@@ -1,5 +1,9 @@
|
|
|
1
|
+
import atexit
|
|
2
|
+
import os
|
|
3
|
+
import tempfile
|
|
1
4
|
import typing
|
|
2
5
|
|
|
6
|
+
import requests
|
|
3
7
|
from duckdb.duckdb import DuckDBPyConnection
|
|
4
8
|
|
|
5
9
|
from datacontract.engines.data_contract_checks import create_checks
|
|
@@ -46,6 +50,9 @@ def execute_data_contract_test(
|
|
|
46
50
|
run.outputPortId = server.outputPortId
|
|
47
51
|
run.server = server_name
|
|
48
52
|
|
|
53
|
+
if server.type == "api":
|
|
54
|
+
server = process_api_response(run, server)
|
|
55
|
+
|
|
49
56
|
run.checks.extend(create_checks(data_contract_specification, server))
|
|
50
57
|
|
|
51
58
|
# TODO check server is supported type for nicer error messages
|
|
@@ -74,3 +81,33 @@ def get_server(data_contract_specification: DataContractSpecification, server_na
|
|
|
74
81
|
server_name = list(data_contract_specification.servers.keys())[0]
|
|
75
82
|
server = data_contract_specification.servers.get(server_name)
|
|
76
83
|
return server
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def process_api_response(run, server):
|
|
87
|
+
tmp_dir = tempfile.TemporaryDirectory(prefix="datacontract_cli_api_")
|
|
88
|
+
atexit.register(tmp_dir.cleanup)
|
|
89
|
+
headers = {}
|
|
90
|
+
if os.getenv("DATACONTRACT_API_HEADER_AUTHORIZATION") is not None:
|
|
91
|
+
headers["Authorization"] = os.getenv("DATACONTRACT_API_HEADER_AUTHORIZATION")
|
|
92
|
+
try:
|
|
93
|
+
response = requests.get(server.location, headers=headers)
|
|
94
|
+
response.raise_for_status()
|
|
95
|
+
except requests.exceptions.RequestException as e:
|
|
96
|
+
raise DataContractException(
|
|
97
|
+
type="connection",
|
|
98
|
+
name="API server connection error",
|
|
99
|
+
result=ResultEnum.error,
|
|
100
|
+
reason=f"Failed to fetch API response from {server.location}: {e}",
|
|
101
|
+
engine="datacontract",
|
|
102
|
+
)
|
|
103
|
+
with open(f"{tmp_dir.name}/api_response.json", "w") as f:
|
|
104
|
+
f.write(response.text)
|
|
105
|
+
run.log_info(f"Saved API response to {tmp_dir.name}/api_response.json")
|
|
106
|
+
server = Server(
|
|
107
|
+
type="local",
|
|
108
|
+
format="json",
|
|
109
|
+
path=f"{tmp_dir.name}/api_response.json",
|
|
110
|
+
dataProductId=server.dataProductId,
|
|
111
|
+
outputPortId=server.outputPortId,
|
|
112
|
+
)
|
|
113
|
+
return server
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
import glob
|
|
1
2
|
import json
|
|
2
3
|
import logging
|
|
3
4
|
import os
|
|
4
5
|
import threading
|
|
5
|
-
from typing import List, Optional
|
|
6
|
+
from typing import Any, Callable, Generator, List, Optional
|
|
6
7
|
|
|
7
8
|
import fastjsonschema
|
|
8
9
|
from fastjsonschema import JsonSchemaValueException
|
|
@@ -85,7 +86,7 @@ def process_exceptions(run, exceptions: List[DataContractException]):
|
|
|
85
86
|
|
|
86
87
|
|
|
87
88
|
def validate_json_stream(
|
|
88
|
-
schema: dict, model_name: str, validate:
|
|
89
|
+
schema: dict, model_name: str, validate: Callable, json_stream: Generator[Any, Any, None]
|
|
89
90
|
) -> List[DataContractException]:
|
|
90
91
|
logging.info(f"Validating JSON stream for model: '{model_name}'.")
|
|
91
92
|
exceptions: List[DataContractException] = []
|
|
@@ -99,7 +100,7 @@ def validate_json_stream(
|
|
|
99
100
|
DataContractException(
|
|
100
101
|
type="schema",
|
|
101
102
|
name="Check that JSON has valid schema",
|
|
102
|
-
result=
|
|
103
|
+
result=ResultEnum.failed,
|
|
103
104
|
reason=f"{f'#{primary_key_value}: ' if primary_key_value is not None else ''}{e.message}",
|
|
104
105
|
model=model_name,
|
|
105
106
|
engine="jsonschema",
|
|
@@ -159,27 +160,44 @@ def process_json_file(run, schema, model_name, validate, file, delimiter):
|
|
|
159
160
|
|
|
160
161
|
def process_local_file(run, server, schema, model_name, validate):
|
|
161
162
|
path = server.path
|
|
163
|
+
if not path:
|
|
164
|
+
raise DataContractException(
|
|
165
|
+
type="schema",
|
|
166
|
+
name="Check that JSON has valid schema",
|
|
167
|
+
result=ResultEnum.warning,
|
|
168
|
+
reason="For server with type 'local', a 'path' must be defined.",
|
|
169
|
+
engine="datacontract",
|
|
170
|
+
)
|
|
162
171
|
if "{model}" in path:
|
|
163
172
|
path = path.format(model=model_name)
|
|
164
173
|
|
|
174
|
+
all_files = []
|
|
165
175
|
if os.path.isdir(path):
|
|
166
|
-
|
|
176
|
+
# Fetch all JSONs in the directory
|
|
177
|
+
for root, _, files in os.walk(path):
|
|
178
|
+
for file in files:
|
|
179
|
+
if file.endswith(".json"):
|
|
180
|
+
all_files.append(os.path.join(root, file))
|
|
167
181
|
else:
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
182
|
+
# Use glob to fetch all JSONs
|
|
183
|
+
for file_path in glob.glob(path, recursive=True):
|
|
184
|
+
if os.path.isfile(file_path):
|
|
185
|
+
if file_path.endswith(".json"):
|
|
186
|
+
all_files.append(file_path)
|
|
171
187
|
|
|
188
|
+
if not all_files:
|
|
189
|
+
raise DataContractException(
|
|
190
|
+
type="schema",
|
|
191
|
+
name="Check that JSON has valid schema",
|
|
192
|
+
result=ResultEnum.warning,
|
|
193
|
+
reason=f"No files found in '{path}'.",
|
|
194
|
+
engine="datacontract",
|
|
195
|
+
)
|
|
172
196
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
file_path = os.path.join(path, filename)
|
|
178
|
-
with open(file_path, "r") as file:
|
|
179
|
-
if not process_json_file(run, model_name, validate, file, server.delimiter):
|
|
180
|
-
success = False
|
|
181
|
-
break
|
|
182
|
-
return success
|
|
197
|
+
for file in all_files:
|
|
198
|
+
logging.info(f"Processing file: {file}")
|
|
199
|
+
with open(file, "r") as f:
|
|
200
|
+
process_json_file(run, schema, model_name, validate, f, server.delimiter)
|
|
183
201
|
|
|
184
202
|
|
|
185
203
|
def process_s3_file(run, server, schema, model_name, validate):
|
|
@@ -201,7 +219,7 @@ def process_s3_file(run, server, schema, model_name, validate):
|
|
|
201
219
|
raise DataContractException(
|
|
202
220
|
type="schema",
|
|
203
221
|
name="Check that JSON has valid schema",
|
|
204
|
-
result=
|
|
222
|
+
result=ResultEnum.warning,
|
|
205
223
|
reason=f"Cannot find any file in {s3_location}",
|
|
206
224
|
engine="datacontract",
|
|
207
225
|
)
|
|
@@ -222,7 +240,7 @@ def check_jsonschema(run: Run, data_contract: DataContractSpecification, server:
|
|
|
222
240
|
Check(
|
|
223
241
|
type="schema",
|
|
224
242
|
name="Check that JSON has valid schema",
|
|
225
|
-
result=
|
|
243
|
+
result=ResultEnum.warning,
|
|
226
244
|
reason="Server format is not 'json'. Skip validating jsonschema.",
|
|
227
245
|
engine="jsonschema",
|
|
228
246
|
)
|
|
@@ -2,6 +2,8 @@ import logging
|
|
|
2
2
|
import typing
|
|
3
3
|
import uuid
|
|
4
4
|
|
|
5
|
+
from datacontract.engines.soda.connections.athena import to_athena_soda_configuration
|
|
6
|
+
|
|
5
7
|
if typing.TYPE_CHECKING:
|
|
6
8
|
from pyspark.sql import SparkSession
|
|
7
9
|
|
|
@@ -106,6 +108,10 @@ def check_soda_execute(
|
|
|
106
108
|
soda_configuration_str = to_trino_soda_configuration(server)
|
|
107
109
|
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
108
110
|
scan.set_data_source_name(server.type)
|
|
111
|
+
elif server.type == "athena":
|
|
112
|
+
soda_configuration_str = to_athena_soda_configuration(server)
|
|
113
|
+
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
114
|
+
scan.set_data_source_name(server.type)
|
|
109
115
|
|
|
110
116
|
else:
|
|
111
117
|
run.checks.append(
|