datacontract-cli 0.10.32__py3-none-any.whl → 0.10.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/cli.py +20 -5
- datacontract/data_contract.py +8 -2
- datacontract/engines/data_contract_checks.py +102 -59
- datacontract/engines/data_contract_test.py +37 -0
- datacontract/engines/fastjsonschema/check_jsonschema.py +8 -0
- datacontract/engines/soda/check_soda_execute.py +6 -0
- datacontract/engines/soda/connections/athena.py +79 -0
- datacontract/engines/soda/connections/duckdb_connection.py +3 -0
- datacontract/export/avro_converter.py +12 -2
- datacontract/export/excel_exporter.py +922 -0
- datacontract/export/exporter.py +1 -0
- datacontract/export/exporter_factory.py +4 -0
- datacontract/export/markdown_converter.py +115 -5
- datacontract/export/sql_type_converter.py +4 -0
- datacontract/imports/avro_importer.py +33 -7
- datacontract/imports/excel_importer.py +13 -5
- datacontract/imports/odcs_v3_importer.py +1 -0
- datacontract/imports/spark_importer.py +12 -1
- {datacontract_cli-0.10.32.dist-info → datacontract_cli-0.10.34.dist-info}/METADATA +111 -12
- {datacontract_cli-0.10.32.dist-info → datacontract_cli-0.10.34.dist-info}/RECORD +24 -22
- {datacontract_cli-0.10.32.dist-info → datacontract_cli-0.10.34.dist-info}/licenses/LICENSE +1 -1
- {datacontract_cli-0.10.32.dist-info → datacontract_cli-0.10.34.dist-info}/WHEEL +0 -0
- {datacontract_cli-0.10.32.dist-info → datacontract_cli-0.10.34.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.32.dist-info → datacontract_cli-0.10.34.dist-info}/top_level.txt +0 -0
datacontract/cli.py
CHANGED
|
@@ -210,12 +210,21 @@ def export(
|
|
|
210
210
|
# TODO: this should be a subcommand
|
|
211
211
|
template: Annotated[
|
|
212
212
|
Optional[Path],
|
|
213
|
-
typer.Option(
|
|
213
|
+
typer.Option(
|
|
214
|
+
help="The file path or URL of a template. For Excel format: path/URL to custom Excel template. For custom format: path to Jinja template."
|
|
215
|
+
),
|
|
214
216
|
] = None,
|
|
215
217
|
):
|
|
216
218
|
"""
|
|
217
219
|
Convert data contract to a specific format. Saves to file specified by `output` option if present, otherwise prints to stdout.
|
|
218
220
|
"""
|
|
221
|
+
# Validate that Excel format requires an output file path
|
|
222
|
+
if format == ExportFormat.excel and output is None:
|
|
223
|
+
console.print("❌ Error: Excel export requires an output file path.")
|
|
224
|
+
console.print("💡 Hint: Use --output to specify where to save the Excel file, e.g.:")
|
|
225
|
+
console.print(" datacontract export --format excel --output datacontract.xlsx")
|
|
226
|
+
raise typer.Exit(code=1)
|
|
227
|
+
|
|
219
228
|
# TODO exception handling
|
|
220
229
|
result = DataContract(data_contract_file=location, schema_location=schema, server=server).export(
|
|
221
230
|
export_format=format,
|
|
@@ -230,8 +239,13 @@ def export(
|
|
|
230
239
|
if output is None:
|
|
231
240
|
console.print(result, markup=False, soft_wrap=True)
|
|
232
241
|
else:
|
|
233
|
-
|
|
234
|
-
|
|
242
|
+
if isinstance(result, bytes):
|
|
243
|
+
# If the result is bytes, we assume it's a binary file (e.g., Excel, PDF)
|
|
244
|
+
with output.open(mode="wb") as f:
|
|
245
|
+
f.write(result)
|
|
246
|
+
else:
|
|
247
|
+
with output.open(mode="w", encoding="utf-8") as f:
|
|
248
|
+
f.write(result)
|
|
235
249
|
console.print(f"Written result to {output}")
|
|
236
250
|
|
|
237
251
|
|
|
@@ -482,13 +496,14 @@ def _get_uvicorn_arguments(port: int, host: str, context: typer.Context) -> dict
|
|
|
482
496
|
}
|
|
483
497
|
|
|
484
498
|
# Create a list of the extra arguments, remove the leading -- from the cli arguments
|
|
485
|
-
trimmed_keys = list(map(lambda x
|
|
499
|
+
trimmed_keys = list(map(lambda x: str(x).replace("--", ""), context.args[::2]))
|
|
486
500
|
# Merge the two dicts and return them as one dict
|
|
487
501
|
return default_args | dict(zip(trimmed_keys, context.args[1::2]))
|
|
488
502
|
|
|
503
|
+
|
|
489
504
|
@app.command(context_settings={"allow_extra_args": True, "ignore_unknown_options": True})
|
|
490
505
|
def api(
|
|
491
|
-
ctx: Annotated[typer.Context, typer.Option(help="Extra arguments to pass to uvicorn.run().")],
|
|
506
|
+
ctx: Annotated[typer.Context, typer.Option(help="Extra arguments to pass to uvicorn.run().")],
|
|
492
507
|
port: Annotated[int, typer.Option(help="Bind socket to this port.")] = 4242,
|
|
493
508
|
host: Annotated[
|
|
494
509
|
str, typer.Option(help="Bind socket to this host. Hint: For running in docker, set it to 0.0.0.0")
|
datacontract/data_contract.py
CHANGED
|
@@ -250,8 +250,14 @@ class DataContract:
|
|
|
250
250
|
inline_quality=self._inline_quality,
|
|
251
251
|
)
|
|
252
252
|
|
|
253
|
-
def export(
|
|
254
|
-
|
|
253
|
+
def export(
|
|
254
|
+
self, export_format: ExportFormat, model: str = "all", sql_server_type: str = "auto", **kwargs
|
|
255
|
+
) -> str | bytes:
|
|
256
|
+
if (
|
|
257
|
+
export_format == ExportFormat.html
|
|
258
|
+
or export_format == ExportFormat.mermaid
|
|
259
|
+
or export_format == ExportFormat.excel
|
|
260
|
+
):
|
|
255
261
|
data_contract = resolve.resolve_data_contract_v2(
|
|
256
262
|
self._data_contract_file,
|
|
257
263
|
self._data_contract_str,
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
+
import re
|
|
1
2
|
import uuid
|
|
3
|
+
from dataclasses import dataclass
|
|
2
4
|
from typing import List
|
|
3
5
|
from venv import logger
|
|
4
6
|
|
|
@@ -9,6 +11,12 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
|
|
|
9
11
|
from datacontract.model.run import Check
|
|
10
12
|
|
|
11
13
|
|
|
14
|
+
@dataclass
|
|
15
|
+
class QuotingConfig:
|
|
16
|
+
quote_field_name: bool = False
|
|
17
|
+
quote_model_name: bool = False
|
|
18
|
+
|
|
19
|
+
|
|
12
20
|
def create_checks(data_contract_spec: DataContractSpecification, server: Server) -> List[Check]:
|
|
13
21
|
checks: List[Check] = []
|
|
14
22
|
for model_key, model_value in data_contract_spec.models.items():
|
|
@@ -26,37 +34,41 @@ def to_model_checks(model_key, model_value, server: Server) -> List[Check]:
|
|
|
26
34
|
fields = model_value.fields
|
|
27
35
|
|
|
28
36
|
check_types = is_check_types(server)
|
|
29
|
-
|
|
37
|
+
|
|
38
|
+
quoting_config = QuotingConfig(
|
|
39
|
+
quote_field_name=server_type in ["postgres", "sqlserver"],
|
|
40
|
+
quote_model_name=server_type in ["postgres", "sqlserver"],
|
|
41
|
+
)
|
|
30
42
|
|
|
31
43
|
for field_name, field in fields.items():
|
|
32
|
-
checks.append(check_field_is_present(model_name, field_name,
|
|
44
|
+
checks.append(check_field_is_present(model_name, field_name, quoting_config))
|
|
33
45
|
if check_types and field.type is not None:
|
|
34
46
|
sql_type = convert_to_sql_type(field, server_type)
|
|
35
|
-
checks.append(check_field_type(model_name, field_name, sql_type,
|
|
47
|
+
checks.append(check_field_type(model_name, field_name, sql_type, quoting_config))
|
|
36
48
|
if field.required:
|
|
37
|
-
checks.append(check_field_required(model_name, field_name,
|
|
49
|
+
checks.append(check_field_required(model_name, field_name, quoting_config))
|
|
38
50
|
if field.unique:
|
|
39
|
-
checks.append(check_field_unique(model_name, field_name,
|
|
51
|
+
checks.append(check_field_unique(model_name, field_name, quoting_config))
|
|
40
52
|
if field.minLength is not None:
|
|
41
|
-
checks.append(check_field_min_length(model_name, field_name, field.minLength,
|
|
53
|
+
checks.append(check_field_min_length(model_name, field_name, field.minLength, quoting_config))
|
|
42
54
|
if field.maxLength is not None:
|
|
43
|
-
checks.append(check_field_max_length(model_name, field_name, field.maxLength,
|
|
55
|
+
checks.append(check_field_max_length(model_name, field_name, field.maxLength, quoting_config))
|
|
44
56
|
if field.minimum is not None:
|
|
45
|
-
checks.append(check_field_minimum(model_name, field_name, field.minimum,
|
|
57
|
+
checks.append(check_field_minimum(model_name, field_name, field.minimum, quoting_config))
|
|
46
58
|
if field.maximum is not None:
|
|
47
|
-
checks.append(check_field_maximum(model_name, field_name, field.maximum,
|
|
59
|
+
checks.append(check_field_maximum(model_name, field_name, field.maximum, quoting_config))
|
|
48
60
|
if field.exclusiveMinimum is not None:
|
|
49
|
-
checks.append(check_field_minimum(model_name, field_name, field.exclusiveMinimum,
|
|
50
|
-
checks.append(check_field_not_equal(model_name, field_name, field.exclusiveMinimum,
|
|
61
|
+
checks.append(check_field_minimum(model_name, field_name, field.exclusiveMinimum, quoting_config))
|
|
62
|
+
checks.append(check_field_not_equal(model_name, field_name, field.exclusiveMinimum, quoting_config))
|
|
51
63
|
if field.exclusiveMaximum is not None:
|
|
52
|
-
checks.append(check_field_maximum(model_name, field_name, field.exclusiveMaximum,
|
|
53
|
-
checks.append(check_field_not_equal(model_name, field_name, field.exclusiveMaximum,
|
|
64
|
+
checks.append(check_field_maximum(model_name, field_name, field.exclusiveMaximum, quoting_config))
|
|
65
|
+
checks.append(check_field_not_equal(model_name, field_name, field.exclusiveMaximum, quoting_config))
|
|
54
66
|
if field.pattern is not None:
|
|
55
|
-
checks.append(check_field_regex(model_name, field_name, field.pattern,
|
|
67
|
+
checks.append(check_field_regex(model_name, field_name, field.pattern, quoting_config))
|
|
56
68
|
if field.enum is not None and len(field.enum) > 0:
|
|
57
|
-
checks.append(check_field_enum(model_name, field_name, field.enum,
|
|
69
|
+
checks.append(check_field_enum(model_name, field_name, field.enum, quoting_config))
|
|
58
70
|
if field.quality is not None and len(field.quality) > 0:
|
|
59
|
-
quality_list = check_quality_list(model_name, field_name, field.quality)
|
|
71
|
+
quality_list = check_quality_list(model_name, field_name, field.quality, quoting_config)
|
|
60
72
|
if (quality_list is not None) and len(quality_list) > 0:
|
|
61
73
|
checks.extend(quality_list)
|
|
62
74
|
# TODO references: str = None
|
|
@@ -70,8 +82,8 @@ def to_model_checks(model_key, model_value, server: Server) -> List[Check]:
|
|
|
70
82
|
return checks
|
|
71
83
|
|
|
72
84
|
|
|
73
|
-
def checks_for(model_name,
|
|
74
|
-
if
|
|
85
|
+
def checks_for(model_name, quote_model_name: bool):
|
|
86
|
+
if quote_model_name:
|
|
75
87
|
return f'checks for "{model_name}"'
|
|
76
88
|
return f"checks for {model_name}"
|
|
77
89
|
|
|
@@ -98,11 +110,11 @@ def to_model_name(model_key, model_value, server_type):
|
|
|
98
110
|
return model_key
|
|
99
111
|
|
|
100
112
|
|
|
101
|
-
def check_field_is_present(model_name, field_name,
|
|
113
|
+
def check_field_is_present(model_name, field_name, quoting_config: QuotingConfig = QuotingConfig()) -> Check:
|
|
102
114
|
check_type = "field_is_present"
|
|
103
115
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
104
116
|
sodacl_check_dict = {
|
|
105
|
-
checks_for(model_name,
|
|
117
|
+
checks_for(model_name, quoting_config.quote_model_name): [
|
|
106
118
|
{
|
|
107
119
|
"schema": {
|
|
108
120
|
"name": check_key,
|
|
@@ -127,11 +139,13 @@ def check_field_is_present(model_name, field_name, quote_field_name: bool) -> Ch
|
|
|
127
139
|
)
|
|
128
140
|
|
|
129
141
|
|
|
130
|
-
def check_field_type(
|
|
142
|
+
def check_field_type(
|
|
143
|
+
model_name: str, field_name: str, expected_type: str, quoting_config: QuotingConfig = QuotingConfig()
|
|
144
|
+
):
|
|
131
145
|
check_type = "field_type"
|
|
132
146
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
133
147
|
sodacl_check_dict = {
|
|
134
|
-
checks_for(model_name,
|
|
148
|
+
checks_for(model_name, quoting_config.quote_model_name): [
|
|
135
149
|
{
|
|
136
150
|
"schema": {
|
|
137
151
|
"name": check_key,
|
|
@@ -158,8 +172,8 @@ def check_field_type(model_name: str, field_name: str, expected_type: str, quote
|
|
|
158
172
|
)
|
|
159
173
|
|
|
160
174
|
|
|
161
|
-
def check_field_required(model_name: str, field_name: str,
|
|
162
|
-
if quote_field_name:
|
|
175
|
+
def check_field_required(model_name: str, field_name: str, quoting_config: QuotingConfig = QuotingConfig()):
|
|
176
|
+
if quoting_config.quote_field_name:
|
|
163
177
|
field_name_for_soda = f'"{field_name}"'
|
|
164
178
|
else:
|
|
165
179
|
field_name_for_soda = field_name
|
|
@@ -167,7 +181,7 @@ def check_field_required(model_name: str, field_name: str, quote_field_name: boo
|
|
|
167
181
|
check_type = "field_required"
|
|
168
182
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
169
183
|
sodacl_check_dict = {
|
|
170
|
-
checks_for(model_name,
|
|
184
|
+
checks_for(model_name, quoting_config.quote_model_name): [
|
|
171
185
|
{
|
|
172
186
|
f"missing_count({field_name_for_soda}) = 0": {
|
|
173
187
|
"name": check_key,
|
|
@@ -189,8 +203,8 @@ def check_field_required(model_name: str, field_name: str, quote_field_name: boo
|
|
|
189
203
|
)
|
|
190
204
|
|
|
191
205
|
|
|
192
|
-
def check_field_unique(model_name: str, field_name: str,
|
|
193
|
-
if quote_field_name:
|
|
206
|
+
def check_field_unique(model_name: str, field_name: str, quoting_config: QuotingConfig = QuotingConfig()):
|
|
207
|
+
if quoting_config.quote_field_name:
|
|
194
208
|
field_name_for_soda = f'"{field_name}"'
|
|
195
209
|
else:
|
|
196
210
|
field_name_for_soda = field_name
|
|
@@ -198,7 +212,7 @@ def check_field_unique(model_name: str, field_name: str, quote_field_name: bool
|
|
|
198
212
|
check_type = "field_unique"
|
|
199
213
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
200
214
|
sodacl_check_dict = {
|
|
201
|
-
checks_for(model_name,
|
|
215
|
+
checks_for(model_name, quoting_config.quote_model_name): [
|
|
202
216
|
{
|
|
203
217
|
f"duplicate_count({field_name_for_soda}) = 0": {
|
|
204
218
|
"name": check_key,
|
|
@@ -220,8 +234,10 @@ def check_field_unique(model_name: str, field_name: str, quote_field_name: bool
|
|
|
220
234
|
)
|
|
221
235
|
|
|
222
236
|
|
|
223
|
-
def check_field_min_length(
|
|
224
|
-
|
|
237
|
+
def check_field_min_length(
|
|
238
|
+
model_name: str, field_name: str, min_length: int, quoting_config: QuotingConfig = QuotingConfig()
|
|
239
|
+
):
|
|
240
|
+
if quoting_config.quote_field_name:
|
|
225
241
|
field_name_for_soda = f'"{field_name}"'
|
|
226
242
|
else:
|
|
227
243
|
field_name_for_soda = field_name
|
|
@@ -229,7 +245,7 @@ def check_field_min_length(model_name: str, field_name: str, min_length: int, qu
|
|
|
229
245
|
check_type = "field_min_length"
|
|
230
246
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
231
247
|
sodacl_check_dict = {
|
|
232
|
-
checks_for(model_name,
|
|
248
|
+
checks_for(model_name, quoting_config.quote_model_name): [
|
|
233
249
|
{
|
|
234
250
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
235
251
|
"name": check_key,
|
|
@@ -252,8 +268,10 @@ def check_field_min_length(model_name: str, field_name: str, min_length: int, qu
|
|
|
252
268
|
)
|
|
253
269
|
|
|
254
270
|
|
|
255
|
-
def check_field_max_length(
|
|
256
|
-
|
|
271
|
+
def check_field_max_length(
|
|
272
|
+
model_name: str, field_name: str, max_length: int, quoting_config: QuotingConfig = QuotingConfig()
|
|
273
|
+
):
|
|
274
|
+
if quoting_config.quote_field_name:
|
|
257
275
|
field_name_for_soda = f'"{field_name}"'
|
|
258
276
|
else:
|
|
259
277
|
field_name_for_soda = field_name
|
|
@@ -261,7 +279,7 @@ def check_field_max_length(model_name: str, field_name: str, max_length: int, qu
|
|
|
261
279
|
check_type = "field_max_length"
|
|
262
280
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
263
281
|
sodacl_check_dict = {
|
|
264
|
-
checks_for(model_name,
|
|
282
|
+
checks_for(model_name, quoting_config.quote_model_name): [
|
|
265
283
|
{
|
|
266
284
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
267
285
|
"name": check_key,
|
|
@@ -284,8 +302,10 @@ def check_field_max_length(model_name: str, field_name: str, max_length: int, qu
|
|
|
284
302
|
)
|
|
285
303
|
|
|
286
304
|
|
|
287
|
-
def check_field_minimum(
|
|
288
|
-
|
|
305
|
+
def check_field_minimum(
|
|
306
|
+
model_name: str, field_name: str, minimum: int, quoting_config: QuotingConfig = QuotingConfig()
|
|
307
|
+
):
|
|
308
|
+
if quoting_config.quote_field_name:
|
|
289
309
|
field_name_for_soda = f'"{field_name}"'
|
|
290
310
|
else:
|
|
291
311
|
field_name_for_soda = field_name
|
|
@@ -293,7 +313,7 @@ def check_field_minimum(model_name: str, field_name: str, minimum: int, quote_fi
|
|
|
293
313
|
check_type = "field_minimum"
|
|
294
314
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
295
315
|
sodacl_check_dict = {
|
|
296
|
-
checks_for(model_name,
|
|
316
|
+
checks_for(model_name, quoting_config.quote_model_name): [
|
|
297
317
|
{
|
|
298
318
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
299
319
|
"name": check_key,
|
|
@@ -316,8 +336,10 @@ def check_field_minimum(model_name: str, field_name: str, minimum: int, quote_fi
|
|
|
316
336
|
)
|
|
317
337
|
|
|
318
338
|
|
|
319
|
-
def check_field_maximum(
|
|
320
|
-
|
|
339
|
+
def check_field_maximum(
|
|
340
|
+
model_name: str, field_name: str, maximum: int, quoting_config: QuotingConfig = QuotingConfig()
|
|
341
|
+
):
|
|
342
|
+
if quoting_config.quote_field_name:
|
|
321
343
|
field_name_for_soda = f'"{field_name}"'
|
|
322
344
|
else:
|
|
323
345
|
field_name_for_soda = field_name
|
|
@@ -325,7 +347,7 @@ def check_field_maximum(model_name: str, field_name: str, maximum: int, quote_fi
|
|
|
325
347
|
check_type = "field_maximum"
|
|
326
348
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
327
349
|
sodacl_check_dict = {
|
|
328
|
-
checks_for(model_name,
|
|
350
|
+
checks_for(model_name, quoting_config.quote_model_name): [
|
|
329
351
|
{
|
|
330
352
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
331
353
|
"name": check_key,
|
|
@@ -348,8 +370,10 @@ def check_field_maximum(model_name: str, field_name: str, maximum: int, quote_fi
|
|
|
348
370
|
)
|
|
349
371
|
|
|
350
372
|
|
|
351
|
-
def check_field_not_equal(
|
|
352
|
-
|
|
373
|
+
def check_field_not_equal(
|
|
374
|
+
model_name: str, field_name: str, value: int, quoting_config: QuotingConfig = QuotingConfig()
|
|
375
|
+
):
|
|
376
|
+
if quoting_config.quote_field_name:
|
|
353
377
|
field_name_for_soda = f'"{field_name}"'
|
|
354
378
|
else:
|
|
355
379
|
field_name_for_soda = field_name
|
|
@@ -357,7 +381,7 @@ def check_field_not_equal(model_name: str, field_name: str, value: int, quote_fi
|
|
|
357
381
|
check_type = "field_not_equal"
|
|
358
382
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
359
383
|
sodacl_check_dict = {
|
|
360
|
-
checks_for(model_name,
|
|
384
|
+
checks_for(model_name, quoting_config.quote_model_name): [
|
|
361
385
|
{
|
|
362
386
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
363
387
|
"name": check_key,
|
|
@@ -380,8 +404,8 @@ def check_field_not_equal(model_name: str, field_name: str, value: int, quote_fi
|
|
|
380
404
|
)
|
|
381
405
|
|
|
382
406
|
|
|
383
|
-
def check_field_enum(model_name: str, field_name: str, enum: list,
|
|
384
|
-
if quote_field_name:
|
|
407
|
+
def check_field_enum(model_name: str, field_name: str, enum: list, quoting_config: QuotingConfig = QuotingConfig()):
|
|
408
|
+
if quoting_config.quote_field_name:
|
|
385
409
|
field_name_for_soda = f'"{field_name}"'
|
|
386
410
|
else:
|
|
387
411
|
field_name_for_soda = field_name
|
|
@@ -389,7 +413,7 @@ def check_field_enum(model_name: str, field_name: str, enum: list, quote_field_n
|
|
|
389
413
|
check_type = "field_enum"
|
|
390
414
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
391
415
|
sodacl_check_dict = {
|
|
392
|
-
checks_for(model_name,
|
|
416
|
+
checks_for(model_name, quoting_config.quote_model_name): [
|
|
393
417
|
{
|
|
394
418
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
395
419
|
"name": check_key,
|
|
@@ -412,8 +436,8 @@ def check_field_enum(model_name: str, field_name: str, enum: list, quote_field_n
|
|
|
412
436
|
)
|
|
413
437
|
|
|
414
438
|
|
|
415
|
-
def check_field_regex(model_name: str, field_name: str, pattern: str,
|
|
416
|
-
if quote_field_name:
|
|
439
|
+
def check_field_regex(model_name: str, field_name: str, pattern: str, quoting_config: QuotingConfig = QuotingConfig()):
|
|
440
|
+
if quoting_config.quote_field_name:
|
|
417
441
|
field_name_for_soda = f'"{field_name}"'
|
|
418
442
|
else:
|
|
419
443
|
field_name_for_soda = field_name
|
|
@@ -421,7 +445,7 @@ def check_field_regex(model_name: str, field_name: str, pattern: str, quote_fiel
|
|
|
421
445
|
check_type = "field_regex"
|
|
422
446
|
check_key = f"{model_name}__{field_name}__{check_type}"
|
|
423
447
|
sodacl_check_dict = {
|
|
424
|
-
checks_for(model_name,
|
|
448
|
+
checks_for(model_name, quoting_config.quote_model_name): [
|
|
425
449
|
{
|
|
426
450
|
f"invalid_count({field_name_for_soda}) = 0": {
|
|
427
451
|
"name": check_key,
|
|
@@ -444,7 +468,9 @@ def check_field_regex(model_name: str, field_name: str, pattern: str, quote_fiel
|
|
|
444
468
|
)
|
|
445
469
|
|
|
446
470
|
|
|
447
|
-
def check_quality_list(
|
|
471
|
+
def check_quality_list(
|
|
472
|
+
model_name, field_name, quality_list: List[Quality], quoting_config: QuotingConfig = QuotingConfig()
|
|
473
|
+
) -> List[Check]:
|
|
448
474
|
checks: List[Check] = []
|
|
449
475
|
|
|
450
476
|
count = 0
|
|
@@ -457,15 +483,20 @@ def check_quality_list(model_name, field_name, quality_list: List[Quality]) -> L
|
|
|
457
483
|
check_key = f"{model_name}__{field_name}__quality_sql_{count}"
|
|
458
484
|
check_type = "model_quality_sql"
|
|
459
485
|
threshold = to_sodacl_threshold(quality)
|
|
460
|
-
query = prepare_query(quality, model_name, field_name)
|
|
486
|
+
query = prepare_query(quality, model_name, field_name, quoting_config)
|
|
461
487
|
if query is None:
|
|
462
488
|
logger.warning(f"Quality check {check_key} has no query")
|
|
463
489
|
continue
|
|
464
490
|
if threshold is None:
|
|
465
491
|
logger.warning(f"Quality check {check_key} has no valid threshold")
|
|
466
492
|
continue
|
|
493
|
+
|
|
494
|
+
if quoting_config.quote_model_name:
|
|
495
|
+
model_name_for_soda = f'"{model_name}"'
|
|
496
|
+
else:
|
|
497
|
+
model_name_for_soda = model_name
|
|
467
498
|
sodacl_check_dict = {
|
|
468
|
-
f"checks for {
|
|
499
|
+
f"checks for {model_name_for_soda}": [
|
|
469
500
|
{
|
|
470
501
|
f"{check_key} {threshold}": {
|
|
471
502
|
f"{check_key} query": query,
|
|
@@ -493,7 +524,9 @@ def check_quality_list(model_name, field_name, quality_list: List[Quality]) -> L
|
|
|
493
524
|
return checks
|
|
494
525
|
|
|
495
526
|
|
|
496
|
-
def prepare_query(
|
|
527
|
+
def prepare_query(
|
|
528
|
+
quality: Quality, model_name: str, field_name: str = None, quoting_config: QuotingConfig = QuotingConfig()
|
|
529
|
+
) -> str | None:
|
|
497
530
|
if quality.query is None:
|
|
498
531
|
return None
|
|
499
532
|
if quality.query == "":
|
|
@@ -501,14 +534,24 @@ def prepare_query(quality: Quality, model_name: str, field_name: str = None) ->
|
|
|
501
534
|
|
|
502
535
|
query = quality.query
|
|
503
536
|
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
537
|
+
if quoting_config.quote_field_name:
|
|
538
|
+
field_name_for_soda = f'"{field_name}"'
|
|
539
|
+
else:
|
|
540
|
+
field_name_for_soda = field_name
|
|
541
|
+
|
|
542
|
+
if quoting_config.quote_model_name:
|
|
543
|
+
model_name_for_soda = f'"{model_name}"'
|
|
544
|
+
else:
|
|
545
|
+
model_name_for_soda = model_name
|
|
546
|
+
|
|
547
|
+
query = re.sub(r'["\']?\{model}["\']?', model_name_for_soda, query)
|
|
548
|
+
query = re.sub(r'["\']?{schema}["\']?', model_name_for_soda, query)
|
|
549
|
+
query = re.sub(r'["\']?{table}["\']?', model_name_for_soda, query)
|
|
507
550
|
|
|
508
551
|
if field_name is not None:
|
|
509
|
-
query =
|
|
510
|
-
query =
|
|
511
|
-
query =
|
|
552
|
+
query = re.sub(r'["\']?{field}["\']?', field_name_for_soda, query)
|
|
553
|
+
query = re.sub(r'["\']?{column}["\']?', field_name_for_soda, query)
|
|
554
|
+
query = re.sub(r'["\']?{property}["\']?', field_name_for_soda, query)
|
|
512
555
|
|
|
513
556
|
return query
|
|
514
557
|
|
|
@@ -1,5 +1,9 @@
|
|
|
1
|
+
import atexit
|
|
2
|
+
import os
|
|
3
|
+
import tempfile
|
|
1
4
|
import typing
|
|
2
5
|
|
|
6
|
+
import requests
|
|
3
7
|
from duckdb.duckdb import DuckDBPyConnection
|
|
4
8
|
|
|
5
9
|
from datacontract.engines.data_contract_checks import create_checks
|
|
@@ -46,6 +50,9 @@ def execute_data_contract_test(
|
|
|
46
50
|
run.outputPortId = server.outputPortId
|
|
47
51
|
run.server = server_name
|
|
48
52
|
|
|
53
|
+
if server.type == "api":
|
|
54
|
+
server = process_api_response(run, server)
|
|
55
|
+
|
|
49
56
|
run.checks.extend(create_checks(data_contract_specification, server))
|
|
50
57
|
|
|
51
58
|
# TODO check server is supported type for nicer error messages
|
|
@@ -74,3 +81,33 @@ def get_server(data_contract_specification: DataContractSpecification, server_na
|
|
|
74
81
|
server_name = list(data_contract_specification.servers.keys())[0]
|
|
75
82
|
server = data_contract_specification.servers.get(server_name)
|
|
76
83
|
return server
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def process_api_response(run, server):
|
|
87
|
+
tmp_dir = tempfile.TemporaryDirectory(prefix="datacontract_cli_api_")
|
|
88
|
+
atexit.register(tmp_dir.cleanup)
|
|
89
|
+
headers = {}
|
|
90
|
+
if os.getenv("DATACONTRACT_API_HEADER_AUTHORIZATION") is not None:
|
|
91
|
+
headers["Authorization"] = os.getenv("DATACONTRACT_API_HEADER_AUTHORIZATION")
|
|
92
|
+
try:
|
|
93
|
+
response = requests.get(server.location, headers=headers)
|
|
94
|
+
response.raise_for_status()
|
|
95
|
+
except requests.exceptions.RequestException as e:
|
|
96
|
+
raise DataContractException(
|
|
97
|
+
type="connection",
|
|
98
|
+
name="API server connection error",
|
|
99
|
+
result=ResultEnum.error,
|
|
100
|
+
reason=f"Failed to fetch API response from {server.location}: {e}",
|
|
101
|
+
engine="datacontract",
|
|
102
|
+
)
|
|
103
|
+
with open(f"{tmp_dir.name}/api_response.json", "w") as f:
|
|
104
|
+
f.write(response.text)
|
|
105
|
+
run.log_info(f"Saved API response to {tmp_dir.name}/api_response.json")
|
|
106
|
+
server = Server(
|
|
107
|
+
type="local",
|
|
108
|
+
format="json",
|
|
109
|
+
path=f"{tmp_dir.name}/api_response.json",
|
|
110
|
+
dataProductId=server.dataProductId,
|
|
111
|
+
outputPortId=server.outputPortId,
|
|
112
|
+
)
|
|
113
|
+
return server
|
|
@@ -159,6 +159,14 @@ def process_json_file(run, schema, model_name, validate, file, delimiter):
|
|
|
159
159
|
|
|
160
160
|
def process_local_file(run, server, schema, model_name, validate):
|
|
161
161
|
path = server.path
|
|
162
|
+
if not path:
|
|
163
|
+
raise DataContractException(
|
|
164
|
+
type="schema",
|
|
165
|
+
name="Check that JSON has valid schema",
|
|
166
|
+
result=ResultEnum.warning,
|
|
167
|
+
reason="For server with type 'local', a 'path' must be defined.",
|
|
168
|
+
engine="datacontract",
|
|
169
|
+
)
|
|
162
170
|
if "{model}" in path:
|
|
163
171
|
path = path.format(model=model_name)
|
|
164
172
|
|
|
@@ -2,6 +2,8 @@ import logging
|
|
|
2
2
|
import typing
|
|
3
3
|
import uuid
|
|
4
4
|
|
|
5
|
+
from datacontract.engines.soda.connections.athena import to_athena_soda_configuration
|
|
6
|
+
|
|
5
7
|
if typing.TYPE_CHECKING:
|
|
6
8
|
from pyspark.sql import SparkSession
|
|
7
9
|
|
|
@@ -106,6 +108,10 @@ def check_soda_execute(
|
|
|
106
108
|
soda_configuration_str = to_trino_soda_configuration(server)
|
|
107
109
|
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
108
110
|
scan.set_data_source_name(server.type)
|
|
111
|
+
elif server.type == "athena":
|
|
112
|
+
soda_configuration_str = to_athena_soda_configuration(server)
|
|
113
|
+
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
114
|
+
scan.set_data_source_name(server.type)
|
|
109
115
|
|
|
110
116
|
else:
|
|
111
117
|
run.checks.append(
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import yaml
|
|
4
|
+
|
|
5
|
+
from datacontract.model.exceptions import DataContractException
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def to_athena_soda_configuration(server):
|
|
9
|
+
s3_region = os.getenv("DATACONTRACT_S3_REGION")
|
|
10
|
+
s3_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
|
|
11
|
+
s3_secret_access_key = os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY")
|
|
12
|
+
s3_session_token = os.getenv("DATACONTRACT_S3_SESSION_TOKEN")
|
|
13
|
+
|
|
14
|
+
# Validate required parameters
|
|
15
|
+
if not s3_access_key_id:
|
|
16
|
+
raise DataContractException(
|
|
17
|
+
type="athena-connection",
|
|
18
|
+
name="missing_access_key_id",
|
|
19
|
+
reason="AWS access key ID is required. Set the DATACONTRACT_S3_ACCESS_KEY_ID environment variable.",
|
|
20
|
+
engine="datacontract",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
if not s3_secret_access_key:
|
|
24
|
+
raise DataContractException(
|
|
25
|
+
type="athena-connection",
|
|
26
|
+
name="missing_secret_access_key",
|
|
27
|
+
reason="AWS secret access key is required. Set the DATACONTRACT_S3_SECRET_ACCESS_KEY environment variable.",
|
|
28
|
+
engine="datacontract",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
if not hasattr(server, "schema_") or not server.schema_:
|
|
32
|
+
raise DataContractException(
|
|
33
|
+
type="athena-connection",
|
|
34
|
+
name="missing_schema",
|
|
35
|
+
reason="Schema is required for Athena connection. Specify the schema where your tables exist in the server configuration.",
|
|
36
|
+
engine="datacontract",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
if not hasattr(server, "stagingDir") or not server.stagingDir:
|
|
40
|
+
raise DataContractException(
|
|
41
|
+
type="athena-connection",
|
|
42
|
+
name="missing_s3_staging_dir",
|
|
43
|
+
reason="S3 staging directory is required for Athena connection. This should be the Amazon S3 Query Result Location (e.g., 's3://my-bucket/athena-results/').",
|
|
44
|
+
engine="datacontract",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# Validate S3 staging directory format
|
|
48
|
+
if not server.stagingDir.startswith("s3://"):
|
|
49
|
+
raise DataContractException(
|
|
50
|
+
type="athena-connection",
|
|
51
|
+
name="invalid_s3_staging_dir",
|
|
52
|
+
reason=f"S3 staging directory must start with 's3://'. Got: {server.s3_staging_dir}. Example: 's3://my-bucket/athena-results/'",
|
|
53
|
+
engine="datacontract",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
data_source = {
|
|
57
|
+
"type": "athena",
|
|
58
|
+
"access_key_id": s3_access_key_id,
|
|
59
|
+
"secret_access_key": s3_secret_access_key,
|
|
60
|
+
"schema": server.schema_,
|
|
61
|
+
"staging_dir": server.stagingDir,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
if s3_region:
|
|
65
|
+
data_source["region_name"] = s3_region
|
|
66
|
+
elif server.region_name:
|
|
67
|
+
data_source["region_name"] = server.region_name
|
|
68
|
+
|
|
69
|
+
if server.catalog:
|
|
70
|
+
# Optional, Identify the name of the Data Source, also referred to as a Catalog. The default value is `awsdatacatalog`.
|
|
71
|
+
data_source["catalog"] = server.catalog
|
|
72
|
+
|
|
73
|
+
if s3_session_token:
|
|
74
|
+
data_source["aws_session_token"] = s3_session_token
|
|
75
|
+
|
|
76
|
+
soda_configuration = {f"data_source {server.type}": data_source}
|
|
77
|
+
|
|
78
|
+
soda_configuration_str = yaml.dump(soda_configuration)
|
|
79
|
+
return soda_configuration_str
|
|
@@ -71,6 +71,9 @@ def get_duckdb_connection(
|
|
|
71
71
|
elif server.format == "delta":
|
|
72
72
|
con.sql("update extensions;") # Make sure we have the latest delta extension
|
|
73
73
|
con.sql(f"""CREATE VIEW "{model_name}" AS SELECT * FROM delta_scan('{model_path}');""")
|
|
74
|
+
table_info = con.sql(f"PRAGMA table_info('{model_name}');").fetchdf()
|
|
75
|
+
if table_info is not None and not table_info.empty:
|
|
76
|
+
run.log_info(f"DuckDB Table Info: {table_info.to_string(index=False)}")
|
|
74
77
|
return con
|
|
75
78
|
|
|
76
79
|
|
|
@@ -44,12 +44,18 @@ def to_avro_field(field, field_name):
|
|
|
44
44
|
avro_type = to_avro_type(field, field_name)
|
|
45
45
|
avro_field["type"] = avro_type if is_required_avro else ["null", avro_type]
|
|
46
46
|
|
|
47
|
-
|
|
48
|
-
|
|
47
|
+
# Handle enum types - both required and optional
|
|
48
|
+
if avro_type == "enum" or (isinstance(avro_field["type"], list) and "enum" in avro_field["type"]):
|
|
49
|
+
enum_def = {
|
|
49
50
|
"type": "enum",
|
|
50
51
|
"name": field.title,
|
|
51
52
|
"symbols": field.enum,
|
|
52
53
|
}
|
|
54
|
+
if is_required_avro:
|
|
55
|
+
avro_field["type"] = enum_def
|
|
56
|
+
else:
|
|
57
|
+
# Replace "enum" with the full enum definition in the union
|
|
58
|
+
avro_field["type"] = ["null", enum_def]
|
|
53
59
|
|
|
54
60
|
if field.config:
|
|
55
61
|
if "avroDefault" in field.config:
|
|
@@ -77,6 +83,10 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
|
|
|
77
83
|
if "avroType" in field.config:
|
|
78
84
|
return field.config["avroType"]
|
|
79
85
|
|
|
86
|
+
# Check for enum fields based on presence of enum list and avroType config
|
|
87
|
+
if field.enum and field.config and field.config.get("avroType") == "enum":
|
|
88
|
+
return "enum"
|
|
89
|
+
|
|
80
90
|
if field.type is None:
|
|
81
91
|
return "null"
|
|
82
92
|
if field.type in ["string", "varchar", "text"]:
|