datacontract-cli 0.10.14__py3-none-any.whl → 0.10.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/breaking/breaking.py +229 -11
- datacontract/breaking/breaking_rules.py +24 -0
- datacontract/catalog/catalog.py +1 -1
- datacontract/cli.py +100 -33
- datacontract/data_contract.py +26 -4
- datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
- datacontract/engines/fastjsonschema/check_jsonschema.py +114 -22
- datacontract/engines/soda/check_soda_execute.py +7 -5
- datacontract/engines/soda/connections/duckdb.py +1 -0
- datacontract/engines/soda/connections/kafka.py +12 -12
- datacontract/export/avro_idl_converter.py +1 -2
- datacontract/export/bigquery_converter.py +4 -3
- datacontract/export/data_caterer_converter.py +1 -1
- datacontract/export/dbml_converter.py +2 -4
- datacontract/export/dbt_converter.py +45 -39
- datacontract/export/exporter.py +2 -1
- datacontract/export/exporter_factory.py +7 -2
- datacontract/export/go_converter.py +3 -2
- datacontract/export/great_expectations_converter.py +202 -40
- datacontract/export/html_export.py +1 -1
- datacontract/export/iceberg_converter.py +188 -0
- datacontract/export/jsonschema_converter.py +3 -2
- datacontract/export/odcs_v2_exporter.py +1 -1
- datacontract/export/odcs_v3_exporter.py +44 -30
- datacontract/export/pandas_type_converter.py +40 -0
- datacontract/export/protobuf_converter.py +1 -1
- datacontract/export/rdf_converter.py +4 -5
- datacontract/export/sodacl_converter.py +9 -4
- datacontract/export/spark_converter.py +7 -6
- datacontract/export/sql_converter.py +1 -2
- datacontract/export/sqlalchemy_converter.py +1 -2
- datacontract/export/terraform_converter.py +1 -1
- datacontract/imports/avro_importer.py +1 -1
- datacontract/imports/bigquery_importer.py +1 -1
- datacontract/imports/dbml_importer.py +2 -2
- datacontract/imports/dbt_importer.py +80 -15
- datacontract/imports/glue_importer.py +5 -3
- datacontract/imports/iceberg_importer.py +17 -7
- datacontract/imports/importer.py +1 -0
- datacontract/imports/importer_factory.py +7 -1
- datacontract/imports/jsonschema_importer.py +3 -2
- datacontract/imports/odcs_v2_importer.py +2 -2
- datacontract/imports/odcs_v3_importer.py +7 -2
- datacontract/imports/parquet_importer.py +81 -0
- datacontract/imports/spark_importer.py +2 -1
- datacontract/imports/sql_importer.py +1 -1
- datacontract/imports/unity_importer.py +3 -3
- datacontract/integration/opentelemetry.py +0 -1
- datacontract/lint/lint.py +2 -1
- datacontract/lint/linters/description_linter.py +1 -0
- datacontract/lint/linters/example_model_linter.py +1 -0
- datacontract/lint/linters/field_pattern_linter.py +1 -0
- datacontract/lint/linters/field_reference_linter.py +1 -0
- datacontract/lint/linters/notice_period_linter.py +1 -0
- datacontract/lint/linters/quality_schema_linter.py +1 -0
- datacontract/lint/linters/valid_constraints_linter.py +1 -0
- datacontract/lint/resolve.py +7 -3
- datacontract/lint/schema.py +1 -1
- datacontract/model/data_contract_specification.py +13 -6
- datacontract/model/run.py +21 -12
- datacontract/templates/index.html +6 -6
- datacontract/web.py +2 -3
- {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.16.dist-info}/METADATA +163 -60
- datacontract_cli-0.10.16.dist-info/RECORD +106 -0
- {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.16.dist-info}/WHEEL +1 -1
- datacontract_cli-0.10.14.dist-info/RECORD +0 -103
- {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.16.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.16.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.16.dist-info}/top_level.txt +0 -0
|
@@ -1,31 +1,114 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
|
+
import threading
|
|
5
|
+
from typing import List, Optional
|
|
4
6
|
|
|
5
7
|
import fastjsonschema
|
|
8
|
+
from fastjsonschema import JsonSchemaValueException
|
|
6
9
|
|
|
7
10
|
from datacontract.engines.fastjsonschema.s3.s3_read_files import yield_s3_files
|
|
8
11
|
from datacontract.export.jsonschema_converter import to_jsonschema
|
|
9
12
|
from datacontract.model.data_contract_specification import DataContractSpecification, Server
|
|
10
13
|
from datacontract.model.exceptions import DataContractException
|
|
11
|
-
from datacontract.model.run import
|
|
14
|
+
from datacontract.model.run import Check, Run
|
|
12
15
|
|
|
16
|
+
# Thread-safe cache for primaryKey fields.
|
|
17
|
+
_primary_key_cache = {}
|
|
18
|
+
_cache_lock = threading.Lock()
|
|
13
19
|
|
|
14
|
-
|
|
20
|
+
|
|
21
|
+
def get_primary_key_field(schema: dict, model_name: str) -> Optional[str]:
|
|
22
|
+
# Check cache first.
|
|
23
|
+
with _cache_lock:
|
|
24
|
+
cached_value = _primary_key_cache.get(model_name)
|
|
25
|
+
if cached_value is not None:
|
|
26
|
+
return cached_value
|
|
27
|
+
|
|
28
|
+
# Find primaryKey field.
|
|
29
|
+
fields = schema.get("properties", {})
|
|
30
|
+
for field_name, attributes in fields.items():
|
|
31
|
+
if attributes.get("primaryKey", False):
|
|
32
|
+
# Cache the result before returning.
|
|
33
|
+
with _cache_lock:
|
|
34
|
+
_primary_key_cache[model_name] = field_name
|
|
35
|
+
return field_name
|
|
36
|
+
|
|
37
|
+
# Return None if no primary key was found.
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def get_primary_key_value(schema: dict, model_name: str, json_object: dict) -> Optional[str]:
|
|
42
|
+
# Get the `primaryKey` field.
|
|
43
|
+
primary_key_field = get_primary_key_field(schema, model_name)
|
|
44
|
+
if not primary_key_field:
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
# Return the value of the `primaryKey` field in the JSON object.
|
|
48
|
+
return json_object.get(primary_key_field)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def process_exceptions(run, exceptions: List[DataContractException]):
|
|
52
|
+
if not exceptions:
|
|
53
|
+
return
|
|
54
|
+
|
|
55
|
+
# Define the maximum number of errors to process (can be adjusted by defining an ENV variable).
|
|
15
56
|
try:
|
|
16
|
-
|
|
17
|
-
|
|
57
|
+
error_limit = int(os.getenv("DATACONTRACT_MAX_ERRORS", 500))
|
|
58
|
+
except ValueError:
|
|
59
|
+
# Fallback to default if environment variable is invalid.
|
|
60
|
+
error_limit = 500
|
|
61
|
+
|
|
62
|
+
# Calculate the effective limit to avoid index out of range
|
|
63
|
+
limit = min(len(exceptions), error_limit)
|
|
64
|
+
|
|
65
|
+
# Add all exceptions up to the limit - 1 to `run.checks`.
|
|
66
|
+
DEFAULT_ERROR_MESSAGE = "An error occurred during validation phase. See the logs for more details."
|
|
67
|
+
run.checks.extend(
|
|
68
|
+
[
|
|
69
|
+
Check(
|
|
70
|
+
type=exception.type,
|
|
71
|
+
name=exception.name,
|
|
72
|
+
result=exception.result,
|
|
73
|
+
reason=exception.reason,
|
|
74
|
+
model=exception.model,
|
|
75
|
+
engine=exception.engine,
|
|
76
|
+
message=exception.message or DEFAULT_ERROR_MESSAGE,
|
|
77
|
+
)
|
|
78
|
+
for exception in exceptions[: limit - 1]
|
|
79
|
+
]
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Raise the last exception within the limit.
|
|
83
|
+
last_exception = exceptions[limit - 1]
|
|
84
|
+
raise last_exception
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def validate_json_stream(
|
|
88
|
+
schema: dict, model_name: str, validate: callable, json_stream: list[dict]
|
|
89
|
+
) -> List[DataContractException]:
|
|
90
|
+
logging.info(f"Validating JSON stream for model: '{model_name}'.")
|
|
91
|
+
exceptions: List[DataContractException] = []
|
|
92
|
+
for json_obj in json_stream:
|
|
93
|
+
try:
|
|
18
94
|
validate(json_obj)
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
95
|
+
except JsonSchemaValueException as e:
|
|
96
|
+
logging.warning(f"Validation failed for JSON object with type: '{model_name}'.")
|
|
97
|
+
primary_key_value = get_primary_key_value(schema, model_name, json_obj)
|
|
98
|
+
exceptions.append(
|
|
99
|
+
DataContractException(
|
|
100
|
+
type="schema",
|
|
101
|
+
name="Check that JSON has valid schema",
|
|
102
|
+
result="failed",
|
|
103
|
+
reason=f"{f'#{primary_key_value}: ' if primary_key_value is not None else ''}{e.message}",
|
|
104
|
+
model=model_name,
|
|
105
|
+
engine="jsonschema",
|
|
106
|
+
message=e.message,
|
|
107
|
+
)
|
|
108
|
+
)
|
|
109
|
+
if not exceptions:
|
|
110
|
+
logging.info(f"All JSON objects in the stream passed validation for model: '{model_name}'.")
|
|
111
|
+
return exceptions
|
|
29
112
|
|
|
30
113
|
|
|
31
114
|
def read_json_lines(file):
|
|
@@ -59,17 +142,22 @@ def read_json_file_content(file_content: str):
|
|
|
59
142
|
yield json.loads(file_content)
|
|
60
143
|
|
|
61
144
|
|
|
62
|
-
def process_json_file(run, model_name, validate, file, delimiter):
|
|
145
|
+
def process_json_file(run, schema, model_name, validate, file, delimiter):
|
|
63
146
|
if delimiter == "new_line":
|
|
64
147
|
json_stream = read_json_lines(file)
|
|
65
148
|
elif delimiter == "array":
|
|
66
149
|
json_stream = read_json_array(file)
|
|
67
150
|
else:
|
|
68
151
|
json_stream = read_json_file(file)
|
|
69
|
-
validate_json_stream(model_name, validate, json_stream)
|
|
70
152
|
|
|
153
|
+
# Validate the JSON stream and collect exceptions.
|
|
154
|
+
exceptions = validate_json_stream(schema, model_name, validate, json_stream)
|
|
71
155
|
|
|
72
|
-
|
|
156
|
+
# Handle all errors from schema validation.
|
|
157
|
+
process_exceptions(run, exceptions)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def process_local_file(run, server, schema, model_name, validate):
|
|
73
161
|
path = server.path
|
|
74
162
|
if "{model}" in path:
|
|
75
163
|
path = path.format(model=model_name)
|
|
@@ -79,7 +167,7 @@ def process_local_file(run, server, model_name, validate):
|
|
|
79
167
|
else:
|
|
80
168
|
logging.info(f"Processing file {path}")
|
|
81
169
|
with open(path, "r") as file:
|
|
82
|
-
process_json_file(run, model_name, validate, file, server.delimiter)
|
|
170
|
+
process_json_file(run, schema, model_name, validate, file, server.delimiter)
|
|
83
171
|
|
|
84
172
|
|
|
85
173
|
def process_directory(run, path, server, model_name, validate):
|
|
@@ -94,7 +182,7 @@ def process_directory(run, path, server, model_name, validate):
|
|
|
94
182
|
return success
|
|
95
183
|
|
|
96
184
|
|
|
97
|
-
def process_s3_file(server, model_name, validate):
|
|
185
|
+
def process_s3_file(run, server, schema, model_name, validate):
|
|
98
186
|
s3_endpoint_url = server.endpointUrl
|
|
99
187
|
s3_location = server.location
|
|
100
188
|
if "{model}" in s3_location:
|
|
@@ -118,7 +206,11 @@ def process_s3_file(server, model_name, validate):
|
|
|
118
206
|
engine="datacontract",
|
|
119
207
|
)
|
|
120
208
|
|
|
121
|
-
|
|
209
|
+
# Validate the JSON stream and collect exceptions.
|
|
210
|
+
exceptions = validate_json_stream(schema, model_name, validate, json_stream)
|
|
211
|
+
|
|
212
|
+
# Handle all errors from schema validation.
|
|
213
|
+
process_exceptions(run, exceptions)
|
|
122
214
|
|
|
123
215
|
|
|
124
216
|
def check_jsonschema(run: Run, data_contract: DataContractSpecification, server: Server):
|
|
@@ -155,9 +247,9 @@ def check_jsonschema(run: Run, data_contract: DataContractSpecification, server:
|
|
|
155
247
|
|
|
156
248
|
# Process files based on server type
|
|
157
249
|
if server.type == "local":
|
|
158
|
-
process_local_file(run, server, model_name, validate)
|
|
250
|
+
process_local_file(run, server, schema, model_name, validate)
|
|
159
251
|
elif server.type == "s3":
|
|
160
|
-
process_s3_file(server, model_name, validate)
|
|
252
|
+
process_s3_file(run, server, schema, model_name, validate)
|
|
161
253
|
elif server.type == "gcs":
|
|
162
254
|
run.checks.append(
|
|
163
255
|
Check(
|
|
@@ -12,7 +12,7 @@ from datacontract.engines.soda.connections.sqlserver import to_sqlserver_soda_co
|
|
|
12
12
|
from datacontract.engines.soda.connections.trino import to_trino_soda_configuration
|
|
13
13
|
from datacontract.export.sodacl_converter import to_sodacl_yaml
|
|
14
14
|
from datacontract.model.data_contract_specification import DataContractSpecification, Server
|
|
15
|
-
from datacontract.model.run import
|
|
15
|
+
from datacontract.model.run import Check, Log, ResultEnum, Run
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
def check_soda_execute(run: Run, data_contract: DataContractSpecification, server: Server, spark, tmp_dir):
|
|
@@ -33,7 +33,7 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
33
33
|
Check(
|
|
34
34
|
type="general",
|
|
35
35
|
name="Check that format is supported",
|
|
36
|
-
result=
|
|
36
|
+
result=ResultEnum.warning,
|
|
37
37
|
reason=f"Format {server.format} not yet supported by datacontract CLI",
|
|
38
38
|
engine="datacontract",
|
|
39
39
|
)
|
|
@@ -93,7 +93,7 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
93
93
|
Check(
|
|
94
94
|
type="general",
|
|
95
95
|
name="Check that server type is supported",
|
|
96
|
-
result=
|
|
96
|
+
result=ResultEnum.warning,
|
|
97
97
|
reason=f"Server type {server.type} not yet supported by datacontract CLI",
|
|
98
98
|
engine="datacontract-cli",
|
|
99
99
|
)
|
|
@@ -176,9 +176,11 @@ def update_reason(check, c):
|
|
|
176
176
|
if block["title"] == "Diagnostics":
|
|
177
177
|
# Extract and print the 'text' value
|
|
178
178
|
diagnostics_text = block["text"]
|
|
179
|
-
print(diagnostics_text)
|
|
179
|
+
# print(diagnostics_text)
|
|
180
180
|
diagnostics_text_split = diagnostics_text.split(":icon-fail: ")
|
|
181
181
|
if len(diagnostics_text_split) > 1:
|
|
182
182
|
check.reason = diagnostics_text_split[1].strip()
|
|
183
|
-
print(check.reason)
|
|
183
|
+
# print(check.reason)
|
|
184
184
|
break # Exit the loop once the desired block is found
|
|
185
|
+
if "fail" in c["diagnostics"]:
|
|
186
|
+
check.reason = f"Got: {c['diagnostics']['value']} Expected: {c['diagnostics']['fail']}"
|
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
import os
|
|
3
3
|
|
|
4
4
|
from datacontract.export.avro_converter import to_avro_schema_json
|
|
5
|
-
from datacontract.model.data_contract_specification import DataContractSpecification,
|
|
5
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Server
|
|
6
6
|
from datacontract.model.exceptions import DataContractException
|
|
7
7
|
|
|
8
8
|
|
|
@@ -69,8 +69,8 @@ def read_kafka_topic(spark, data_contract: DataContractSpecification, server: Se
|
|
|
69
69
|
|
|
70
70
|
def process_avro_format(df, model_name, model):
|
|
71
71
|
try:
|
|
72
|
-
from pyspark.sql.functions import col, expr
|
|
73
72
|
from pyspark.sql.avro.functions import from_avro
|
|
73
|
+
from pyspark.sql.functions import col, expr
|
|
74
74
|
except ImportError as e:
|
|
75
75
|
raise DataContractException(
|
|
76
76
|
type="schema",
|
|
@@ -167,21 +167,21 @@ def to_struct_type(fields):
|
|
|
167
167
|
def to_struct_field(field_name: str, field: Field):
|
|
168
168
|
try:
|
|
169
169
|
from pyspark.sql.types import (
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
170
|
+
ArrayType,
|
|
171
|
+
BinaryType,
|
|
172
|
+
BooleanType,
|
|
173
|
+
DataType,
|
|
174
|
+
DateType,
|
|
173
175
|
DecimalType,
|
|
174
176
|
DoubleType,
|
|
175
177
|
IntegerType,
|
|
176
178
|
LongType,
|
|
177
|
-
BooleanType,
|
|
178
|
-
TimestampType,
|
|
179
|
-
TimestampNTZType,
|
|
180
|
-
DateType,
|
|
181
|
-
BinaryType,
|
|
182
|
-
ArrayType,
|
|
183
179
|
NullType,
|
|
184
|
-
|
|
180
|
+
StringType,
|
|
181
|
+
StructField,
|
|
182
|
+
StructType,
|
|
183
|
+
TimestampNTZType,
|
|
184
|
+
TimestampType,
|
|
185
185
|
)
|
|
186
186
|
except ImportError as e:
|
|
187
187
|
raise DataContractException(
|
|
@@ -3,12 +3,11 @@ from dataclasses import dataclass
|
|
|
3
3
|
from enum import Enum
|
|
4
4
|
from io import StringIO
|
|
5
5
|
|
|
6
|
+
from datacontract.export.exporter import Exporter
|
|
6
7
|
from datacontract.lint.resolve import inline_definitions_into_data_contract
|
|
7
8
|
from datacontract.model.data_contract_specification import DataContractSpecification, Field
|
|
8
9
|
from datacontract.model.exceptions import DataContractException
|
|
9
10
|
|
|
10
|
-
from datacontract.export.exporter import Exporter
|
|
11
|
-
|
|
12
11
|
|
|
13
12
|
class AvroPrimitiveType(Enum):
|
|
14
13
|
int = "int"
|
|
@@ -2,10 +2,9 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
|
-
from datacontract.model.data_contract_specification import Model, Field, Server
|
|
6
|
-
from datacontract.model.exceptions import DataContractException
|
|
7
|
-
|
|
8
5
|
from datacontract.export.exporter import Exporter, _check_models_for_export
|
|
6
|
+
from datacontract.model.data_contract_specification import Field, Model, Server
|
|
7
|
+
from datacontract.model.exceptions import DataContractException
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
class BigQueryExporter(Exporter):
|
|
@@ -109,6 +108,8 @@ def map_type_to_bigquery(field: Field) -> str:
|
|
|
109
108
|
return "NUMERIC"
|
|
110
109
|
elif field_type.lower() == "double":
|
|
111
110
|
return "BIGNUMERIC"
|
|
111
|
+
elif field_type.lower() in ["object", "record"] and not field.fields:
|
|
112
|
+
return "JSON"
|
|
112
113
|
elif field_type.lower() in ["object", "record", "array"]:
|
|
113
114
|
return "RECORD"
|
|
114
115
|
elif field_type.lower() == "struct":
|
|
@@ -3,7 +3,7 @@ from typing import Dict
|
|
|
3
3
|
import yaml
|
|
4
4
|
|
|
5
5
|
from datacontract.export.exporter import Exporter
|
|
6
|
-
from datacontract.model.data_contract_specification import DataContractSpecification,
|
|
6
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Server
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class DataCatererExporter(Exporter):
|
|
@@ -3,13 +3,11 @@ from importlib.metadata import version
|
|
|
3
3
|
from typing import Tuple
|
|
4
4
|
|
|
5
5
|
import pytz
|
|
6
|
-
from datacontract.model.exceptions import DataContractException
|
|
7
6
|
|
|
8
7
|
import datacontract.model.data_contract_specification as spec
|
|
9
|
-
from datacontract.export.sql_type_converter import convert_to_sql_type
|
|
10
|
-
|
|
11
|
-
|
|
12
8
|
from datacontract.export.exporter import Exporter
|
|
9
|
+
from datacontract.export.sql_type_converter import convert_to_sql_type
|
|
10
|
+
from datacontract.model.exceptions import DataContractException
|
|
13
11
|
|
|
14
12
|
|
|
15
13
|
class DbmlExporter(Exporter):
|
|
@@ -1,11 +1,10 @@
|
|
|
1
|
-
from typing import Dict
|
|
1
|
+
from typing import Dict, Optional
|
|
2
2
|
|
|
3
3
|
import yaml
|
|
4
4
|
|
|
5
|
-
from datacontract.export.sql_type_converter import convert_to_sql_type
|
|
6
|
-
from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
|
|
7
|
-
|
|
8
5
|
from datacontract.export.exporter import Exporter, _check_models_for_export
|
|
6
|
+
from datacontract.export.sql_type_converter import convert_to_sql_type
|
|
7
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
class DbtExporter(Exporter):
|
|
@@ -53,14 +52,14 @@ def to_dbt_staging_sql(data_contract_spec: DataContractSpecification, model_name
|
|
|
53
52
|
# TODO escape SQL reserved key words, probably dependent on server type
|
|
54
53
|
columns.append(field_name)
|
|
55
54
|
return f"""
|
|
56
|
-
select
|
|
55
|
+
select
|
|
57
56
|
{", ".join(columns)}
|
|
58
57
|
from {{{{ source('{id}', '{model_name}') }}}}
|
|
59
58
|
"""
|
|
60
59
|
|
|
61
60
|
|
|
62
61
|
def to_dbt_sources_yaml(data_contract_spec: DataContractSpecification, server: str = None):
|
|
63
|
-
source = {"name": data_contract_spec.id
|
|
62
|
+
source = {"name": data_contract_spec.id}
|
|
64
63
|
dbt = {
|
|
65
64
|
"version": 2,
|
|
66
65
|
"sources": [source],
|
|
@@ -70,24 +69,31 @@ def to_dbt_sources_yaml(data_contract_spec: DataContractSpecification, server: s
|
|
|
70
69
|
if data_contract_spec.info.description is not None:
|
|
71
70
|
source["description"] = data_contract_spec.info.description
|
|
72
71
|
found_server = data_contract_spec.servers.get(server)
|
|
72
|
+
adapter_type = None
|
|
73
73
|
if found_server is not None:
|
|
74
|
-
|
|
75
|
-
|
|
74
|
+
adapter_type = found_server.type
|
|
75
|
+
if adapter_type == "bigquery":
|
|
76
|
+
source["database"] = found_server.project
|
|
77
|
+
source["schema"] = found_server.dataset
|
|
78
|
+
else:
|
|
79
|
+
source["database"] = found_server.database
|
|
80
|
+
source["schema"] = found_server.schema_
|
|
76
81
|
|
|
82
|
+
source["tables"] = []
|
|
77
83
|
for model_key, model_value in data_contract_spec.models.items():
|
|
78
|
-
dbt_model = _to_dbt_source_table(model_key, model_value)
|
|
84
|
+
dbt_model = _to_dbt_source_table(model_key, model_value, adapter_type)
|
|
79
85
|
source["tables"].append(dbt_model)
|
|
80
86
|
return yaml.dump(dbt, indent=2, sort_keys=False, allow_unicode=True)
|
|
81
87
|
|
|
82
88
|
|
|
83
|
-
def _to_dbt_source_table(model_key, model_value: Model) -> dict:
|
|
89
|
+
def _to_dbt_source_table(model_key, model_value: Model, adapter_type: Optional[str]) -> dict:
|
|
84
90
|
dbt_model = {
|
|
85
91
|
"name": model_key,
|
|
86
92
|
}
|
|
87
93
|
|
|
88
94
|
if model_value.description is not None:
|
|
89
95
|
dbt_model["description"] = model_value.description
|
|
90
|
-
columns = _to_columns(model_value.fields, False,
|
|
96
|
+
columns = _to_columns(model_value.fields, False, adapter_type)
|
|
91
97
|
if columns:
|
|
92
98
|
dbt_model["columns"] = columns
|
|
93
99
|
return dbt_model
|
|
@@ -108,7 +114,7 @@ def _to_dbt_model(model_key, model_value: Model, data_contract_spec: DataContrac
|
|
|
108
114
|
dbt_model["config"]["contract"] = {"enforced": True}
|
|
109
115
|
if model_value.description is not None:
|
|
110
116
|
dbt_model["description"] = model_value.description
|
|
111
|
-
columns = _to_columns(model_value.fields, _supports_constraints(model_type),
|
|
117
|
+
columns = _to_columns(model_value.fields, _supports_constraints(model_type), None)
|
|
112
118
|
if columns:
|
|
113
119
|
dbt_model["columns"] = columns
|
|
114
120
|
return dbt_model
|
|
@@ -131,48 +137,47 @@ def _supports_constraints(model_type):
|
|
|
131
137
|
return model_type == "table" or model_type == "incremental"
|
|
132
138
|
|
|
133
139
|
|
|
134
|
-
def _to_columns(fields: Dict[str, Field], supports_constraints: bool,
|
|
140
|
+
def _to_columns(fields: Dict[str, Field], supports_constraints: bool, adapter_type: Optional[str]) -> list:
|
|
135
141
|
columns = []
|
|
136
142
|
for field_name, field in fields.items():
|
|
137
|
-
column = _to_column(field, supports_constraints,
|
|
138
|
-
column["name"] = field_name
|
|
143
|
+
column = _to_column(field_name, field, supports_constraints, adapter_type)
|
|
139
144
|
columns.append(column)
|
|
140
145
|
return columns
|
|
141
146
|
|
|
142
147
|
|
|
143
|
-
def _to_column(field: Field, supports_constraints: bool,
|
|
144
|
-
column = {}
|
|
145
|
-
|
|
148
|
+
def _to_column(field_name: str, field: Field, supports_constraints: bool, adapter_type: Optional[str]) -> dict:
|
|
149
|
+
column = {"name": field_name}
|
|
150
|
+
adapter_type = adapter_type or "snowflake"
|
|
151
|
+
dbt_type = convert_to_sql_type(field, adapter_type)
|
|
152
|
+
|
|
153
|
+
column["data_tests"] = []
|
|
146
154
|
if dbt_type is not None:
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
)
|
|
155
|
+
column["data_type"] = dbt_type
|
|
156
|
+
else:
|
|
157
|
+
column["data_tests"].append(
|
|
158
|
+
{"dbt_expectations.dbt_expectations.expect_column_values_to_be_of_type": {"column_type": dbt_type}}
|
|
159
|
+
)
|
|
153
160
|
if field.description is not None:
|
|
154
161
|
column["description"] = field.description
|
|
155
162
|
if field.required:
|
|
156
163
|
if supports_constraints:
|
|
157
164
|
column.setdefault("constraints", []).append({"type": "not_null"})
|
|
158
165
|
else:
|
|
159
|
-
column
|
|
166
|
+
column["data_tests"].append("not_null")
|
|
160
167
|
if field.unique:
|
|
161
168
|
if supports_constraints:
|
|
162
169
|
column.setdefault("constraints", []).append({"type": "unique"})
|
|
163
170
|
else:
|
|
164
|
-
column
|
|
171
|
+
column["data_tests"].append("unique")
|
|
165
172
|
if field.enum is not None and len(field.enum) > 0:
|
|
166
|
-
column
|
|
173
|
+
column["data_tests"].append({"accepted_values": {"values": field.enum}})
|
|
167
174
|
if field.minLength is not None or field.maxLength is not None:
|
|
168
175
|
length_test = {}
|
|
169
176
|
if field.minLength is not None:
|
|
170
177
|
length_test["min_value"] = field.minLength
|
|
171
178
|
if field.maxLength is not None:
|
|
172
179
|
length_test["max_value"] = field.maxLength
|
|
173
|
-
column.
|
|
174
|
-
{"dbt_expectations.expect_column_value_lengths_to_be_between": length_test}
|
|
175
|
-
)
|
|
180
|
+
column["data_tests"].append({"dbt_expectations.expect_column_value_lengths_to_be_between": length_test})
|
|
176
181
|
if field.pii is not None:
|
|
177
182
|
column.setdefault("meta", {})["pii"] = field.pii
|
|
178
183
|
if field.classification is not None:
|
|
@@ -181,9 +186,7 @@ def _to_column(field: Field, supports_constraints: bool, supports_datatype: bool
|
|
|
181
186
|
column.setdefault("tags", []).extend(field.tags)
|
|
182
187
|
if field.pattern is not None:
|
|
183
188
|
# Beware, the data contract pattern is a regex, not a like pattern
|
|
184
|
-
column.
|
|
185
|
-
{"dbt_expectations.expect_column_values_to_match_regex": {"regex": field.pattern}}
|
|
186
|
-
)
|
|
189
|
+
column["data_tests"].append({"dbt_expectations.expect_column_values_to_match_regex": {"regex": field.pattern}})
|
|
187
190
|
if (
|
|
188
191
|
field.minimum is not None
|
|
189
192
|
or field.maximum is not None
|
|
@@ -195,7 +198,7 @@ def _to_column(field: Field, supports_constraints: bool, supports_datatype: bool
|
|
|
195
198
|
range_test["min_value"] = field.minimum
|
|
196
199
|
if field.maximum is not None:
|
|
197
200
|
range_test["max_value"] = field.maximum
|
|
198
|
-
column
|
|
201
|
+
column["data_tests"].append({"dbt_expectations.expect_column_values_to_be_between": range_test})
|
|
199
202
|
elif (
|
|
200
203
|
field.exclusiveMinimum is not None
|
|
201
204
|
or field.exclusiveMaximum is not None
|
|
@@ -208,18 +211,18 @@ def _to_column(field: Field, supports_constraints: bool, supports_datatype: bool
|
|
|
208
211
|
if field.exclusiveMaximum is not None:
|
|
209
212
|
range_test["max_value"] = field.exclusiveMaximum
|
|
210
213
|
range_test["strictly"] = True
|
|
211
|
-
column
|
|
214
|
+
column["data_tests"].append({"dbt_expectations.expect_column_values_to_be_between": range_test})
|
|
212
215
|
else:
|
|
213
216
|
if field.minimum is not None:
|
|
214
|
-
column
|
|
217
|
+
column["data_tests"].append(
|
|
215
218
|
{"dbt_expectations.expect_column_values_to_be_between": {"min_value": field.minimum}}
|
|
216
219
|
)
|
|
217
220
|
if field.maximum is not None:
|
|
218
|
-
column
|
|
221
|
+
column["data_tests"].append(
|
|
219
222
|
{"dbt_expectations.expect_column_values_to_be_between": {"max_value": field.maximum}}
|
|
220
223
|
)
|
|
221
224
|
if field.exclusiveMinimum is not None:
|
|
222
|
-
column
|
|
225
|
+
column["data_tests"].append(
|
|
223
226
|
{
|
|
224
227
|
"dbt_expectations.expect_column_values_to_be_between": {
|
|
225
228
|
"min_value": field.exclusiveMinimum,
|
|
@@ -228,7 +231,7 @@ def _to_column(field: Field, supports_constraints: bool, supports_datatype: bool
|
|
|
228
231
|
}
|
|
229
232
|
)
|
|
230
233
|
if field.exclusiveMaximum is not None:
|
|
231
|
-
column
|
|
234
|
+
column["data_tests"].append(
|
|
232
235
|
{
|
|
233
236
|
"dbt_expectations.expect_column_values_to_be_between": {
|
|
234
237
|
"max_value": field.exclusiveMaximum,
|
|
@@ -237,5 +240,8 @@ def _to_column(field: Field, supports_constraints: bool, supports_datatype: bool
|
|
|
237
240
|
}
|
|
238
241
|
)
|
|
239
242
|
|
|
243
|
+
if not column["data_tests"]:
|
|
244
|
+
column.pop("data_tests")
|
|
245
|
+
|
|
240
246
|
# TODO: all constraints
|
|
241
247
|
return column
|
datacontract/export/exporter.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
+
import typing
|
|
1
2
|
from abc import ABC, abstractmethod
|
|
2
3
|
from enum import Enum
|
|
3
|
-
import typing
|
|
4
4
|
|
|
5
5
|
from datacontract.model.data_contract_specification import DataContractSpecification
|
|
6
6
|
|
|
@@ -40,6 +40,7 @@ class ExportFormat(str, Enum):
|
|
|
40
40
|
sqlalchemy = "sqlalchemy"
|
|
41
41
|
data_caterer = "data-caterer"
|
|
42
42
|
dcs = "dcs"
|
|
43
|
+
iceberg = "iceberg"
|
|
43
44
|
|
|
44
45
|
@classmethod
|
|
45
46
|
def get_supported_formats(cls):
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import importlib
|
|
2
2
|
import sys
|
|
3
|
-
|
|
3
|
+
|
|
4
|
+
from datacontract.export.exporter import Exporter, ExportFormat
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
class ExporterFactory:
|
|
@@ -117,7 +118,7 @@ exporter_factory.register_lazy_exporter(
|
|
|
117
118
|
exporter_factory.register_lazy_exporter(
|
|
118
119
|
name=ExportFormat.great_expectations,
|
|
119
120
|
module_path="datacontract.export.great_expectations_converter",
|
|
120
|
-
class_name="
|
|
121
|
+
class_name="GreatExpectationsExporter",
|
|
121
122
|
)
|
|
122
123
|
|
|
123
124
|
exporter_factory.register_lazy_exporter(
|
|
@@ -167,3 +168,7 @@ exporter_factory.register_lazy_exporter(
|
|
|
167
168
|
exporter_factory.register_lazy_exporter(
|
|
168
169
|
name=ExportFormat.dcs, module_path="datacontract.export.dcs_exporter", class_name="DcsExporter"
|
|
169
170
|
)
|
|
171
|
+
|
|
172
|
+
exporter_factory.register_lazy_exporter(
|
|
173
|
+
name=ExportFormat.iceberg, module_path="datacontract.export.iceberg_converter", class_name="IcebergExporter"
|
|
174
|
+
)
|