datacontract-cli 0.10.23__py3-none-any.whl → 0.10.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/__init__.py +13 -0
- datacontract/api.py +3 -3
- datacontract/catalog/catalog.py +2 -2
- datacontract/cli.py +1 -1
- datacontract/data_contract.py +5 -3
- datacontract/engines/data_contract_test.py +13 -4
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +3 -2
- datacontract/engines/soda/check_soda_execute.py +16 -3
- datacontract/engines/soda/connections/duckdb_connection.py +61 -5
- datacontract/engines/soda/connections/kafka.py +3 -2
- datacontract/export/avro_converter.py +8 -1
- datacontract/export/bigquery_converter.py +1 -1
- datacontract/export/duckdb_type_converter.py +57 -0
- datacontract/export/great_expectations_converter.py +49 -2
- datacontract/export/odcs_v3_exporter.py +162 -136
- datacontract/export/protobuf_converter.py +163 -69
- datacontract/export/spark_converter.py +1 -1
- datacontract/imports/avro_importer.py +30 -5
- datacontract/imports/csv_importer.py +111 -57
- datacontract/imports/excel_importer.py +850 -0
- datacontract/imports/importer.py +5 -2
- datacontract/imports/importer_factory.py +10 -0
- datacontract/imports/odcs_v3_importer.py +226 -127
- datacontract/imports/protobuf_importer.py +264 -0
- datacontract/lint/linters/description_linter.py +1 -3
- datacontract/lint/linters/field_reference_linter.py +1 -2
- datacontract/lint/linters/notice_period_linter.py +2 -2
- datacontract/lint/linters/valid_constraints_linter.py +3 -3
- datacontract/lint/resolve.py +23 -8
- datacontract/model/data_contract_specification/__init__.py +1 -0
- datacontract/model/run.py +3 -0
- datacontract/output/__init__.py +0 -0
- datacontract/templates/datacontract.html +2 -1
- datacontract/templates/index.html +2 -1
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.25.dist-info}/METADATA +305 -195
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.25.dist-info}/RECORD +40 -38
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.25.dist-info}/WHEEL +1 -1
- datacontract/export/csv_type_converter.py +0 -36
- datacontract/lint/linters/quality_schema_linter.py +0 -52
- datacontract/model/data_contract_specification.py +0 -327
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.25.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.25.dist-info/licenses}/LICENSE +0 -0
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.25.dist-info}/top_level.txt +0 -0
datacontract/__init__.py
CHANGED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Configuration so that yaml.safe_dump dumps strings with line breaks with yaml literal |
|
|
2
|
+
import yaml
|
|
3
|
+
|
|
4
|
+
yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def repr_str(dumper, data):
|
|
8
|
+
if "\n" in data:
|
|
9
|
+
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
|
|
10
|
+
return dumper.org_represent_str(data)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
|
datacontract/api.py
CHANGED
|
@@ -162,7 +162,7 @@ async def test(
|
|
|
162
162
|
server: Annotated[
|
|
163
163
|
str | None,
|
|
164
164
|
Query(
|
|
165
|
-
|
|
165
|
+
examples=["production"],
|
|
166
166
|
description="The server name to test. Optional, if there is only one server.",
|
|
167
167
|
),
|
|
168
168
|
] = None,
|
|
@@ -191,7 +191,7 @@ async def lint(
|
|
|
191
191
|
schema: Annotated[
|
|
192
192
|
str | None,
|
|
193
193
|
Query(
|
|
194
|
-
|
|
194
|
+
examples=["https://datacontract.com/datacontract.schema.json"],
|
|
195
195
|
description="The schema to use for validation. This must be a URL.",
|
|
196
196
|
),
|
|
197
197
|
] = None,
|
|
@@ -220,7 +220,7 @@ def export(
|
|
|
220
220
|
server: Annotated[
|
|
221
221
|
str | None,
|
|
222
222
|
Query(
|
|
223
|
-
|
|
223
|
+
examples=["production"],
|
|
224
224
|
description="The server name to export. Optional, if there is only one server.",
|
|
225
225
|
),
|
|
226
226
|
] = None,
|
datacontract/catalog/catalog.py
CHANGED
|
@@ -19,7 +19,7 @@ def create_data_contract_html(contracts, file: Path, path: Path, schema: str):
|
|
|
19
19
|
file_without_suffix = file.with_suffix(".html")
|
|
20
20
|
html_filepath = path / file_without_suffix
|
|
21
21
|
html_filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
22
|
-
with open(html_filepath, "w") as f:
|
|
22
|
+
with open(html_filepath, "w", encoding="utf-8") as f:
|
|
23
23
|
f.write(html)
|
|
24
24
|
contracts.append(
|
|
25
25
|
DataContractView(
|
|
@@ -42,7 +42,7 @@ class DataContractView:
|
|
|
42
42
|
|
|
43
43
|
def create_index_html(contracts, path):
|
|
44
44
|
index_filepath = path / "index.html"
|
|
45
|
-
with open(index_filepath, "w") as f:
|
|
45
|
+
with open(index_filepath, "w", encoding="utf-8") as f:
|
|
46
46
|
# Load templates from templates folder
|
|
47
47
|
package_loader = PackageLoader("datacontract", "templates")
|
|
48
48
|
env = Environment(
|
datacontract/cli.py
CHANGED
|
@@ -244,7 +244,7 @@ def import_(
|
|
|
244
244
|
] = None,
|
|
245
245
|
source: Annotated[
|
|
246
246
|
Optional[str],
|
|
247
|
-
typer.Option(help="The path to the file
|
|
247
|
+
typer.Option(help="The path to the file that should be imported."),
|
|
248
248
|
] = None,
|
|
249
249
|
dialect: Annotated[
|
|
250
250
|
Optional[str],
|
datacontract/data_contract.py
CHANGED
|
@@ -4,6 +4,8 @@ import typing
|
|
|
4
4
|
if typing.TYPE_CHECKING:
|
|
5
5
|
from pyspark.sql import SparkSession
|
|
6
6
|
|
|
7
|
+
from duckdb.duckdb import DuckDBPyConnection
|
|
8
|
+
|
|
7
9
|
from datacontract.breaking.breaking import (
|
|
8
10
|
info_breaking_changes,
|
|
9
11
|
models_breaking_changes,
|
|
@@ -22,7 +24,6 @@ from datacontract.lint.linters.description_linter import DescriptionLinter
|
|
|
22
24
|
from datacontract.lint.linters.field_pattern_linter import FieldPatternLinter
|
|
23
25
|
from datacontract.lint.linters.field_reference_linter import FieldReferenceLinter
|
|
24
26
|
from datacontract.lint.linters.notice_period_linter import NoticePeriodLinter
|
|
25
|
-
from datacontract.lint.linters.quality_schema_linter import QualityUsesSchemaLinter
|
|
26
27
|
from datacontract.lint.linters.valid_constraints_linter import ValidFieldConstraintsLinter
|
|
27
28
|
from datacontract.model.data_contract_specification import DataContractSpecification
|
|
28
29
|
from datacontract.model.exceptions import DataContractException
|
|
@@ -39,6 +40,7 @@ class DataContract:
|
|
|
39
40
|
server: str = None,
|
|
40
41
|
publish_url: str = None,
|
|
41
42
|
spark: "SparkSession" = None,
|
|
43
|
+
duckdb_connection: DuckDBPyConnection = None,
|
|
42
44
|
inline_definitions: bool = True,
|
|
43
45
|
inline_quality: bool = True,
|
|
44
46
|
ssl_verification: bool = True,
|
|
@@ -50,11 +52,11 @@ class DataContract:
|
|
|
50
52
|
self._server = server
|
|
51
53
|
self._publish_url = publish_url
|
|
52
54
|
self._spark = spark
|
|
55
|
+
self._duckdb_connection = duckdb_connection
|
|
53
56
|
self._inline_definitions = inline_definitions
|
|
54
57
|
self._inline_quality = inline_quality
|
|
55
58
|
self._ssl_verification = ssl_verification
|
|
56
59
|
self.all_linters = {
|
|
57
|
-
QualityUsesSchemaLinter(),
|
|
58
60
|
FieldPatternLinter(),
|
|
59
61
|
FieldReferenceLinter(),
|
|
60
62
|
NoticePeriodLinter(),
|
|
@@ -146,7 +148,7 @@ class DataContract:
|
|
|
146
148
|
inline_quality=self._inline_quality,
|
|
147
149
|
)
|
|
148
150
|
|
|
149
|
-
execute_data_contract_test(data_contract, run, self._server, self._spark)
|
|
151
|
+
execute_data_contract_test(data_contract, run, self._server, self._spark, self._duckdb_connection)
|
|
150
152
|
|
|
151
153
|
except DataContractException as e:
|
|
152
154
|
run.checks.append(
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import typing
|
|
2
2
|
|
|
3
|
+
from duckdb.duckdb import DuckDBPyConnection
|
|
4
|
+
|
|
3
5
|
from datacontract.engines.data_contract_checks import create_checks
|
|
4
6
|
|
|
5
7
|
if typing.TYPE_CHECKING:
|
|
@@ -10,7 +12,7 @@ from datacontract.engines.datacontract.check_that_datacontract_contains_valid_se
|
|
|
10
12
|
)
|
|
11
13
|
from datacontract.engines.fastjsonschema.check_jsonschema import check_jsonschema
|
|
12
14
|
from datacontract.engines.soda.check_soda_execute import check_soda_execute
|
|
13
|
-
from datacontract.model.data_contract_specification import DataContractSpecification
|
|
15
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Server
|
|
14
16
|
from datacontract.model.exceptions import DataContractException
|
|
15
17
|
from datacontract.model.run import ResultEnum, Run
|
|
16
18
|
|
|
@@ -20,6 +22,7 @@ def execute_data_contract_test(
|
|
|
20
22
|
run: Run,
|
|
21
23
|
server_name: str = None,
|
|
22
24
|
spark: "SparkSession" = None,
|
|
25
|
+
duckdb_connection: DuckDBPyConnection = None,
|
|
23
26
|
):
|
|
24
27
|
if data_contract_specification.models is None or len(data_contract_specification.models) == 0:
|
|
25
28
|
raise DataContractException(
|
|
@@ -29,6 +32,12 @@ def execute_data_contract_test(
|
|
|
29
32
|
reason="Models block is missing. Skip executing tests.",
|
|
30
33
|
engine="datacontract",
|
|
31
34
|
)
|
|
35
|
+
if (
|
|
36
|
+
server_name is None
|
|
37
|
+
and data_contract_specification.servers is not None
|
|
38
|
+
and len(data_contract_specification.servers) > 0
|
|
39
|
+
):
|
|
40
|
+
server_name = list(data_contract_specification.servers.keys())[0]
|
|
32
41
|
server = get_server(data_contract_specification, server_name)
|
|
33
42
|
run.log_info(f"Running tests for data contract {data_contract_specification.id} with server {server_name}")
|
|
34
43
|
run.dataContractId = data_contract_specification.id
|
|
@@ -43,10 +52,10 @@ def execute_data_contract_test(
|
|
|
43
52
|
# TODO check server credentials are complete for nicer error messages
|
|
44
53
|
if server.format == "json" and server.type != "kafka":
|
|
45
54
|
check_jsonschema(run, data_contract_specification, server)
|
|
46
|
-
check_soda_execute(run, data_contract_specification, server, spark)
|
|
55
|
+
check_soda_execute(run, data_contract_specification, server, spark, duckdb_connection)
|
|
47
56
|
|
|
48
57
|
|
|
49
|
-
def get_server(data_contract_specification: DataContractSpecification, server_name: str = None):
|
|
58
|
+
def get_server(data_contract_specification: DataContractSpecification, server_name: str = None) -> Server | None:
|
|
50
59
|
"""Get the server configuration from the data contract specification.
|
|
51
60
|
|
|
52
61
|
Args:
|
|
@@ -59,7 +68,7 @@ def get_server(data_contract_specification: DataContractSpecification, server_na
|
|
|
59
68
|
|
|
60
69
|
check_that_datacontract_contains_valid_server_configuration(data_contract_specification, server_name)
|
|
61
70
|
|
|
62
|
-
if server_name:
|
|
71
|
+
if server_name is not None:
|
|
63
72
|
server = data_contract_specification.servers.get(server_name)
|
|
64
73
|
else:
|
|
65
74
|
server_name = list(data_contract_specification.servers.keys())[0]
|
|
@@ -2,6 +2,7 @@ import logging
|
|
|
2
2
|
import os
|
|
3
3
|
|
|
4
4
|
from datacontract.model.exceptions import DataContractException
|
|
5
|
+
from datacontract.model.run import ResultEnum
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
def yield_s3_files(s3_endpoint_url, s3_location):
|
|
@@ -19,9 +20,9 @@ def s3_fs(s3_endpoint_url):
|
|
|
19
20
|
except ImportError as e:
|
|
20
21
|
raise DataContractException(
|
|
21
22
|
type="schema",
|
|
22
|
-
result=
|
|
23
|
+
result=ResultEnum.failed,
|
|
23
24
|
name="s3 extra missing",
|
|
24
|
-
reason="Install the extra
|
|
25
|
+
reason="Install the extra s3 to use s3",
|
|
25
26
|
engine="datacontract",
|
|
26
27
|
original_exception=e,
|
|
27
28
|
)
|
|
@@ -1,6 +1,12 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import typing
|
|
2
3
|
import uuid
|
|
3
4
|
|
|
5
|
+
if typing.TYPE_CHECKING:
|
|
6
|
+
from pyspark.sql import SparkSession
|
|
7
|
+
|
|
8
|
+
from duckdb.duckdb import DuckDBPyConnection
|
|
9
|
+
|
|
4
10
|
from datacontract.engines.soda.connections.bigquery import to_bigquery_soda_configuration
|
|
5
11
|
from datacontract.engines.soda.connections.databricks import to_databricks_soda_configuration
|
|
6
12
|
from datacontract.engines.soda.connections.duckdb_connection import get_duckdb_connection
|
|
@@ -14,7 +20,13 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
|
|
|
14
20
|
from datacontract.model.run import Check, Log, ResultEnum, Run
|
|
15
21
|
|
|
16
22
|
|
|
17
|
-
def check_soda_execute(
|
|
23
|
+
def check_soda_execute(
|
|
24
|
+
run: Run,
|
|
25
|
+
data_contract: DataContractSpecification,
|
|
26
|
+
server: Server,
|
|
27
|
+
spark: "SparkSession" = None,
|
|
28
|
+
duckdb_connection: DuckDBPyConnection = None,
|
|
29
|
+
):
|
|
18
30
|
from soda.common.config_helper import ConfigHelper
|
|
19
31
|
|
|
20
32
|
ConfigHelper.get_instance().upsert_value("send_anonymous_usage_stats", False)
|
|
@@ -30,7 +42,7 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
30
42
|
if server.type in ["s3", "gcs", "azure", "local"]:
|
|
31
43
|
if server.format in ["json", "parquet", "csv", "delta"]:
|
|
32
44
|
run.log_info(f"Configuring engine soda-core to connect to {server.type} {server.format} with duckdb")
|
|
33
|
-
con = get_duckdb_connection(data_contract, server, run)
|
|
45
|
+
con = get_duckdb_connection(data_contract, server, run, duckdb_connection)
|
|
34
46
|
scan.add_duckdb_connection(duckdb_connection=con, data_source_name=server.type)
|
|
35
47
|
scan.set_data_source_name(server.type)
|
|
36
48
|
else:
|
|
@@ -62,7 +74,8 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
62
74
|
run.log_info("Connecting to databricks via spark")
|
|
63
75
|
scan.add_spark_session(spark, data_source_name=server.type)
|
|
64
76
|
scan.set_data_source_name(server.type)
|
|
65
|
-
|
|
77
|
+
database_name = ".".join(filter(None, [server.catalog, server.schema_]))
|
|
78
|
+
spark.sql(f"USE {database_name}")
|
|
66
79
|
else:
|
|
67
80
|
run.log_info("Connecting to databricks directly")
|
|
68
81
|
soda_configuration_str = to_databricks_soda_configuration(server)
|
|
@@ -1,14 +1,24 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from typing import Any
|
|
2
|
+
from typing import Any, Dict
|
|
3
3
|
|
|
4
4
|
import duckdb
|
|
5
5
|
|
|
6
|
-
from datacontract.export.
|
|
6
|
+
from datacontract.export.duckdb_type_converter import convert_to_duckdb_csv_type, convert_to_duckdb_json_type
|
|
7
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Server
|
|
7
8
|
from datacontract.model.run import Run
|
|
8
9
|
|
|
9
10
|
|
|
10
|
-
def get_duckdb_connection(
|
|
11
|
-
|
|
11
|
+
def get_duckdb_connection(
|
|
12
|
+
data_contract: DataContractSpecification,
|
|
13
|
+
server: Server,
|
|
14
|
+
run: Run,
|
|
15
|
+
duckdb_connection: duckdb.DuckDBPyConnection | None = None,
|
|
16
|
+
) -> duckdb.DuckDBPyConnection:
|
|
17
|
+
if duckdb_connection is None:
|
|
18
|
+
con = duckdb.connect(database=":memory:")
|
|
19
|
+
else:
|
|
20
|
+
con = duckdb_connection
|
|
21
|
+
|
|
12
22
|
path: str = ""
|
|
13
23
|
if server.type == "local":
|
|
14
24
|
path = server.path
|
|
@@ -33,9 +43,16 @@ def get_duckdb_connection(data_contract, server, run: Run):
|
|
|
33
43
|
json_format = "newline_delimited"
|
|
34
44
|
elif server.delimiter == "array":
|
|
35
45
|
json_format = "array"
|
|
36
|
-
|
|
46
|
+
columns = to_json_types(model)
|
|
47
|
+
if columns is None:
|
|
48
|
+
con.sql(f"""
|
|
37
49
|
CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', hive_partitioning=1);
|
|
38
50
|
""")
|
|
51
|
+
else:
|
|
52
|
+
con.sql(
|
|
53
|
+
f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', columns={columns}, hive_partitioning=1);"""
|
|
54
|
+
)
|
|
55
|
+
add_nested_views(con, model_name, model.fields)
|
|
39
56
|
elif server.format == "parquet":
|
|
40
57
|
con.sql(f"""
|
|
41
58
|
CREATE VIEW "{model_name}" AS SELECT * FROM read_parquet('{model_path}', hive_partitioning=1);
|
|
@@ -67,6 +84,45 @@ def to_csv_types(model) -> dict[Any, str | None] | None:
|
|
|
67
84
|
return columns
|
|
68
85
|
|
|
69
86
|
|
|
87
|
+
def to_json_types(model: Model) -> dict[Any, str | None] | None:
|
|
88
|
+
if model is None:
|
|
89
|
+
return None
|
|
90
|
+
columns = {}
|
|
91
|
+
for field_name, field in model.fields.items():
|
|
92
|
+
columns[field_name] = convert_to_duckdb_json_type(field)
|
|
93
|
+
return columns
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def add_nested_views(con: duckdb.DuckDBPyConnection, model_name: str, fields: Dict[str, Field] | None):
|
|
97
|
+
model_name = model_name.strip('"')
|
|
98
|
+
if fields is None:
|
|
99
|
+
return
|
|
100
|
+
for field_name, field in fields.items():
|
|
101
|
+
if field.type is None or field.type.lower() not in ["array", "object"]:
|
|
102
|
+
continue
|
|
103
|
+
field_type = field.type.lower()
|
|
104
|
+
if field_type == "array" and field.items is None:
|
|
105
|
+
continue
|
|
106
|
+
elif field_type == "object" and field.fields is None:
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
nested_model_name = f"{model_name}__{field_name}"
|
|
110
|
+
max_depth = 2 if field_type == "array" else 1
|
|
111
|
+
|
|
112
|
+
## if parent field is not required, the nested objects may respolve
|
|
113
|
+
## to a row of NULLs -- but if the objects themselves have required
|
|
114
|
+
## fields, this will fail the check.
|
|
115
|
+
where = "" if field.required else f" WHERE {field_name} IS NOT NULL"
|
|
116
|
+
con.sql(f"""
|
|
117
|
+
CREATE VIEW IF NOT EXISTS "{nested_model_name}" AS
|
|
118
|
+
SELECT unnest({field_name}, max_depth := {max_depth}) as {field_name} FROM "{model_name}" {where}
|
|
119
|
+
""")
|
|
120
|
+
if field_type == "array":
|
|
121
|
+
add_nested_views(con, nested_model_name, field.items.fields)
|
|
122
|
+
elif field_type == "object":
|
|
123
|
+
add_nested_views(con, nested_model_name, field.fields)
|
|
124
|
+
|
|
125
|
+
|
|
70
126
|
def setup_s3_connection(con, server):
|
|
71
127
|
s3_region = os.getenv("DATACONTRACT_S3_REGION")
|
|
72
128
|
s3_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
|
|
@@ -6,6 +6,7 @@ import tempfile
|
|
|
6
6
|
from datacontract.export.avro_converter import to_avro_schema_json
|
|
7
7
|
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Server
|
|
8
8
|
from datacontract.model.exceptions import DataContractException
|
|
9
|
+
from datacontract.model.run import ResultEnum
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
def create_spark_session():
|
|
@@ -16,7 +17,7 @@ def create_spark_session():
|
|
|
16
17
|
except ImportError as e:
|
|
17
18
|
raise DataContractException(
|
|
18
19
|
type="schema",
|
|
19
|
-
result=
|
|
20
|
+
result=ResultEnum.failed,
|
|
20
21
|
name="pyspark is missing",
|
|
21
22
|
reason="Install the extra datacontract-cli[kafka] to use kafka",
|
|
22
23
|
engine="datacontract",
|
|
@@ -33,7 +34,7 @@ def create_spark_session():
|
|
|
33
34
|
.config("spark.ui.enabled", "false")
|
|
34
35
|
.config(
|
|
35
36
|
"spark.jars.packages",
|
|
36
|
-
"org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.
|
|
37
|
+
"org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.5,org.apache.spark:spark-avro_2.12:3.5.5",
|
|
37
38
|
)
|
|
38
39
|
.getOrCreate()
|
|
39
40
|
)
|
|
@@ -91,7 +91,9 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
|
|
|
91
91
|
if field.precision is not None:
|
|
92
92
|
typeVal["precision"] = field.precision
|
|
93
93
|
return typeVal
|
|
94
|
-
elif field.type in ["float"
|
|
94
|
+
elif field.type in ["float"]:
|
|
95
|
+
return "float"
|
|
96
|
+
elif field.type in ["double"]:
|
|
95
97
|
return "double"
|
|
96
98
|
elif field.type in ["integer", "int"]:
|
|
97
99
|
return "int"
|
|
@@ -107,6 +109,11 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
|
|
|
107
109
|
return {"type": "int", "logicalType": "date"}
|
|
108
110
|
elif field.type in ["time"]:
|
|
109
111
|
return "long"
|
|
112
|
+
elif field.type in ["map"]:
|
|
113
|
+
if field.config is not None and "values" in field.config:
|
|
114
|
+
return {"type": "map", "values": field.config["values"]}
|
|
115
|
+
else:
|
|
116
|
+
return "bytes"
|
|
110
117
|
elif field.type in ["object", "record", "struct"]:
|
|
111
118
|
if field.config is not None and "namespace" in field.config:
|
|
112
119
|
return to_avro_record(field_name, field.fields, field.description, field.config["namespace"])
|
|
@@ -103,7 +103,7 @@ def map_type_to_bigquery(field: Field) -> str:
|
|
|
103
103
|
elif field_type.lower() == "date":
|
|
104
104
|
return "DATE"
|
|
105
105
|
elif field_type.lower() == "timestamp_ntz":
|
|
106
|
-
return "
|
|
106
|
+
return "DATETIME"
|
|
107
107
|
elif field_type.lower() in ["number", "decimal", "numeric"]:
|
|
108
108
|
return "NUMERIC"
|
|
109
109
|
elif field_type.lower() == "double":
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from datacontract.model.data_contract_specification import Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# https://duckdb.org/docs/data/csv/overview.html
|
|
7
|
+
# ['SQLNULL', 'BOOLEAN', 'BIGINT', 'DOUBLE', 'TIME', 'DATE', 'TIMESTAMP', 'VARCHAR']
|
|
8
|
+
def convert_to_duckdb_csv_type(field) -> None | str:
|
|
9
|
+
datacontract_type = field.type
|
|
10
|
+
if datacontract_type is None:
|
|
11
|
+
return "VARCHAR"
|
|
12
|
+
if datacontract_type.lower() in ["string", "varchar", "text"]:
|
|
13
|
+
return "VARCHAR"
|
|
14
|
+
if datacontract_type.lower() in ["timestamp", "timestamp_tz"]:
|
|
15
|
+
return "TIMESTAMP"
|
|
16
|
+
if datacontract_type.lower() in ["timestamp_ntz"]:
|
|
17
|
+
return "TIMESTAMP"
|
|
18
|
+
if datacontract_type.lower() in ["date"]:
|
|
19
|
+
return "DATE"
|
|
20
|
+
if datacontract_type.lower() in ["time"]:
|
|
21
|
+
return "TIME"
|
|
22
|
+
if datacontract_type.lower() in ["number", "decimal", "numeric"]:
|
|
23
|
+
# precision and scale not supported by data contract
|
|
24
|
+
return "VARCHAR"
|
|
25
|
+
if datacontract_type.lower() in ["float", "double"]:
|
|
26
|
+
return "DOUBLE"
|
|
27
|
+
if datacontract_type.lower() in ["integer", "int", "long", "bigint"]:
|
|
28
|
+
return "BIGINT"
|
|
29
|
+
if datacontract_type.lower() in ["boolean"]:
|
|
30
|
+
return "BOOLEAN"
|
|
31
|
+
if datacontract_type.lower() in ["object", "record", "struct"]:
|
|
32
|
+
# not supported in CSV
|
|
33
|
+
return "VARCHAR"
|
|
34
|
+
if datacontract_type.lower() in ["bytes"]:
|
|
35
|
+
# not supported in CSV
|
|
36
|
+
return "VARCHAR"
|
|
37
|
+
if datacontract_type.lower() in ["array"]:
|
|
38
|
+
return "VARCHAR"
|
|
39
|
+
if datacontract_type.lower() in ["null"]:
|
|
40
|
+
return "SQLNULL"
|
|
41
|
+
return "VARCHAR"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def convert_to_duckdb_json_type(field: Field) -> None | str:
|
|
45
|
+
datacontract_type = field.type
|
|
46
|
+
if datacontract_type is None:
|
|
47
|
+
return "VARCHAR"
|
|
48
|
+
if datacontract_type.lower() in ["array"]:
|
|
49
|
+
return convert_to_duckdb_json_type(field.items) + "[]" # type: ignore
|
|
50
|
+
if datacontract_type.lower() in ["object", "record", "struct"]:
|
|
51
|
+
return convert_to_duckdb_object(field.fields)
|
|
52
|
+
return convert_to_duckdb_csv_type(field)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def convert_to_duckdb_object(fields: Dict[str, Field]):
|
|
56
|
+
columns = [f'"{x[0]}" {convert_to_duckdb_json_type(x[1])}' for x in fields.items()]
|
|
57
|
+
return f"STRUCT({', '.join(columns)})"
|
|
@@ -19,6 +19,7 @@ from datacontract.export.spark_converter import to_spark_data_type
|
|
|
19
19
|
from datacontract.export.sql_type_converter import convert_to_sql_type
|
|
20
20
|
from datacontract.model.data_contract_specification import (
|
|
21
21
|
DataContractSpecification,
|
|
22
|
+
DeprecatedQuality,
|
|
22
23
|
Field,
|
|
23
24
|
Quality,
|
|
24
25
|
)
|
|
@@ -91,8 +92,14 @@ def to_great_expectations(
|
|
|
91
92
|
model_key=model_key, contract_version=data_contract_spec.info.version
|
|
92
93
|
)
|
|
93
94
|
model_value = data_contract_spec.models.get(model_key)
|
|
94
|
-
|
|
95
|
+
|
|
96
|
+
# Support for Deprecated Quality
|
|
97
|
+
quality_checks = get_deprecated_quality_checks(data_contract_spec.quality)
|
|
98
|
+
|
|
99
|
+
expectations.extend(get_quality_checks(model_value.quality))
|
|
100
|
+
|
|
95
101
|
expectations.extend(model_to_expectations(model_value.fields, engine, sql_server_type))
|
|
102
|
+
|
|
96
103
|
expectations.extend(checks_to_expectations(quality_checks, model_key))
|
|
97
104
|
model_expectation_suite = to_suite(expectations, expectation_suite_name)
|
|
98
105
|
|
|
@@ -135,6 +142,7 @@ def model_to_expectations(fields: Dict[str, Field], engine: str | None, sql_serv
|
|
|
135
142
|
add_column_order_exp(fields, expectations)
|
|
136
143
|
for field_name, field in fields.items():
|
|
137
144
|
add_field_expectations(field_name, field, expectations, engine, sql_server_type)
|
|
145
|
+
expectations.extend(get_quality_checks(field.quality, field_name))
|
|
138
146
|
return expectations
|
|
139
147
|
|
|
140
148
|
|
|
@@ -173,6 +181,8 @@ def add_field_expectations(
|
|
|
173
181
|
expectations.append(to_column_length_exp(field_name, field.minLength, field.maxLength))
|
|
174
182
|
if field.minimum is not None or field.maximum is not None:
|
|
175
183
|
expectations.append(to_column_min_max_exp(field_name, field.minimum, field.maximum))
|
|
184
|
+
if field.enum is not None and len(field.enum) != 0:
|
|
185
|
+
expectations.append(to_column_enum_exp(field_name, field.enum))
|
|
176
186
|
|
|
177
187
|
return expectations
|
|
178
188
|
|
|
@@ -266,7 +276,24 @@ def to_column_min_max_exp(field_name, minimum, maximum) -> Dict[str, Any]:
|
|
|
266
276
|
}
|
|
267
277
|
|
|
268
278
|
|
|
269
|
-
def
|
|
279
|
+
def to_column_enum_exp(field_name, enum_list: List[str]) -> Dict[str, Any]:
|
|
280
|
+
"""Creates a expect_column_values_to_be_in_set expectation.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
field_name (str): The name of the field.
|
|
284
|
+
enum_list (Set[str]): enum list of value.
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
Dict[str, Any]: Column value in set expectation.
|
|
288
|
+
"""
|
|
289
|
+
return {
|
|
290
|
+
"expectation_type": "expect_column_values_to_be_in_set",
|
|
291
|
+
"kwargs": {"column": field_name, "value_set": enum_list},
|
|
292
|
+
"meta": {},
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def get_deprecated_quality_checks(quality: DeprecatedQuality) -> Dict[str, Any]:
|
|
270
297
|
"""Retrieves quality checks defined in a data contract.
|
|
271
298
|
|
|
272
299
|
Args:
|
|
@@ -288,6 +315,26 @@ def get_quality_checks(quality: Quality) -> Dict[str, Any]:
|
|
|
288
315
|
return quality_specification
|
|
289
316
|
|
|
290
317
|
|
|
318
|
+
def get_quality_checks(qualities: List[Quality], field_name: str | None = None) -> List[Dict[str, Any]]:
|
|
319
|
+
"""Retrieves quality checks defined in a data contract.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
qualities (List[Quality]): List of quality object from the model specification.
|
|
323
|
+
field_name (str | None): field name if the quality list is attached to a specific field
|
|
324
|
+
|
|
325
|
+
Returns:
|
|
326
|
+
Dict[str, Any]: Dictionary of quality checks.
|
|
327
|
+
"""
|
|
328
|
+
quality_specification = []
|
|
329
|
+
for quality in qualities:
|
|
330
|
+
if quality is not None and quality.engine is not None and quality.engine.lower() == "great-expectations":
|
|
331
|
+
ge_expectation = quality.implementation
|
|
332
|
+
if field_name is not None:
|
|
333
|
+
ge_expectation["column"] = field_name
|
|
334
|
+
quality_specification.append(ge_expectation)
|
|
335
|
+
return quality_specification
|
|
336
|
+
|
|
337
|
+
|
|
291
338
|
def checks_to_expectations(quality_checks: Dict[str, Any], model_key: str) -> List[Dict[str, Any]]:
|
|
292
339
|
"""Converts quality checks to a list of expectations.
|
|
293
340
|
|