datacontract-cli 0.10.23__py3-none-any.whl → 0.10.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datacontract/__init__.py +13 -0
- datacontract/api.py +12 -5
- datacontract/catalog/catalog.py +5 -3
- datacontract/cli.py +119 -13
- datacontract/data_contract.py +145 -67
- datacontract/engines/data_contract_checks.py +366 -60
- datacontract/engines/data_contract_test.py +50 -4
- datacontract/engines/fastjsonschema/check_jsonschema.py +37 -19
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +3 -2
- datacontract/engines/soda/check_soda_execute.py +27 -3
- datacontract/engines/soda/connections/athena.py +79 -0
- datacontract/engines/soda/connections/duckdb_connection.py +65 -6
- datacontract/engines/soda/connections/kafka.py +4 -2
- datacontract/engines/soda/connections/oracle.py +50 -0
- datacontract/export/avro_converter.py +20 -3
- datacontract/export/bigquery_converter.py +1 -1
- datacontract/export/dbt_converter.py +36 -7
- datacontract/export/dqx_converter.py +126 -0
- datacontract/export/duckdb_type_converter.py +57 -0
- datacontract/export/excel_exporter.py +923 -0
- datacontract/export/exporter.py +3 -0
- datacontract/export/exporter_factory.py +17 -1
- datacontract/export/great_expectations_converter.py +55 -5
- datacontract/export/{html_export.py → html_exporter.py} +31 -20
- datacontract/export/markdown_converter.py +134 -5
- datacontract/export/mermaid_exporter.py +110 -0
- datacontract/export/odcs_v3_exporter.py +193 -149
- datacontract/export/protobuf_converter.py +163 -69
- datacontract/export/rdf_converter.py +2 -2
- datacontract/export/sodacl_converter.py +9 -1
- datacontract/export/spark_converter.py +31 -4
- datacontract/export/sql_converter.py +6 -2
- datacontract/export/sql_type_converter.py +124 -8
- datacontract/imports/avro_importer.py +63 -12
- datacontract/imports/csv_importer.py +111 -57
- datacontract/imports/excel_importer.py +1112 -0
- datacontract/imports/importer.py +16 -3
- datacontract/imports/importer_factory.py +17 -0
- datacontract/imports/json_importer.py +325 -0
- datacontract/imports/odcs_importer.py +2 -2
- datacontract/imports/odcs_v3_importer.py +367 -151
- datacontract/imports/protobuf_importer.py +264 -0
- datacontract/imports/spark_importer.py +117 -13
- datacontract/imports/sql_importer.py +32 -16
- datacontract/imports/unity_importer.py +84 -38
- datacontract/init/init_template.py +1 -1
- datacontract/integration/entropy_data.py +126 -0
- datacontract/lint/resolve.py +112 -23
- datacontract/lint/schema.py +24 -15
- datacontract/lint/urls.py +17 -3
- datacontract/model/data_contract_specification/__init__.py +1 -0
- datacontract/model/odcs.py +13 -0
- datacontract/model/run.py +3 -0
- datacontract/output/junit_test_results.py +3 -3
- datacontract/schemas/datacontract-1.1.0.init.yaml +1 -1
- datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
- datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
- datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
- datacontract/schemas/odcs-3.1.0.schema.json +2809 -0
- datacontract/templates/datacontract.html +54 -3
- datacontract/templates/datacontract_odcs.html +685 -0
- datacontract/templates/index.html +5 -2
- datacontract/templates/partials/server.html +2 -0
- datacontract/templates/style/output.css +319 -145
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/METADATA +711 -433
- datacontract_cli-0.10.40.dist-info/RECORD +121 -0
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/WHEEL +1 -1
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info/licenses}/LICENSE +1 -1
- datacontract/export/csv_type_converter.py +0 -36
- datacontract/integration/datamesh_manager.py +0 -72
- datacontract/lint/lint.py +0 -142
- datacontract/lint/linters/description_linter.py +0 -35
- datacontract/lint/linters/field_pattern_linter.py +0 -34
- datacontract/lint/linters/field_reference_linter.py +0 -48
- datacontract/lint/linters/notice_period_linter.py +0 -55
- datacontract/lint/linters/quality_schema_linter.py +0 -52
- datacontract/lint/linters/valid_constraints_linter.py +0 -100
- datacontract/model/data_contract_specification.py +0 -327
- datacontract_cli-0.10.23.dist-info/RECORD +0 -113
- /datacontract/{lint/linters → output}/__init__.py +0 -0
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,11 @@
|
|
|
1
|
+
import atexit
|
|
2
|
+
import os
|
|
3
|
+
import tempfile
|
|
1
4
|
import typing
|
|
2
5
|
|
|
6
|
+
import requests
|
|
7
|
+
from duckdb.duckdb import DuckDBPyConnection
|
|
8
|
+
|
|
3
9
|
from datacontract.engines.data_contract_checks import create_checks
|
|
4
10
|
|
|
5
11
|
if typing.TYPE_CHECKING:
|
|
@@ -10,7 +16,7 @@ from datacontract.engines.datacontract.check_that_datacontract_contains_valid_se
|
|
|
10
16
|
)
|
|
11
17
|
from datacontract.engines.fastjsonschema.check_jsonschema import check_jsonschema
|
|
12
18
|
from datacontract.engines.soda.check_soda_execute import check_soda_execute
|
|
13
|
-
from datacontract.model.data_contract_specification import DataContractSpecification
|
|
19
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Server
|
|
14
20
|
from datacontract.model.exceptions import DataContractException
|
|
15
21
|
from datacontract.model.run import ResultEnum, Run
|
|
16
22
|
|
|
@@ -20,6 +26,7 @@ def execute_data_contract_test(
|
|
|
20
26
|
run: Run,
|
|
21
27
|
server_name: str = None,
|
|
22
28
|
spark: "SparkSession" = None,
|
|
29
|
+
duckdb_connection: DuckDBPyConnection = None,
|
|
23
30
|
):
|
|
24
31
|
if data_contract_specification.models is None or len(data_contract_specification.models) == 0:
|
|
25
32
|
raise DataContractException(
|
|
@@ -29,6 +36,12 @@ def execute_data_contract_test(
|
|
|
29
36
|
reason="Models block is missing. Skip executing tests.",
|
|
30
37
|
engine="datacontract",
|
|
31
38
|
)
|
|
39
|
+
if (
|
|
40
|
+
server_name is None
|
|
41
|
+
and data_contract_specification.servers is not None
|
|
42
|
+
and len(data_contract_specification.servers) > 0
|
|
43
|
+
):
|
|
44
|
+
server_name = list(data_contract_specification.servers.keys())[0]
|
|
32
45
|
server = get_server(data_contract_specification, server_name)
|
|
33
46
|
run.log_info(f"Running tests for data contract {data_contract_specification.id} with server {server_name}")
|
|
34
47
|
run.dataContractId = data_contract_specification.id
|
|
@@ -37,16 +50,19 @@ def execute_data_contract_test(
|
|
|
37
50
|
run.outputPortId = server.outputPortId
|
|
38
51
|
run.server = server_name
|
|
39
52
|
|
|
53
|
+
if server.type == "api":
|
|
54
|
+
server = process_api_response(run, server)
|
|
55
|
+
|
|
40
56
|
run.checks.extend(create_checks(data_contract_specification, server))
|
|
41
57
|
|
|
42
58
|
# TODO check server is supported type for nicer error messages
|
|
43
59
|
# TODO check server credentials are complete for nicer error messages
|
|
44
60
|
if server.format == "json" and server.type != "kafka":
|
|
45
61
|
check_jsonschema(run, data_contract_specification, server)
|
|
46
|
-
check_soda_execute(run, data_contract_specification, server, spark)
|
|
62
|
+
check_soda_execute(run, data_contract_specification, server, spark, duckdb_connection)
|
|
47
63
|
|
|
48
64
|
|
|
49
|
-
def get_server(data_contract_specification: DataContractSpecification, server_name: str = None):
|
|
65
|
+
def get_server(data_contract_specification: DataContractSpecification, server_name: str = None) -> Server | None:
|
|
50
66
|
"""Get the server configuration from the data contract specification.
|
|
51
67
|
|
|
52
68
|
Args:
|
|
@@ -59,9 +75,39 @@ def get_server(data_contract_specification: DataContractSpecification, server_na
|
|
|
59
75
|
|
|
60
76
|
check_that_datacontract_contains_valid_server_configuration(data_contract_specification, server_name)
|
|
61
77
|
|
|
62
|
-
if server_name:
|
|
78
|
+
if server_name is not None:
|
|
63
79
|
server = data_contract_specification.servers.get(server_name)
|
|
64
80
|
else:
|
|
65
81
|
server_name = list(data_contract_specification.servers.keys())[0]
|
|
66
82
|
server = data_contract_specification.servers.get(server_name)
|
|
67
83
|
return server
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def process_api_response(run, server):
|
|
87
|
+
tmp_dir = tempfile.TemporaryDirectory(prefix="datacontract_cli_api_")
|
|
88
|
+
atexit.register(tmp_dir.cleanup)
|
|
89
|
+
headers = {}
|
|
90
|
+
if os.getenv("DATACONTRACT_API_HEADER_AUTHORIZATION") is not None:
|
|
91
|
+
headers["Authorization"] = os.getenv("DATACONTRACT_API_HEADER_AUTHORIZATION")
|
|
92
|
+
try:
|
|
93
|
+
response = requests.get(server.location, headers=headers)
|
|
94
|
+
response.raise_for_status()
|
|
95
|
+
except requests.exceptions.RequestException as e:
|
|
96
|
+
raise DataContractException(
|
|
97
|
+
type="connection",
|
|
98
|
+
name="API server connection error",
|
|
99
|
+
result=ResultEnum.error,
|
|
100
|
+
reason=f"Failed to fetch API response from {server.location}: {e}",
|
|
101
|
+
engine="datacontract",
|
|
102
|
+
)
|
|
103
|
+
with open(f"{tmp_dir.name}/api_response.json", "w") as f:
|
|
104
|
+
f.write(response.text)
|
|
105
|
+
run.log_info(f"Saved API response to {tmp_dir.name}/api_response.json")
|
|
106
|
+
server = Server(
|
|
107
|
+
type="local",
|
|
108
|
+
format="json",
|
|
109
|
+
path=f"{tmp_dir.name}/api_response.json",
|
|
110
|
+
dataProductId=server.dataProductId,
|
|
111
|
+
outputPortId=server.outputPortId,
|
|
112
|
+
)
|
|
113
|
+
return server
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
import glob
|
|
1
2
|
import json
|
|
2
3
|
import logging
|
|
3
4
|
import os
|
|
4
5
|
import threading
|
|
5
|
-
from typing import List, Optional
|
|
6
|
+
from typing import Any, Callable, Generator, List, Optional
|
|
6
7
|
|
|
7
8
|
import fastjsonschema
|
|
8
9
|
from fastjsonschema import JsonSchemaValueException
|
|
@@ -85,7 +86,7 @@ def process_exceptions(run, exceptions: List[DataContractException]):
|
|
|
85
86
|
|
|
86
87
|
|
|
87
88
|
def validate_json_stream(
|
|
88
|
-
schema: dict, model_name: str, validate:
|
|
89
|
+
schema: dict, model_name: str, validate: Callable, json_stream: Generator[Any, Any, None]
|
|
89
90
|
) -> List[DataContractException]:
|
|
90
91
|
logging.info(f"Validating JSON stream for model: '{model_name}'.")
|
|
91
92
|
exceptions: List[DataContractException] = []
|
|
@@ -99,7 +100,7 @@ def validate_json_stream(
|
|
|
99
100
|
DataContractException(
|
|
100
101
|
type="schema",
|
|
101
102
|
name="Check that JSON has valid schema",
|
|
102
|
-
result=
|
|
103
|
+
result=ResultEnum.failed,
|
|
103
104
|
reason=f"{f'#{primary_key_value}: ' if primary_key_value is not None else ''}{e.message}",
|
|
104
105
|
model=model_name,
|
|
105
106
|
engine="jsonschema",
|
|
@@ -159,27 +160,44 @@ def process_json_file(run, schema, model_name, validate, file, delimiter):
|
|
|
159
160
|
|
|
160
161
|
def process_local_file(run, server, schema, model_name, validate):
|
|
161
162
|
path = server.path
|
|
163
|
+
if not path:
|
|
164
|
+
raise DataContractException(
|
|
165
|
+
type="schema",
|
|
166
|
+
name="Check that JSON has valid schema",
|
|
167
|
+
result=ResultEnum.warning,
|
|
168
|
+
reason="For server with type 'local', a 'path' must be defined.",
|
|
169
|
+
engine="datacontract",
|
|
170
|
+
)
|
|
162
171
|
if "{model}" in path:
|
|
163
172
|
path = path.format(model=model_name)
|
|
164
173
|
|
|
174
|
+
all_files = []
|
|
165
175
|
if os.path.isdir(path):
|
|
166
|
-
|
|
176
|
+
# Fetch all JSONs in the directory
|
|
177
|
+
for root, _, files in os.walk(path):
|
|
178
|
+
for file in files:
|
|
179
|
+
if file.endswith(".json"):
|
|
180
|
+
all_files.append(os.path.join(root, file))
|
|
167
181
|
else:
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
182
|
+
# Use glob to fetch all JSONs
|
|
183
|
+
for file_path in glob.glob(path, recursive=True):
|
|
184
|
+
if os.path.isfile(file_path):
|
|
185
|
+
if file_path.endswith(".json"):
|
|
186
|
+
all_files.append(file_path)
|
|
171
187
|
|
|
188
|
+
if not all_files:
|
|
189
|
+
raise DataContractException(
|
|
190
|
+
type="schema",
|
|
191
|
+
name="Check that JSON has valid schema",
|
|
192
|
+
result=ResultEnum.warning,
|
|
193
|
+
reason=f"No files found in '{path}'.",
|
|
194
|
+
engine="datacontract",
|
|
195
|
+
)
|
|
172
196
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
file_path = os.path.join(path, filename)
|
|
178
|
-
with open(file_path, "r") as file:
|
|
179
|
-
if not process_json_file(run, model_name, validate, file, server.delimiter):
|
|
180
|
-
success = False
|
|
181
|
-
break
|
|
182
|
-
return success
|
|
197
|
+
for file in all_files:
|
|
198
|
+
logging.info(f"Processing file: {file}")
|
|
199
|
+
with open(file, "r") as f:
|
|
200
|
+
process_json_file(run, schema, model_name, validate, f, server.delimiter)
|
|
183
201
|
|
|
184
202
|
|
|
185
203
|
def process_s3_file(run, server, schema, model_name, validate):
|
|
@@ -201,7 +219,7 @@ def process_s3_file(run, server, schema, model_name, validate):
|
|
|
201
219
|
raise DataContractException(
|
|
202
220
|
type="schema",
|
|
203
221
|
name="Check that JSON has valid schema",
|
|
204
|
-
result=
|
|
222
|
+
result=ResultEnum.warning,
|
|
205
223
|
reason=f"Cannot find any file in {s3_location}",
|
|
206
224
|
engine="datacontract",
|
|
207
225
|
)
|
|
@@ -222,7 +240,7 @@ def check_jsonschema(run: Run, data_contract: DataContractSpecification, server:
|
|
|
222
240
|
Check(
|
|
223
241
|
type="schema",
|
|
224
242
|
name="Check that JSON has valid schema",
|
|
225
|
-
result=
|
|
243
|
+
result=ResultEnum.warning,
|
|
226
244
|
reason="Server format is not 'json'. Skip validating jsonschema.",
|
|
227
245
|
engine="jsonschema",
|
|
228
246
|
)
|
|
@@ -2,6 +2,7 @@ import logging
|
|
|
2
2
|
import os
|
|
3
3
|
|
|
4
4
|
from datacontract.model.exceptions import DataContractException
|
|
5
|
+
from datacontract.model.run import ResultEnum
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
def yield_s3_files(s3_endpoint_url, s3_location):
|
|
@@ -19,9 +20,9 @@ def s3_fs(s3_endpoint_url):
|
|
|
19
20
|
except ImportError as e:
|
|
20
21
|
raise DataContractException(
|
|
21
22
|
type="schema",
|
|
22
|
-
result=
|
|
23
|
+
result=ResultEnum.failed,
|
|
23
24
|
name="s3 extra missing",
|
|
24
|
-
reason="Install the extra
|
|
25
|
+
reason="Install the extra s3 to use s3",
|
|
25
26
|
engine="datacontract",
|
|
26
27
|
original_exception=e,
|
|
27
28
|
)
|
|
@@ -1,6 +1,15 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import typing
|
|
2
3
|
import uuid
|
|
3
4
|
|
|
5
|
+
from datacontract.engines.soda.connections.athena import to_athena_soda_configuration
|
|
6
|
+
from datacontract.engines.soda.connections.oracle import initialize_client_and_create_soda_configuration
|
|
7
|
+
|
|
8
|
+
if typing.TYPE_CHECKING:
|
|
9
|
+
from pyspark.sql import SparkSession
|
|
10
|
+
|
|
11
|
+
from duckdb.duckdb import DuckDBPyConnection
|
|
12
|
+
|
|
4
13
|
from datacontract.engines.soda.connections.bigquery import to_bigquery_soda_configuration
|
|
5
14
|
from datacontract.engines.soda.connections.databricks import to_databricks_soda_configuration
|
|
6
15
|
from datacontract.engines.soda.connections.duckdb_connection import get_duckdb_connection
|
|
@@ -14,7 +23,13 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
|
|
|
14
23
|
from datacontract.model.run import Check, Log, ResultEnum, Run
|
|
15
24
|
|
|
16
25
|
|
|
17
|
-
def check_soda_execute(
|
|
26
|
+
def check_soda_execute(
|
|
27
|
+
run: Run,
|
|
28
|
+
data_contract: DataContractSpecification,
|
|
29
|
+
server: Server,
|
|
30
|
+
spark: "SparkSession" = None,
|
|
31
|
+
duckdb_connection: DuckDBPyConnection = None,
|
|
32
|
+
):
|
|
18
33
|
from soda.common.config_helper import ConfigHelper
|
|
19
34
|
|
|
20
35
|
ConfigHelper.get_instance().upsert_value("send_anonymous_usage_stats", False)
|
|
@@ -30,7 +45,7 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
30
45
|
if server.type in ["s3", "gcs", "azure", "local"]:
|
|
31
46
|
if server.format in ["json", "parquet", "csv", "delta"]:
|
|
32
47
|
run.log_info(f"Configuring engine soda-core to connect to {server.type} {server.format} with duckdb")
|
|
33
|
-
con = get_duckdb_connection(data_contract, server, run)
|
|
48
|
+
con = get_duckdb_connection(data_contract, server, run, duckdb_connection)
|
|
34
49
|
scan.add_duckdb_connection(duckdb_connection=con, data_source_name=server.type)
|
|
35
50
|
scan.set_data_source_name(server.type)
|
|
36
51
|
else:
|
|
@@ -62,7 +77,8 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
62
77
|
run.log_info("Connecting to databricks via spark")
|
|
63
78
|
scan.add_spark_session(spark, data_source_name=server.type)
|
|
64
79
|
scan.set_data_source_name(server.type)
|
|
65
|
-
|
|
80
|
+
database_name = ".".join(filter(None, [server.catalog, server.schema_]))
|
|
81
|
+
spark.sql(f"USE {database_name}")
|
|
66
82
|
else:
|
|
67
83
|
run.log_info("Connecting to databricks directly")
|
|
68
84
|
soda_configuration_str = to_databricks_soda_configuration(server)
|
|
@@ -89,10 +105,18 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
89
105
|
soda_configuration_str = to_sqlserver_soda_configuration(server)
|
|
90
106
|
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
91
107
|
scan.set_data_source_name(server.type)
|
|
108
|
+
elif server.type == "oracle":
|
|
109
|
+
soda_configuration_str = initialize_client_and_create_soda_configuration(server)
|
|
110
|
+
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
111
|
+
scan.set_data_source_name(server.type)
|
|
92
112
|
elif server.type == "trino":
|
|
93
113
|
soda_configuration_str = to_trino_soda_configuration(server)
|
|
94
114
|
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
95
115
|
scan.set_data_source_name(server.type)
|
|
116
|
+
elif server.type == "athena":
|
|
117
|
+
soda_configuration_str = to_athena_soda_configuration(server)
|
|
118
|
+
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
119
|
+
scan.set_data_source_name(server.type)
|
|
96
120
|
|
|
97
121
|
else:
|
|
98
122
|
run.checks.append(
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import yaml
|
|
4
|
+
|
|
5
|
+
from datacontract.model.exceptions import DataContractException
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def to_athena_soda_configuration(server):
|
|
9
|
+
s3_region = os.getenv("DATACONTRACT_S3_REGION")
|
|
10
|
+
s3_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
|
|
11
|
+
s3_secret_access_key = os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY")
|
|
12
|
+
s3_session_token = os.getenv("DATACONTRACT_S3_SESSION_TOKEN")
|
|
13
|
+
|
|
14
|
+
# Validate required parameters
|
|
15
|
+
if not s3_access_key_id:
|
|
16
|
+
raise DataContractException(
|
|
17
|
+
type="athena-connection",
|
|
18
|
+
name="missing_access_key_id",
|
|
19
|
+
reason="AWS access key ID is required. Set the DATACONTRACT_S3_ACCESS_KEY_ID environment variable.",
|
|
20
|
+
engine="datacontract",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
if not s3_secret_access_key:
|
|
24
|
+
raise DataContractException(
|
|
25
|
+
type="athena-connection",
|
|
26
|
+
name="missing_secret_access_key",
|
|
27
|
+
reason="AWS secret access key is required. Set the DATACONTRACT_S3_SECRET_ACCESS_KEY environment variable.",
|
|
28
|
+
engine="datacontract",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
if not hasattr(server, "schema_") or not server.schema_:
|
|
32
|
+
raise DataContractException(
|
|
33
|
+
type="athena-connection",
|
|
34
|
+
name="missing_schema",
|
|
35
|
+
reason="Schema is required for Athena connection. Specify the schema where your tables exist in the server configuration.",
|
|
36
|
+
engine="datacontract",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
if not hasattr(server, "stagingDir") or not server.stagingDir:
|
|
40
|
+
raise DataContractException(
|
|
41
|
+
type="athena-connection",
|
|
42
|
+
name="missing_s3_staging_dir",
|
|
43
|
+
reason="S3 staging directory is required for Athena connection. This should be the Amazon S3 Query Result Location (e.g., 's3://my-bucket/athena-results/').",
|
|
44
|
+
engine="datacontract",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# Validate S3 staging directory format
|
|
48
|
+
if not server.stagingDir.startswith("s3://"):
|
|
49
|
+
raise DataContractException(
|
|
50
|
+
type="athena-connection",
|
|
51
|
+
name="invalid_s3_staging_dir",
|
|
52
|
+
reason=f"S3 staging directory must start with 's3://'. Got: {server.s3_staging_dir}. Example: 's3://my-bucket/athena-results/'",
|
|
53
|
+
engine="datacontract",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
data_source = {
|
|
57
|
+
"type": "athena",
|
|
58
|
+
"access_key_id": s3_access_key_id,
|
|
59
|
+
"secret_access_key": s3_secret_access_key,
|
|
60
|
+
"schema": server.schema_,
|
|
61
|
+
"staging_dir": server.stagingDir,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
if s3_region:
|
|
65
|
+
data_source["region_name"] = s3_region
|
|
66
|
+
elif server.region_name:
|
|
67
|
+
data_source["region_name"] = server.region_name
|
|
68
|
+
|
|
69
|
+
if server.catalog:
|
|
70
|
+
# Optional, Identify the name of the Data Source, also referred to as a Catalog. The default value is `awsdatacatalog`.
|
|
71
|
+
data_source["catalog"] = server.catalog
|
|
72
|
+
|
|
73
|
+
if s3_session_token:
|
|
74
|
+
data_source["session_token"] = s3_session_token
|
|
75
|
+
|
|
76
|
+
soda_configuration = {f"data_source {server.type}": data_source}
|
|
77
|
+
|
|
78
|
+
soda_configuration_str = yaml.dump(soda_configuration)
|
|
79
|
+
return soda_configuration_str
|
|
@@ -1,14 +1,24 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from typing import Any
|
|
2
|
+
from typing import Any, Dict
|
|
3
3
|
|
|
4
4
|
import duckdb
|
|
5
5
|
|
|
6
|
-
from datacontract.export.
|
|
6
|
+
from datacontract.export.duckdb_type_converter import convert_to_duckdb_csv_type, convert_to_duckdb_json_type
|
|
7
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Server
|
|
7
8
|
from datacontract.model.run import Run
|
|
8
9
|
|
|
9
10
|
|
|
10
|
-
def get_duckdb_connection(
|
|
11
|
-
|
|
11
|
+
def get_duckdb_connection(
|
|
12
|
+
data_contract: DataContractSpecification,
|
|
13
|
+
server: Server,
|
|
14
|
+
run: Run,
|
|
15
|
+
duckdb_connection: duckdb.DuckDBPyConnection | None = None,
|
|
16
|
+
) -> duckdb.DuckDBPyConnection:
|
|
17
|
+
if duckdb_connection is None:
|
|
18
|
+
con = duckdb.connect(database=":memory:")
|
|
19
|
+
else:
|
|
20
|
+
con = duckdb_connection
|
|
21
|
+
|
|
12
22
|
path: str = ""
|
|
13
23
|
if server.type == "local":
|
|
14
24
|
path = server.path
|
|
@@ -33,9 +43,16 @@ def get_duckdb_connection(data_contract, server, run: Run):
|
|
|
33
43
|
json_format = "newline_delimited"
|
|
34
44
|
elif server.delimiter == "array":
|
|
35
45
|
json_format = "array"
|
|
36
|
-
|
|
46
|
+
columns = to_json_types(model)
|
|
47
|
+
if columns is None:
|
|
48
|
+
con.sql(f"""
|
|
37
49
|
CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', hive_partitioning=1);
|
|
38
50
|
""")
|
|
51
|
+
else:
|
|
52
|
+
con.sql(
|
|
53
|
+
f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', columns={columns}, hive_partitioning=1);"""
|
|
54
|
+
)
|
|
55
|
+
add_nested_views(con, model_name, model.fields)
|
|
39
56
|
elif server.format == "parquet":
|
|
40
57
|
con.sql(f"""
|
|
41
58
|
CREATE VIEW "{model_name}" AS SELECT * FROM read_parquet('{model_path}', hive_partitioning=1);
|
|
@@ -54,6 +71,9 @@ def get_duckdb_connection(data_contract, server, run: Run):
|
|
|
54
71
|
elif server.format == "delta":
|
|
55
72
|
con.sql("update extensions;") # Make sure we have the latest delta extension
|
|
56
73
|
con.sql(f"""CREATE VIEW "{model_name}" AS SELECT * FROM delta_scan('{model_path}');""")
|
|
74
|
+
table_info = con.sql(f"PRAGMA table_info('{model_name}');").fetchdf()
|
|
75
|
+
if table_info is not None and not table_info.empty:
|
|
76
|
+
run.log_info(f"DuckDB Table Info: {table_info.to_string(index=False)}")
|
|
57
77
|
return con
|
|
58
78
|
|
|
59
79
|
|
|
@@ -67,6 +87,45 @@ def to_csv_types(model) -> dict[Any, str | None] | None:
|
|
|
67
87
|
return columns
|
|
68
88
|
|
|
69
89
|
|
|
90
|
+
def to_json_types(model: Model) -> dict[Any, str | None] | None:
|
|
91
|
+
if model is None:
|
|
92
|
+
return None
|
|
93
|
+
columns = {}
|
|
94
|
+
for field_name, field in model.fields.items():
|
|
95
|
+
columns[field_name] = convert_to_duckdb_json_type(field)
|
|
96
|
+
return columns
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def add_nested_views(con: duckdb.DuckDBPyConnection, model_name: str, fields: Dict[str, Field] | None):
|
|
100
|
+
model_name = model_name.strip('"')
|
|
101
|
+
if fields is None:
|
|
102
|
+
return
|
|
103
|
+
for field_name, field in fields.items():
|
|
104
|
+
if field.type is None or field.type.lower() not in ["array", "object"]:
|
|
105
|
+
continue
|
|
106
|
+
field_type = field.type.lower()
|
|
107
|
+
if field_type == "array" and field.items is None:
|
|
108
|
+
continue
|
|
109
|
+
elif field_type == "object" and field.fields is None:
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
nested_model_name = f"{model_name}__{field_name}"
|
|
113
|
+
max_depth = 2 if field_type == "array" else 1
|
|
114
|
+
|
|
115
|
+
## if parent field is not required, the nested objects may respolve
|
|
116
|
+
## to a row of NULLs -- but if the objects themselves have required
|
|
117
|
+
## fields, this will fail the check.
|
|
118
|
+
where = "" if field.required else f" WHERE {field_name} IS NOT NULL"
|
|
119
|
+
con.sql(f"""
|
|
120
|
+
CREATE VIEW IF NOT EXISTS "{nested_model_name}" AS
|
|
121
|
+
SELECT unnest({field_name}, max_depth := {max_depth}) as {field_name} FROM "{model_name}" {where}
|
|
122
|
+
""")
|
|
123
|
+
if field_type == "array":
|
|
124
|
+
add_nested_views(con, nested_model_name, field.items.fields)
|
|
125
|
+
elif field_type == "object":
|
|
126
|
+
add_nested_views(con, nested_model_name, field.fields)
|
|
127
|
+
|
|
128
|
+
|
|
70
129
|
def setup_s3_connection(con, server):
|
|
71
130
|
s3_region = os.getenv("DATACONTRACT_S3_REGION")
|
|
72
131
|
s3_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
|
|
@@ -76,10 +135,10 @@ def setup_s3_connection(con, server):
|
|
|
76
135
|
use_ssl = "true"
|
|
77
136
|
url_style = "vhost"
|
|
78
137
|
if server.endpointUrl is not None:
|
|
138
|
+
url_style = "path"
|
|
79
139
|
s3_endpoint = server.endpointUrl.removeprefix("http://").removeprefix("https://")
|
|
80
140
|
if server.endpointUrl.startswith("http://"):
|
|
81
141
|
use_ssl = "false"
|
|
82
|
-
url_style = "path"
|
|
83
142
|
|
|
84
143
|
if s3_access_key_id is not None:
|
|
85
144
|
if s3_session_token is not None:
|
|
@@ -6,6 +6,7 @@ import tempfile
|
|
|
6
6
|
from datacontract.export.avro_converter import to_avro_schema_json
|
|
7
7
|
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Server
|
|
8
8
|
from datacontract.model.exceptions import DataContractException
|
|
9
|
+
from datacontract.model.run import ResultEnum
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
def create_spark_session():
|
|
@@ -16,7 +17,7 @@ def create_spark_session():
|
|
|
16
17
|
except ImportError as e:
|
|
17
18
|
raise DataContractException(
|
|
18
19
|
type="schema",
|
|
19
|
-
result=
|
|
20
|
+
result=ResultEnum.failed,
|
|
20
21
|
name="pyspark is missing",
|
|
21
22
|
reason="Install the extra datacontract-cli[kafka] to use kafka",
|
|
22
23
|
engine="datacontract",
|
|
@@ -26,6 +27,7 @@ def create_spark_session():
|
|
|
26
27
|
tmp_dir = tempfile.TemporaryDirectory(prefix="datacontract-cli-spark")
|
|
27
28
|
atexit.register(tmp_dir.cleanup)
|
|
28
29
|
|
|
30
|
+
pyspark_version = "3.5.5" # MUST be the same as in the pyproject.toml
|
|
29
31
|
spark = (
|
|
30
32
|
SparkSession.builder.appName("datacontract")
|
|
31
33
|
.config("spark.sql.warehouse.dir", f"{tmp_dir}/spark-warehouse")
|
|
@@ -33,7 +35,7 @@ def create_spark_session():
|
|
|
33
35
|
.config("spark.ui.enabled", "false")
|
|
34
36
|
.config(
|
|
35
37
|
"spark.jars.packages",
|
|
36
|
-
"org.apache.spark:spark-sql-kafka-0-10_2.12:
|
|
38
|
+
f"org.apache.spark:spark-sql-kafka-0-10_2.12:{pyspark_version},org.apache.spark:spark-avro_2.12:{pyspark_version}",
|
|
37
39
|
)
|
|
38
40
|
.getOrCreate()
|
|
39
41
|
)
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import yaml
|
|
4
|
+
|
|
5
|
+
from datacontract.model.data_contract_specification import Server
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def initialize_client_and_create_soda_configuration(server: Server) -> str:
|
|
9
|
+
import oracledb
|
|
10
|
+
soda_config = to_oracle_soda_configuration(server)
|
|
11
|
+
|
|
12
|
+
oracle_client_dir = os.getenv("DATACONTRACT_ORACLE_CLIENT_DIR")
|
|
13
|
+
if oracle_client_dir is not None:
|
|
14
|
+
# Soda Core currently does not support thick mode natively, see https://github.com/sodadata/soda-core/issues/2036
|
|
15
|
+
# but the oracledb client can be configured accordingly before Soda initializes as a work-around
|
|
16
|
+
oracledb.init_oracle_client(lib_dir=oracle_client_dir)
|
|
17
|
+
|
|
18
|
+
return soda_config
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def to_oracle_soda_configuration(server: Server) -> str:
|
|
22
|
+
"""Serialize server config to soda configuration.
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
### Example:
|
|
26
|
+
type: oracle
|
|
27
|
+
host: database-1.us-east-1.rds.amazonaws.com
|
|
28
|
+
port: '1521'
|
|
29
|
+
username: simple
|
|
30
|
+
password: simple_pass
|
|
31
|
+
connectstring: database-1.us-east-1.rds.amazonaws.com:1521/ORCL (database is equal to service name at oracle)
|
|
32
|
+
schema: SYSTEM
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
service_name = server.service_name or server.database
|
|
36
|
+
# with service account key, using an external json file
|
|
37
|
+
soda_configuration = {
|
|
38
|
+
f"data_source {server.type}": {
|
|
39
|
+
"type": "oracle",
|
|
40
|
+
"host": server.host,
|
|
41
|
+
"port": str(server.port),
|
|
42
|
+
"username": os.getenv("DATACONTRACT_ORACLE_USERNAME", ""),
|
|
43
|
+
"password": os.getenv("DATACONTRACT_ORACLE_PASSWORD", ""),
|
|
44
|
+
"connectstring": f"{server.host}:{server.port}/{service_name}",
|
|
45
|
+
"schema": server.schema_,
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
soda_configuration_str = yaml.dump(soda_configuration)
|
|
50
|
+
return soda_configuration_str
|
|
@@ -44,12 +44,18 @@ def to_avro_field(field, field_name):
|
|
|
44
44
|
avro_type = to_avro_type(field, field_name)
|
|
45
45
|
avro_field["type"] = avro_type if is_required_avro else ["null", avro_type]
|
|
46
46
|
|
|
47
|
-
|
|
48
|
-
|
|
47
|
+
# Handle enum types - both required and optional
|
|
48
|
+
if avro_type == "enum" or (isinstance(avro_field["type"], list) and "enum" in avro_field["type"]):
|
|
49
|
+
enum_def = {
|
|
49
50
|
"type": "enum",
|
|
50
51
|
"name": field.title,
|
|
51
52
|
"symbols": field.enum,
|
|
52
53
|
}
|
|
54
|
+
if is_required_avro:
|
|
55
|
+
avro_field["type"] = enum_def
|
|
56
|
+
else:
|
|
57
|
+
# Replace "enum" with the full enum definition in the union
|
|
58
|
+
avro_field["type"] = ["null", enum_def]
|
|
53
59
|
|
|
54
60
|
if field.config:
|
|
55
61
|
if "avroDefault" in field.config:
|
|
@@ -77,6 +83,10 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
|
|
|
77
83
|
if "avroType" in field.config:
|
|
78
84
|
return field.config["avroType"]
|
|
79
85
|
|
|
86
|
+
# Check for enum fields based on presence of enum list and avroType config
|
|
87
|
+
if field.enum and field.config and field.config.get("avroType") == "enum":
|
|
88
|
+
return "enum"
|
|
89
|
+
|
|
80
90
|
if field.type is None:
|
|
81
91
|
return "null"
|
|
82
92
|
if field.type in ["string", "varchar", "text"]:
|
|
@@ -91,7 +101,9 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
|
|
|
91
101
|
if field.precision is not None:
|
|
92
102
|
typeVal["precision"] = field.precision
|
|
93
103
|
return typeVal
|
|
94
|
-
elif field.type in ["float"
|
|
104
|
+
elif field.type in ["float"]:
|
|
105
|
+
return "float"
|
|
106
|
+
elif field.type in ["double"]:
|
|
95
107
|
return "double"
|
|
96
108
|
elif field.type in ["integer", "int"]:
|
|
97
109
|
return "int"
|
|
@@ -107,6 +119,11 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
|
|
|
107
119
|
return {"type": "int", "logicalType": "date"}
|
|
108
120
|
elif field.type in ["time"]:
|
|
109
121
|
return "long"
|
|
122
|
+
elif field.type in ["map"]:
|
|
123
|
+
if field.config is not None and "values" in field.config:
|
|
124
|
+
return {"type": "map", "values": field.config["values"]}
|
|
125
|
+
else:
|
|
126
|
+
return "bytes"
|
|
110
127
|
elif field.type in ["object", "record", "struct"]:
|
|
111
128
|
if field.config is not None and "namespace" in field.config:
|
|
112
129
|
return to_avro_record(field_name, field.fields, field.description, field.config["namespace"])
|
|
@@ -103,7 +103,7 @@ def map_type_to_bigquery(field: Field) -> str:
|
|
|
103
103
|
elif field_type.lower() == "date":
|
|
104
104
|
return "DATE"
|
|
105
105
|
elif field_type.lower() == "timestamp_ntz":
|
|
106
|
-
return "
|
|
106
|
+
return "DATETIME"
|
|
107
107
|
elif field_type.lower() in ["number", "decimal", "numeric"]:
|
|
108
108
|
return "NUMERIC"
|
|
109
109
|
elif field_type.lower() == "double":
|