datacontract-cli 0.10.0__py3-none-any.whl → 0.10.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datacontract/__init__.py +13 -0
- datacontract/api.py +260 -0
- datacontract/breaking/breaking.py +242 -12
- datacontract/breaking/breaking_rules.py +37 -1
- datacontract/catalog/catalog.py +80 -0
- datacontract/cli.py +387 -117
- datacontract/data_contract.py +216 -353
- datacontract/engines/data_contract_checks.py +1041 -0
- datacontract/engines/data_contract_test.py +113 -0
- datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +2 -3
- datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
- datacontract/engines/fastjsonschema/check_jsonschema.py +176 -42
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +16 -1
- datacontract/engines/soda/check_soda_execute.py +100 -56
- datacontract/engines/soda/connections/athena.py +79 -0
- datacontract/engines/soda/connections/bigquery.py +8 -1
- datacontract/engines/soda/connections/databricks.py +12 -3
- datacontract/engines/soda/connections/duckdb_connection.py +241 -0
- datacontract/engines/soda/connections/kafka.py +206 -113
- datacontract/engines/soda/connections/snowflake.py +8 -5
- datacontract/engines/soda/connections/sqlserver.py +43 -0
- datacontract/engines/soda/connections/trino.py +26 -0
- datacontract/export/avro_converter.py +72 -8
- datacontract/export/avro_idl_converter.py +31 -25
- datacontract/export/bigquery_converter.py +130 -0
- datacontract/export/custom_converter.py +40 -0
- datacontract/export/data_caterer_converter.py +161 -0
- datacontract/export/dbml_converter.py +148 -0
- datacontract/export/dbt_converter.py +141 -54
- datacontract/export/dcs_exporter.py +6 -0
- datacontract/export/dqx_converter.py +126 -0
- datacontract/export/duckdb_type_converter.py +57 -0
- datacontract/export/excel_exporter.py +923 -0
- datacontract/export/exporter.py +100 -0
- datacontract/export/exporter_factory.py +216 -0
- datacontract/export/go_converter.py +105 -0
- datacontract/export/great_expectations_converter.py +257 -36
- datacontract/export/html_exporter.py +86 -0
- datacontract/export/iceberg_converter.py +188 -0
- datacontract/export/jsonschema_converter.py +71 -16
- datacontract/export/markdown_converter.py +337 -0
- datacontract/export/mermaid_exporter.py +110 -0
- datacontract/export/odcs_v3_exporter.py +375 -0
- datacontract/export/pandas_type_converter.py +40 -0
- datacontract/export/protobuf_converter.py +168 -68
- datacontract/export/pydantic_converter.py +6 -0
- datacontract/export/rdf_converter.py +13 -6
- datacontract/export/sodacl_converter.py +36 -188
- datacontract/export/spark_converter.py +245 -0
- datacontract/export/sql_converter.py +37 -3
- datacontract/export/sql_type_converter.py +269 -8
- datacontract/export/sqlalchemy_converter.py +170 -0
- datacontract/export/terraform_converter.py +7 -2
- datacontract/imports/avro_importer.py +246 -26
- datacontract/imports/bigquery_importer.py +221 -0
- datacontract/imports/csv_importer.py +143 -0
- datacontract/imports/dbml_importer.py +112 -0
- datacontract/imports/dbt_importer.py +240 -0
- datacontract/imports/excel_importer.py +1111 -0
- datacontract/imports/glue_importer.py +288 -0
- datacontract/imports/iceberg_importer.py +172 -0
- datacontract/imports/importer.py +51 -0
- datacontract/imports/importer_factory.py +128 -0
- datacontract/imports/json_importer.py +325 -0
- datacontract/imports/jsonschema_importer.py +146 -0
- datacontract/imports/odcs_importer.py +60 -0
- datacontract/imports/odcs_v3_importer.py +516 -0
- datacontract/imports/parquet_importer.py +81 -0
- datacontract/imports/protobuf_importer.py +264 -0
- datacontract/imports/spark_importer.py +262 -0
- datacontract/imports/sql_importer.py +274 -35
- datacontract/imports/unity_importer.py +219 -0
- datacontract/init/init_template.py +20 -0
- datacontract/integration/datamesh_manager.py +86 -0
- datacontract/lint/resolve.py +271 -49
- datacontract/lint/resources.py +21 -0
- datacontract/lint/schema.py +53 -17
- datacontract/lint/urls.py +32 -12
- datacontract/model/data_contract_specification/__init__.py +1 -0
- datacontract/model/exceptions.py +4 -1
- datacontract/model/odcs.py +24 -0
- datacontract/model/run.py +49 -29
- datacontract/output/__init__.py +0 -0
- datacontract/output/junit_test_results.py +135 -0
- datacontract/output/output_format.py +10 -0
- datacontract/output/test_results_writer.py +79 -0
- datacontract/py.typed +0 -0
- datacontract/schemas/datacontract-1.1.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.1.0.schema.json +1975 -0
- datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
- datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
- datacontract/schemas/odcs-3.0.1.schema.json +2634 -0
- datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
- datacontract/templates/datacontract.html +139 -294
- datacontract/templates/datacontract_odcs.html +685 -0
- datacontract/templates/index.html +236 -0
- datacontract/templates/partials/datacontract_information.html +86 -0
- datacontract/templates/partials/datacontract_servicelevels.html +253 -0
- datacontract/templates/partials/datacontract_terms.html +51 -0
- datacontract/templates/partials/definition.html +25 -0
- datacontract/templates/partials/example.html +27 -0
- datacontract/templates/partials/model_field.html +144 -0
- datacontract/templates/partials/quality.html +49 -0
- datacontract/templates/partials/server.html +211 -0
- datacontract/templates/style/output.css +491 -72
- datacontract_cli-0.10.37.dist-info/METADATA +2235 -0
- datacontract_cli-0.10.37.dist-info/RECORD +119 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/WHEEL +1 -1
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info/licenses}/LICENSE +1 -1
- datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
- datacontract/engines/soda/connections/dask.py +0 -28
- datacontract/engines/soda/connections/duckdb.py +0 -76
- datacontract/export/csv_type_converter.py +0 -36
- datacontract/export/html_export.py +0 -66
- datacontract/export/odcs_converter.py +0 -102
- datacontract/init/download_datacontract_file.py +0 -17
- datacontract/integration/publish_datamesh_manager.py +0 -33
- datacontract/integration/publish_opentelemetry.py +0 -107
- datacontract/lint/lint.py +0 -141
- datacontract/lint/linters/description_linter.py +0 -34
- datacontract/lint/linters/example_model_linter.py +0 -91
- datacontract/lint/linters/field_pattern_linter.py +0 -34
- datacontract/lint/linters/field_reference_linter.py +0 -38
- datacontract/lint/linters/notice_period_linter.py +0 -55
- datacontract/lint/linters/quality_schema_linter.py +0 -52
- datacontract/lint/linters/valid_constraints_linter.py +0 -99
- datacontract/model/data_contract_specification.py +0 -141
- datacontract/web.py +0 -14
- datacontract_cli-0.10.0.dist-info/METADATA +0 -951
- datacontract_cli-0.10.0.dist-info/RECORD +0 -66
- /datacontract/{model → breaking}/breaking_change.py +0 -0
- /datacontract/{lint/linters → export}/__init__.py +0 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/top_level.txt +0 -0
|
@@ -1,28 +1,39 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import typing
|
|
3
|
+
import uuid
|
|
2
4
|
|
|
3
|
-
from
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from
|
|
7
|
-
|
|
8
|
-
from
|
|
9
|
-
|
|
10
|
-
from datacontract.engines.soda.connections.
|
|
11
|
-
from datacontract.engines.soda.connections.
|
|
12
|
-
|
|
13
|
-
from datacontract.engines.soda.connections.
|
|
14
|
-
|
|
15
|
-
from datacontract.engines.soda.connections.snowflake import
|
|
16
|
-
|
|
5
|
+
from datacontract.engines.soda.connections.athena import to_athena_soda_configuration
|
|
6
|
+
|
|
7
|
+
if typing.TYPE_CHECKING:
|
|
8
|
+
from pyspark.sql import SparkSession
|
|
9
|
+
|
|
10
|
+
from duckdb.duckdb import DuckDBPyConnection
|
|
11
|
+
|
|
12
|
+
from datacontract.engines.soda.connections.bigquery import to_bigquery_soda_configuration
|
|
13
|
+
from datacontract.engines.soda.connections.databricks import to_databricks_soda_configuration
|
|
14
|
+
from datacontract.engines.soda.connections.duckdb_connection import get_duckdb_connection
|
|
15
|
+
from datacontract.engines.soda.connections.kafka import create_spark_session, read_kafka_topic
|
|
16
|
+
from datacontract.engines.soda.connections.postgres import to_postgres_soda_configuration
|
|
17
|
+
from datacontract.engines.soda.connections.snowflake import to_snowflake_soda_configuration
|
|
18
|
+
from datacontract.engines.soda.connections.sqlserver import to_sqlserver_soda_configuration
|
|
19
|
+
from datacontract.engines.soda.connections.trino import to_trino_soda_configuration
|
|
17
20
|
from datacontract.export.sodacl_converter import to_sodacl_yaml
|
|
18
|
-
from datacontract.model.data_contract_specification import
|
|
19
|
-
|
|
20
|
-
from datacontract.model.run import Run, Check, Log
|
|
21
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Server
|
|
22
|
+
from datacontract.model.run import Check, Log, ResultEnum, Run
|
|
21
23
|
|
|
22
24
|
|
|
23
25
|
def check_soda_execute(
|
|
24
|
-
run: Run,
|
|
26
|
+
run: Run,
|
|
27
|
+
data_contract: DataContractSpecification,
|
|
28
|
+
server: Server,
|
|
29
|
+
spark: "SparkSession" = None,
|
|
30
|
+
duckdb_connection: DuckDBPyConnection = None,
|
|
25
31
|
):
|
|
32
|
+
from soda.common.config_helper import ConfigHelper
|
|
33
|
+
|
|
34
|
+
ConfigHelper.get_instance().upsert_value("send_anonymous_usage_stats", False)
|
|
35
|
+
from soda.scan import Scan
|
|
36
|
+
|
|
26
37
|
if data_contract is None:
|
|
27
38
|
run.log_warn("Cannot run engine soda-core, as data contract is invalid")
|
|
28
39
|
return
|
|
@@ -30,9 +41,10 @@ def check_soda_execute(
|
|
|
30
41
|
run.log_info("Running engine soda-core")
|
|
31
42
|
scan = Scan()
|
|
32
43
|
|
|
33
|
-
if server.type
|
|
34
|
-
if server.format in ["json", "parquet", "csv"]:
|
|
35
|
-
|
|
44
|
+
if server.type in ["s3", "gcs", "azure", "local"]:
|
|
45
|
+
if server.format in ["json", "parquet", "csv", "delta"]:
|
|
46
|
+
run.log_info(f"Configuring engine soda-core to connect to {server.type} {server.format} with duckdb")
|
|
47
|
+
con = get_duckdb_connection(data_contract, server, run, duckdb_connection)
|
|
36
48
|
scan.add_duckdb_connection(duckdb_connection=con, data_source_name=server.type)
|
|
37
49
|
scan.set_data_source_name(server.type)
|
|
38
50
|
else:
|
|
@@ -40,7 +52,7 @@ def check_soda_execute(
|
|
|
40
52
|
Check(
|
|
41
53
|
type="general",
|
|
42
54
|
name="Check that format is supported",
|
|
43
|
-
result=
|
|
55
|
+
result=ResultEnum.warning,
|
|
44
56
|
reason=f"Format {server.format} not yet supported by datacontract CLI",
|
|
45
57
|
engine="datacontract",
|
|
46
58
|
)
|
|
@@ -61,27 +73,52 @@ def check_soda_execute(
|
|
|
61
73
|
scan.set_data_source_name(server.type)
|
|
62
74
|
elif server.type == "databricks":
|
|
63
75
|
if spark is not None:
|
|
64
|
-
|
|
76
|
+
run.log_info("Connecting to databricks via spark")
|
|
65
77
|
scan.add_spark_session(spark, data_source_name=server.type)
|
|
66
78
|
scan.set_data_source_name(server.type)
|
|
67
|
-
|
|
79
|
+
database_name = ".".join(filter(None, [server.catalog, server.schema_]))
|
|
80
|
+
spark.sql(f"USE {database_name}")
|
|
68
81
|
else:
|
|
82
|
+
run.log_info("Connecting to databricks directly")
|
|
69
83
|
soda_configuration_str = to_databricks_soda_configuration(server)
|
|
70
84
|
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
71
85
|
scan.set_data_source_name(server.type)
|
|
86
|
+
elif server.type == "dataframe":
|
|
87
|
+
if spark is None:
|
|
88
|
+
run.log_warn(
|
|
89
|
+
"Server type dataframe only works with the Python library and requires a Spark session, "
|
|
90
|
+
"please provide one with the DataContract class"
|
|
91
|
+
)
|
|
92
|
+
return
|
|
93
|
+
else:
|
|
94
|
+
logging.info("Use Spark to connect to data source")
|
|
95
|
+
scan.add_spark_session(spark, data_source_name="datacontract-cli")
|
|
96
|
+
scan.set_data_source_name("datacontract-cli")
|
|
72
97
|
elif server.type == "kafka":
|
|
73
98
|
if spark is None:
|
|
74
|
-
spark = create_spark_session(
|
|
75
|
-
read_kafka_topic(spark, data_contract, server
|
|
99
|
+
spark = create_spark_session()
|
|
100
|
+
read_kafka_topic(spark, data_contract, server)
|
|
76
101
|
scan.add_spark_session(spark, data_source_name=server.type)
|
|
77
102
|
scan.set_data_source_name(server.type)
|
|
103
|
+
elif server.type == "sqlserver":
|
|
104
|
+
soda_configuration_str = to_sqlserver_soda_configuration(server)
|
|
105
|
+
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
106
|
+
scan.set_data_source_name(server.type)
|
|
107
|
+
elif server.type == "trino":
|
|
108
|
+
soda_configuration_str = to_trino_soda_configuration(server)
|
|
109
|
+
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
110
|
+
scan.set_data_source_name(server.type)
|
|
111
|
+
elif server.type == "athena":
|
|
112
|
+
soda_configuration_str = to_athena_soda_configuration(server)
|
|
113
|
+
scan.add_configuration_yaml_str(soda_configuration_str)
|
|
114
|
+
scan.set_data_source_name(server.type)
|
|
78
115
|
|
|
79
116
|
else:
|
|
80
117
|
run.checks.append(
|
|
81
118
|
Check(
|
|
82
119
|
type="general",
|
|
83
120
|
name="Check that server type is supported",
|
|
84
|
-
result=
|
|
121
|
+
result=ResultEnum.warning,
|
|
85
122
|
reason=f"Server type {server.type} not yet supported by datacontract CLI",
|
|
86
123
|
engine="datacontract-cli",
|
|
87
124
|
)
|
|
@@ -89,37 +126,34 @@ def check_soda_execute(
|
|
|
89
126
|
run.log_warn(f"Server type {server.type} not yet supported by datacontract CLI")
|
|
90
127
|
return
|
|
91
128
|
|
|
92
|
-
|
|
93
|
-
# Don't check types for avro format, as they are checked with avro schema
|
|
94
|
-
# Don't check types for csv format, as they are hard to detect
|
|
95
|
-
server_type = server.type
|
|
96
|
-
check_types = server.format != "json" and server.format != "csv" and server.format != "avro"
|
|
97
|
-
|
|
98
|
-
sodacl_yaml_str = to_sodacl_yaml(data_contract, server_type, check_types)
|
|
129
|
+
sodacl_yaml_str = to_sodacl_yaml(run)
|
|
99
130
|
# print("sodacl_yaml_str:\n" + sodacl_yaml_str)
|
|
100
131
|
scan.add_sodacl_yaml_str(sodacl_yaml_str)
|
|
101
132
|
|
|
102
133
|
# Execute the scan
|
|
103
|
-
logging.info("Starting soda scan")
|
|
134
|
+
logging.info("Starting soda scan with checks:\n" + sodacl_yaml_str)
|
|
104
135
|
scan.execute()
|
|
105
136
|
logging.info("Finished soda scan")
|
|
106
137
|
|
|
107
138
|
# pprint.PrettyPrinter(indent=2).pprint(scan.build_scan_results())
|
|
108
139
|
|
|
109
140
|
scan_results = scan.get_scan_results()
|
|
110
|
-
for
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
141
|
+
for scan_result in scan_results.get("checks"):
|
|
142
|
+
name = scan_result.get("name")
|
|
143
|
+
check = get_check(run, scan_result)
|
|
144
|
+
if check is None:
|
|
145
|
+
check = Check(
|
|
146
|
+
id=str(uuid.uuid4()),
|
|
147
|
+
category="custom",
|
|
148
|
+
type="custom",
|
|
149
|
+
name=name,
|
|
150
|
+
engine="soda-core",
|
|
151
|
+
)
|
|
152
|
+
run.checks.append(check)
|
|
153
|
+
check.result = to_result(scan_result)
|
|
154
|
+
check.reason = ", ".join(scan_result.get("outcomeReasons"))
|
|
155
|
+
check.diagnostics = scan_result.get("diagnostics")
|
|
156
|
+
update_reason(check, scan_result)
|
|
123
157
|
|
|
124
158
|
for log in scan_results.get("logs"):
|
|
125
159
|
run.logs.append(
|
|
@@ -135,8 +169,8 @@ def check_soda_execute(
|
|
|
135
169
|
run.checks.append(
|
|
136
170
|
Check(
|
|
137
171
|
type="general",
|
|
138
|
-
name="
|
|
139
|
-
result=
|
|
172
|
+
name="Data Contract Tests",
|
|
173
|
+
result=ResultEnum.warning,
|
|
140
174
|
reason="Engine soda-core has errors. See the logs for details.",
|
|
141
175
|
engine="soda-core",
|
|
142
176
|
)
|
|
@@ -144,14 +178,22 @@ def check_soda_execute(
|
|
|
144
178
|
return
|
|
145
179
|
|
|
146
180
|
|
|
147
|
-
def
|
|
181
|
+
def get_check(run, scan_result) -> Check | None:
|
|
182
|
+
check_by_name = next((c for c in run.checks if c.key == scan_result.get("name")), None)
|
|
183
|
+
if check_by_name is not None:
|
|
184
|
+
return check_by_name
|
|
185
|
+
|
|
186
|
+
return None
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def to_result(c) -> ResultEnum:
|
|
148
190
|
soda_outcome = c.get("outcome")
|
|
149
191
|
if soda_outcome == "pass":
|
|
150
|
-
return
|
|
192
|
+
return ResultEnum.passed
|
|
151
193
|
elif soda_outcome == "fail":
|
|
152
|
-
return
|
|
194
|
+
return ResultEnum.failed
|
|
153
195
|
else:
|
|
154
|
-
return
|
|
196
|
+
return ResultEnum.unknown
|
|
155
197
|
|
|
156
198
|
|
|
157
199
|
def update_reason(check, c):
|
|
@@ -164,9 +206,11 @@ def update_reason(check, c):
|
|
|
164
206
|
if block["title"] == "Diagnostics":
|
|
165
207
|
# Extract and print the 'text' value
|
|
166
208
|
diagnostics_text = block["text"]
|
|
167
|
-
print(diagnostics_text)
|
|
209
|
+
# print(diagnostics_text)
|
|
168
210
|
diagnostics_text_split = diagnostics_text.split(":icon-fail: ")
|
|
169
211
|
if len(diagnostics_text_split) > 1:
|
|
170
212
|
check.reason = diagnostics_text_split[1].strip()
|
|
171
|
-
print(check.reason)
|
|
213
|
+
# print(check.reason)
|
|
172
214
|
break # Exit the loop once the desired block is found
|
|
215
|
+
if "fail" in c["diagnostics"]:
|
|
216
|
+
check.reason = f"Value: {c['diagnostics']['value']} Fail: {c['diagnostics']['fail']}"
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import yaml
|
|
4
|
+
|
|
5
|
+
from datacontract.model.exceptions import DataContractException
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def to_athena_soda_configuration(server):
|
|
9
|
+
s3_region = os.getenv("DATACONTRACT_S3_REGION")
|
|
10
|
+
s3_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
|
|
11
|
+
s3_secret_access_key = os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY")
|
|
12
|
+
s3_session_token = os.getenv("DATACONTRACT_S3_SESSION_TOKEN")
|
|
13
|
+
|
|
14
|
+
# Validate required parameters
|
|
15
|
+
if not s3_access_key_id:
|
|
16
|
+
raise DataContractException(
|
|
17
|
+
type="athena-connection",
|
|
18
|
+
name="missing_access_key_id",
|
|
19
|
+
reason="AWS access key ID is required. Set the DATACONTRACT_S3_ACCESS_KEY_ID environment variable.",
|
|
20
|
+
engine="datacontract",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
if not s3_secret_access_key:
|
|
24
|
+
raise DataContractException(
|
|
25
|
+
type="athena-connection",
|
|
26
|
+
name="missing_secret_access_key",
|
|
27
|
+
reason="AWS secret access key is required. Set the DATACONTRACT_S3_SECRET_ACCESS_KEY environment variable.",
|
|
28
|
+
engine="datacontract",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
if not hasattr(server, "schema_") or not server.schema_:
|
|
32
|
+
raise DataContractException(
|
|
33
|
+
type="athena-connection",
|
|
34
|
+
name="missing_schema",
|
|
35
|
+
reason="Schema is required for Athena connection. Specify the schema where your tables exist in the server configuration.",
|
|
36
|
+
engine="datacontract",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
if not hasattr(server, "stagingDir") or not server.stagingDir:
|
|
40
|
+
raise DataContractException(
|
|
41
|
+
type="athena-connection",
|
|
42
|
+
name="missing_s3_staging_dir",
|
|
43
|
+
reason="S3 staging directory is required for Athena connection. This should be the Amazon S3 Query Result Location (e.g., 's3://my-bucket/athena-results/').",
|
|
44
|
+
engine="datacontract",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# Validate S3 staging directory format
|
|
48
|
+
if not server.stagingDir.startswith("s3://"):
|
|
49
|
+
raise DataContractException(
|
|
50
|
+
type="athena-connection",
|
|
51
|
+
name="invalid_s3_staging_dir",
|
|
52
|
+
reason=f"S3 staging directory must start with 's3://'. Got: {server.s3_staging_dir}. Example: 's3://my-bucket/athena-results/'",
|
|
53
|
+
engine="datacontract",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
data_source = {
|
|
57
|
+
"type": "athena",
|
|
58
|
+
"access_key_id": s3_access_key_id,
|
|
59
|
+
"secret_access_key": s3_secret_access_key,
|
|
60
|
+
"schema": server.schema_,
|
|
61
|
+
"staging_dir": server.stagingDir,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
if s3_region:
|
|
65
|
+
data_source["region_name"] = s3_region
|
|
66
|
+
elif server.region_name:
|
|
67
|
+
data_source["region_name"] = server.region_name
|
|
68
|
+
|
|
69
|
+
if server.catalog:
|
|
70
|
+
# Optional, Identify the name of the Data Source, also referred to as a Catalog. The default value is `awsdatacatalog`.
|
|
71
|
+
data_source["catalog"] = server.catalog
|
|
72
|
+
|
|
73
|
+
if s3_session_token:
|
|
74
|
+
data_source["aws_session_token"] = s3_session_token
|
|
75
|
+
|
|
76
|
+
soda_configuration = {f"data_source {server.type}": data_source}
|
|
77
|
+
|
|
78
|
+
soda_configuration_str = yaml.dump(soda_configuration)
|
|
79
|
+
return soda_configuration_str
|
|
@@ -6,10 +6,17 @@ import yaml
|
|
|
6
6
|
# https://docs.soda.io/soda/connect-bigquery.html#authentication-methods
|
|
7
7
|
def to_bigquery_soda_configuration(server):
|
|
8
8
|
# with service account key, using an external json file
|
|
9
|
+
|
|
10
|
+
# check for our own environment variable first
|
|
11
|
+
account_info = os.getenv("DATACONTRACT_BIGQUERY_ACCOUNT_INFO_JSON_PATH")
|
|
12
|
+
if account_info is None:
|
|
13
|
+
# but as a fallback look for the default google one
|
|
14
|
+
account_info = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
|
|
15
|
+
|
|
9
16
|
soda_configuration = {
|
|
10
17
|
f"data_source {server.type}": {
|
|
11
18
|
"type": "bigquery",
|
|
12
|
-
"account_info_json_path":
|
|
19
|
+
"account_info_json_path": account_info,
|
|
13
20
|
"auth_scopes": ["https://www.googleapis.com/auth/bigquery"],
|
|
14
21
|
"project_id": server.project,
|
|
15
22
|
"dataset": server.dataset,
|
|
@@ -4,15 +4,24 @@ import yaml
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def to_databricks_soda_configuration(server):
|
|
7
|
+
token = os.getenv("DATACONTRACT_DATABRICKS_TOKEN")
|
|
8
|
+
if token is None:
|
|
9
|
+
raise ValueError("DATACONTRACT_DATABRICKS_TOKEN environment variable is not set")
|
|
10
|
+
http_path = os.getenv("DATACONTRACT_DATABRICKS_HTTP_PATH")
|
|
11
|
+
host = server.host
|
|
12
|
+
if host is None:
|
|
13
|
+
host = os.getenv("DATACONTRACT_DATABRICKS_SERVER_HOSTNAME")
|
|
14
|
+
if host is None:
|
|
15
|
+
raise ValueError("DATACONTRACT_DATABRICKS_SERVER_HOSTNAME environment variable is not set")
|
|
7
16
|
soda_configuration = {
|
|
8
17
|
f"data_source {server.type}": {
|
|
9
18
|
"type": "spark",
|
|
10
19
|
"method": "databricks",
|
|
11
|
-
"host":
|
|
20
|
+
"host": host,
|
|
12
21
|
"catalog": server.catalog,
|
|
13
22
|
"schema": server.schema_,
|
|
14
|
-
"http_path":
|
|
15
|
-
"token":
|
|
23
|
+
"http_path": http_path,
|
|
24
|
+
"token": token,
|
|
16
25
|
}
|
|
17
26
|
}
|
|
18
27
|
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
4
|
+
import duckdb
|
|
5
|
+
|
|
6
|
+
from datacontract.export.duckdb_type_converter import convert_to_duckdb_csv_type, convert_to_duckdb_json_type
|
|
7
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Server
|
|
8
|
+
from datacontract.model.run import Run
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_duckdb_connection(
|
|
12
|
+
data_contract: DataContractSpecification,
|
|
13
|
+
server: Server,
|
|
14
|
+
run: Run,
|
|
15
|
+
duckdb_connection: duckdb.DuckDBPyConnection | None = None,
|
|
16
|
+
) -> duckdb.DuckDBPyConnection:
|
|
17
|
+
if duckdb_connection is None:
|
|
18
|
+
con = duckdb.connect(database=":memory:")
|
|
19
|
+
else:
|
|
20
|
+
con = duckdb_connection
|
|
21
|
+
|
|
22
|
+
path: str = ""
|
|
23
|
+
if server.type == "local":
|
|
24
|
+
path = server.path
|
|
25
|
+
if server.type == "s3":
|
|
26
|
+
path = server.location
|
|
27
|
+
setup_s3_connection(con, server)
|
|
28
|
+
if server.type == "gcs":
|
|
29
|
+
path = server.location
|
|
30
|
+
setup_gcs_connection(con, server)
|
|
31
|
+
if server.type == "azure":
|
|
32
|
+
path = server.location
|
|
33
|
+
setup_azure_connection(con, server)
|
|
34
|
+
for model_name, model in data_contract.models.items():
|
|
35
|
+
model_path = path
|
|
36
|
+
if "{model}" in model_path:
|
|
37
|
+
model_path = model_path.format(model=model_name)
|
|
38
|
+
run.log_info(f"Creating table {model_name} for {model_path}")
|
|
39
|
+
|
|
40
|
+
if server.format == "json":
|
|
41
|
+
json_format = "auto"
|
|
42
|
+
if server.delimiter == "new_line":
|
|
43
|
+
json_format = "newline_delimited"
|
|
44
|
+
elif server.delimiter == "array":
|
|
45
|
+
json_format = "array"
|
|
46
|
+
columns = to_json_types(model)
|
|
47
|
+
if columns is None:
|
|
48
|
+
con.sql(f"""
|
|
49
|
+
CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', hive_partitioning=1);
|
|
50
|
+
""")
|
|
51
|
+
else:
|
|
52
|
+
con.sql(
|
|
53
|
+
f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_json_auto('{model_path}', format='{json_format}', columns={columns}, hive_partitioning=1);"""
|
|
54
|
+
)
|
|
55
|
+
add_nested_views(con, model_name, model.fields)
|
|
56
|
+
elif server.format == "parquet":
|
|
57
|
+
con.sql(f"""
|
|
58
|
+
CREATE VIEW "{model_name}" AS SELECT * FROM read_parquet('{model_path}', hive_partitioning=1);
|
|
59
|
+
""")
|
|
60
|
+
elif server.format == "csv":
|
|
61
|
+
columns = to_csv_types(model)
|
|
62
|
+
run.log_info("Using columns: " + str(columns))
|
|
63
|
+
if columns is None:
|
|
64
|
+
con.sql(
|
|
65
|
+
f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1);"""
|
|
66
|
+
)
|
|
67
|
+
else:
|
|
68
|
+
con.sql(
|
|
69
|
+
f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1, columns={columns});"""
|
|
70
|
+
)
|
|
71
|
+
elif server.format == "delta":
|
|
72
|
+
con.sql("update extensions;") # Make sure we have the latest delta extension
|
|
73
|
+
con.sql(f"""CREATE VIEW "{model_name}" AS SELECT * FROM delta_scan('{model_path}');""")
|
|
74
|
+
table_info = con.sql(f"PRAGMA table_info('{model_name}');").fetchdf()
|
|
75
|
+
if table_info is not None and not table_info.empty:
|
|
76
|
+
run.log_info(f"DuckDB Table Info: {table_info.to_string(index=False)}")
|
|
77
|
+
return con
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def to_csv_types(model) -> dict[Any, str | None] | None:
|
|
81
|
+
if model is None:
|
|
82
|
+
return None
|
|
83
|
+
columns = {}
|
|
84
|
+
# ['SQLNULL', 'BOOLEAN', 'BIGINT', 'DOUBLE', 'TIME', 'DATE', 'TIMESTAMP', 'VARCHAR']
|
|
85
|
+
for field_name, field in model.fields.items():
|
|
86
|
+
columns[field_name] = convert_to_duckdb_csv_type(field)
|
|
87
|
+
return columns
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def to_json_types(model: Model) -> dict[Any, str | None] | None:
|
|
91
|
+
if model is None:
|
|
92
|
+
return None
|
|
93
|
+
columns = {}
|
|
94
|
+
for field_name, field in model.fields.items():
|
|
95
|
+
columns[field_name] = convert_to_duckdb_json_type(field)
|
|
96
|
+
return columns
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def add_nested_views(con: duckdb.DuckDBPyConnection, model_name: str, fields: Dict[str, Field] | None):
|
|
100
|
+
model_name = model_name.strip('"')
|
|
101
|
+
if fields is None:
|
|
102
|
+
return
|
|
103
|
+
for field_name, field in fields.items():
|
|
104
|
+
if field.type is None or field.type.lower() not in ["array", "object"]:
|
|
105
|
+
continue
|
|
106
|
+
field_type = field.type.lower()
|
|
107
|
+
if field_type == "array" and field.items is None:
|
|
108
|
+
continue
|
|
109
|
+
elif field_type == "object" and field.fields is None:
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
nested_model_name = f"{model_name}__{field_name}"
|
|
113
|
+
max_depth = 2 if field_type == "array" else 1
|
|
114
|
+
|
|
115
|
+
## if parent field is not required, the nested objects may respolve
|
|
116
|
+
## to a row of NULLs -- but if the objects themselves have required
|
|
117
|
+
## fields, this will fail the check.
|
|
118
|
+
where = "" if field.required else f" WHERE {field_name} IS NOT NULL"
|
|
119
|
+
con.sql(f"""
|
|
120
|
+
CREATE VIEW IF NOT EXISTS "{nested_model_name}" AS
|
|
121
|
+
SELECT unnest({field_name}, max_depth := {max_depth}) as {field_name} FROM "{model_name}" {where}
|
|
122
|
+
""")
|
|
123
|
+
if field_type == "array":
|
|
124
|
+
add_nested_views(con, nested_model_name, field.items.fields)
|
|
125
|
+
elif field_type == "object":
|
|
126
|
+
add_nested_views(con, nested_model_name, field.fields)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def setup_s3_connection(con, server):
|
|
130
|
+
s3_region = os.getenv("DATACONTRACT_S3_REGION")
|
|
131
|
+
s3_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
|
|
132
|
+
s3_secret_access_key = os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY")
|
|
133
|
+
s3_session_token = os.getenv("DATACONTRACT_S3_SESSION_TOKEN")
|
|
134
|
+
s3_endpoint = "s3.amazonaws.com"
|
|
135
|
+
use_ssl = "true"
|
|
136
|
+
url_style = "vhost"
|
|
137
|
+
if server.endpointUrl is not None:
|
|
138
|
+
url_style = "path"
|
|
139
|
+
s3_endpoint = server.endpointUrl.removeprefix("http://").removeprefix("https://")
|
|
140
|
+
if server.endpointUrl.startswith("http://"):
|
|
141
|
+
use_ssl = "false"
|
|
142
|
+
|
|
143
|
+
if s3_access_key_id is not None:
|
|
144
|
+
if s3_session_token is not None:
|
|
145
|
+
con.sql(f"""
|
|
146
|
+
CREATE OR REPLACE SECRET s3_secret (
|
|
147
|
+
TYPE S3,
|
|
148
|
+
PROVIDER CREDENTIAL_CHAIN,
|
|
149
|
+
REGION '{s3_region}',
|
|
150
|
+
KEY_ID '{s3_access_key_id}',
|
|
151
|
+
SECRET '{s3_secret_access_key}',
|
|
152
|
+
SESSION_TOKEN '{s3_session_token}',
|
|
153
|
+
ENDPOINT '{s3_endpoint}',
|
|
154
|
+
USE_SSL '{use_ssl}',
|
|
155
|
+
URL_STYLE '{url_style}'
|
|
156
|
+
);
|
|
157
|
+
""")
|
|
158
|
+
else:
|
|
159
|
+
con.sql(f"""
|
|
160
|
+
CREATE OR REPLACE SECRET s3_secret (
|
|
161
|
+
TYPE S3,
|
|
162
|
+
PROVIDER CREDENTIAL_CHAIN,
|
|
163
|
+
REGION '{s3_region}',
|
|
164
|
+
KEY_ID '{s3_access_key_id}',
|
|
165
|
+
SECRET '{s3_secret_access_key}',
|
|
166
|
+
ENDPOINT '{s3_endpoint}',
|
|
167
|
+
USE_SSL '{use_ssl}',
|
|
168
|
+
URL_STYLE '{url_style}'
|
|
169
|
+
);
|
|
170
|
+
""")
|
|
171
|
+
|
|
172
|
+
# con.sql(f"""
|
|
173
|
+
# SET s3_region = '{s3_region}';
|
|
174
|
+
# SET s3_access_key_id = '{s3_access_key_id}';
|
|
175
|
+
# SET s3_secret_access_key = '{s3_secret_access_key}';
|
|
176
|
+
# """)
|
|
177
|
+
# else:
|
|
178
|
+
# con.sql("""
|
|
179
|
+
# RESET s3_region;
|
|
180
|
+
# RESET s3_access_key_id;
|
|
181
|
+
# RESET s3_secret_access_key;
|
|
182
|
+
# """)
|
|
183
|
+
# con.sql("RESET s3_session_token")
|
|
184
|
+
# print(con.sql("SELECT * FROM duckdb_settings() WHERE name like 's3%'"))
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def setup_gcs_connection(con, server):
|
|
188
|
+
key_id = os.getenv("DATACONTRACT_GCS_KEY_ID")
|
|
189
|
+
secret = os.getenv("DATACONTRACT_GCS_SECRET")
|
|
190
|
+
|
|
191
|
+
if key_id is None:
|
|
192
|
+
raise ValueError("Error: Environment variable DATACONTRACT_GCS_KEY_ID is not set")
|
|
193
|
+
if secret is None:
|
|
194
|
+
raise ValueError("Error: Environment variable DATACONTRACT_GCS_SECRET is not set")
|
|
195
|
+
|
|
196
|
+
con.sql(f"""
|
|
197
|
+
CREATE SECRET gcs_secret (
|
|
198
|
+
TYPE GCS,
|
|
199
|
+
KEY_ID '{key_id}',
|
|
200
|
+
SECRET '{secret}'
|
|
201
|
+
);
|
|
202
|
+
""")
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def setup_azure_connection(con, server):
|
|
206
|
+
tenant_id = os.getenv("DATACONTRACT_AZURE_TENANT_ID")
|
|
207
|
+
client_id = os.getenv("DATACONTRACT_AZURE_CLIENT_ID")
|
|
208
|
+
client_secret = os.getenv("DATACONTRACT_AZURE_CLIENT_SECRET")
|
|
209
|
+
storage_account = server.storageAccount
|
|
210
|
+
|
|
211
|
+
if tenant_id is None:
|
|
212
|
+
raise ValueError("Error: Environment variable DATACONTRACT_AZURE_TENANT_ID is not set")
|
|
213
|
+
if client_id is None:
|
|
214
|
+
raise ValueError("Error: Environment variable DATACONTRACT_AZURE_CLIENT_ID is not set")
|
|
215
|
+
if client_secret is None:
|
|
216
|
+
raise ValueError("Error: Environment variable DATACONTRACT_AZURE_CLIENT_SECRET is not set")
|
|
217
|
+
|
|
218
|
+
con.install_extension("azure")
|
|
219
|
+
con.load_extension("azure")
|
|
220
|
+
|
|
221
|
+
if storage_account is not None:
|
|
222
|
+
con.sql(f"""
|
|
223
|
+
CREATE SECRET azure_spn (
|
|
224
|
+
TYPE AZURE,
|
|
225
|
+
PROVIDER SERVICE_PRINCIPAL,
|
|
226
|
+
TENANT_ID '{tenant_id}',
|
|
227
|
+
CLIENT_ID '{client_id}',
|
|
228
|
+
CLIENT_SECRET '{client_secret}',
|
|
229
|
+
ACCOUNT_NAME '{storage_account}'
|
|
230
|
+
);
|
|
231
|
+
""")
|
|
232
|
+
else:
|
|
233
|
+
con.sql(f"""
|
|
234
|
+
CREATE SECRET azure_spn (
|
|
235
|
+
TYPE AZURE,
|
|
236
|
+
PROVIDER SERVICE_PRINCIPAL,
|
|
237
|
+
TENANT_ID '{tenant_id}',
|
|
238
|
+
CLIENT_ID '{client_id}',
|
|
239
|
+
CLIENT_SECRET '{client_secret}'
|
|
240
|
+
);
|
|
241
|
+
""")
|