datacontract-cli 0.10.10__py3-none-any.whl → 0.10.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/cli.py +19 -3
- datacontract/data_contract.py +17 -17
- datacontract/engines/fastjsonschema/check_jsonschema.py +15 -1
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +2 -0
- datacontract/engines/soda/check_soda_execute.py +2 -8
- datacontract/engines/soda/connections/duckdb.py +23 -20
- datacontract/engines/soda/connections/kafka.py +81 -23
- datacontract/engines/soda/connections/snowflake.py +8 -5
- datacontract/export/avro_converter.py +12 -2
- datacontract/export/dbml_converter.py +42 -19
- datacontract/export/exporter.py +2 -1
- datacontract/export/exporter_factory.py +6 -0
- datacontract/export/jsonschema_converter.py +1 -4
- datacontract/export/spark_converter.py +4 -0
- datacontract/export/sql_type_converter.py +64 -29
- datacontract/export/sqlalchemy_converter.py +169 -0
- datacontract/imports/avro_importer.py +1 -0
- datacontract/imports/bigquery_importer.py +2 -2
- datacontract/imports/dbml_importer.py +112 -0
- datacontract/imports/dbt_importer.py +67 -91
- datacontract/imports/glue_importer.py +64 -54
- datacontract/imports/importer.py +3 -2
- datacontract/imports/importer_factory.py +5 -0
- datacontract/imports/jsonschema_importer.py +106 -120
- datacontract/imports/odcs_importer.py +1 -1
- datacontract/imports/spark_importer.py +29 -10
- datacontract/imports/sql_importer.py +5 -1
- datacontract/imports/unity_importer.py +1 -1
- datacontract/integration/{publish_datamesh_manager.py → datamesh_manager.py} +33 -5
- datacontract/integration/{publish_opentelemetry.py → opentelemetry.py} +1 -1
- datacontract/model/data_contract_specification.py +6 -2
- datacontract/templates/partials/model_field.html +10 -2
- {datacontract_cli-0.10.10.dist-info → datacontract_cli-0.10.12.dist-info}/METADATA +283 -113
- {datacontract_cli-0.10.10.dist-info → datacontract_cli-0.10.12.dist-info}/RECORD +38 -37
- {datacontract_cli-0.10.10.dist-info → datacontract_cli-0.10.12.dist-info}/WHEEL +1 -1
- datacontract/publish/publish.py +0 -32
- {datacontract_cli-0.10.10.dist-info → datacontract_cli-0.10.12.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.10.10.dist-info → datacontract_cli-0.10.12.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.10.dist-info → datacontract_cli-0.10.12.dist-info}/top_level.txt +0 -0
datacontract/cli.py
CHANGED
|
@@ -17,7 +17,7 @@ from datacontract.catalog.catalog import create_index_html, create_data_contract
|
|
|
17
17
|
from datacontract.data_contract import DataContract, ExportFormat
|
|
18
18
|
from datacontract.imports.importer import ImportFormat
|
|
19
19
|
from datacontract.init.download_datacontract_file import download_datacontract_file, FileExistsException
|
|
20
|
-
from datacontract.
|
|
20
|
+
from datacontract.integration.datamesh_manager import publish_data_contract_to_datamesh_manager
|
|
21
21
|
|
|
22
22
|
DEFAULT_DATA_CONTRACT_SCHEMA_URL = "https://datacontract.com/datacontract.schema.json"
|
|
23
23
|
|
|
@@ -232,6 +232,18 @@ def import_(
|
|
|
232
232
|
help="List of models names to import from the dbt manifest file (repeat for multiple models names, leave empty for all models in the dataset)."
|
|
233
233
|
),
|
|
234
234
|
] = None,
|
|
235
|
+
dbml_schema: Annotated[
|
|
236
|
+
Optional[List[str]],
|
|
237
|
+
typer.Option(
|
|
238
|
+
help="List of schema names to import from the DBML file (repeat for multiple schema names, leave empty for all tables in the file)."
|
|
239
|
+
),
|
|
240
|
+
] = None,
|
|
241
|
+
dbml_table: Annotated[
|
|
242
|
+
Optional[List[str]],
|
|
243
|
+
typer.Option(
|
|
244
|
+
help="List of table names to import from the DBML file (repeat for multiple table names, leave empty for all tables in the file)."
|
|
245
|
+
),
|
|
246
|
+
] = None,
|
|
235
247
|
):
|
|
236
248
|
"""
|
|
237
249
|
Create a data contract from the given source location. Prints to stdout.
|
|
@@ -245,6 +257,8 @@ def import_(
|
|
|
245
257
|
bigquery_dataset=bigquery_dataset,
|
|
246
258
|
unity_table_full_name=unity_table_full_name,
|
|
247
259
|
dbt_model=dbt_model,
|
|
260
|
+
dbml_schema=dbml_schema,
|
|
261
|
+
dbml_table=dbml_table,
|
|
248
262
|
)
|
|
249
263
|
console.print(result.to_yaml())
|
|
250
264
|
|
|
@@ -261,8 +275,10 @@ def publish(
|
|
|
261
275
|
"""
|
|
262
276
|
Publish the data contract to the Data Mesh Manager.
|
|
263
277
|
"""
|
|
264
|
-
|
|
265
|
-
|
|
278
|
+
publish_data_contract_to_datamesh_manager(
|
|
279
|
+
data_contract_specification=DataContract(
|
|
280
|
+
data_contract_file=location, schema_location=schema
|
|
281
|
+
).get_data_contract_specification(),
|
|
266
282
|
)
|
|
267
283
|
|
|
268
284
|
|
datacontract/data_contract.py
CHANGED
|
@@ -18,8 +18,8 @@ from datacontract.export.exporter import ExportFormat
|
|
|
18
18
|
from datacontract.export.exporter_factory import exporter_factory
|
|
19
19
|
from datacontract.imports.importer_factory import importer_factory
|
|
20
20
|
|
|
21
|
-
from datacontract.integration.
|
|
22
|
-
from datacontract.integration.
|
|
21
|
+
from datacontract.integration.datamesh_manager import publish_test_results_to_datamesh_manager
|
|
22
|
+
from datacontract.integration.opentelemetry import publish_test_results_to_opentelemetry
|
|
23
23
|
from datacontract.lint import resolve
|
|
24
24
|
from datacontract.lint.linters.description_linter import DescriptionLinter
|
|
25
25
|
from datacontract.lint.linters.example_model_linter import ExampleModelLinter
|
|
@@ -46,8 +46,8 @@ class DataContract:
|
|
|
46
46
|
publish_url: str = None,
|
|
47
47
|
publish_to_opentelemetry: bool = False,
|
|
48
48
|
spark: "SparkSession" = None,
|
|
49
|
-
inline_definitions: bool =
|
|
50
|
-
inline_quality: bool =
|
|
49
|
+
inline_definitions: bool = True,
|
|
50
|
+
inline_quality: bool = True,
|
|
51
51
|
):
|
|
52
52
|
self._data_contract_file = data_contract_file
|
|
53
53
|
self._data_contract_str = data_contract_str
|
|
@@ -87,8 +87,8 @@ class DataContract:
|
|
|
87
87
|
self._data_contract_str,
|
|
88
88
|
self._data_contract,
|
|
89
89
|
self._schema_location,
|
|
90
|
-
inline_definitions=
|
|
91
|
-
inline_quality=
|
|
90
|
+
inline_definitions=self._inline_definitions,
|
|
91
|
+
inline_quality=self._inline_quality,
|
|
92
92
|
)
|
|
93
93
|
run.checks.append(
|
|
94
94
|
Check(type="lint", result="passed", name="Data contract is syntactically valid", engine="datacontract")
|
|
@@ -140,7 +140,12 @@ class DataContract:
|
|
|
140
140
|
try:
|
|
141
141
|
run.log_info("Testing data contract")
|
|
142
142
|
data_contract = resolve.resolve_data_contract(
|
|
143
|
-
self._data_contract_file,
|
|
143
|
+
self._data_contract_file,
|
|
144
|
+
self._data_contract_str,
|
|
145
|
+
self._data_contract,
|
|
146
|
+
self._schema_location,
|
|
147
|
+
inline_definitions=self._inline_definitions,
|
|
148
|
+
inline_quality=self._inline_quality,
|
|
144
149
|
)
|
|
145
150
|
|
|
146
151
|
if data_contract.models is None or len(data_contract.models) == 0:
|
|
@@ -213,15 +218,10 @@ class DataContract:
|
|
|
213
218
|
run.finish()
|
|
214
219
|
|
|
215
220
|
if self._publish_url is not None:
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
except Exception:
|
|
219
|
-
run.log_error("Failed to publish to datamesh manager")
|
|
221
|
+
publish_test_results_to_datamesh_manager(run, self._publish_url)
|
|
222
|
+
|
|
220
223
|
if self._publish_to_opentelemetry:
|
|
221
|
-
|
|
222
|
-
publish_opentelemetry(run)
|
|
223
|
-
except Exception:
|
|
224
|
-
run.log_error("Failed to publish to opentelemetry")
|
|
224
|
+
publish_test_results_to_opentelemetry(run)
|
|
225
225
|
|
|
226
226
|
return run
|
|
227
227
|
|
|
@@ -304,8 +304,8 @@ class DataContract:
|
|
|
304
304
|
self._data_contract_str,
|
|
305
305
|
self._data_contract,
|
|
306
306
|
schema_location=self._schema_location,
|
|
307
|
-
inline_definitions=
|
|
308
|
-
inline_quality=
|
|
307
|
+
inline_definitions=self._inline_definitions,
|
|
308
|
+
inline_quality=self._inline_quality,
|
|
309
309
|
)
|
|
310
310
|
|
|
311
311
|
return exporter_factory.create(export_format).export(
|
|
@@ -148,13 +148,27 @@ def check_jsonschema(run: Run, data_contract: DataContractSpecification, server:
|
|
|
148
148
|
schema = to_jsonschema(model_name, model)
|
|
149
149
|
run.log_info(f"jsonschema: {schema}")
|
|
150
150
|
|
|
151
|
-
validate = fastjsonschema.compile(
|
|
151
|
+
validate = fastjsonschema.compile(
|
|
152
|
+
schema,
|
|
153
|
+
formats={"uuid": r"^[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12}$"},
|
|
154
|
+
)
|
|
152
155
|
|
|
153
156
|
# Process files based on server type
|
|
154
157
|
if server.type == "local":
|
|
155
158
|
process_local_file(run, server, model_name, validate)
|
|
156
159
|
elif server.type == "s3":
|
|
157
160
|
process_s3_file(server, model_name, validate)
|
|
161
|
+
elif server.type == "gcs":
|
|
162
|
+
run.checks.append(
|
|
163
|
+
Check(
|
|
164
|
+
type="schema",
|
|
165
|
+
name="Check that JSON has valid schema",
|
|
166
|
+
model=model_name,
|
|
167
|
+
result="info",
|
|
168
|
+
reason="JSON Schema check skipped for GCS, as GCS is currently not supported",
|
|
169
|
+
engine="jsonschema",
|
|
170
|
+
)
|
|
171
|
+
)
|
|
158
172
|
else:
|
|
159
173
|
run.checks.append(
|
|
160
174
|
Check(
|
|
@@ -28,9 +28,11 @@ def s3_fs(s3_endpoint_url):
|
|
|
28
28
|
|
|
29
29
|
aws_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
|
|
30
30
|
aws_secret_access_key = os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY")
|
|
31
|
+
aws_session_token = os.getenv("DATACONTRACT_S3_SESSION_TOKEN")
|
|
31
32
|
return s3fs.S3FileSystem(
|
|
32
33
|
key=aws_access_key_id,
|
|
33
34
|
secret=aws_secret_access_key,
|
|
35
|
+
token=aws_session_token,
|
|
34
36
|
anon=aws_access_key_id is None,
|
|
35
37
|
client_kwargs={"endpoint_url": s3_endpoint_url},
|
|
36
38
|
)
|
|
@@ -1,8 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import typing
|
|
3
|
-
|
|
4
|
-
if typing.TYPE_CHECKING:
|
|
5
|
-
from pyspark.sql import SparkSession
|
|
6
2
|
|
|
7
3
|
from soda.scan import Scan
|
|
8
4
|
|
|
@@ -19,9 +15,7 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
|
|
|
19
15
|
from datacontract.model.run import Run, Check, Log
|
|
20
16
|
|
|
21
17
|
|
|
22
|
-
def check_soda_execute(
|
|
23
|
-
run: Run, data_contract: DataContractSpecification, server: Server, spark: "SparkSession", tmp_dir
|
|
24
|
-
):
|
|
18
|
+
def check_soda_execute(run: Run, data_contract: DataContractSpecification, server: Server, spark, tmp_dir):
|
|
25
19
|
if data_contract is None:
|
|
26
20
|
run.log_warn("Cannot run engine soda-core, as data contract is invalid")
|
|
27
21
|
return
|
|
@@ -29,7 +23,7 @@ def check_soda_execute(
|
|
|
29
23
|
run.log_info("Running engine soda-core")
|
|
30
24
|
scan = Scan()
|
|
31
25
|
|
|
32
|
-
if server.type in ["s3", "azure", "local"]:
|
|
26
|
+
if server.type in ["s3", "gcs", "azure", "local"]:
|
|
33
27
|
if server.format in ["json", "parquet", "csv", "delta"]:
|
|
34
28
|
con = get_duckdb_connection(data_contract, server, run)
|
|
35
29
|
scan.add_duckdb_connection(duckdb_connection=con, data_source_name=server.type)
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
2
|
|
|
3
|
-
from deltalake import DeltaTable
|
|
4
|
-
|
|
5
3
|
import duckdb
|
|
6
4
|
from datacontract.export.csv_type_converter import convert_to_duckdb_csv_type
|
|
7
5
|
from datacontract.model.run import Run
|
|
@@ -15,6 +13,9 @@ def get_duckdb_connection(data_contract, server, run: Run):
|
|
|
15
13
|
if server.type == "s3":
|
|
16
14
|
path = server.location
|
|
17
15
|
setup_s3_connection(con, server)
|
|
16
|
+
if server.type == "gcs":
|
|
17
|
+
path = server.location
|
|
18
|
+
setup_gcs_connection(con, server)
|
|
18
19
|
if server.type == "azure":
|
|
19
20
|
path = server.location
|
|
20
21
|
setup_azure_connection(con, server)
|
|
@@ -49,24 +50,8 @@ def get_duckdb_connection(data_contract, server, run: Run):
|
|
|
49
50
|
f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1, columns={columns});"""
|
|
50
51
|
)
|
|
51
52
|
elif server.format == "delta":
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
# in https://github.com/datacontract/datacontract-cli/issues/258,
|
|
55
|
-
# azure storage should also work
|
|
56
|
-
# https://github.com/duckdb/duckdb_delta/issues/21
|
|
57
|
-
raise NotImplementedError("Support for Delta Tables on Azure Storage is not implemented yet")
|
|
58
|
-
|
|
59
|
-
storage_options = {
|
|
60
|
-
"AWS_ENDPOINT_URL": server.endpointUrl,
|
|
61
|
-
"AWS_ACCESS_KEY_ID": os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID"),
|
|
62
|
-
"AWS_SECRET_ACCESS_KEY": os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY"),
|
|
63
|
-
"AWS_REGION": os.getenv("DATACONTRACT_S3_REGION", "us-east-1"),
|
|
64
|
-
"AWS_ALLOW_HTTP": "True" if server.endpointUrl.startswith("http://") else "False",
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
delta_table_arrow = DeltaTable(model_path, storage_options=storage_options).to_pyarrow_dataset()
|
|
68
|
-
|
|
69
|
-
con.register(model_name, delta_table_arrow)
|
|
53
|
+
con.sql("update extensions;") # Make sure we have the latest delta extension
|
|
54
|
+
con.sql(f"""CREATE VIEW "{model_name}" AS SELECT * FROM delta_scan('{model_path}');""")
|
|
70
55
|
return con
|
|
71
56
|
|
|
72
57
|
|
|
@@ -138,6 +123,24 @@ def setup_s3_connection(con, server):
|
|
|
138
123
|
# print(con.sql("SELECT * FROM duckdb_settings() WHERE name like 's3%'"))
|
|
139
124
|
|
|
140
125
|
|
|
126
|
+
def setup_gcs_connection(con, server):
|
|
127
|
+
key_id = os.getenv("DATACONTRACT_GCS_KEY_ID")
|
|
128
|
+
secret = os.getenv("DATACONTRACT_GCS_SECRET")
|
|
129
|
+
|
|
130
|
+
if key_id is None:
|
|
131
|
+
raise ValueError("Error: Environment variable DATACONTRACT_GCS_KEY_ID is not set")
|
|
132
|
+
if secret is None:
|
|
133
|
+
raise ValueError("Error: Environment variable DATACONTRACT_GCS_SECRET is not set")
|
|
134
|
+
|
|
135
|
+
con.sql(f"""
|
|
136
|
+
CREATE SECRET gcs_secret (
|
|
137
|
+
TYPE GCS,
|
|
138
|
+
KEY_ID '{key_id}',
|
|
139
|
+
SECRET '{secret}'
|
|
140
|
+
);
|
|
141
|
+
""")
|
|
142
|
+
|
|
143
|
+
|
|
141
144
|
def setup_azure_connection(con, server):
|
|
142
145
|
tenant_id = os.getenv("DATACONTRACT_AZURE_TENANT_ID")
|
|
143
146
|
client_id = os.getenv("DATACONTRACT_AZURE_CLIENT_ID")
|
|
@@ -1,33 +1,26 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
|
-
from pyspark.sql import SparkSession
|
|
4
|
-
from pyspark.sql.functions import col, expr, from_json
|
|
5
|
-
from pyspark.sql.avro.functions import from_avro
|
|
6
|
-
from pyspark.sql.types import (
|
|
7
|
-
StructType,
|
|
8
|
-
StructField,
|
|
9
|
-
StringType,
|
|
10
|
-
DecimalType,
|
|
11
|
-
DoubleType,
|
|
12
|
-
IntegerType,
|
|
13
|
-
LongType,
|
|
14
|
-
BooleanType,
|
|
15
|
-
TimestampType,
|
|
16
|
-
TimestampNTZType,
|
|
17
|
-
DateType,
|
|
18
|
-
BinaryType,
|
|
19
|
-
ArrayType,
|
|
20
|
-
NullType,
|
|
21
|
-
DataType,
|
|
22
|
-
)
|
|
23
3
|
|
|
24
4
|
from datacontract.export.avro_converter import to_avro_schema_json
|
|
25
5
|
from datacontract.model.data_contract_specification import DataContractSpecification, Server, Field
|
|
26
6
|
from datacontract.model.exceptions import DataContractException
|
|
27
7
|
|
|
28
8
|
|
|
29
|
-
def create_spark_session(tmp_dir: str)
|
|
9
|
+
def create_spark_session(tmp_dir: str):
|
|
30
10
|
"""Create and configure a Spark session."""
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
from pyspark.sql import SparkSession
|
|
14
|
+
except ImportError as e:
|
|
15
|
+
raise DataContractException(
|
|
16
|
+
type="schema",
|
|
17
|
+
result="failed",
|
|
18
|
+
name="pyspark is missing",
|
|
19
|
+
reason="Install the extra datacontract-cli[kafka] to use kafka",
|
|
20
|
+
engine="datacontract",
|
|
21
|
+
original_exception=e,
|
|
22
|
+
)
|
|
23
|
+
|
|
31
24
|
spark = (
|
|
32
25
|
SparkSession.builder.appName("datacontract")
|
|
33
26
|
.config("spark.sql.warehouse.dir", f"{tmp_dir}/spark-warehouse")
|
|
@@ -43,7 +36,7 @@ def create_spark_session(tmp_dir: str) -> SparkSession:
|
|
|
43
36
|
return spark
|
|
44
37
|
|
|
45
38
|
|
|
46
|
-
def read_kafka_topic(spark
|
|
39
|
+
def read_kafka_topic(spark, data_contract: DataContractSpecification, server: Server, tmp_dir):
|
|
47
40
|
"""Read and process data from a Kafka topic based on the server configuration."""
|
|
48
41
|
|
|
49
42
|
logging.info("Reading data from Kafka server %s topic %s", server.host, server.topic)
|
|
@@ -74,6 +67,19 @@ def read_kafka_topic(spark: SparkSession, data_contract: DataContractSpecificati
|
|
|
74
67
|
|
|
75
68
|
|
|
76
69
|
def process_avro_format(df, model_name, model):
|
|
70
|
+
try:
|
|
71
|
+
from pyspark.sql.functions import col, expr
|
|
72
|
+
from pyspark.sql.avro.functions import from_avro
|
|
73
|
+
except ImportError as e:
|
|
74
|
+
raise DataContractException(
|
|
75
|
+
type="schema",
|
|
76
|
+
result="failed",
|
|
77
|
+
name="pyspark is missing",
|
|
78
|
+
reason="Install the extra datacontract-cli[kafka] to use kafka",
|
|
79
|
+
engine="datacontract",
|
|
80
|
+
original_exception=e,
|
|
81
|
+
)
|
|
82
|
+
|
|
77
83
|
avro_schema = to_avro_schema_json(model_name, model)
|
|
78
84
|
df2 = df.withColumn("fixedValue", expr("substring(value, 6, length(value)-5)"))
|
|
79
85
|
options = {"mode": "PERMISSIVE"}
|
|
@@ -83,6 +89,18 @@ def process_avro_format(df, model_name, model):
|
|
|
83
89
|
|
|
84
90
|
|
|
85
91
|
def process_json_format(df, model_name, model):
|
|
92
|
+
try:
|
|
93
|
+
from pyspark.sql.functions import col, from_json
|
|
94
|
+
except ImportError as e:
|
|
95
|
+
raise DataContractException(
|
|
96
|
+
type="schema",
|
|
97
|
+
result="failed",
|
|
98
|
+
name="pyspark is missing",
|
|
99
|
+
reason="Install the extra datacontract-cli[kafka] to use kafka",
|
|
100
|
+
engine="datacontract",
|
|
101
|
+
original_exception=e,
|
|
102
|
+
)
|
|
103
|
+
|
|
86
104
|
struct_type = to_struct_type(model.fields)
|
|
87
105
|
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").select(
|
|
88
106
|
from_json(col("value"), struct_type, {"mode": "PERMISSIVE"}).alias("json")
|
|
@@ -108,11 +126,51 @@ def get_auth_options():
|
|
|
108
126
|
|
|
109
127
|
|
|
110
128
|
def to_struct_type(fields):
|
|
129
|
+
try:
|
|
130
|
+
from pyspark.sql.types import StructType
|
|
131
|
+
except ImportError as e:
|
|
132
|
+
raise DataContractException(
|
|
133
|
+
type="schema",
|
|
134
|
+
result="failed",
|
|
135
|
+
name="pyspark is missing",
|
|
136
|
+
reason="Install the extra datacontract-cli[kafka] to use kafka",
|
|
137
|
+
engine="datacontract",
|
|
138
|
+
original_exception=e,
|
|
139
|
+
)
|
|
140
|
+
|
|
111
141
|
"""Convert field definitions to Spark StructType."""
|
|
112
142
|
return StructType([to_struct_field(field_name, field) for field_name, field in fields.items()])
|
|
113
143
|
|
|
114
144
|
|
|
115
|
-
def to_struct_field(field_name: str, field: Field)
|
|
145
|
+
def to_struct_field(field_name: str, field: Field):
|
|
146
|
+
try:
|
|
147
|
+
from pyspark.sql.types import (
|
|
148
|
+
StructType,
|
|
149
|
+
StructField,
|
|
150
|
+
StringType,
|
|
151
|
+
DecimalType,
|
|
152
|
+
DoubleType,
|
|
153
|
+
IntegerType,
|
|
154
|
+
LongType,
|
|
155
|
+
BooleanType,
|
|
156
|
+
TimestampType,
|
|
157
|
+
TimestampNTZType,
|
|
158
|
+
DateType,
|
|
159
|
+
BinaryType,
|
|
160
|
+
ArrayType,
|
|
161
|
+
NullType,
|
|
162
|
+
DataType,
|
|
163
|
+
)
|
|
164
|
+
except ImportError as e:
|
|
165
|
+
raise DataContractException(
|
|
166
|
+
type="schema",
|
|
167
|
+
result="failed",
|
|
168
|
+
name="pyspark is missing",
|
|
169
|
+
reason="Install the extra datacontract-cli[kafka] to use kafka",
|
|
170
|
+
engine="datacontract",
|
|
171
|
+
original_exception=e,
|
|
172
|
+
)
|
|
173
|
+
|
|
116
174
|
"""Map field definitions to Spark StructField using match-case."""
|
|
117
175
|
match field.type:
|
|
118
176
|
case "string" | "varchar" | "text":
|
|
@@ -4,17 +4,20 @@ import yaml
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def to_snowflake_soda_configuration(server):
|
|
7
|
+
prefix = "DATACONTRACT_SNOWFLAKE_"
|
|
8
|
+
snowflake_soda_params = {k.replace(prefix, "").lower(): v for k, v in os.environ.items() if k.startswith(prefix)}
|
|
9
|
+
|
|
10
|
+
# backward compatibility
|
|
11
|
+
if "connection_timeout" not in snowflake_soda_params:
|
|
12
|
+
snowflake_soda_params["connection_timeout"] = "5" # minutes
|
|
13
|
+
|
|
7
14
|
soda_configuration = {
|
|
8
15
|
f"data_source {server.type}": {
|
|
9
16
|
"type": "snowflake",
|
|
10
|
-
"username": os.getenv("DATACONTRACT_SNOWFLAKE_USERNAME"),
|
|
11
|
-
"password": os.getenv("DATACONTRACT_SNOWFLAKE_PASSWORD"),
|
|
12
|
-
"role": os.getenv("DATACONTRACT_SNOWFLAKE_ROLE"),
|
|
13
17
|
"account": server.account,
|
|
14
18
|
"database": server.database,
|
|
15
19
|
"schema": server.schema_,
|
|
16
|
-
|
|
17
|
-
"connection_timeout": 5, # minutes
|
|
20
|
+
**snowflake_soda_params,
|
|
18
21
|
}
|
|
19
22
|
}
|
|
20
23
|
soda_configuration_str = yaml.dump(soda_configuration)
|
|
@@ -40,11 +40,21 @@ def to_avro_field(field, field_name):
|
|
|
40
40
|
avro_field = {"name": field_name}
|
|
41
41
|
if field.description is not None:
|
|
42
42
|
avro_field["doc"] = field.description
|
|
43
|
-
|
|
43
|
+
is_required_avro = field.required if field.required is not None else True
|
|
44
|
+
avro_type = to_avro_type(field, field_name)
|
|
45
|
+
avro_field["type"] = avro_type if is_required_avro else ["null", avro_type]
|
|
46
|
+
|
|
47
|
+
if avro_field["type"] == "enum":
|
|
48
|
+
avro_field["type"] = {
|
|
49
|
+
"type": "enum",
|
|
50
|
+
"name": field.title,
|
|
51
|
+
"symbols": field.enum,
|
|
52
|
+
}
|
|
44
53
|
|
|
45
54
|
if field.config:
|
|
46
55
|
if "avroDefault" in field.config:
|
|
47
|
-
|
|
56
|
+
if field.config.get("avroType") != "enum":
|
|
57
|
+
avro_field["default"] = field.config["avroDefault"]
|
|
48
58
|
|
|
49
59
|
return avro_field
|
|
50
60
|
|
|
@@ -3,6 +3,7 @@ from importlib.metadata import version
|
|
|
3
3
|
from typing import Tuple
|
|
4
4
|
|
|
5
5
|
import pytz
|
|
6
|
+
from datacontract.model.exceptions import DataContractException
|
|
6
7
|
|
|
7
8
|
import datacontract.model.data_contract_specification as spec
|
|
8
9
|
from datacontract.export.sql_type_converter import convert_to_sql_type
|
|
@@ -48,17 +49,7 @@ Using {5} Types for the field types
|
|
|
48
49
|
{0}
|
|
49
50
|
*/
|
|
50
51
|
""".format(generated_info)
|
|
51
|
-
|
|
52
|
-
note = """Note project_info {{
|
|
53
|
-
'''
|
|
54
|
-
{0}
|
|
55
|
-
'''
|
|
56
|
-
}}
|
|
57
|
-
""".format(generated_info)
|
|
58
|
-
|
|
59
|
-
return """{0}
|
|
60
|
-
{1}
|
|
61
|
-
""".format(comment, note)
|
|
52
|
+
return comment
|
|
62
53
|
|
|
63
54
|
|
|
64
55
|
def get_version() -> str:
|
|
@@ -70,19 +61,18 @@ def get_version() -> str:
|
|
|
70
61
|
|
|
71
62
|
def generate_project_info(contract: spec.DataContractSpecification) -> str:
|
|
72
63
|
return """Project "{0}" {{
|
|
73
|
-
Note:
|
|
64
|
+
Note: '''{1}'''
|
|
74
65
|
}}\n
|
|
75
|
-
""".format(contract.info.title,
|
|
66
|
+
""".format(contract.info.title, contract.info.description)
|
|
76
67
|
|
|
77
68
|
|
|
78
69
|
def generate_table(model_name: str, model: spec.Model, server: spec.Server) -> str:
|
|
79
70
|
result = """Table "{0}" {{
|
|
80
|
-
Note:
|
|
81
|
-
""".format(model_name,
|
|
71
|
+
Note: {1}
|
|
72
|
+
""".format(model_name, formatDescription(model.description))
|
|
82
73
|
|
|
83
74
|
references = []
|
|
84
75
|
|
|
85
|
-
# Add all the fields
|
|
86
76
|
for field_name, field in model.fields.items():
|
|
87
77
|
ref, field_string = generate_field(field_name, field, model_name, server)
|
|
88
78
|
if ref is not None:
|
|
@@ -102,6 +92,30 @@ Note: "{1}"
|
|
|
102
92
|
|
|
103
93
|
|
|
104
94
|
def generate_field(field_name: str, field: spec.Field, model_name: str, server: spec.Server) -> Tuple[str, str]:
|
|
95
|
+
if field.primary:
|
|
96
|
+
if field.required is not None:
|
|
97
|
+
if not field.required:
|
|
98
|
+
raise DataContractException(
|
|
99
|
+
type="lint",
|
|
100
|
+
name="Primary key fields cannot have required == False.",
|
|
101
|
+
result="error",
|
|
102
|
+
reason="Primary key fields cannot have required == False.",
|
|
103
|
+
engine="datacontract",
|
|
104
|
+
)
|
|
105
|
+
else:
|
|
106
|
+
field.required = True
|
|
107
|
+
if field.unique is not None:
|
|
108
|
+
if not field.unique:
|
|
109
|
+
raise DataContractException(
|
|
110
|
+
type="lint",
|
|
111
|
+
name="Primary key fields cannot have unique == False",
|
|
112
|
+
result="error",
|
|
113
|
+
reason="Primary key fields cannot have unique == False.",
|
|
114
|
+
engine="datacontract",
|
|
115
|
+
)
|
|
116
|
+
else:
|
|
117
|
+
field.unique = True
|
|
118
|
+
|
|
105
119
|
field_attrs = []
|
|
106
120
|
if field.primary:
|
|
107
121
|
field_attrs.append("pk")
|
|
@@ -115,13 +129,22 @@ def generate_field(field_name: str, field: spec.Field, model_name: str, server:
|
|
|
115
129
|
field_attrs.append("null")
|
|
116
130
|
|
|
117
131
|
if field.description:
|
|
118
|
-
field_attrs.append(
|
|
132
|
+
field_attrs.append("""Note: {0}""".format(formatDescription(field.description)))
|
|
119
133
|
|
|
120
134
|
field_type = field.type if server is None else convert_to_sql_type(field, server.type)
|
|
121
135
|
|
|
122
136
|
field_str = '"{0}" "{1}" [{2}]'.format(field_name, field_type, ",".join(field_attrs))
|
|
123
137
|
ref_str = None
|
|
124
138
|
if (field.references) is not None:
|
|
125
|
-
|
|
126
|
-
|
|
139
|
+
if field.unique:
|
|
140
|
+
ref_str = "{0}.{1} - {2}".format(model_name, field_name, field.references)
|
|
141
|
+
else:
|
|
142
|
+
ref_str = "{0}.{1} > {2}".format(model_name, field_name, field.references)
|
|
127
143
|
return (ref_str, field_str)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def formatDescription(input: str) -> str:
|
|
147
|
+
if "\n" in input or "\r" in input or '"' in input:
|
|
148
|
+
return "'''{0}'''".format(input)
|
|
149
|
+
else:
|
|
150
|
+
return '"{0}"'.format(input)
|
datacontract/export/exporter.py
CHANGED
|
@@ -35,9 +35,10 @@ class ExportFormat(str, Enum):
|
|
|
35
35
|
bigquery = "bigquery"
|
|
36
36
|
dbml = "dbml"
|
|
37
37
|
spark = "spark"
|
|
38
|
+
sqlalchemy = "sqlalchemy"
|
|
38
39
|
|
|
39
40
|
@classmethod
|
|
40
|
-
def
|
|
41
|
+
def get_supported_formats(cls):
|
|
41
42
|
return list(map(lambda c: c.value, cls))
|
|
42
43
|
|
|
43
44
|
|
|
@@ -143,3 +143,9 @@ exporter_factory.register_lazy_exporter(
|
|
|
143
143
|
exporter_factory.register_lazy_exporter(
|
|
144
144
|
name=ExportFormat.spark, module_path="datacontract.export.spark_converter", class_name="SparkExporter"
|
|
145
145
|
)
|
|
146
|
+
|
|
147
|
+
exporter_factory.register_lazy_exporter(
|
|
148
|
+
name=ExportFormat.sqlalchemy,
|
|
149
|
+
module_path="datacontract.export.sqlalchemy_converter",
|
|
150
|
+
class_name="SQLAlchemyExporter",
|
|
151
|
+
)
|
|
@@ -36,10 +36,7 @@ def to_property(field: Field) -> dict:
|
|
|
36
36
|
property = {}
|
|
37
37
|
json_type, json_format = convert_type_format(field.type, field.format)
|
|
38
38
|
if json_type is not None:
|
|
39
|
-
|
|
40
|
-
property["type"] = json_type
|
|
41
|
-
else:
|
|
42
|
-
property["type"] = [json_type, "null"]
|
|
39
|
+
property["type"] = json_type
|
|
43
40
|
if json_format is not None:
|
|
44
41
|
property["format"] = json_format
|
|
45
42
|
if field.unique:
|
|
@@ -123,6 +123,8 @@ def to_data_type(field: Field) -> types.DataType:
|
|
|
123
123
|
return types.ArrayType(to_data_type(field.items))
|
|
124
124
|
if field_type in ["object", "record", "struct"]:
|
|
125
125
|
return types.StructType(to_struct_type(field.fields))
|
|
126
|
+
if field_type == "map":
|
|
127
|
+
return types.MapType(to_data_type(field.keys), to_data_type(field.values))
|
|
126
128
|
if field_type in ["string", "varchar", "text"]:
|
|
127
129
|
return types.StringType()
|
|
128
130
|
if field_type in ["number", "decimal", "numeric"]:
|
|
@@ -204,6 +206,8 @@ def print_schema(dtype: types.DataType) -> str:
|
|
|
204
206
|
return format_struct_type(dtype)
|
|
205
207
|
elif isinstance(dtype, types.ArrayType):
|
|
206
208
|
return f"ArrayType({print_schema(dtype.elementType)})"
|
|
209
|
+
elif isinstance(dtype, types.MapType):
|
|
210
|
+
return f"MapType(\n{indent(print_schema(dtype.keyType), 1)}, {print_schema(dtype.valueType)})"
|
|
207
211
|
elif isinstance(dtype, types.DecimalType):
|
|
208
212
|
return f"DecimalType({dtype.precision}, {dtype.scale})"
|
|
209
213
|
else:
|