datacontract-cli 0.10.0__py3-none-any.whl → 0.10.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datacontract/__init__.py +13 -0
- datacontract/api.py +260 -0
- datacontract/breaking/breaking.py +242 -12
- datacontract/breaking/breaking_rules.py +37 -1
- datacontract/catalog/catalog.py +80 -0
- datacontract/cli.py +387 -117
- datacontract/data_contract.py +216 -353
- datacontract/engines/data_contract_checks.py +1041 -0
- datacontract/engines/data_contract_test.py +113 -0
- datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +2 -3
- datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
- datacontract/engines/fastjsonschema/check_jsonschema.py +176 -42
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +16 -1
- datacontract/engines/soda/check_soda_execute.py +100 -56
- datacontract/engines/soda/connections/athena.py +79 -0
- datacontract/engines/soda/connections/bigquery.py +8 -1
- datacontract/engines/soda/connections/databricks.py +12 -3
- datacontract/engines/soda/connections/duckdb_connection.py +241 -0
- datacontract/engines/soda/connections/kafka.py +206 -113
- datacontract/engines/soda/connections/snowflake.py +8 -5
- datacontract/engines/soda/connections/sqlserver.py +43 -0
- datacontract/engines/soda/connections/trino.py +26 -0
- datacontract/export/avro_converter.py +72 -8
- datacontract/export/avro_idl_converter.py +31 -25
- datacontract/export/bigquery_converter.py +130 -0
- datacontract/export/custom_converter.py +40 -0
- datacontract/export/data_caterer_converter.py +161 -0
- datacontract/export/dbml_converter.py +148 -0
- datacontract/export/dbt_converter.py +141 -54
- datacontract/export/dcs_exporter.py +6 -0
- datacontract/export/dqx_converter.py +126 -0
- datacontract/export/duckdb_type_converter.py +57 -0
- datacontract/export/excel_exporter.py +923 -0
- datacontract/export/exporter.py +100 -0
- datacontract/export/exporter_factory.py +216 -0
- datacontract/export/go_converter.py +105 -0
- datacontract/export/great_expectations_converter.py +257 -36
- datacontract/export/html_exporter.py +86 -0
- datacontract/export/iceberg_converter.py +188 -0
- datacontract/export/jsonschema_converter.py +71 -16
- datacontract/export/markdown_converter.py +337 -0
- datacontract/export/mermaid_exporter.py +110 -0
- datacontract/export/odcs_v3_exporter.py +375 -0
- datacontract/export/pandas_type_converter.py +40 -0
- datacontract/export/protobuf_converter.py +168 -68
- datacontract/export/pydantic_converter.py +6 -0
- datacontract/export/rdf_converter.py +13 -6
- datacontract/export/sodacl_converter.py +36 -188
- datacontract/export/spark_converter.py +245 -0
- datacontract/export/sql_converter.py +37 -3
- datacontract/export/sql_type_converter.py +269 -8
- datacontract/export/sqlalchemy_converter.py +170 -0
- datacontract/export/terraform_converter.py +7 -2
- datacontract/imports/avro_importer.py +246 -26
- datacontract/imports/bigquery_importer.py +221 -0
- datacontract/imports/csv_importer.py +143 -0
- datacontract/imports/dbml_importer.py +112 -0
- datacontract/imports/dbt_importer.py +240 -0
- datacontract/imports/excel_importer.py +1111 -0
- datacontract/imports/glue_importer.py +288 -0
- datacontract/imports/iceberg_importer.py +172 -0
- datacontract/imports/importer.py +51 -0
- datacontract/imports/importer_factory.py +128 -0
- datacontract/imports/json_importer.py +325 -0
- datacontract/imports/jsonschema_importer.py +146 -0
- datacontract/imports/odcs_importer.py +60 -0
- datacontract/imports/odcs_v3_importer.py +516 -0
- datacontract/imports/parquet_importer.py +81 -0
- datacontract/imports/protobuf_importer.py +264 -0
- datacontract/imports/spark_importer.py +262 -0
- datacontract/imports/sql_importer.py +274 -35
- datacontract/imports/unity_importer.py +219 -0
- datacontract/init/init_template.py +20 -0
- datacontract/integration/datamesh_manager.py +86 -0
- datacontract/lint/resolve.py +271 -49
- datacontract/lint/resources.py +21 -0
- datacontract/lint/schema.py +53 -17
- datacontract/lint/urls.py +32 -12
- datacontract/model/data_contract_specification/__init__.py +1 -0
- datacontract/model/exceptions.py +4 -1
- datacontract/model/odcs.py +24 -0
- datacontract/model/run.py +49 -29
- datacontract/output/__init__.py +0 -0
- datacontract/output/junit_test_results.py +135 -0
- datacontract/output/output_format.py +10 -0
- datacontract/output/test_results_writer.py +79 -0
- datacontract/py.typed +0 -0
- datacontract/schemas/datacontract-1.1.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.1.0.schema.json +1975 -0
- datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
- datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
- datacontract/schemas/odcs-3.0.1.schema.json +2634 -0
- datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
- datacontract/templates/datacontract.html +139 -294
- datacontract/templates/datacontract_odcs.html +685 -0
- datacontract/templates/index.html +236 -0
- datacontract/templates/partials/datacontract_information.html +86 -0
- datacontract/templates/partials/datacontract_servicelevels.html +253 -0
- datacontract/templates/partials/datacontract_terms.html +51 -0
- datacontract/templates/partials/definition.html +25 -0
- datacontract/templates/partials/example.html +27 -0
- datacontract/templates/partials/model_field.html +144 -0
- datacontract/templates/partials/quality.html +49 -0
- datacontract/templates/partials/server.html +211 -0
- datacontract/templates/style/output.css +491 -72
- datacontract_cli-0.10.37.dist-info/METADATA +2235 -0
- datacontract_cli-0.10.37.dist-info/RECORD +119 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/WHEEL +1 -1
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info/licenses}/LICENSE +1 -1
- datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
- datacontract/engines/soda/connections/dask.py +0 -28
- datacontract/engines/soda/connections/duckdb.py +0 -76
- datacontract/export/csv_type_converter.py +0 -36
- datacontract/export/html_export.py +0 -66
- datacontract/export/odcs_converter.py +0 -102
- datacontract/init/download_datacontract_file.py +0 -17
- datacontract/integration/publish_datamesh_manager.py +0 -33
- datacontract/integration/publish_opentelemetry.py +0 -107
- datacontract/lint/lint.py +0 -141
- datacontract/lint/linters/description_linter.py +0 -34
- datacontract/lint/linters/example_model_linter.py +0 -91
- datacontract/lint/linters/field_pattern_linter.py +0 -34
- datacontract/lint/linters/field_reference_linter.py +0 -38
- datacontract/lint/linters/notice_period_linter.py +0 -55
- datacontract/lint/linters/quality_schema_linter.py +0 -52
- datacontract/lint/linters/valid_constraints_linter.py +0 -99
- datacontract/model/data_contract_specification.py +0 -141
- datacontract/web.py +0 -14
- datacontract_cli-0.10.0.dist-info/METADATA +0 -951
- datacontract_cli-0.10.0.dist-info/RECORD +0 -66
- /datacontract/{model → breaking}/breaking_change.py +0 -0
- /datacontract/{lint/linters → export}/__init__.py +0 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/top_level.txt +0 -0
|
@@ -1,43 +1,41 @@
|
|
|
1
|
+
import atexit
|
|
2
|
+
import logging
|
|
1
3
|
import os
|
|
2
|
-
|
|
3
|
-
import pyspark.sql.functions as fn
|
|
4
|
-
from pyspark.sql import SparkSession
|
|
5
|
-
from pyspark.sql.avro.functions import from_avro
|
|
6
|
-
from pyspark.sql.functions import from_json, col
|
|
7
|
-
from pyspark.sql.types import (
|
|
8
|
-
StructType,
|
|
9
|
-
DataType,
|
|
10
|
-
NullType,
|
|
11
|
-
ArrayType,
|
|
12
|
-
BinaryType,
|
|
13
|
-
DateType,
|
|
14
|
-
TimestampNTZType,
|
|
15
|
-
TimestampType,
|
|
16
|
-
BooleanType,
|
|
17
|
-
LongType,
|
|
18
|
-
IntegerType,
|
|
19
|
-
DoubleType,
|
|
20
|
-
DecimalType,
|
|
21
|
-
StringType,
|
|
22
|
-
StructField,
|
|
23
|
-
)
|
|
4
|
+
import tempfile
|
|
24
5
|
|
|
25
6
|
from datacontract.export.avro_converter import to_avro_schema_json
|
|
26
|
-
from datacontract.model.data_contract_specification import
|
|
27
|
-
DataContractSpecification, Server, Field
|
|
7
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Server
|
|
28
8
|
from datacontract.model.exceptions import DataContractException
|
|
9
|
+
from datacontract.model.run import ResultEnum
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def create_spark_session():
|
|
13
|
+
"""Create and configure a Spark session."""
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
from pyspark.sql import SparkSession
|
|
17
|
+
except ImportError as e:
|
|
18
|
+
raise DataContractException(
|
|
19
|
+
type="schema",
|
|
20
|
+
result=ResultEnum.failed,
|
|
21
|
+
name="pyspark is missing",
|
|
22
|
+
reason="Install the extra datacontract-cli[kafka] to use kafka",
|
|
23
|
+
engine="datacontract",
|
|
24
|
+
original_exception=e,
|
|
25
|
+
)
|
|
29
26
|
|
|
27
|
+
tmp_dir = tempfile.TemporaryDirectory(prefix="datacontract-cli-spark")
|
|
28
|
+
atexit.register(tmp_dir.cleanup)
|
|
30
29
|
|
|
31
|
-
|
|
32
|
-
# TODO: Update dependency versions when updating pyspark
|
|
33
|
-
# TODO: add protobuf library
|
|
30
|
+
pyspark_version = "3.5.5" # MUST be the same as in the pyproject.toml
|
|
34
31
|
spark = (
|
|
35
32
|
SparkSession.builder.appName("datacontract")
|
|
36
|
-
.config("spark.sql.warehouse.dir", tmp_dir
|
|
37
|
-
.config("spark.streaming.stopGracefullyOnShutdown",
|
|
33
|
+
.config("spark.sql.warehouse.dir", f"{tmp_dir}/spark-warehouse")
|
|
34
|
+
.config("spark.streaming.stopGracefullyOnShutdown", "true")
|
|
35
|
+
.config("spark.ui.enabled", "false")
|
|
38
36
|
.config(
|
|
39
37
|
"spark.jars.packages",
|
|
40
|
-
"org.apache.spark:spark-sql-kafka-0-10_2.12:
|
|
38
|
+
f"org.apache.spark:spark-sql-kafka-0-10_2.12:{pyspark_version},org.apache.spark:spark-avro_2.12:{pyspark_version}",
|
|
41
39
|
)
|
|
42
40
|
.getOrCreate()
|
|
43
41
|
)
|
|
@@ -46,107 +44,202 @@ def create_spark_session(tmp_dir) -> SparkSession:
|
|
|
46
44
|
return spark
|
|
47
45
|
|
|
48
46
|
|
|
49
|
-
def read_kafka_topic(spark
|
|
50
|
-
|
|
51
|
-
topic = server.topic
|
|
52
|
-
auth_options = get_auth_options()
|
|
47
|
+
def read_kafka_topic(spark, data_contract: DataContractSpecification, server: Server):
|
|
48
|
+
"""Read and process data from a Kafka topic based on the server configuration."""
|
|
53
49
|
|
|
54
|
-
|
|
50
|
+
logging.info("Reading data from Kafka server %s topic %s", server.host, server.topic)
|
|
55
51
|
df = (
|
|
56
52
|
spark.read.format("kafka")
|
|
57
|
-
.options(**
|
|
58
|
-
.option("kafka.bootstrap.servers", host)
|
|
59
|
-
.option("subscribe", topic)
|
|
53
|
+
.options(**get_auth_options())
|
|
54
|
+
.option("kafka.bootstrap.servers", server.host)
|
|
55
|
+
.option("subscribe", server.topic)
|
|
60
56
|
.option("startingOffsets", "earliest")
|
|
61
57
|
.load()
|
|
62
58
|
)
|
|
63
|
-
|
|
59
|
+
|
|
64
60
|
model_name, model = next(iter(data_contract.models.items()))
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
61
|
+
|
|
62
|
+
match server.format:
|
|
63
|
+
case "avro":
|
|
64
|
+
process_avro_format(df, model_name, model)
|
|
65
|
+
case "json":
|
|
66
|
+
process_json_format(df, model_name, model)
|
|
67
|
+
case _:
|
|
68
|
+
raise DataContractException(
|
|
69
|
+
type="test",
|
|
70
|
+
name="Configuring Kafka checks",
|
|
71
|
+
result="warning",
|
|
72
|
+
reason=f"Kafka format '{server.format}' is not supported. Skip executing tests.",
|
|
73
|
+
engine="datacontract",
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def process_avro_format(df, model_name, model):
|
|
78
|
+
try:
|
|
79
|
+
from pyspark.sql.avro.functions import from_avro
|
|
80
|
+
from pyspark.sql.functions import col, expr
|
|
81
|
+
except ImportError as e:
|
|
82
|
+
raise DataContractException(
|
|
83
|
+
type="schema",
|
|
84
|
+
result="failed",
|
|
85
|
+
name="pyspark is missing",
|
|
86
|
+
reason="Install the extra datacontract-cli[kafka] to use kafka",
|
|
87
|
+
engine="datacontract",
|
|
88
|
+
original_exception=e,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
avro_schema = to_avro_schema_json(model_name, model)
|
|
92
|
+
df2 = df.withColumn("fixedValue", expr("substring(value, 6, length(value)-5)"))
|
|
93
|
+
options = {"mode": "PERMISSIVE"}
|
|
94
|
+
df2.select(from_avro(col("fixedValue"), avro_schema, options).alias("avro")).select(
|
|
95
|
+
col("avro.*")
|
|
96
|
+
).createOrReplaceTempView(model_name)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def process_json_format(df, model_name, model):
|
|
100
|
+
try:
|
|
101
|
+
from pyspark.sql.functions import col, from_json
|
|
102
|
+
except ImportError as e:
|
|
82
103
|
raise DataContractException(
|
|
83
|
-
type="
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
reason=
|
|
104
|
+
type="schema",
|
|
105
|
+
result="failed",
|
|
106
|
+
name="pyspark is missing",
|
|
107
|
+
reason="Install the extra datacontract-cli[kafka] to use kafka",
|
|
87
108
|
engine="datacontract",
|
|
109
|
+
original_exception=e,
|
|
88
110
|
)
|
|
89
111
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
112
|
+
struct_type = to_struct_type(model.fields)
|
|
113
|
+
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").select(
|
|
114
|
+
from_json(col("value"), struct_type, {"mode": "PERMISSIVE"}).alias("json")
|
|
115
|
+
).select(col("json.*")).createOrReplaceTempView(model_name)
|
|
93
116
|
|
|
94
117
|
|
|
95
118
|
def get_auth_options():
|
|
119
|
+
"""Retrieve Kafka authentication options from environment variables."""
|
|
96
120
|
kafka_sasl_username = os.getenv("DATACONTRACT_KAFKA_SASL_USERNAME")
|
|
97
121
|
kafka_sasl_password = os.getenv("DATACONTRACT_KAFKA_SASL_PASSWORD")
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
122
|
+
kafka_sasl_mechanism = os.getenv("DATACONTRACT_KAFKA_SASL_MECHANISM", "PLAIN").upper()
|
|
123
|
+
|
|
124
|
+
# Skip authentication if credentials are not provided
|
|
125
|
+
if not kafka_sasl_username or not kafka_sasl_password:
|
|
126
|
+
return {}
|
|
127
|
+
|
|
128
|
+
# SASL mechanisms supported by Kafka
|
|
129
|
+
jaas_config = {
|
|
130
|
+
"PLAIN": (
|
|
131
|
+
f"org.apache.kafka.common.security.plain.PlainLoginModule required "
|
|
132
|
+
f'username="{kafka_sasl_username}" password="{kafka_sasl_password}";'
|
|
133
|
+
),
|
|
134
|
+
"SCRAM-SHA-256": (
|
|
135
|
+
f"org.apache.kafka.common.security.scram.ScramLoginModule required "
|
|
136
|
+
f'username="{kafka_sasl_username}" password="{kafka_sasl_password}";'
|
|
137
|
+
),
|
|
138
|
+
"SCRAM-SHA-512": (
|
|
139
|
+
f"org.apache.kafka.common.security.scram.ScramLoginModule required "
|
|
140
|
+
f'username="{kafka_sasl_username}" password="{kafka_sasl_password}";'
|
|
141
|
+
),
|
|
142
|
+
# Add more mechanisms as needed
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
# Validate SASL mechanism
|
|
146
|
+
if kafka_sasl_mechanism not in jaas_config:
|
|
147
|
+
raise ValueError(f"Unsupported SASL mechanism: {kafka_sasl_mechanism}")
|
|
148
|
+
|
|
149
|
+
# Return config
|
|
150
|
+
return {
|
|
151
|
+
"kafka.sasl.mechanism": kafka_sasl_mechanism,
|
|
152
|
+
"kafka.security.protocol": "SASL_SSL",
|
|
153
|
+
"kafka.sasl.jaas.config": jaas_config[kafka_sasl_mechanism],
|
|
154
|
+
}
|
|
108
155
|
|
|
109
156
|
|
|
110
157
|
def to_struct_type(fields):
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
158
|
+
try:
|
|
159
|
+
from pyspark.sql.types import StructType
|
|
160
|
+
except ImportError as e:
|
|
161
|
+
raise DataContractException(
|
|
162
|
+
type="schema",
|
|
163
|
+
result="failed",
|
|
164
|
+
name="pyspark is missing",
|
|
165
|
+
reason="Install the extra datacontract-cli[kafka] to use kafka",
|
|
166
|
+
engine="datacontract",
|
|
167
|
+
original_exception=e,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
"""Convert field definitions to Spark StructType."""
|
|
171
|
+
return StructType([to_struct_field(field_name, field) for field_name, field in fields.items()])
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def to_struct_field(field_name: str, field: Field):
|
|
175
|
+
try:
|
|
176
|
+
from pyspark.sql.types import (
|
|
177
|
+
ArrayType,
|
|
178
|
+
BinaryType,
|
|
179
|
+
BooleanType,
|
|
180
|
+
DataType,
|
|
181
|
+
DateType,
|
|
182
|
+
DecimalType,
|
|
183
|
+
DoubleType,
|
|
184
|
+
IntegerType,
|
|
185
|
+
LongType,
|
|
186
|
+
NullType,
|
|
187
|
+
StringType,
|
|
188
|
+
StructField,
|
|
189
|
+
StructType,
|
|
190
|
+
TimestampNTZType,
|
|
191
|
+
TimestampType,
|
|
192
|
+
)
|
|
193
|
+
except ImportError as e:
|
|
194
|
+
raise DataContractException(
|
|
195
|
+
type="schema",
|
|
196
|
+
result="failed",
|
|
197
|
+
name="pyspark is missing",
|
|
198
|
+
reason="Install the extra datacontract-cli[kafka] to use kafka",
|
|
199
|
+
engine="datacontract",
|
|
200
|
+
original_exception=e,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
"""Map field definitions to Spark StructField using match-case."""
|
|
204
|
+
match field.type:
|
|
205
|
+
case "string" | "varchar" | "text":
|
|
206
|
+
data_type = StringType()
|
|
207
|
+
case "number" | "decimal" | "numeric":
|
|
208
|
+
data_type = DecimalType()
|
|
209
|
+
case "float" | "double":
|
|
210
|
+
data_type = DoubleType()
|
|
211
|
+
case "integer" | "int":
|
|
212
|
+
data_type = IntegerType()
|
|
213
|
+
case "long" | "bigint":
|
|
214
|
+
data_type = LongType()
|
|
215
|
+
case "boolean":
|
|
216
|
+
data_type = BooleanType()
|
|
217
|
+
case "timestamp" | "timestamp_tz":
|
|
218
|
+
data_type = TimestampType()
|
|
219
|
+
case "timestamp_ntz":
|
|
220
|
+
data_type = TimestampNTZType()
|
|
221
|
+
case "date":
|
|
222
|
+
data_type = DateType()
|
|
223
|
+
case "time":
|
|
224
|
+
data_type = DataType() # Specific handling for time type
|
|
225
|
+
case "object" | "record" | "struct":
|
|
226
|
+
data_type = StructType(
|
|
227
|
+
[to_struct_field(sub_field_name, sub_field) for sub_field_name, sub_field in field.fields.items()]
|
|
228
|
+
)
|
|
229
|
+
case "binary":
|
|
230
|
+
data_type = BinaryType()
|
|
231
|
+
case "array":
|
|
232
|
+
element_type = (
|
|
233
|
+
StructType(
|
|
234
|
+
[to_struct_field(sub_field_name, sub_field) for sub_field_name, sub_field in field.fields.items()]
|
|
235
|
+
)
|
|
236
|
+
if field.fields
|
|
237
|
+
else DataType()
|
|
238
|
+
)
|
|
239
|
+
data_type = ArrayType(element_type)
|
|
240
|
+
case "null":
|
|
241
|
+
data_type = NullType()
|
|
242
|
+
case _:
|
|
243
|
+
data_type = DataType() # Fallback generic DataType
|
|
151
244
|
|
|
152
245
|
return StructField(field_name, data_type, nullable=not field.required)
|
|
@@ -4,17 +4,20 @@ import yaml
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def to_snowflake_soda_configuration(server):
|
|
7
|
+
prefix = "DATACONTRACT_SNOWFLAKE_"
|
|
8
|
+
snowflake_soda_params = {k.replace(prefix, "").lower(): v for k, v in os.environ.items() if k.startswith(prefix)}
|
|
9
|
+
|
|
10
|
+
# backward compatibility
|
|
11
|
+
if "connection_timeout" not in snowflake_soda_params:
|
|
12
|
+
snowflake_soda_params["connection_timeout"] = "5" # minutes
|
|
13
|
+
|
|
7
14
|
soda_configuration = {
|
|
8
15
|
f"data_source {server.type}": {
|
|
9
16
|
"type": "snowflake",
|
|
10
|
-
"username": os.getenv("DATACONTRACT_SNOWFLAKE_USERNAME"),
|
|
11
|
-
"password": os.getenv("DATACONTRACT_SNOWFLAKE_PASSWORD"),
|
|
12
|
-
"role": os.getenv("DATACONTRACT_SNOWFLAKE_ROLE"),
|
|
13
17
|
"account": server.account,
|
|
14
18
|
"database": server.database,
|
|
15
19
|
"schema": server.schema_,
|
|
16
|
-
|
|
17
|
-
"connection_timeout": 5, # minutes
|
|
20
|
+
**snowflake_soda_params,
|
|
18
21
|
}
|
|
19
22
|
}
|
|
20
23
|
soda_configuration_str = yaml.dump(soda_configuration)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import yaml
|
|
4
|
+
|
|
5
|
+
from datacontract.model.data_contract_specification import Server
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def to_sqlserver_soda_configuration(server: Server) -> str:
|
|
9
|
+
"""Serialize server config to soda configuration.
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
### Example:
|
|
13
|
+
type: sqlserver
|
|
14
|
+
host: host
|
|
15
|
+
port: '1433'
|
|
16
|
+
username: simple
|
|
17
|
+
password: simple_pass
|
|
18
|
+
database: database
|
|
19
|
+
schema: dbo
|
|
20
|
+
trusted_connection: false
|
|
21
|
+
encrypt: false
|
|
22
|
+
trust_server_certificate: false
|
|
23
|
+
driver: ODBC Driver 18 for SQL Server
|
|
24
|
+
"""
|
|
25
|
+
# with service account key, using an external json file
|
|
26
|
+
soda_configuration = {
|
|
27
|
+
f"data_source {server.type}": {
|
|
28
|
+
"type": "sqlserver",
|
|
29
|
+
"host": server.host,
|
|
30
|
+
"port": str(server.port),
|
|
31
|
+
"username": os.getenv("DATACONTRACT_SQLSERVER_USERNAME", ""),
|
|
32
|
+
"password": os.getenv("DATACONTRACT_SQLSERVER_PASSWORD", ""),
|
|
33
|
+
"database": server.database,
|
|
34
|
+
"schema": server.schema_,
|
|
35
|
+
"trusted_connection": os.getenv("DATACONTRACT_SQLSERVER_TRUSTED_CONNECTION", False),
|
|
36
|
+
"trust_server_certificate": os.getenv("DATACONTRACT_SQLSERVER_TRUST_SERVER_CERTIFICATE", False),
|
|
37
|
+
"encrypt": os.getenv("DATACONTRACT_SQLSERVER_ENCRYPTED_CONNECTION", True),
|
|
38
|
+
"driver": server.driver,
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
soda_configuration_str = yaml.dump(soda_configuration)
|
|
43
|
+
return soda_configuration_str
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import yaml
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def to_trino_soda_configuration(server):
|
|
7
|
+
password = os.getenv("DATACONTRACT_TRINO_PASSWORD")
|
|
8
|
+
username = os.getenv("DATACONTRACT_TRINO_USERNAME")
|
|
9
|
+
|
|
10
|
+
data_source = {
|
|
11
|
+
"type": "trino",
|
|
12
|
+
"host": server.host,
|
|
13
|
+
"port": str(server.port),
|
|
14
|
+
"username": username,
|
|
15
|
+
"password": password,
|
|
16
|
+
"catalog": server.catalog,
|
|
17
|
+
"schema": server.schema_,
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
if password is None or password == "":
|
|
21
|
+
data_source["auth_type"] = "NoAuthentication" # default is BasicAuthentication
|
|
22
|
+
|
|
23
|
+
soda_configuration = {f"data_source {server.type}": data_source}
|
|
24
|
+
|
|
25
|
+
soda_configuration_str = yaml.dump(soda_configuration)
|
|
26
|
+
return soda_configuration_str
|
|
@@ -1,8 +1,15 @@
|
|
|
1
1
|
import json
|
|
2
2
|
|
|
3
|
+
from datacontract.export.exporter import Exporter, _check_models_for_export
|
|
3
4
|
from datacontract.model.data_contract_specification import Field
|
|
4
5
|
|
|
5
6
|
|
|
7
|
+
class AvroExporter(Exporter):
|
|
8
|
+
def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
|
|
9
|
+
model_name, model_value = _check_models_for_export(data_contract, model, self.export_format)
|
|
10
|
+
return to_avro_schema_json(model_name, model_value)
|
|
11
|
+
|
|
12
|
+
|
|
6
13
|
def to_avro_schema(model_name, model) -> dict:
|
|
7
14
|
return to_avro_record(model_name, model.fields, model.description, model.namespace)
|
|
8
15
|
|
|
@@ -33,19 +40,70 @@ def to_avro_field(field, field_name):
|
|
|
33
40
|
avro_field = {"name": field_name}
|
|
34
41
|
if field.description is not None:
|
|
35
42
|
avro_field["doc"] = field.description
|
|
36
|
-
|
|
43
|
+
is_required_avro = field.required if field.required is not None else True
|
|
44
|
+
avro_type = to_avro_type(field, field_name)
|
|
45
|
+
avro_field["type"] = avro_type if is_required_avro else ["null", avro_type]
|
|
46
|
+
|
|
47
|
+
# Handle enum types - both required and optional
|
|
48
|
+
if avro_type == "enum" or (isinstance(avro_field["type"], list) and "enum" in avro_field["type"]):
|
|
49
|
+
enum_def = {
|
|
50
|
+
"type": "enum",
|
|
51
|
+
"name": field.title,
|
|
52
|
+
"symbols": field.enum,
|
|
53
|
+
}
|
|
54
|
+
if is_required_avro:
|
|
55
|
+
avro_field["type"] = enum_def
|
|
56
|
+
else:
|
|
57
|
+
# Replace "enum" with the full enum definition in the union
|
|
58
|
+
avro_field["type"] = ["null", enum_def]
|
|
59
|
+
|
|
60
|
+
if field.config:
|
|
61
|
+
if "avroDefault" in field.config:
|
|
62
|
+
if field.config.get("avroType") != "enum":
|
|
63
|
+
avro_field["default"] = field.config["avroDefault"]
|
|
64
|
+
|
|
37
65
|
return avro_field
|
|
38
66
|
|
|
39
67
|
|
|
40
68
|
def to_avro_type(field: Field, field_name: str) -> str | dict:
|
|
69
|
+
if field.config:
|
|
70
|
+
if "avroLogicalType" in field.config and "avroType" in field.config:
|
|
71
|
+
return {"type": field.config["avroType"], "logicalType": field.config["avroLogicalType"]}
|
|
72
|
+
if "avroLogicalType" in field.config:
|
|
73
|
+
if field.config["avroLogicalType"] in [
|
|
74
|
+
"timestamp-millis",
|
|
75
|
+
"timestamp-micros",
|
|
76
|
+
"local-timestamp-millis",
|
|
77
|
+
"local-timestamp-micros",
|
|
78
|
+
"time-micros",
|
|
79
|
+
]:
|
|
80
|
+
return {"type": "long", "logicalType": field.config["avroLogicalType"]}
|
|
81
|
+
if field.config["avroLogicalType"] in ["time-millis", "date"]:
|
|
82
|
+
return {"type": "int", "logicalType": field.config["avroLogicalType"]}
|
|
83
|
+
if "avroType" in field.config:
|
|
84
|
+
return field.config["avroType"]
|
|
85
|
+
|
|
86
|
+
# Check for enum fields based on presence of enum list and avroType config
|
|
87
|
+
if field.enum and field.config and field.config.get("avroType") == "enum":
|
|
88
|
+
return "enum"
|
|
89
|
+
|
|
41
90
|
if field.type is None:
|
|
42
91
|
return "null"
|
|
43
92
|
if field.type in ["string", "varchar", "text"]:
|
|
44
93
|
return "string"
|
|
45
|
-
elif field.type in ["number", "
|
|
94
|
+
elif field.type in ["number", "numeric"]:
|
|
46
95
|
# https://avro.apache.org/docs/1.11.1/specification/#decimal
|
|
47
96
|
return "bytes"
|
|
48
|
-
elif field.type in ["
|
|
97
|
+
elif field.type in ["decimal"]:
|
|
98
|
+
typeVal = {"type": "bytes", "logicalType": "decimal"}
|
|
99
|
+
if field.scale is not None:
|
|
100
|
+
typeVal["scale"] = field.scale
|
|
101
|
+
if field.precision is not None:
|
|
102
|
+
typeVal["precision"] = field.precision
|
|
103
|
+
return typeVal
|
|
104
|
+
elif field.type in ["float"]:
|
|
105
|
+
return "float"
|
|
106
|
+
elif field.type in ["double"]:
|
|
49
107
|
return "double"
|
|
50
108
|
elif field.type in ["integer", "int"]:
|
|
51
109
|
return "int"
|
|
@@ -54,20 +112,26 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
|
|
|
54
112
|
elif field.type in ["boolean"]:
|
|
55
113
|
return "boolean"
|
|
56
114
|
elif field.type in ["timestamp", "timestamp_tz"]:
|
|
57
|
-
return "
|
|
115
|
+
return {"type": "long", "logicalType": "timestamp-millis"}
|
|
58
116
|
elif field.type in ["timestamp_ntz"]:
|
|
59
|
-
return "
|
|
117
|
+
return {"type": "long", "logicalType": "local-timestamp-millis"}
|
|
60
118
|
elif field.type in ["date"]:
|
|
61
|
-
return "int"
|
|
119
|
+
return {"type": "int", "logicalType": "date"}
|
|
62
120
|
elif field.type in ["time"]:
|
|
63
121
|
return "long"
|
|
122
|
+
elif field.type in ["map"]:
|
|
123
|
+
if field.config is not None and "values" in field.config:
|
|
124
|
+
return {"type": "map", "values": field.config["values"]}
|
|
125
|
+
else:
|
|
126
|
+
return "bytes"
|
|
64
127
|
elif field.type in ["object", "record", "struct"]:
|
|
128
|
+
if field.config is not None and "namespace" in field.config:
|
|
129
|
+
return to_avro_record(field_name, field.fields, field.description, field.config["namespace"])
|
|
65
130
|
return to_avro_record(field_name, field.fields, field.description, None)
|
|
66
131
|
elif field.type in ["binary"]:
|
|
67
132
|
return "bytes"
|
|
68
133
|
elif field.type in ["array"]:
|
|
69
|
-
|
|
70
|
-
return "array"
|
|
134
|
+
return {"type": "array", "items": to_avro_type(field.items, field_name)}
|
|
71
135
|
elif field.type in ["null"]:
|
|
72
136
|
return "null"
|
|
73
137
|
else:
|
|
@@ -3,35 +3,12 @@ from dataclasses import dataclass
|
|
|
3
3
|
from enum import Enum
|
|
4
4
|
from io import StringIO
|
|
5
5
|
|
|
6
|
+
from datacontract.export.exporter import Exporter
|
|
6
7
|
from datacontract.lint.resolve import inline_definitions_into_data_contract
|
|
7
|
-
from datacontract.model.data_contract_specification import
|
|
8
|
-
DataContractSpecification, Field
|
|
8
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field
|
|
9
9
|
from datacontract.model.exceptions import DataContractException
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
def to_avro_idl(contract: DataContractSpecification) -> str:
|
|
13
|
-
"""Serialize the provided data contract specification into an Avro IDL string.
|
|
14
|
-
|
|
15
|
-
The data contract will be serialized as a protocol, with one record type
|
|
16
|
-
for each contained model. Model fields are mapped one-to-one to Avro IDL
|
|
17
|
-
record fields.
|
|
18
|
-
"""
|
|
19
|
-
stream = StringIO()
|
|
20
|
-
to_avro_idl_stream(contract, stream)
|
|
21
|
-
return stream.getvalue()
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def to_avro_idl_stream(contract: DataContractSpecification, stream: typing.TextIO):
|
|
25
|
-
"""Serialize the provided data contract specification into Avro IDL."""
|
|
26
|
-
ir = _contract_to_avro_idl_ir(contract)
|
|
27
|
-
if ir.description:
|
|
28
|
-
stream.write(f"/** {contract.info.description} */\n")
|
|
29
|
-
stream.write(f"protocol {ir.name or 'Unnamed'} {{\n")
|
|
30
|
-
for model_type in ir.model_types:
|
|
31
|
-
_write_model_type(model_type, stream)
|
|
32
|
-
stream.write("}\n")
|
|
33
|
-
|
|
34
|
-
|
|
35
12
|
class AvroPrimitiveType(Enum):
|
|
36
13
|
int = "int"
|
|
37
14
|
long = "long"
|
|
@@ -86,6 +63,7 @@ class AvroIDLProtocol:
|
|
|
86
63
|
model_types: list[AvroModelType]
|
|
87
64
|
|
|
88
65
|
|
|
66
|
+
# TODO use DATACONTRACT_TYPES from datacontract/model/data_contract_specification.py
|
|
89
67
|
avro_primitive_types = set(
|
|
90
68
|
[
|
|
91
69
|
"string",
|
|
@@ -108,6 +86,34 @@ avro_primitive_types = set(
|
|
|
108
86
|
)
|
|
109
87
|
|
|
110
88
|
|
|
89
|
+
class AvroIdlExporter(Exporter):
|
|
90
|
+
def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
|
|
91
|
+
return to_avro_idl(data_contract)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def to_avro_idl(contract: DataContractSpecification) -> str:
|
|
95
|
+
"""Serialize the provided data contract specification into an Avro IDL string.
|
|
96
|
+
|
|
97
|
+
The data contract will be serialized as a protocol, with one record type
|
|
98
|
+
for each contained model. Model fields are mapped one-to-one to Avro IDL
|
|
99
|
+
record fields.
|
|
100
|
+
"""
|
|
101
|
+
stream = StringIO()
|
|
102
|
+
to_avro_idl_stream(contract, stream)
|
|
103
|
+
return stream.getvalue()
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def to_avro_idl_stream(contract: DataContractSpecification, stream: typing.TextIO):
|
|
107
|
+
"""Serialize the provided data contract specification into Avro IDL."""
|
|
108
|
+
ir = _contract_to_avro_idl_ir(contract)
|
|
109
|
+
if ir.description:
|
|
110
|
+
stream.write(f"/** {contract.info.description} */\n")
|
|
111
|
+
stream.write(f"protocol {ir.name or 'Unnamed'} {{\n")
|
|
112
|
+
for model_type in ir.model_types:
|
|
113
|
+
_write_model_type(model_type, stream)
|
|
114
|
+
stream.write("}\n")
|
|
115
|
+
|
|
116
|
+
|
|
111
117
|
def _to_avro_primitive_logical_type(field_name: str, field: Field) -> AvroPrimitiveField:
|
|
112
118
|
result = AvroPrimitiveField(field_name, field.required, field.description, AvroPrimitiveType.string)
|
|
113
119
|
match field.type:
|