datacontract-cli 0.9.7__py3-none-any.whl → 0.9.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/breaking/breaking.py +48 -57
- datacontract/cli.py +98 -80
- datacontract/data_contract.py +156 -106
- datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +5 -1
- datacontract/engines/datacontract/check_that_datacontract_file_exists.py +9 -8
- datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +26 -22
- datacontract/engines/fastjsonschema/check_jsonschema.py +31 -25
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +8 -6
- datacontract/engines/soda/check_soda_execute.py +46 -35
- datacontract/engines/soda/connections/bigquery.py +5 -3
- datacontract/engines/soda/connections/dask.py +0 -1
- datacontract/engines/soda/connections/databricks.py +2 -2
- datacontract/engines/soda/connections/duckdb.py +4 -4
- datacontract/engines/soda/connections/kafka.py +36 -17
- datacontract/engines/soda/connections/postgres.py +3 -3
- datacontract/engines/soda/connections/snowflake.py +4 -4
- datacontract/export/avro_converter.py +3 -7
- datacontract/export/avro_idl_converter.py +65 -42
- datacontract/export/dbt_converter.py +43 -32
- datacontract/export/great_expectations_converter.py +141 -0
- datacontract/export/jsonschema_converter.py +3 -1
- datacontract/export/odcs_converter.py +5 -7
- datacontract/export/protobuf_converter.py +12 -10
- datacontract/export/pydantic_converter.py +140 -0
- datacontract/export/rdf_converter.py +34 -11
- datacontract/export/sodacl_converter.py +24 -24
- datacontract/export/sql_converter.py +20 -9
- datacontract/export/sql_type_converter.py +44 -4
- datacontract/export/terraform_converter.py +4 -3
- datacontract/imports/avro_importer.py +32 -10
- datacontract/imports/sql_importer.py +0 -2
- datacontract/init/download_datacontract_file.py +2 -2
- datacontract/integration/publish_datamesh_manager.py +4 -9
- datacontract/integration/publish_opentelemetry.py +30 -16
- datacontract/lint/files.py +2 -2
- datacontract/lint/lint.py +26 -31
- datacontract/lint/linters/description_linter.py +12 -21
- datacontract/lint/linters/example_model_linter.py +28 -29
- datacontract/lint/linters/field_pattern_linter.py +8 -8
- datacontract/lint/linters/field_reference_linter.py +11 -10
- datacontract/lint/linters/notice_period_linter.py +18 -22
- datacontract/lint/linters/primary_field_linter.py +10 -12
- datacontract/lint/linters/quality_schema_linter.py +16 -20
- datacontract/lint/linters/valid_constraints_linter.py +42 -37
- datacontract/lint/resolve.py +7 -10
- datacontract/lint/schema.py +2 -3
- datacontract/lint/urls.py +4 -5
- datacontract/model/breaking_change.py +2 -1
- datacontract/model/data_contract_specification.py +8 -7
- datacontract/model/exceptions.py +13 -2
- datacontract/model/run.py +1 -1
- datacontract/web.py +3 -7
- {datacontract_cli-0.9.7.dist-info → datacontract_cli-0.9.8.dist-info}/METADATA +176 -37
- datacontract_cli-0.9.8.dist-info/RECORD +63 -0
- datacontract_cli-0.9.7.dist-info/RECORD +0 -61
- {datacontract_cli-0.9.7.dist-info → datacontract_cli-0.9.8.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.9.7.dist-info → datacontract_cli-0.9.8.dist-info}/WHEEL +0 -0
- {datacontract_cli-0.9.7.dist-info → datacontract_cli-0.9.8.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.9.7.dist-info → datacontract_cli-0.9.8.dist-info}/top_level.txt +0 -0
|
@@ -16,8 +16,7 @@ from datacontract.engines.soda.connections.snowflake import \
|
|
|
16
16
|
from datacontract.export.sodacl_converter import to_sodacl_yaml
|
|
17
17
|
from datacontract.model.data_contract_specification import \
|
|
18
18
|
DataContractSpecification, Server
|
|
19
|
-
from datacontract.model.run import
|
|
20
|
-
Run, Check, Log
|
|
19
|
+
from datacontract.model.run import Run, Check, Log
|
|
21
20
|
|
|
22
21
|
|
|
23
22
|
def check_soda_execute(run: Run, data_contract: DataContractSpecification, server: Server, spark, tmp_dir):
|
|
@@ -34,13 +33,15 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
34
33
|
scan.add_duckdb_connection(duckdb_connection=con, data_source_name=server.type)
|
|
35
34
|
scan.set_data_source_name(server.type)
|
|
36
35
|
else:
|
|
37
|
-
run.checks.append(
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
36
|
+
run.checks.append(
|
|
37
|
+
Check(
|
|
38
|
+
type="general",
|
|
39
|
+
name="Check that format is supported",
|
|
40
|
+
result="warning",
|
|
41
|
+
reason=f"Format {server.format} not yet supported by datacontract CLI",
|
|
42
|
+
engine="datacontract",
|
|
43
|
+
)
|
|
44
|
+
)
|
|
44
45
|
run.log_warn(f"Format {server.format} not yet supported by datacontract CLI")
|
|
45
46
|
return
|
|
46
47
|
elif server.type == "snowflake":
|
|
@@ -73,21 +74,24 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
73
74
|
scan.set_data_source_name(server.type)
|
|
74
75
|
|
|
75
76
|
else:
|
|
76
|
-
run.checks.append(
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
77
|
+
run.checks.append(
|
|
78
|
+
Check(
|
|
79
|
+
type="general",
|
|
80
|
+
name="Check that server type is supported",
|
|
81
|
+
result="warning",
|
|
82
|
+
reason=f"Server type {server.type} not yet supported by datacontract CLI",
|
|
83
|
+
engine="datacontract-cli",
|
|
84
|
+
)
|
|
85
|
+
)
|
|
83
86
|
run.log_warn(f"Server type {server.type} not yet supported by datacontract CLI")
|
|
84
87
|
return
|
|
85
88
|
|
|
86
89
|
# Don't check types for json format, as they are checked with json schema
|
|
87
90
|
# Don't check types for avro format, as they are checked with avro schema
|
|
88
91
|
# Don't check types for csv format, as they are hard to detect
|
|
92
|
+
server_type = server.type
|
|
89
93
|
check_types = server.format != "json" and server.format != "csv" and server.format != "avro"
|
|
90
|
-
sodacl_yaml_str = to_sodacl_yaml(data_contract, check_types)
|
|
94
|
+
sodacl_yaml_str = to_sodacl_yaml(data_contract, server_type, check_types)
|
|
91
95
|
# print("sodacl_yaml_str:\n" + sodacl_yaml_str)
|
|
92
96
|
scan.add_sodacl_yaml_str(sodacl_yaml_str)
|
|
93
97
|
|
|
@@ -102,9 +106,12 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
102
106
|
for c in scan_results.get("checks"):
|
|
103
107
|
check = Check(
|
|
104
108
|
type="schema",
|
|
105
|
-
result="passed"
|
|
106
|
-
|
|
107
|
-
|
|
109
|
+
result="passed"
|
|
110
|
+
if c.get("outcome") == "pass"
|
|
111
|
+
else "failed"
|
|
112
|
+
if c.get("outcome") == "fail"
|
|
113
|
+
else c.get("outcome"),
|
|
114
|
+
reason=", ".join(c.get("outcomeReasons")),
|
|
108
115
|
name=c.get("name"),
|
|
109
116
|
model=c.get("table"),
|
|
110
117
|
field=c.get("column"),
|
|
@@ -114,21 +121,25 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
114
121
|
run.checks.append(check)
|
|
115
122
|
|
|
116
123
|
for log in scan_results.get("logs"):
|
|
117
|
-
run.logs.append(
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
124
|
+
run.logs.append(
|
|
125
|
+
Log(
|
|
126
|
+
timestamp=log.get("timestamp"),
|
|
127
|
+
level=log.get("level"),
|
|
128
|
+
message=log.get("message"),
|
|
129
|
+
)
|
|
130
|
+
)
|
|
122
131
|
|
|
123
132
|
if scan.has_error_logs():
|
|
124
133
|
run.log_warn("Engine soda-core has errors. See the logs for details.")
|
|
125
|
-
run.checks.append(
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
134
|
+
run.checks.append(
|
|
135
|
+
Check(
|
|
136
|
+
type="general",
|
|
137
|
+
name="Execute quality checks",
|
|
138
|
+
result="warning",
|
|
139
|
+
reason="Engine soda-core has errors. See the logs for details.",
|
|
140
|
+
engine="soda-core",
|
|
141
|
+
)
|
|
142
|
+
)
|
|
132
143
|
return
|
|
133
144
|
|
|
134
145
|
|
|
@@ -138,10 +149,10 @@ def update_reason(check, c):
|
|
|
138
149
|
return
|
|
139
150
|
if check.reason is not None and check.reason != "":
|
|
140
151
|
return
|
|
141
|
-
for block in c[
|
|
142
|
-
if block[
|
|
152
|
+
for block in c["diagnostics"]["blocks"]:
|
|
153
|
+
if block["title"] == "Diagnostics":
|
|
143
154
|
# Extract and print the 'text' value
|
|
144
|
-
diagnostics_text = block[
|
|
155
|
+
diagnostics_text = block["text"]
|
|
145
156
|
print(diagnostics_text)
|
|
146
157
|
diagnostics_text_split = diagnostics_text.split(":icon-fail: ")
|
|
147
158
|
if len(diagnostics_text_split) > 1:
|
|
@@ -1,18 +1,20 @@
|
|
|
1
1
|
import os
|
|
2
|
+
|
|
2
3
|
import yaml
|
|
3
4
|
|
|
5
|
+
|
|
4
6
|
# https://docs.soda.io/soda/connect-bigquery.html#authentication-methods
|
|
5
7
|
def to_bigquery_soda_configuration(server):
|
|
6
8
|
# with service account key, using an external json file
|
|
7
9
|
soda_configuration = {
|
|
8
10
|
f"data_source {server.type}": {
|
|
9
11
|
"type": "bigquery",
|
|
10
|
-
"account_info_json_path": os.getenv(
|
|
12
|
+
"account_info_json_path": os.getenv("DATACONTRACT_BIGQUERY_ACCOUNT_INFO_JSON_PATH"),
|
|
11
13
|
"auth_scopes": ["https://www.googleapis.com/auth/bigquery"],
|
|
12
14
|
"project_id": server.project,
|
|
13
|
-
"dataset": server.dataset
|
|
15
|
+
"dataset": server.dataset,
|
|
14
16
|
}
|
|
15
17
|
}
|
|
16
18
|
|
|
17
19
|
soda_configuration_str = yaml.dump(soda_configuration)
|
|
18
|
-
return soda_configuration_str
|
|
20
|
+
return soda_configuration_str
|
|
@@ -11,8 +11,8 @@ def to_databricks_soda_configuration(server):
|
|
|
11
11
|
"host": server.host,
|
|
12
12
|
"catalog": server.catalog,
|
|
13
13
|
"schema": server.schema_,
|
|
14
|
-
"http_path": os.getenv(
|
|
15
|
-
"token": os.getenv(
|
|
14
|
+
"http_path": os.getenv("DATACONTRACT_DATABRICKS_HTTP_PATH"),
|
|
15
|
+
"token": os.getenv("DATACONTRACT_DATABRICKS_TOKEN"),
|
|
16
16
|
}
|
|
17
17
|
}
|
|
18
18
|
|
|
@@ -15,7 +15,7 @@ def get_duckdb_connection(data_contract, server):
|
|
|
15
15
|
for model_name in data_contract.models:
|
|
16
16
|
model_path = path
|
|
17
17
|
if "{model}" in model_path:
|
|
18
|
-
model_path = model_path.format(model
|
|
18
|
+
model_path = model_path.format(model=model_name)
|
|
19
19
|
logging.info(f"Creating table {model_name} for {model_path}")
|
|
20
20
|
|
|
21
21
|
if server.format == "json":
|
|
@@ -39,9 +39,9 @@ def get_duckdb_connection(data_contract, server):
|
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
def setup_s3_connection(con, server):
|
|
42
|
-
s3_region = os.getenv(
|
|
43
|
-
s3_access_key_id = os.getenv(
|
|
44
|
-
s3_secret_access_key = os.getenv(
|
|
42
|
+
s3_region = os.getenv("DATACONTRACT_S3_REGION")
|
|
43
|
+
s3_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
|
|
44
|
+
s3_secret_access_key = os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY")
|
|
45
45
|
# con.install_extension("httpfs")
|
|
46
46
|
# con.load_extension("httpfs")
|
|
47
47
|
if server.endpointUrl is not None:
|
|
@@ -4,7 +4,23 @@ import pyspark.sql.functions as fn
|
|
|
4
4
|
from pyspark.sql import SparkSession
|
|
5
5
|
from pyspark.sql.avro.functions import from_avro
|
|
6
6
|
from pyspark.sql.functions import from_json, col
|
|
7
|
-
from pyspark.sql.types import
|
|
7
|
+
from pyspark.sql.types import (
|
|
8
|
+
StructType,
|
|
9
|
+
DataType,
|
|
10
|
+
NullType,
|
|
11
|
+
ArrayType,
|
|
12
|
+
BinaryType,
|
|
13
|
+
DateType,
|
|
14
|
+
TimestampNTZType,
|
|
15
|
+
TimestampType,
|
|
16
|
+
BooleanType,
|
|
17
|
+
LongType,
|
|
18
|
+
IntegerType,
|
|
19
|
+
DoubleType,
|
|
20
|
+
DecimalType,
|
|
21
|
+
StringType,
|
|
22
|
+
StructField,
|
|
23
|
+
)
|
|
8
24
|
|
|
9
25
|
from datacontract.export.avro_converter import to_avro_schema_json
|
|
10
26
|
from datacontract.model.data_contract_specification import \
|
|
@@ -15,14 +31,18 @@ from datacontract.model.exceptions import DataContractException
|
|
|
15
31
|
def create_spark_session(tmp_dir) -> SparkSession:
|
|
16
32
|
# TODO: Update dependency versions when updating pyspark
|
|
17
33
|
# TODO: add protobuf library
|
|
18
|
-
spark =
|
|
19
|
-
.
|
|
20
|
-
.config("spark.
|
|
21
|
-
.config(
|
|
22
|
-
|
|
34
|
+
spark = (
|
|
35
|
+
SparkSession.builder.appName("datacontract")
|
|
36
|
+
.config("spark.sql.warehouse.dir", tmp_dir + "/spark-warehouse")
|
|
37
|
+
.config("spark.streaming.stopGracefullyOnShutdown", True)
|
|
38
|
+
.config(
|
|
39
|
+
"spark.jars.packages",
|
|
40
|
+
"org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,org.apache.spark:spark-avro_2.12:3.5.0",
|
|
41
|
+
)
|
|
23
42
|
.getOrCreate()
|
|
43
|
+
)
|
|
24
44
|
spark.sparkContext.setLogLevel("WARN")
|
|
25
|
-
print(f
|
|
45
|
+
print(f"Using PySpark version {spark.version}")
|
|
26
46
|
return spark
|
|
27
47
|
|
|
28
48
|
|
|
@@ -32,14 +52,14 @@ def read_kafka_topic(spark: SparkSession, data_contract: DataContractSpecificati
|
|
|
32
52
|
auth_options = get_auth_options()
|
|
33
53
|
|
|
34
54
|
# read full kafka topic
|
|
35
|
-
df =
|
|
36
|
-
.read
|
|
37
|
-
.
|
|
38
|
-
.
|
|
39
|
-
.option("
|
|
40
|
-
.option("
|
|
41
|
-
.option("startingOffsets", "earliest") \
|
|
55
|
+
df = (
|
|
56
|
+
spark.read.format("kafka")
|
|
57
|
+
.options(**auth_options)
|
|
58
|
+
.option("kafka.bootstrap.servers", host)
|
|
59
|
+
.option("subscribe", topic)
|
|
60
|
+
.option("startingOffsets", "earliest")
|
|
42
61
|
.load()
|
|
62
|
+
)
|
|
43
63
|
# TODO a warning if none or multiple models
|
|
44
64
|
model_name, model = next(iter(data_contract.models.items()))
|
|
45
65
|
if server.format == "avro":
|
|
@@ -73,8 +93,8 @@ def read_kafka_topic(spark: SparkSession, data_contract: DataContractSpecificati
|
|
|
73
93
|
|
|
74
94
|
|
|
75
95
|
def get_auth_options():
|
|
76
|
-
kafka_sasl_username = os.getenv(
|
|
77
|
-
kafka_sasl_password = os.getenv(
|
|
96
|
+
kafka_sasl_username = os.getenv("DATACONTRACT_KAFKA_SASL_USERNAME")
|
|
97
|
+
kafka_sasl_password = os.getenv("DATACONTRACT_KAFKA_SASL_PASSWORD")
|
|
78
98
|
if kafka_sasl_username is None:
|
|
79
99
|
auth_options = {}
|
|
80
100
|
else:
|
|
@@ -130,4 +150,3 @@ def to_struct_field(field_name: str, field: Field) -> StructField:
|
|
|
130
150
|
data_type = DataType()
|
|
131
151
|
|
|
132
152
|
return StructField(field_name, data_type, nullable=not field.required)
|
|
133
|
-
|
|
@@ -10,12 +10,12 @@ def to_postgres_soda_configuration(server):
|
|
|
10
10
|
"type": "postgres",
|
|
11
11
|
"host": server.host,
|
|
12
12
|
"port": str(server.port),
|
|
13
|
-
"username": os.getenv(
|
|
14
|
-
"password": os.getenv(
|
|
13
|
+
"username": os.getenv("DATACONTRACT_POSTGRES_USERNAME"),
|
|
14
|
+
"password": os.getenv("DATACONTRACT_POSTGRES_PASSWORD"),
|
|
15
15
|
"database": server.database,
|
|
16
16
|
"schema": server.schema_,
|
|
17
17
|
}
|
|
18
18
|
}
|
|
19
19
|
|
|
20
20
|
soda_configuration_str = yaml.dump(soda_configuration)
|
|
21
|
-
return soda_configuration_str
|
|
21
|
+
return soda_configuration_str
|
|
@@ -7,13 +7,13 @@ def to_snowflake_soda_configuration(server):
|
|
|
7
7
|
soda_configuration = {
|
|
8
8
|
f"data_source {server.type}": {
|
|
9
9
|
"type": "snowflake",
|
|
10
|
-
"username": os.getenv(
|
|
11
|
-
"password": os.getenv(
|
|
12
|
-
"role": os.getenv(
|
|
10
|
+
"username": os.getenv("DATACONTRACT_SNOWFLAKE_USERNAME"),
|
|
11
|
+
"password": os.getenv("DATACONTRACT_SNOWFLAKE_PASSWORD"),
|
|
12
|
+
"role": os.getenv("DATACONTRACT_SNOWFLAKE_ROLE"),
|
|
13
13
|
"account": server.account,
|
|
14
14
|
"database": server.database,
|
|
15
15
|
"schema": server.schema_,
|
|
16
|
-
"warehouse": os.getenv(
|
|
16
|
+
"warehouse": os.getenv("DATACONTRACT_SNOWFLAKE_WAREHOUSE"),
|
|
17
17
|
"connection_timeout": 5, # minutes
|
|
18
18
|
}
|
|
19
19
|
}
|
|
@@ -6,16 +6,14 @@ from datacontract.model.data_contract_specification import Field
|
|
|
6
6
|
def to_avro_schema(model_name, model) -> dict:
|
|
7
7
|
return to_avro_record(model_name, model.fields, model.description)
|
|
8
8
|
|
|
9
|
+
|
|
9
10
|
def to_avro_schema_json(model_name, model) -> str:
|
|
10
11
|
schema = to_avro_schema(model_name, model)
|
|
11
12
|
return json.dumps(schema, indent=2, sort_keys=False)
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
def to_avro_record(name, fields, description) -> dict:
|
|
15
|
-
schema = {
|
|
16
|
-
"type": "record",
|
|
17
|
-
"name": name
|
|
18
|
-
}
|
|
16
|
+
schema = {"type": "record", "name": name}
|
|
19
17
|
if description is not None:
|
|
20
18
|
schema["doc"] = description
|
|
21
19
|
schema["fields"] = to_avro_fields(fields)
|
|
@@ -30,9 +28,7 @@ def to_avro_fields(fields):
|
|
|
30
28
|
|
|
31
29
|
|
|
32
30
|
def to_avro_field(field, field_name):
|
|
33
|
-
avro_field = {
|
|
34
|
-
"name": field_name
|
|
35
|
-
}
|
|
31
|
+
avro_field = {"name": field_name}
|
|
36
32
|
if field.description is not None:
|
|
37
33
|
avro_field["doc"] = field.description
|
|
38
34
|
avro_field["type"] = to_avro_type(field, field_name)
|
|
@@ -1,12 +1,14 @@
|
|
|
1
|
-
|
|
2
|
-
from datacontract.lint.resolve import inline_definitions_into_data_contract
|
|
1
|
+
import typing
|
|
3
2
|
from dataclasses import dataclass
|
|
4
3
|
from enum import Enum
|
|
5
|
-
import typing
|
|
6
4
|
from io import StringIO
|
|
7
5
|
|
|
6
|
+
from datacontract.lint.resolve import inline_definitions_into_data_contract
|
|
7
|
+
from datacontract.model.data_contract_specification import \
|
|
8
|
+
DataContractSpecification, Field
|
|
8
9
|
from datacontract.model.exceptions import DataContractException
|
|
9
10
|
|
|
11
|
+
|
|
10
12
|
def to_avro_idl(contract: DataContractSpecification) -> str:
|
|
11
13
|
"""Serialize the provided data contract specification into an Avro IDL string.
|
|
12
14
|
|
|
@@ -18,6 +20,7 @@ def to_avro_idl(contract: DataContractSpecification) -> str:
|
|
|
18
20
|
to_avro_idl_stream(contract, stream)
|
|
19
21
|
return stream.getvalue()
|
|
20
22
|
|
|
23
|
+
|
|
21
24
|
def to_avro_idl_stream(contract: DataContractSpecification, stream: typing.TextIO):
|
|
22
25
|
"""Serialize the provided data contract specification into Avro IDL."""
|
|
23
26
|
ir = _contract_to_avro_idl_ir(contract)
|
|
@@ -28,6 +31,7 @@ def to_avro_idl_stream(contract: DataContractSpecification, stream: typing.TextI
|
|
|
28
31
|
_write_model_type(model_type, stream)
|
|
29
32
|
stream.write("}\n")
|
|
30
33
|
|
|
34
|
+
|
|
31
35
|
class AvroPrimitiveType(Enum):
|
|
32
36
|
int = "int"
|
|
33
37
|
long = "long"
|
|
@@ -38,49 +42,71 @@ class AvroPrimitiveType(Enum):
|
|
|
38
42
|
null = "null"
|
|
39
43
|
bytes = "bytes"
|
|
40
44
|
|
|
45
|
+
|
|
41
46
|
class AvroLogicalType(Enum):
|
|
42
47
|
decimal = "decimal"
|
|
43
48
|
date = "date"
|
|
44
49
|
time_ms = "time_ms"
|
|
45
50
|
timestamp_ms = "timestamp_ms"
|
|
46
51
|
|
|
52
|
+
|
|
47
53
|
@dataclass
|
|
48
54
|
class AvroField:
|
|
49
55
|
name: str
|
|
50
56
|
required: bool
|
|
51
57
|
description: typing.Optional[str]
|
|
52
58
|
|
|
59
|
+
|
|
53
60
|
@dataclass
|
|
54
61
|
class AvroPrimitiveField(AvroField):
|
|
55
62
|
type: typing.Union[AvroPrimitiveType, AvroLogicalType]
|
|
56
63
|
|
|
64
|
+
|
|
57
65
|
@dataclass
|
|
58
66
|
class AvroComplexField(AvroField):
|
|
59
67
|
subfields: list[AvroField]
|
|
60
68
|
|
|
69
|
+
|
|
61
70
|
@dataclass
|
|
62
71
|
class AvroArrayField(AvroField):
|
|
63
72
|
type: AvroField
|
|
64
73
|
|
|
74
|
+
|
|
65
75
|
@dataclass
|
|
66
76
|
class AvroModelType:
|
|
67
77
|
name: str
|
|
68
78
|
description: typing.Optional[str]
|
|
69
79
|
fields: list[AvroField]
|
|
70
80
|
|
|
81
|
+
|
|
71
82
|
@dataclass
|
|
72
83
|
class AvroIDLProtocol:
|
|
73
84
|
name: typing.Optional[str]
|
|
74
85
|
description: typing.Optional[str]
|
|
75
86
|
model_types: list[AvroModelType]
|
|
76
87
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
88
|
+
|
|
89
|
+
avro_primitive_types = set(
|
|
90
|
+
[
|
|
91
|
+
"string",
|
|
92
|
+
"text",
|
|
93
|
+
"varchar",
|
|
94
|
+
"float",
|
|
95
|
+
"double",
|
|
96
|
+
"int",
|
|
97
|
+
"integer",
|
|
98
|
+
"long",
|
|
99
|
+
"bigint",
|
|
100
|
+
"boolean",
|
|
101
|
+
"timestamp_ntz",
|
|
102
|
+
"timestamp",
|
|
103
|
+
"timestamp_tz",
|
|
104
|
+
"date",
|
|
105
|
+
"bytes",
|
|
106
|
+
"null",
|
|
107
|
+
]
|
|
108
|
+
)
|
|
109
|
+
|
|
84
110
|
|
|
85
111
|
def _to_avro_primitive_logical_type(field_name: str, field: Field) -> AvroPrimitiveField:
|
|
86
112
|
result = AvroPrimitiveField(field_name, field.required, field.description, AvroPrimitiveType.string)
|
|
@@ -114,10 +140,11 @@ def _to_avro_primitive_logical_type(field_name: str, field: Field) -> AvroPrimit
|
|
|
114
140
|
model=field,
|
|
115
141
|
reason="Unknown field type {field.type}",
|
|
116
142
|
result="failed",
|
|
117
|
-
message="Avro IDL type conversion failed."
|
|
143
|
+
message="Avro IDL type conversion failed.",
|
|
118
144
|
)
|
|
119
145
|
return result
|
|
120
146
|
|
|
147
|
+
|
|
121
148
|
def _to_avro_idl_type(field_name: str, field: Field) -> AvroField:
|
|
122
149
|
if field.type in avro_primitive_types:
|
|
123
150
|
return _to_avro_primitive_logical_type(field_name, field)
|
|
@@ -125,17 +152,14 @@ def _to_avro_idl_type(field_name: str, field: Field) -> AvroField:
|
|
|
125
152
|
match field.type:
|
|
126
153
|
case "array":
|
|
127
154
|
return AvroArrayField(
|
|
128
|
-
field_name,
|
|
129
|
-
field.required,
|
|
130
|
-
field.description,
|
|
131
|
-
_to_avro_idl_type(field_name, field.items)
|
|
155
|
+
field_name, field.required, field.description, _to_avro_idl_type(field_name, field.items)
|
|
132
156
|
)
|
|
133
157
|
case "object" | "record" | "struct":
|
|
134
158
|
return AvroComplexField(
|
|
135
159
|
field_name,
|
|
136
160
|
field.required,
|
|
137
161
|
field.description,
|
|
138
|
-
[_to_avro_idl_type(field_name, field) for (field_name, field) in field.fields.items()]
|
|
162
|
+
[_to_avro_idl_type(field_name, field) for (field_name, field) in field.fields.items()],
|
|
139
163
|
)
|
|
140
164
|
case _:
|
|
141
165
|
raise DataContractException(
|
|
@@ -144,56 +168,55 @@ def _to_avro_idl_type(field_name: str, field: Field) -> AvroField:
|
|
|
144
168
|
model=type,
|
|
145
169
|
reason="Unknown Data Contract field type",
|
|
146
170
|
result="failed",
|
|
147
|
-
message="Avro IDL type conversion failed."
|
|
171
|
+
message="Avro IDL type conversion failed.",
|
|
148
172
|
)
|
|
149
173
|
|
|
150
174
|
|
|
151
175
|
def _generate_field_types(contract: DataContractSpecification) -> list[AvroField]:
|
|
152
176
|
result = []
|
|
153
|
-
for
|
|
154
|
-
for
|
|
177
|
+
for _, model in contract.models.items():
|
|
178
|
+
for field_name, field in model.fields.items():
|
|
155
179
|
result.append(_to_avro_idl_type(field_name, field))
|
|
156
180
|
return result
|
|
157
181
|
|
|
182
|
+
|
|
158
183
|
def generate_model_types(contract: DataContractSpecification) -> list[AvroModelType]:
|
|
159
184
|
result = []
|
|
160
|
-
for
|
|
161
|
-
result.append(
|
|
162
|
-
name=model_name,
|
|
163
|
-
|
|
164
|
-
fields=_generate_field_types(contract)
|
|
165
|
-
))
|
|
185
|
+
for model_name, model in contract.models.items():
|
|
186
|
+
result.append(
|
|
187
|
+
AvroModelType(name=model_name, description=model.description, fields=_generate_field_types(contract))
|
|
188
|
+
)
|
|
166
189
|
return result
|
|
167
190
|
|
|
191
|
+
|
|
168
192
|
def _model_name_to_identifier(model_name: str):
|
|
169
|
-
return "".join([word.title() for word in
|
|
193
|
+
return "".join([word.title() for word in model_name.split()])
|
|
170
194
|
|
|
171
|
-
def _contract_to_avro_idl_ir(contract: DataContractSpecification) -> AvroIDLProtocol:
|
|
172
195
|
|
|
196
|
+
def _contract_to_avro_idl_ir(contract: DataContractSpecification) -> AvroIDLProtocol:
|
|
173
197
|
"""Convert models into an intermediate representation for later serialization into Avro IDL.
|
|
174
198
|
|
|
175
|
-
|
|
176
|
-
|
|
199
|
+
Each model is converted to a record containing a field for each model field.
|
|
200
|
+
"""
|
|
177
201
|
inlined_contract = contract.model_copy()
|
|
178
202
|
inline_definitions_into_data_contract(inlined_contract)
|
|
179
|
-
protocol_name =
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
return AvroIDLProtocol(name=protocol_name,
|
|
186
|
-
description=description,
|
|
187
|
-
model_types=generate_model_types(inlined_contract))
|
|
203
|
+
protocol_name = _model_name_to_identifier(contract.info.title) if contract.info and contract.info.title else None
|
|
204
|
+
description = contract.info.description if contract.info and contract.info.description else None
|
|
205
|
+
return AvroIDLProtocol(
|
|
206
|
+
name=protocol_name, description=description, model_types=generate_model_types(inlined_contract)
|
|
207
|
+
)
|
|
208
|
+
|
|
188
209
|
|
|
189
210
|
def _write_indent(indent: int, stream: typing.TextIO):
|
|
190
211
|
stream.write(" " * indent)
|
|
191
212
|
|
|
213
|
+
|
|
192
214
|
def _write_field_description(field: AvroField, indent: int, stream: typing.TextIO):
|
|
193
215
|
if field.description:
|
|
194
216
|
_write_indent(indent, stream)
|
|
195
217
|
stream.write(f"/** {field.description} */\n")
|
|
196
218
|
|
|
219
|
+
|
|
197
220
|
def _write_field_type_definition(field: AvroField, indent: int, stream: typing.TextIO) -> str:
|
|
198
221
|
# Write any extra information (such as record type definition) and return
|
|
199
222
|
# the name of the generated type. Writes descriptions only for record
|
|
@@ -215,7 +238,7 @@ def _write_field_type_definition(field: AvroField, indent: int, stream: typing.T
|
|
|
215
238
|
for subfield in subfields:
|
|
216
239
|
subfield_types.append(_write_field_type_definition(subfield, indent + 1, stream))
|
|
217
240
|
# Reference all defined record types.
|
|
218
|
-
for
|
|
241
|
+
for field, subfield_type in zip(field.subfields, subfield_types):
|
|
219
242
|
_write_field_description(field, indent + 1, stream)
|
|
220
243
|
_write_indent(indent + 1, stream)
|
|
221
244
|
stream.write(f"{subfield_type} {field.name};\n")
|
|
@@ -234,15 +257,15 @@ def _write_field_type_definition(field: AvroField, indent: int, stream: typing.T
|
|
|
234
257
|
case _:
|
|
235
258
|
raise RuntimeError("Unknown Avro field type {field}")
|
|
236
259
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
stream: typing.TextIO):
|
|
260
|
+
|
|
261
|
+
def _write_field(field: AvroField, indent, stream: typing.TextIO):
|
|
240
262
|
# Start of recursion.
|
|
241
263
|
typename = _write_field_type_definition(field, indent, stream)
|
|
242
264
|
_write_field_description(field, indent, stream)
|
|
243
265
|
_write_indent(indent, stream)
|
|
244
266
|
stream.write(f"{typename} {field.name};\n")
|
|
245
267
|
|
|
268
|
+
|
|
246
269
|
def _write_model_type(model: AvroModelType, stream: typing.TextIO):
|
|
247
270
|
# Called once for each model
|
|
248
271
|
if model.description:
|