datacontract-cli 0.9.7__py3-none-any.whl → 0.9.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/breaking/breaking.py +48 -57
- datacontract/cli.py +100 -80
- datacontract/data_contract.py +178 -128
- datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +5 -1
- datacontract/engines/datacontract/check_that_datacontract_file_exists.py +9 -8
- datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +26 -22
- datacontract/engines/fastjsonschema/check_jsonschema.py +31 -25
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +8 -6
- datacontract/engines/soda/check_soda_execute.py +58 -36
- datacontract/engines/soda/connections/bigquery.py +5 -3
- datacontract/engines/soda/connections/dask.py +0 -1
- datacontract/engines/soda/connections/databricks.py +2 -2
- datacontract/engines/soda/connections/duckdb.py +25 -8
- datacontract/engines/soda/connections/kafka.py +36 -17
- datacontract/engines/soda/connections/postgres.py +3 -3
- datacontract/engines/soda/connections/snowflake.py +4 -4
- datacontract/export/avro_converter.py +9 -11
- datacontract/export/avro_idl_converter.py +65 -42
- datacontract/export/csv_type_converter.py +36 -0
- datacontract/export/dbt_converter.py +43 -32
- datacontract/export/great_expectations_converter.py +141 -0
- datacontract/export/html_export.py +46 -0
- datacontract/export/jsonschema_converter.py +3 -1
- datacontract/export/odcs_converter.py +5 -7
- datacontract/export/protobuf_converter.py +12 -10
- datacontract/export/pydantic_converter.py +131 -0
- datacontract/export/rdf_converter.py +34 -11
- datacontract/export/sodacl_converter.py +118 -21
- datacontract/export/sql_converter.py +30 -8
- datacontract/export/sql_type_converter.py +44 -4
- datacontract/export/terraform_converter.py +4 -3
- datacontract/imports/avro_importer.py +65 -18
- datacontract/imports/sql_importer.py +0 -2
- datacontract/init/download_datacontract_file.py +2 -2
- datacontract/integration/publish_datamesh_manager.py +6 -12
- datacontract/integration/publish_opentelemetry.py +30 -16
- datacontract/lint/files.py +2 -2
- datacontract/lint/lint.py +26 -31
- datacontract/lint/linters/description_linter.py +12 -21
- datacontract/lint/linters/example_model_linter.py +28 -29
- datacontract/lint/linters/field_pattern_linter.py +8 -8
- datacontract/lint/linters/field_reference_linter.py +11 -10
- datacontract/lint/linters/notice_period_linter.py +18 -22
- datacontract/lint/linters/quality_schema_linter.py +16 -20
- datacontract/lint/linters/valid_constraints_linter.py +42 -37
- datacontract/lint/resolve.py +50 -14
- datacontract/lint/schema.py +2 -3
- datacontract/lint/urls.py +4 -5
- datacontract/model/breaking_change.py +2 -1
- datacontract/model/data_contract_specification.py +8 -7
- datacontract/model/exceptions.py +13 -2
- datacontract/model/run.py +3 -2
- datacontract/web.py +3 -7
- datacontract_cli-0.9.9.dist-info/METADATA +951 -0
- datacontract_cli-0.9.9.dist-info/RECORD +64 -0
- datacontract/lint/linters/primary_field_linter.py +0 -30
- datacontract_cli-0.9.7.dist-info/METADATA +0 -603
- datacontract_cli-0.9.7.dist-info/RECORD +0 -61
- {datacontract_cli-0.9.7.dist-info → datacontract_cli-0.9.9.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.9.7.dist-info → datacontract_cli-0.9.9.dist-info}/WHEEL +0 -0
- {datacontract_cli-0.9.7.dist-info → datacontract_cli-0.9.9.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.9.7.dist-info → datacontract_cli-0.9.9.dist-info}/top_level.txt +0 -0
|
@@ -25,7 +25,7 @@ def validate_json_stream(model_name, validate, json_stream):
|
|
|
25
25
|
model=model_name,
|
|
26
26
|
reason=e.message,
|
|
27
27
|
engine="jsonschema",
|
|
28
|
-
original_exception=e
|
|
28
|
+
original_exception=e,
|
|
29
29
|
)
|
|
30
30
|
|
|
31
31
|
|
|
@@ -79,16 +79,16 @@ def process_local_file(run, server, model_name, validate):
|
|
|
79
79
|
return process_directory(run, path, server, model_name, validate)
|
|
80
80
|
else:
|
|
81
81
|
logging.info(f"Processing file {path}")
|
|
82
|
-
with open(path,
|
|
82
|
+
with open(path, "r") as file:
|
|
83
83
|
process_json_file(run, model_name, validate, file, server.delimiter)
|
|
84
84
|
|
|
85
85
|
|
|
86
86
|
def process_directory(run, path, server, model_name, validate):
|
|
87
87
|
success = True
|
|
88
88
|
for filename in os.listdir(path):
|
|
89
|
-
if filename.endswith(
|
|
89
|
+
if filename.endswith(".json"): # or make this a parameter
|
|
90
90
|
file_path = os.path.join(path, filename)
|
|
91
|
-
with open(file_path,
|
|
91
|
+
with open(file_path, "r") as file:
|
|
92
92
|
if not process_json_file(run, model_name, validate, file, server.delimiter):
|
|
93
93
|
success = False
|
|
94
94
|
break
|
|
@@ -127,13 +127,15 @@ def check_jsonschema(run: Run, data_contract: DataContractSpecification, server:
|
|
|
127
127
|
|
|
128
128
|
# Early exit conditions
|
|
129
129
|
if server.format != "json":
|
|
130
|
-
run.checks.append(
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
130
|
+
run.checks.append(
|
|
131
|
+
Check(
|
|
132
|
+
type="schema",
|
|
133
|
+
name="Check that JSON has valid schema",
|
|
134
|
+
result="warning",
|
|
135
|
+
reason="Server format is not 'json'. Skip validating jsonschema.",
|
|
136
|
+
engine="jsonschema",
|
|
137
|
+
)
|
|
138
|
+
)
|
|
137
139
|
run.log_warn("jsonschema: Server format is not 'json'. Skip jsonschema checks.")
|
|
138
140
|
return
|
|
139
141
|
|
|
@@ -155,21 +157,25 @@ def check_jsonschema(run: Run, data_contract: DataContractSpecification, server:
|
|
|
155
157
|
elif server.type == "s3":
|
|
156
158
|
process_s3_file(server, model_name, validate)
|
|
157
159
|
else:
|
|
158
|
-
run.checks.append(
|
|
160
|
+
run.checks.append(
|
|
161
|
+
Check(
|
|
162
|
+
type="schema",
|
|
163
|
+
name="Check that JSON has valid schema",
|
|
164
|
+
model=model_name,
|
|
165
|
+
result="warn",
|
|
166
|
+
reason=f"Server type {server.type} not supported",
|
|
167
|
+
engine="jsonschema",
|
|
168
|
+
)
|
|
169
|
+
)
|
|
170
|
+
return
|
|
171
|
+
|
|
172
|
+
run.checks.append(
|
|
173
|
+
Check(
|
|
159
174
|
type="schema",
|
|
160
175
|
name="Check that JSON has valid schema",
|
|
161
176
|
model=model_name,
|
|
162
|
-
result="
|
|
163
|
-
reason=
|
|
177
|
+
result="passed",
|
|
178
|
+
reason="All JSON entries are valid.",
|
|
164
179
|
engine="jsonschema",
|
|
165
|
-
)
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
run.checks.append(Check(
|
|
169
|
-
type="schema",
|
|
170
|
-
name="Check that JSON has valid schema",
|
|
171
|
-
model=model_name,
|
|
172
|
-
result="passed",
|
|
173
|
-
reason="All JSON entries are valid.",
|
|
174
|
-
engine="jsonschema",
|
|
175
|
-
))
|
|
180
|
+
)
|
|
181
|
+
)
|
|
@@ -14,9 +14,11 @@ def yield_s3_files(s3_endpoint_url, s3_location):
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def s3_fs(s3_endpoint_url):
|
|
17
|
-
aws_access_key_id = os.getenv(
|
|
18
|
-
aws_secret_access_key = os.getenv(
|
|
19
|
-
return s3fs.S3FileSystem(
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
17
|
+
aws_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
|
|
18
|
+
aws_secret_access_key = os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY")
|
|
19
|
+
return s3fs.S3FileSystem(
|
|
20
|
+
key=aws_access_key_id,
|
|
21
|
+
secret=aws_secret_access_key,
|
|
22
|
+
anon=aws_access_key_id is None,
|
|
23
|
+
client_kwargs={"endpoint_url": s3_endpoint_url},
|
|
24
|
+
)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
|
|
3
|
+
from pyspark.sql import SparkSession
|
|
3
4
|
from soda.scan import Scan
|
|
4
5
|
|
|
5
6
|
from datacontract.engines.soda.connections.bigquery import \
|
|
@@ -16,11 +17,12 @@ from datacontract.engines.soda.connections.snowflake import \
|
|
|
16
17
|
from datacontract.export.sodacl_converter import to_sodacl_yaml
|
|
17
18
|
from datacontract.model.data_contract_specification import \
|
|
18
19
|
DataContractSpecification, Server
|
|
19
|
-
from datacontract.model.run import
|
|
20
|
-
Run, Check, Log
|
|
20
|
+
from datacontract.model.run import Run, Check, Log
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
def check_soda_execute(
|
|
23
|
+
def check_soda_execute(
|
|
24
|
+
run: Run, data_contract: DataContractSpecification, server: Server, spark: SparkSession, tmp_dir
|
|
25
|
+
):
|
|
24
26
|
if data_contract is None:
|
|
25
27
|
run.log_warn("Cannot run engine soda-core, as data contract is invalid")
|
|
26
28
|
return
|
|
@@ -34,13 +36,15 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
34
36
|
scan.add_duckdb_connection(duckdb_connection=con, data_source_name=server.type)
|
|
35
37
|
scan.set_data_source_name(server.type)
|
|
36
38
|
else:
|
|
37
|
-
run.checks.append(
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
39
|
+
run.checks.append(
|
|
40
|
+
Check(
|
|
41
|
+
type="general",
|
|
42
|
+
name="Check that format is supported",
|
|
43
|
+
result="warning",
|
|
44
|
+
reason=f"Format {server.format} not yet supported by datacontract CLI",
|
|
45
|
+
engine="datacontract",
|
|
46
|
+
)
|
|
47
|
+
)
|
|
44
48
|
run.log_warn(f"Format {server.format} not yet supported by datacontract CLI")
|
|
45
49
|
return
|
|
46
50
|
elif server.type == "snowflake":
|
|
@@ -73,21 +77,25 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
73
77
|
scan.set_data_source_name(server.type)
|
|
74
78
|
|
|
75
79
|
else:
|
|
76
|
-
run.checks.append(
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
80
|
+
run.checks.append(
|
|
81
|
+
Check(
|
|
82
|
+
type="general",
|
|
83
|
+
name="Check that server type is supported",
|
|
84
|
+
result="warning",
|
|
85
|
+
reason=f"Server type {server.type} not yet supported by datacontract CLI",
|
|
86
|
+
engine="datacontract-cli",
|
|
87
|
+
)
|
|
88
|
+
)
|
|
83
89
|
run.log_warn(f"Server type {server.type} not yet supported by datacontract CLI")
|
|
84
90
|
return
|
|
85
91
|
|
|
86
92
|
# Don't check types for json format, as they are checked with json schema
|
|
87
93
|
# Don't check types for avro format, as they are checked with avro schema
|
|
88
94
|
# Don't check types for csv format, as they are hard to detect
|
|
95
|
+
server_type = server.type
|
|
89
96
|
check_types = server.format != "json" and server.format != "csv" and server.format != "avro"
|
|
90
|
-
|
|
97
|
+
|
|
98
|
+
sodacl_yaml_str = to_sodacl_yaml(data_contract, server_type, check_types)
|
|
91
99
|
# print("sodacl_yaml_str:\n" + sodacl_yaml_str)
|
|
92
100
|
scan.add_sodacl_yaml_str(sodacl_yaml_str)
|
|
93
101
|
|
|
@@ -102,46 +110,60 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
|
|
|
102
110
|
for c in scan_results.get("checks"):
|
|
103
111
|
check = Check(
|
|
104
112
|
type="schema",
|
|
105
|
-
result=
|
|
106
|
-
|
|
107
|
-
reason=', '.join(c.get("outcomeReasons")),
|
|
113
|
+
result=to_result(c),
|
|
114
|
+
reason=", ".join(c.get("outcomeReasons")),
|
|
108
115
|
name=c.get("name"),
|
|
109
116
|
model=c.get("table"),
|
|
110
117
|
field=c.get("column"),
|
|
111
118
|
engine="soda-core",
|
|
119
|
+
diagnostics=c.get("diagnostics"),
|
|
112
120
|
)
|
|
113
121
|
update_reason(check, c)
|
|
114
122
|
run.checks.append(check)
|
|
115
123
|
|
|
116
124
|
for log in scan_results.get("logs"):
|
|
117
|
-
run.logs.append(
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
125
|
+
run.logs.append(
|
|
126
|
+
Log(
|
|
127
|
+
timestamp=log.get("timestamp"),
|
|
128
|
+
level=log.get("level"),
|
|
129
|
+
message=log.get("message"),
|
|
130
|
+
)
|
|
131
|
+
)
|
|
122
132
|
|
|
123
133
|
if scan.has_error_logs():
|
|
124
134
|
run.log_warn("Engine soda-core has errors. See the logs for details.")
|
|
125
|
-
run.checks.append(
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
135
|
+
run.checks.append(
|
|
136
|
+
Check(
|
|
137
|
+
type="general",
|
|
138
|
+
name="Execute quality checks",
|
|
139
|
+
result="warning",
|
|
140
|
+
reason="Engine soda-core has errors. See the logs for details.",
|
|
141
|
+
engine="soda-core",
|
|
142
|
+
)
|
|
143
|
+
)
|
|
132
144
|
return
|
|
133
145
|
|
|
134
146
|
|
|
147
|
+
def to_result(c) -> str:
|
|
148
|
+
soda_outcome = c.get("outcome")
|
|
149
|
+
if soda_outcome == "pass":
|
|
150
|
+
return "passed"
|
|
151
|
+
elif soda_outcome == "fail":
|
|
152
|
+
return "failed"
|
|
153
|
+
else:
|
|
154
|
+
return soda_outcome
|
|
155
|
+
|
|
156
|
+
|
|
135
157
|
def update_reason(check, c):
|
|
136
158
|
"""Try to find a reason in diagnostics"""
|
|
137
159
|
if check.result == "passed":
|
|
138
160
|
return
|
|
139
161
|
if check.reason is not None and check.reason != "":
|
|
140
162
|
return
|
|
141
|
-
for block in c[
|
|
142
|
-
if block[
|
|
163
|
+
for block in c["diagnostics"]["blocks"]:
|
|
164
|
+
if block["title"] == "Diagnostics":
|
|
143
165
|
# Extract and print the 'text' value
|
|
144
|
-
diagnostics_text = block[
|
|
166
|
+
diagnostics_text = block["text"]
|
|
145
167
|
print(diagnostics_text)
|
|
146
168
|
diagnostics_text_split = diagnostics_text.split(":icon-fail: ")
|
|
147
169
|
if len(diagnostics_text_split) > 1:
|
|
@@ -1,18 +1,20 @@
|
|
|
1
1
|
import os
|
|
2
|
+
|
|
2
3
|
import yaml
|
|
3
4
|
|
|
5
|
+
|
|
4
6
|
# https://docs.soda.io/soda/connect-bigquery.html#authentication-methods
|
|
5
7
|
def to_bigquery_soda_configuration(server):
|
|
6
8
|
# with service account key, using an external json file
|
|
7
9
|
soda_configuration = {
|
|
8
10
|
f"data_source {server.type}": {
|
|
9
11
|
"type": "bigquery",
|
|
10
|
-
"account_info_json_path": os.getenv(
|
|
12
|
+
"account_info_json_path": os.getenv("DATACONTRACT_BIGQUERY_ACCOUNT_INFO_JSON_PATH"),
|
|
11
13
|
"auth_scopes": ["https://www.googleapis.com/auth/bigquery"],
|
|
12
14
|
"project_id": server.project,
|
|
13
|
-
"dataset": server.dataset
|
|
15
|
+
"dataset": server.dataset,
|
|
14
16
|
}
|
|
15
17
|
}
|
|
16
18
|
|
|
17
19
|
soda_configuration_str = yaml.dump(soda_configuration)
|
|
18
|
-
return soda_configuration_str
|
|
20
|
+
return soda_configuration_str
|
|
@@ -11,8 +11,8 @@ def to_databricks_soda_configuration(server):
|
|
|
11
11
|
"host": server.host,
|
|
12
12
|
"catalog": server.catalog,
|
|
13
13
|
"schema": server.schema_,
|
|
14
|
-
"http_path": os.getenv(
|
|
15
|
-
"token": os.getenv(
|
|
14
|
+
"http_path": os.getenv("DATACONTRACT_DATABRICKS_HTTP_PATH"),
|
|
15
|
+
"token": os.getenv("DATACONTRACT_DATABRICKS_TOKEN"),
|
|
16
16
|
}
|
|
17
17
|
}
|
|
18
18
|
|
|
@@ -2,6 +2,7 @@ import logging
|
|
|
2
2
|
import os
|
|
3
3
|
|
|
4
4
|
import duckdb
|
|
5
|
+
from datacontract.export.csv_type_converter import convert_to_duckdb_csv_type
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
def get_duckdb_connection(data_contract, server):
|
|
@@ -12,10 +13,10 @@ def get_duckdb_connection(data_contract, server):
|
|
|
12
13
|
if server.type == "s3":
|
|
13
14
|
path = server.location
|
|
14
15
|
setup_s3_connection(con, server)
|
|
15
|
-
for model_name in data_contract.models:
|
|
16
|
+
for model_name, model in data_contract.models.items():
|
|
16
17
|
model_path = path
|
|
17
18
|
if "{model}" in model_path:
|
|
18
|
-
model_path = model_path.format(model
|
|
19
|
+
model_path = model_path.format(model=model_name)
|
|
19
20
|
logging.info(f"Creating table {model_name} for {model_path}")
|
|
20
21
|
|
|
21
22
|
if server.format == "json":
|
|
@@ -32,16 +33,32 @@ def get_duckdb_connection(data_contract, server):
|
|
|
32
33
|
CREATE VIEW "{model_name}" AS SELECT * FROM read_parquet('{model_path}', hive_partitioning=1);
|
|
33
34
|
""")
|
|
34
35
|
elif server.format == "csv":
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
36
|
+
columns = to_csv_types(model)
|
|
37
|
+
if columns is None:
|
|
38
|
+
con.sql(
|
|
39
|
+
f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1);"""
|
|
40
|
+
)
|
|
41
|
+
else:
|
|
42
|
+
con.sql(
|
|
43
|
+
f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1, columns={columns});"""
|
|
44
|
+
)
|
|
38
45
|
return con
|
|
39
46
|
|
|
40
47
|
|
|
48
|
+
def to_csv_types(model) -> dict:
|
|
49
|
+
if model is None:
|
|
50
|
+
return None
|
|
51
|
+
columns = {}
|
|
52
|
+
# ['SQLNULL', 'BOOLEAN', 'BIGINT', 'DOUBLE', 'TIME', 'DATE', 'TIMESTAMP', 'VARCHAR']
|
|
53
|
+
for field_name, field in model.fields.items():
|
|
54
|
+
columns[field_name] = convert_to_duckdb_csv_type(field)
|
|
55
|
+
return columns
|
|
56
|
+
|
|
57
|
+
|
|
41
58
|
def setup_s3_connection(con, server):
|
|
42
|
-
s3_region = os.getenv(
|
|
43
|
-
s3_access_key_id = os.getenv(
|
|
44
|
-
s3_secret_access_key = os.getenv(
|
|
59
|
+
s3_region = os.getenv("DATACONTRACT_S3_REGION")
|
|
60
|
+
s3_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
|
|
61
|
+
s3_secret_access_key = os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY")
|
|
45
62
|
# con.install_extension("httpfs")
|
|
46
63
|
# con.load_extension("httpfs")
|
|
47
64
|
if server.endpointUrl is not None:
|
|
@@ -4,7 +4,23 @@ import pyspark.sql.functions as fn
|
|
|
4
4
|
from pyspark.sql import SparkSession
|
|
5
5
|
from pyspark.sql.avro.functions import from_avro
|
|
6
6
|
from pyspark.sql.functions import from_json, col
|
|
7
|
-
from pyspark.sql.types import
|
|
7
|
+
from pyspark.sql.types import (
|
|
8
|
+
StructType,
|
|
9
|
+
DataType,
|
|
10
|
+
NullType,
|
|
11
|
+
ArrayType,
|
|
12
|
+
BinaryType,
|
|
13
|
+
DateType,
|
|
14
|
+
TimestampNTZType,
|
|
15
|
+
TimestampType,
|
|
16
|
+
BooleanType,
|
|
17
|
+
LongType,
|
|
18
|
+
IntegerType,
|
|
19
|
+
DoubleType,
|
|
20
|
+
DecimalType,
|
|
21
|
+
StringType,
|
|
22
|
+
StructField,
|
|
23
|
+
)
|
|
8
24
|
|
|
9
25
|
from datacontract.export.avro_converter import to_avro_schema_json
|
|
10
26
|
from datacontract.model.data_contract_specification import \
|
|
@@ -15,14 +31,18 @@ from datacontract.model.exceptions import DataContractException
|
|
|
15
31
|
def create_spark_session(tmp_dir) -> SparkSession:
|
|
16
32
|
# TODO: Update dependency versions when updating pyspark
|
|
17
33
|
# TODO: add protobuf library
|
|
18
|
-
spark =
|
|
19
|
-
.
|
|
20
|
-
.config("spark.
|
|
21
|
-
.config(
|
|
22
|
-
|
|
34
|
+
spark = (
|
|
35
|
+
SparkSession.builder.appName("datacontract")
|
|
36
|
+
.config("spark.sql.warehouse.dir", tmp_dir + "/spark-warehouse")
|
|
37
|
+
.config("spark.streaming.stopGracefullyOnShutdown", True)
|
|
38
|
+
.config(
|
|
39
|
+
"spark.jars.packages",
|
|
40
|
+
"org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,org.apache.spark:spark-avro_2.12:3.5.0",
|
|
41
|
+
)
|
|
23
42
|
.getOrCreate()
|
|
43
|
+
)
|
|
24
44
|
spark.sparkContext.setLogLevel("WARN")
|
|
25
|
-
print(f
|
|
45
|
+
print(f"Using PySpark version {spark.version}")
|
|
26
46
|
return spark
|
|
27
47
|
|
|
28
48
|
|
|
@@ -32,14 +52,14 @@ def read_kafka_topic(spark: SparkSession, data_contract: DataContractSpecificati
|
|
|
32
52
|
auth_options = get_auth_options()
|
|
33
53
|
|
|
34
54
|
# read full kafka topic
|
|
35
|
-
df =
|
|
36
|
-
.read
|
|
37
|
-
.
|
|
38
|
-
.
|
|
39
|
-
.option("
|
|
40
|
-
.option("
|
|
41
|
-
.option("startingOffsets", "earliest") \
|
|
55
|
+
df = (
|
|
56
|
+
spark.read.format("kafka")
|
|
57
|
+
.options(**auth_options)
|
|
58
|
+
.option("kafka.bootstrap.servers", host)
|
|
59
|
+
.option("subscribe", topic)
|
|
60
|
+
.option("startingOffsets", "earliest")
|
|
42
61
|
.load()
|
|
62
|
+
)
|
|
43
63
|
# TODO a warning if none or multiple models
|
|
44
64
|
model_name, model = next(iter(data_contract.models.items()))
|
|
45
65
|
if server.format == "avro":
|
|
@@ -73,8 +93,8 @@ def read_kafka_topic(spark: SparkSession, data_contract: DataContractSpecificati
|
|
|
73
93
|
|
|
74
94
|
|
|
75
95
|
def get_auth_options():
|
|
76
|
-
kafka_sasl_username = os.getenv(
|
|
77
|
-
kafka_sasl_password = os.getenv(
|
|
96
|
+
kafka_sasl_username = os.getenv("DATACONTRACT_KAFKA_SASL_USERNAME")
|
|
97
|
+
kafka_sasl_password = os.getenv("DATACONTRACT_KAFKA_SASL_PASSWORD")
|
|
78
98
|
if kafka_sasl_username is None:
|
|
79
99
|
auth_options = {}
|
|
80
100
|
else:
|
|
@@ -130,4 +150,3 @@ def to_struct_field(field_name: str, field: Field) -> StructField:
|
|
|
130
150
|
data_type = DataType()
|
|
131
151
|
|
|
132
152
|
return StructField(field_name, data_type, nullable=not field.required)
|
|
133
|
-
|
|
@@ -10,12 +10,12 @@ def to_postgres_soda_configuration(server):
|
|
|
10
10
|
"type": "postgres",
|
|
11
11
|
"host": server.host,
|
|
12
12
|
"port": str(server.port),
|
|
13
|
-
"username": os.getenv(
|
|
14
|
-
"password": os.getenv(
|
|
13
|
+
"username": os.getenv("DATACONTRACT_POSTGRES_USERNAME"),
|
|
14
|
+
"password": os.getenv("DATACONTRACT_POSTGRES_PASSWORD"),
|
|
15
15
|
"database": server.database,
|
|
16
16
|
"schema": server.schema_,
|
|
17
17
|
}
|
|
18
18
|
}
|
|
19
19
|
|
|
20
20
|
soda_configuration_str = yaml.dump(soda_configuration)
|
|
21
|
-
return soda_configuration_str
|
|
21
|
+
return soda_configuration_str
|
|
@@ -7,13 +7,13 @@ def to_snowflake_soda_configuration(server):
|
|
|
7
7
|
soda_configuration = {
|
|
8
8
|
f"data_source {server.type}": {
|
|
9
9
|
"type": "snowflake",
|
|
10
|
-
"username": os.getenv(
|
|
11
|
-
"password": os.getenv(
|
|
12
|
-
"role": os.getenv(
|
|
10
|
+
"username": os.getenv("DATACONTRACT_SNOWFLAKE_USERNAME"),
|
|
11
|
+
"password": os.getenv("DATACONTRACT_SNOWFLAKE_PASSWORD"),
|
|
12
|
+
"role": os.getenv("DATACONTRACT_SNOWFLAKE_ROLE"),
|
|
13
13
|
"account": server.account,
|
|
14
14
|
"database": server.database,
|
|
15
15
|
"schema": server.schema_,
|
|
16
|
-
"warehouse": os.getenv(
|
|
16
|
+
"warehouse": os.getenv("DATACONTRACT_SNOWFLAKE_WAREHOUSE"),
|
|
17
17
|
"connection_timeout": 5, # minutes
|
|
18
18
|
}
|
|
19
19
|
}
|
|
@@ -4,20 +4,20 @@ from datacontract.model.data_contract_specification import Field
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def to_avro_schema(model_name, model) -> dict:
|
|
7
|
-
return to_avro_record(model_name, model.fields, model.description)
|
|
7
|
+
return to_avro_record(model_name, model.fields, model.description, model.namespace)
|
|
8
|
+
|
|
8
9
|
|
|
9
10
|
def to_avro_schema_json(model_name, model) -> str:
|
|
10
11
|
schema = to_avro_schema(model_name, model)
|
|
11
12
|
return json.dumps(schema, indent=2, sort_keys=False)
|
|
12
13
|
|
|
13
14
|
|
|
14
|
-
def to_avro_record(name, fields, description) -> dict:
|
|
15
|
-
schema = {
|
|
16
|
-
"type": "record",
|
|
17
|
-
"name": name
|
|
18
|
-
}
|
|
15
|
+
def to_avro_record(name, fields, description, namespace) -> dict:
|
|
16
|
+
schema = {"type": "record", "name": name}
|
|
19
17
|
if description is not None:
|
|
20
18
|
schema["doc"] = description
|
|
19
|
+
if namespace is not None:
|
|
20
|
+
schema["namespace"] = namespace
|
|
21
21
|
schema["fields"] = to_avro_fields(fields)
|
|
22
22
|
return schema
|
|
23
23
|
|
|
@@ -30,16 +30,14 @@ def to_avro_fields(fields):
|
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
def to_avro_field(field, field_name):
|
|
33
|
-
avro_field = {
|
|
34
|
-
"name": field_name
|
|
35
|
-
}
|
|
33
|
+
avro_field = {"name": field_name}
|
|
36
34
|
if field.description is not None:
|
|
37
35
|
avro_field["doc"] = field.description
|
|
38
36
|
avro_field["type"] = to_avro_type(field, field_name)
|
|
39
37
|
return avro_field
|
|
40
38
|
|
|
41
39
|
|
|
42
|
-
def to_avro_type(field: Field, field_name: str):
|
|
40
|
+
def to_avro_type(field: Field, field_name: str) -> str | dict:
|
|
43
41
|
if field.type is None:
|
|
44
42
|
return "null"
|
|
45
43
|
if field.type in ["string", "varchar", "text"]:
|
|
@@ -64,7 +62,7 @@ def to_avro_type(field: Field, field_name: str):
|
|
|
64
62
|
elif field.type in ["time"]:
|
|
65
63
|
return "long"
|
|
66
64
|
elif field.type in ["object", "record", "struct"]:
|
|
67
|
-
return to_avro_record(field_name, field.fields, field.description)
|
|
65
|
+
return to_avro_record(field_name, field.fields, field.description, None)
|
|
68
66
|
elif field.type in ["binary"]:
|
|
69
67
|
return "bytes"
|
|
70
68
|
elif field.type in ["array"]:
|