datacontract-cli 0.10.0__py3-none-any.whl → 0.10.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datacontract/__init__.py +13 -0
- datacontract/api.py +260 -0
- datacontract/breaking/breaking.py +242 -12
- datacontract/breaking/breaking_rules.py +37 -1
- datacontract/catalog/catalog.py +80 -0
- datacontract/cli.py +387 -117
- datacontract/data_contract.py +216 -353
- datacontract/engines/data_contract_checks.py +1041 -0
- datacontract/engines/data_contract_test.py +113 -0
- datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +2 -3
- datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
- datacontract/engines/fastjsonschema/check_jsonschema.py +176 -42
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +16 -1
- datacontract/engines/soda/check_soda_execute.py +100 -56
- datacontract/engines/soda/connections/athena.py +79 -0
- datacontract/engines/soda/connections/bigquery.py +8 -1
- datacontract/engines/soda/connections/databricks.py +12 -3
- datacontract/engines/soda/connections/duckdb_connection.py +241 -0
- datacontract/engines/soda/connections/kafka.py +206 -113
- datacontract/engines/soda/connections/snowflake.py +8 -5
- datacontract/engines/soda/connections/sqlserver.py +43 -0
- datacontract/engines/soda/connections/trino.py +26 -0
- datacontract/export/avro_converter.py +72 -8
- datacontract/export/avro_idl_converter.py +31 -25
- datacontract/export/bigquery_converter.py +130 -0
- datacontract/export/custom_converter.py +40 -0
- datacontract/export/data_caterer_converter.py +161 -0
- datacontract/export/dbml_converter.py +148 -0
- datacontract/export/dbt_converter.py +141 -54
- datacontract/export/dcs_exporter.py +6 -0
- datacontract/export/dqx_converter.py +126 -0
- datacontract/export/duckdb_type_converter.py +57 -0
- datacontract/export/excel_exporter.py +923 -0
- datacontract/export/exporter.py +100 -0
- datacontract/export/exporter_factory.py +216 -0
- datacontract/export/go_converter.py +105 -0
- datacontract/export/great_expectations_converter.py +257 -36
- datacontract/export/html_exporter.py +86 -0
- datacontract/export/iceberg_converter.py +188 -0
- datacontract/export/jsonschema_converter.py +71 -16
- datacontract/export/markdown_converter.py +337 -0
- datacontract/export/mermaid_exporter.py +110 -0
- datacontract/export/odcs_v3_exporter.py +375 -0
- datacontract/export/pandas_type_converter.py +40 -0
- datacontract/export/protobuf_converter.py +168 -68
- datacontract/export/pydantic_converter.py +6 -0
- datacontract/export/rdf_converter.py +13 -6
- datacontract/export/sodacl_converter.py +36 -188
- datacontract/export/spark_converter.py +245 -0
- datacontract/export/sql_converter.py +37 -3
- datacontract/export/sql_type_converter.py +269 -8
- datacontract/export/sqlalchemy_converter.py +170 -0
- datacontract/export/terraform_converter.py +7 -2
- datacontract/imports/avro_importer.py +246 -26
- datacontract/imports/bigquery_importer.py +221 -0
- datacontract/imports/csv_importer.py +143 -0
- datacontract/imports/dbml_importer.py +112 -0
- datacontract/imports/dbt_importer.py +240 -0
- datacontract/imports/excel_importer.py +1111 -0
- datacontract/imports/glue_importer.py +288 -0
- datacontract/imports/iceberg_importer.py +172 -0
- datacontract/imports/importer.py +51 -0
- datacontract/imports/importer_factory.py +128 -0
- datacontract/imports/json_importer.py +325 -0
- datacontract/imports/jsonschema_importer.py +146 -0
- datacontract/imports/odcs_importer.py +60 -0
- datacontract/imports/odcs_v3_importer.py +516 -0
- datacontract/imports/parquet_importer.py +81 -0
- datacontract/imports/protobuf_importer.py +264 -0
- datacontract/imports/spark_importer.py +262 -0
- datacontract/imports/sql_importer.py +274 -35
- datacontract/imports/unity_importer.py +219 -0
- datacontract/init/init_template.py +20 -0
- datacontract/integration/datamesh_manager.py +86 -0
- datacontract/lint/resolve.py +271 -49
- datacontract/lint/resources.py +21 -0
- datacontract/lint/schema.py +53 -17
- datacontract/lint/urls.py +32 -12
- datacontract/model/data_contract_specification/__init__.py +1 -0
- datacontract/model/exceptions.py +4 -1
- datacontract/model/odcs.py +24 -0
- datacontract/model/run.py +49 -29
- datacontract/output/__init__.py +0 -0
- datacontract/output/junit_test_results.py +135 -0
- datacontract/output/output_format.py +10 -0
- datacontract/output/test_results_writer.py +79 -0
- datacontract/py.typed +0 -0
- datacontract/schemas/datacontract-1.1.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.1.0.schema.json +1975 -0
- datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
- datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
- datacontract/schemas/odcs-3.0.1.schema.json +2634 -0
- datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
- datacontract/templates/datacontract.html +139 -294
- datacontract/templates/datacontract_odcs.html +685 -0
- datacontract/templates/index.html +236 -0
- datacontract/templates/partials/datacontract_information.html +86 -0
- datacontract/templates/partials/datacontract_servicelevels.html +253 -0
- datacontract/templates/partials/datacontract_terms.html +51 -0
- datacontract/templates/partials/definition.html +25 -0
- datacontract/templates/partials/example.html +27 -0
- datacontract/templates/partials/model_field.html +144 -0
- datacontract/templates/partials/quality.html +49 -0
- datacontract/templates/partials/server.html +211 -0
- datacontract/templates/style/output.css +491 -72
- datacontract_cli-0.10.37.dist-info/METADATA +2235 -0
- datacontract_cli-0.10.37.dist-info/RECORD +119 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/WHEEL +1 -1
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info/licenses}/LICENSE +1 -1
- datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
- datacontract/engines/soda/connections/dask.py +0 -28
- datacontract/engines/soda/connections/duckdb.py +0 -76
- datacontract/export/csv_type_converter.py +0 -36
- datacontract/export/html_export.py +0 -66
- datacontract/export/odcs_converter.py +0 -102
- datacontract/init/download_datacontract_file.py +0 -17
- datacontract/integration/publish_datamesh_manager.py +0 -33
- datacontract/integration/publish_opentelemetry.py +0 -107
- datacontract/lint/lint.py +0 -141
- datacontract/lint/linters/description_linter.py +0 -34
- datacontract/lint/linters/example_model_linter.py +0 -91
- datacontract/lint/linters/field_pattern_linter.py +0 -34
- datacontract/lint/linters/field_reference_linter.py +0 -38
- datacontract/lint/linters/notice_period_linter.py +0 -55
- datacontract/lint/linters/quality_schema_linter.py +0 -52
- datacontract/lint/linters/valid_constraints_linter.py +0 -99
- datacontract/model/data_contract_specification.py +0 -141
- datacontract/web.py +0 -14
- datacontract_cli-0.10.0.dist-info/METADATA +0 -951
- datacontract_cli-0.10.0.dist-info/RECORD +0 -66
- /datacontract/{model → breaking}/breaking_change.py +0 -0
- /datacontract/{lint/linters → export}/__init__.py +0 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/top_level.txt +0 -0
|
@@ -2,6 +2,12 @@ import ast
|
|
|
2
2
|
import typing
|
|
3
3
|
|
|
4
4
|
import datacontract.model.data_contract_specification as spec
|
|
5
|
+
from datacontract.export.exporter import Exporter
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PydanticExporter(Exporter):
|
|
9
|
+
def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
|
|
10
|
+
return to_pydantic_model_str(data_contract)
|
|
5
11
|
|
|
6
12
|
|
|
7
13
|
def to_pydantic_model_str(contract: spec.DataContractSpecification) -> str:
|
|
@@ -1,8 +1,15 @@
|
|
|
1
1
|
from pydantic import BaseModel
|
|
2
|
-
from rdflib import
|
|
2
|
+
from rdflib import RDF, BNode, Graph, Literal, Namespace, URIRef
|
|
3
3
|
|
|
4
|
-
from datacontract.
|
|
5
|
-
|
|
4
|
+
from datacontract.export.exporter import Exporter
|
|
5
|
+
from datacontract.model.data_contract_specification import DataContractSpecification
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class RdfExporter(Exporter):
|
|
9
|
+
def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
|
|
10
|
+
self.dict_args = export_args
|
|
11
|
+
rdf_base = self.dict_args.get("rdf_base")
|
|
12
|
+
return to_rdf_n3(data_contract_spec=data_contract, base=rdf_base)
|
|
6
13
|
|
|
7
14
|
|
|
8
15
|
def is_literal(property_name):
|
|
@@ -50,8 +57,8 @@ def to_rdf(data_contract_spec: DataContractSpecification, base) -> Graph:
|
|
|
50
57
|
else:
|
|
51
58
|
g = Graph(base=Namespace(""))
|
|
52
59
|
|
|
53
|
-
dc = Namespace("https://datacontract.com/DataContractSpecification/
|
|
54
|
-
dcx = Namespace("https://datacontract.com/DataContractSpecification/
|
|
60
|
+
dc = Namespace("https://datacontract.com/DataContractSpecification/1.2.1/")
|
|
61
|
+
dcx = Namespace("https://datacontract.com/DataContractSpecification/1.2.1/Extension/")
|
|
55
62
|
|
|
56
63
|
g.bind("dc", dc)
|
|
57
64
|
g.bind("dcx", dcx)
|
|
@@ -141,7 +148,7 @@ def add_info(contract, info, graph, dc, dcx):
|
|
|
141
148
|
graph.add((bnode_info, dc.version, Literal(info.version)))
|
|
142
149
|
|
|
143
150
|
# add owner
|
|
144
|
-
owner =
|
|
151
|
+
owner = Literal(info.owner)
|
|
145
152
|
graph.add((bnode_info, dc.owner, owner))
|
|
146
153
|
|
|
147
154
|
# add contact
|
|
@@ -1,190 +1,38 @@
|
|
|
1
1
|
import yaml
|
|
2
2
|
|
|
3
|
-
from datacontract.
|
|
4
|
-
from datacontract.
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
) -> str:
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
checks.append(check_field_min_length(field_name, field.minLength))
|
|
40
|
-
if field.maxLength is not None:
|
|
41
|
-
checks.append(check_field_max_length(field_name, field.maxLength))
|
|
42
|
-
if field.minimum is not None:
|
|
43
|
-
checks.append(check_field_minimum(field_name, field.minimum))
|
|
44
|
-
if field.maximum is not None:
|
|
45
|
-
checks.append(check_field_maximum(field_name, field.maximum))
|
|
46
|
-
if field.exclusiveMinimum is not None:
|
|
47
|
-
checks.append(check_field_minimum(field_name, field.exclusiveMinimum))
|
|
48
|
-
checks.append(check_field_not_equal(field_name, field.exclusiveMinimum))
|
|
49
|
-
if field.exclusiveMaximum is not None:
|
|
50
|
-
checks.append(check_field_maximum(field_name, field.exclusiveMaximum))
|
|
51
|
-
checks.append(check_field_not_equal(field_name, field.exclusiveMaximum))
|
|
52
|
-
if field.pattern is not None:
|
|
53
|
-
checks.append(check_field_regex(field_name, field.pattern))
|
|
54
|
-
if field.enum is not None and len(field.enum) > 0:
|
|
55
|
-
checks.append(check_field_enum(field_name, field.enum))
|
|
56
|
-
# TODO references: str = None
|
|
57
|
-
# TODO format
|
|
58
|
-
|
|
59
|
-
return f"checks for {model_key}", checks
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def check_field_is_present(field_name):
|
|
63
|
-
return {
|
|
64
|
-
"schema": {
|
|
65
|
-
"name": f"Check that field {field_name} is present",
|
|
66
|
-
"fail": {
|
|
67
|
-
"when required column missing": [field_name],
|
|
68
|
-
},
|
|
69
|
-
}
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def check_field_type(field_name: str, type: str):
|
|
74
|
-
return {
|
|
75
|
-
"schema": {
|
|
76
|
-
"name": f"Check that field {field_name} has type {type}",
|
|
77
|
-
"fail": {"when wrong column type": {field_name: type}},
|
|
78
|
-
}
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
def check_field_required(field_name: str, quote_field_name: bool = False):
|
|
83
|
-
if quote_field_name:
|
|
84
|
-
field_name = f'"{field_name}"'
|
|
85
|
-
|
|
86
|
-
return {f"missing_count({field_name}) = 0": {"name": f"Check that required field {field_name} has no null values"}}
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def check_field_unique(field_name, quote_field_name: bool = False):
|
|
90
|
-
if quote_field_name:
|
|
91
|
-
field_name = f'"{field_name}"'
|
|
92
|
-
return {
|
|
93
|
-
f"duplicate_count({field_name}) = 0": {"name": f"Check that unique field {field_name} has no duplicate values"}
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
def check_field_min_length(field_name, min_length, quote_field_name: bool = False):
|
|
98
|
-
if quote_field_name:
|
|
99
|
-
field_name = f'"{field_name}"'
|
|
100
|
-
return {
|
|
101
|
-
f"invalid_count({field_name}) = 0": {
|
|
102
|
-
"name": f"Check that field {field_name} has a min length of {min}",
|
|
103
|
-
"valid min length": min_length,
|
|
104
|
-
}
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
def check_field_max_length(field_name, max_length, quote_field_name: bool = False):
|
|
109
|
-
if quote_field_name:
|
|
110
|
-
field_name = f'"{field_name}"'
|
|
111
|
-
return {
|
|
112
|
-
f"invalid_count({field_name}) = 0": {
|
|
113
|
-
"name": f"Check that field {field_name} has a max length of {max_length}",
|
|
114
|
-
"valid max length": max_length,
|
|
115
|
-
}
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
def check_field_minimum(field_name, minimum, quote_field_name: bool = False):
|
|
120
|
-
if quote_field_name:
|
|
121
|
-
field_name = f'"{field_name}"'
|
|
122
|
-
return {
|
|
123
|
-
f"invalid_count({field_name}) = 0": {
|
|
124
|
-
"name": f"Check that field {field_name} has a minimum of {min}",
|
|
125
|
-
"valid min": minimum,
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
def check_field_maximum(field_name, maximum, quote_field_name: bool = False):
|
|
131
|
-
if quote_field_name:
|
|
132
|
-
field_name = f'"{field_name}"'
|
|
133
|
-
return {
|
|
134
|
-
f"invalid_count({field_name}) = 0": {
|
|
135
|
-
"name": f"Check that field {field_name} has a maximum of {maximum}",
|
|
136
|
-
"valid max": maximum,
|
|
137
|
-
}
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
def check_field_not_equal(field_name, value, quote_field_name: bool = False):
|
|
142
|
-
if quote_field_name:
|
|
143
|
-
field_name = f'"{field_name}"'
|
|
144
|
-
return {
|
|
145
|
-
f"invalid_count({field_name}) = 0": {
|
|
146
|
-
"name": f"Check that field {field_name} is not equal to {value}",
|
|
147
|
-
"invalid values": [value],
|
|
148
|
-
}
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
def check_field_enum(field_name, enum, quote_field_name: bool = False):
|
|
153
|
-
if quote_field_name:
|
|
154
|
-
field_name = f'"{field_name}"'
|
|
155
|
-
return {
|
|
156
|
-
f"invalid_count({field_name}) = 0": {
|
|
157
|
-
"name": f"Check that field {field_name} only contains enum values {enum}",
|
|
158
|
-
"valid values": enum,
|
|
159
|
-
}
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
def check_field_regex(field_name, pattern, quote_field_name: bool = False):
|
|
164
|
-
if quote_field_name:
|
|
165
|
-
field_name = f'"{field_name}"'
|
|
166
|
-
return {
|
|
167
|
-
f"invalid_count({field_name}) = 0": {
|
|
168
|
-
"name": f"Check that field {field_name} matches regex pattern {pattern}",
|
|
169
|
-
"valid regex": pattern,
|
|
170
|
-
}
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
def add_quality_checks(sodacl, data_contract_spec):
|
|
175
|
-
if data_contract_spec.quality is None:
|
|
176
|
-
return
|
|
177
|
-
if data_contract_spec.quality.type is None:
|
|
178
|
-
return
|
|
179
|
-
if data_contract_spec.quality.type.lower() != "sodacl":
|
|
180
|
-
return
|
|
181
|
-
if isinstance(data_contract_spec.quality.specification, str):
|
|
182
|
-
quality_specification = yaml.safe_load(data_contract_spec.quality.specification)
|
|
183
|
-
else:
|
|
184
|
-
quality_specification = data_contract_spec.quality.specification
|
|
185
|
-
for key, checks in quality_specification.items():
|
|
186
|
-
if key in sodacl:
|
|
187
|
-
for check in checks:
|
|
188
|
-
sodacl[key].append(check)
|
|
189
|
-
else:
|
|
190
|
-
sodacl[key] = checks
|
|
3
|
+
from datacontract.engines.data_contract_checks import create_checks
|
|
4
|
+
from datacontract.export.exporter import Exporter
|
|
5
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Server
|
|
6
|
+
from datacontract.model.run import Run
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SodaExporter(Exporter):
|
|
10
|
+
def export(self, data_contract, model, server, sql_server_type, export_args) -> str:
|
|
11
|
+
run = Run.create_run()
|
|
12
|
+
server = get_server(data_contract, server)
|
|
13
|
+
run.checks.extend(create_checks(data_contract, server))
|
|
14
|
+
return to_sodacl_yaml(run)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def to_sodacl_yaml(run: Run) -> str:
|
|
18
|
+
sodacl_dict = {}
|
|
19
|
+
for run_check in run.checks:
|
|
20
|
+
if run_check.engine != "soda" or run_check.language != "sodacl":
|
|
21
|
+
continue
|
|
22
|
+
check_yaml_str = run_check.implementation
|
|
23
|
+
check_yaml_dict = yaml.safe_load(check_yaml_str)
|
|
24
|
+
for key, value in check_yaml_dict.items():
|
|
25
|
+
if key in sodacl_dict:
|
|
26
|
+
if isinstance(sodacl_dict[key], list) and isinstance(value, list):
|
|
27
|
+
sodacl_dict[key].extend(value)
|
|
28
|
+
else:
|
|
29
|
+
sodacl_dict[key].update(value)
|
|
30
|
+
else:
|
|
31
|
+
sodacl_dict[key] = value
|
|
32
|
+
return yaml.dump(sodacl_dict)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_server(data_contract_specification: DataContractSpecification, server_name: str = None) -> Server | None:
|
|
36
|
+
if server_name is None:
|
|
37
|
+
return None
|
|
38
|
+
return data_contract_specification.servers.get(server_name)
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import types
|
|
4
|
+
|
|
5
|
+
from datacontract.export.exporter import Exporter
|
|
6
|
+
from datacontract.model.data_contract_specification import (
|
|
7
|
+
DataContractSpecification,
|
|
8
|
+
Field,
|
|
9
|
+
Model,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SparkExporter(Exporter):
|
|
14
|
+
"""
|
|
15
|
+
Exporter class for exporting data contracts to Spark schemas.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def export(
|
|
19
|
+
self,
|
|
20
|
+
data_contract: DataContractSpecification,
|
|
21
|
+
model,
|
|
22
|
+
server,
|
|
23
|
+
sql_server_type,
|
|
24
|
+
export_args,
|
|
25
|
+
) -> dict[str, types.StructType]:
|
|
26
|
+
"""
|
|
27
|
+
Export the given data contract to Spark schemas.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
data_contract (DataContractSpecification): The data contract specification.
|
|
31
|
+
model: Not used in this implementation.
|
|
32
|
+
server: Not used in this implementation.
|
|
33
|
+
sql_server_type: Not used in this implementation.
|
|
34
|
+
export_args: Additional arguments for export.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
dict[str, types.StructType]: A dictionary mapping model names to their corresponding Spark schemas.
|
|
38
|
+
"""
|
|
39
|
+
return to_spark(data_contract)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def to_spark(contract: DataContractSpecification) -> str:
|
|
43
|
+
"""
|
|
44
|
+
Converts a DataContractSpecification into a Spark schema string.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
contract (DataContractSpecification): The data contract specification containing models.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
str: A string representation of the Spark schema for each model in the contract.
|
|
51
|
+
"""
|
|
52
|
+
return "\n\n".join(
|
|
53
|
+
f"{model_name} = {print_schema(to_spark_schema(model))}" for model_name, model in contract.models.items()
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def to_spark_dict(contract: DataContractSpecification) -> dict[str, types.StructType]:
|
|
58
|
+
"""
|
|
59
|
+
Convert a data contract specification to Spark schemas.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
contract (DataContractSpecification): The data contract specification.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
dict[str, types.StructType]: A dictionary mapping model names to their corresponding Spark schemas.
|
|
66
|
+
"""
|
|
67
|
+
return {model_name: to_spark_schema(model) for model_name, model in contract.models.items()}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def to_spark_schema(model: Model) -> types.StructType:
|
|
71
|
+
"""
|
|
72
|
+
Convert a model to a Spark schema.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
model (Model): The model to convert.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
types.StructType: The corresponding Spark schema.
|
|
79
|
+
"""
|
|
80
|
+
return to_struct_type(model.fields)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def to_struct_type(fields: dict[str, Field]) -> types.StructType:
|
|
84
|
+
"""
|
|
85
|
+
Convert a dictionary of fields to a Spark StructType.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
fields (dict[str, Field]): The fields to convert.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
types.StructType: The corresponding Spark StructType.
|
|
92
|
+
"""
|
|
93
|
+
struct_fields = [to_struct_field(field, field_name) for field_name, field in fields.items()]
|
|
94
|
+
return types.StructType(struct_fields)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def to_struct_field(field: Field, field_name: str) -> types.StructField:
|
|
98
|
+
"""
|
|
99
|
+
Convert a field to a Spark StructField.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
field (Field): The field to convert.
|
|
103
|
+
field_name (str): The name of the field.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
types.StructField: The corresponding Spark StructField.
|
|
107
|
+
"""
|
|
108
|
+
data_type = to_spark_data_type(field)
|
|
109
|
+
metadata = to_spark_metadata(field)
|
|
110
|
+
return types.StructField(name=field_name, dataType=data_type, nullable=not field.required, metadata=metadata)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def to_spark_data_type(field: Field) -> types.DataType:
|
|
114
|
+
"""
|
|
115
|
+
Convert a field to a Spark DataType.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
field (Field): The field to convert.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
types.DataType: The corresponding Spark DataType.
|
|
122
|
+
"""
|
|
123
|
+
field_type = field.type
|
|
124
|
+
if field_type is None or field_type in ["null"]:
|
|
125
|
+
return types.NullType()
|
|
126
|
+
if field_type == "array":
|
|
127
|
+
return types.ArrayType(to_spark_data_type(field.items))
|
|
128
|
+
if field_type in ["object", "record", "struct"]:
|
|
129
|
+
return types.StructType(to_struct_type(field.fields))
|
|
130
|
+
if field_type == "map":
|
|
131
|
+
return types.MapType(to_spark_data_type(field.keys), to_spark_data_type(field.values))
|
|
132
|
+
if field_type == "variant":
|
|
133
|
+
return types.VariantType()
|
|
134
|
+
if field_type in ["string", "varchar", "text"]:
|
|
135
|
+
return types.StringType()
|
|
136
|
+
if field_type in ["number", "decimal", "numeric"]:
|
|
137
|
+
precision = field.precision if field.precision is not None else 38
|
|
138
|
+
scale = field.scale if field.scale is not None else 0
|
|
139
|
+
return types.DecimalType(precision=precision, scale=scale)
|
|
140
|
+
if field_type in ["integer", "int"]:
|
|
141
|
+
return types.IntegerType()
|
|
142
|
+
if field_type == "long":
|
|
143
|
+
return types.LongType()
|
|
144
|
+
if field_type == "float":
|
|
145
|
+
return types.FloatType()
|
|
146
|
+
if field_type == "double":
|
|
147
|
+
return types.DoubleType()
|
|
148
|
+
if field_type == "boolean":
|
|
149
|
+
return types.BooleanType()
|
|
150
|
+
if field_type in ["timestamp", "timestamp_tz"]:
|
|
151
|
+
return types.TimestampType()
|
|
152
|
+
if field_type == "timestamp_ntz":
|
|
153
|
+
return types.TimestampNTZType()
|
|
154
|
+
if field_type == "date":
|
|
155
|
+
return types.DateType()
|
|
156
|
+
if field_type == "bytes":
|
|
157
|
+
return types.BinaryType()
|
|
158
|
+
return types.StringType() # default if no condition is met
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def to_spark_metadata(field: Field) -> dict[str, str]:
|
|
162
|
+
"""
|
|
163
|
+
Convert a field to a Spark metadata dictonary.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
field (Field): The field to convert.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
dict: dictionary that can be supplied to Spark as metadata for a StructField
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
metadata = {}
|
|
173
|
+
if field.description:
|
|
174
|
+
metadata["comment"] = field.description
|
|
175
|
+
|
|
176
|
+
return metadata
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def print_schema(dtype: types.DataType) -> str:
|
|
180
|
+
"""
|
|
181
|
+
Converts a PySpark DataType schema to its equivalent code representation.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
dtype (types.DataType): The PySpark DataType schema to be converted.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
str: The code representation of the PySpark DataType schema.
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
def indent(text: str, level: int) -> str:
|
|
191
|
+
"""
|
|
192
|
+
Indents each line of the given text by a specified number of levels.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
text (str): The text to be indented.
|
|
196
|
+
level (int): The number of indentation levels.
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
str: The indented text.
|
|
200
|
+
"""
|
|
201
|
+
return "\n".join([f"{' ' * level}{line}" for line in text.split("\n")])
|
|
202
|
+
|
|
203
|
+
def repr_column(column: types.StructField) -> str:
|
|
204
|
+
"""
|
|
205
|
+
Converts a PySpark StructField to its code representation.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
column (types.StructField): The StructField to be converted.
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
str: The code representation of the StructField.
|
|
212
|
+
"""
|
|
213
|
+
name = f'"{column.name}"'
|
|
214
|
+
data_type = indent(print_schema(column.dataType), 1)
|
|
215
|
+
nullable = indent(f"{column.nullable}", 1)
|
|
216
|
+
if column.metadata:
|
|
217
|
+
metadata = indent(f"{json.dumps(column.metadata)}", 1)
|
|
218
|
+
return f"StructField({name},\n{data_type},\n{nullable},\n{metadata}\n)"
|
|
219
|
+
else:
|
|
220
|
+
return f"StructField({name},\n{data_type},\n{nullable}\n)"
|
|
221
|
+
|
|
222
|
+
def format_struct_type(struct_type: types.StructType) -> str:
|
|
223
|
+
"""
|
|
224
|
+
Converts a PySpark StructType to its code representation.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
struct_type (types.StructType): The StructType to be converted.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
str: The code representation of the StructType.
|
|
231
|
+
"""
|
|
232
|
+
fields = ",\n".join([indent(repr_column(field), 1) for field in struct_type.fields])
|
|
233
|
+
return f"StructType([\n{fields}\n])"
|
|
234
|
+
|
|
235
|
+
if isinstance(dtype, types.StructType):
|
|
236
|
+
return format_struct_type(dtype)
|
|
237
|
+
elif isinstance(dtype, types.ArrayType):
|
|
238
|
+
return f"ArrayType({print_schema(dtype.elementType)})"
|
|
239
|
+
elif isinstance(dtype, types.MapType):
|
|
240
|
+
return f"MapType(\n{indent(print_schema(dtype.keyType), 1)}, {print_schema(dtype.valueType)})"
|
|
241
|
+
elif isinstance(dtype, types.DecimalType):
|
|
242
|
+
return f"DecimalType({dtype.precision}, {dtype.scale})"
|
|
243
|
+
else:
|
|
244
|
+
dtype_str = str(dtype)
|
|
245
|
+
return dtype_str if dtype_str.endswith("()") else f"{dtype_str}()"
|
|
@@ -1,7 +1,29 @@
|
|
|
1
|
+
from datacontract.export.exporter import Exporter, _check_models_for_export, _determine_sql_server_type
|
|
1
2
|
from datacontract.export.sql_type_converter import convert_to_sql_type
|
|
2
3
|
from datacontract.model.data_contract_specification import DataContractSpecification, Model
|
|
3
4
|
|
|
4
5
|
|
|
6
|
+
class SqlExporter(Exporter):
|
|
7
|
+
def export(self, data_contract, model, server, sql_server_type, export_args) -> str:
|
|
8
|
+
server_type = _determine_sql_server_type(
|
|
9
|
+
data_contract,
|
|
10
|
+
sql_server_type,
|
|
11
|
+
)
|
|
12
|
+
return to_sql_ddl(data_contract, server_type, export_args.get("server"))
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SqlQueryExporter(Exporter):
|
|
16
|
+
def export(self, data_contract, model, server, sql_server_type, export_args) -> str:
|
|
17
|
+
model_name, model_value = _check_models_for_export(data_contract, model, self.export_format)
|
|
18
|
+
server_type = _determine_sql_server_type(data_contract, sql_server_type, export_args.get("server"))
|
|
19
|
+
return to_sql_query(
|
|
20
|
+
data_contract,
|
|
21
|
+
model_name,
|
|
22
|
+
model_value,
|
|
23
|
+
server_type,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
5
27
|
def to_sql_query(
|
|
6
28
|
data_contract_spec: DataContractSpecification, model_name: str, model_value: Model, server_type: str = "snowflake"
|
|
7
29
|
) -> str:
|
|
@@ -37,7 +59,9 @@ def _to_sql_query(model_name, model_value, server_type) -> str:
|
|
|
37
59
|
return result
|
|
38
60
|
|
|
39
61
|
|
|
40
|
-
def to_sql_ddl(
|
|
62
|
+
def to_sql_ddl(
|
|
63
|
+
data_contract_spec: DataContractSpecification, server_type: str = "snowflake", server: str = None
|
|
64
|
+
) -> str:
|
|
41
65
|
if data_contract_spec is None:
|
|
42
66
|
return ""
|
|
43
67
|
if data_contract_spec.models is None or len(data_contract_spec.models) == 0:
|
|
@@ -45,7 +69,12 @@ def to_sql_ddl(data_contract_spec: DataContractSpecification, server_type: str =
|
|
|
45
69
|
|
|
46
70
|
table_prefix = ""
|
|
47
71
|
|
|
48
|
-
|
|
72
|
+
if server is None:
|
|
73
|
+
servers = data_contract_spec.servers
|
|
74
|
+
else:
|
|
75
|
+
servers = {server: data_contract_spec.servers[server]}
|
|
76
|
+
|
|
77
|
+
for server_name, server in iter(servers.items()):
|
|
49
78
|
if server.type == "snowflake":
|
|
50
79
|
server_type = "snowflake"
|
|
51
80
|
break
|
|
@@ -63,6 +92,7 @@ def to_sql_ddl(data_contract_spec: DataContractSpecification, server_type: str =
|
|
|
63
92
|
result = ""
|
|
64
93
|
result += f"-- Data Contract: {data_contract_spec.id}\n"
|
|
65
94
|
result += f"-- SQL Dialect: {server_type}\n"
|
|
95
|
+
|
|
66
96
|
for model_name, model in iter(data_contract_spec.models.items()):
|
|
67
97
|
result += _to_sql_table(table_prefix + model_name, model, server_type)
|
|
68
98
|
|
|
@@ -83,10 +113,12 @@ def _to_sql_table(model_name, model, server_type="snowflake"):
|
|
|
83
113
|
result += f" {field_name} {type}"
|
|
84
114
|
if field.required:
|
|
85
115
|
result += " not null"
|
|
86
|
-
if field.primary:
|
|
116
|
+
if field.primaryKey or field.primary:
|
|
87
117
|
result += " primary key"
|
|
88
118
|
if server_type == "databricks" and field.description is not None:
|
|
89
119
|
result += f' COMMENT "{_escape(field.description)}"'
|
|
120
|
+
if server_type == "snowflake" and field.description is not None:
|
|
121
|
+
result += f" COMMENT '{_escape(field.description)}'"
|
|
90
122
|
if current_field_index < fields:
|
|
91
123
|
result += ","
|
|
92
124
|
result += "\n"
|
|
@@ -94,6 +126,8 @@ def _to_sql_table(model_name, model, server_type="snowflake"):
|
|
|
94
126
|
result += ")"
|
|
95
127
|
if server_type == "databricks" and model.description is not None:
|
|
96
128
|
result += f' COMMENT "{_escape(model.description)}"'
|
|
129
|
+
if server_type == "snowflake" and model.description is not None:
|
|
130
|
+
result += f" COMMENT='{_escape(model.description)}'"
|
|
97
131
|
result += ";\n"
|
|
98
132
|
return result
|
|
99
133
|
|