datacontract-cli 0.10.14__py3-none-any.whl → 0.10.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/breaking/breaking.py +229 -11
- datacontract/breaking/breaking_rules.py +24 -0
- datacontract/catalog/catalog.py +1 -1
- datacontract/cli.py +100 -33
- datacontract/data_contract.py +26 -4
- datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
- datacontract/engines/fastjsonschema/check_jsonschema.py +114 -22
- datacontract/engines/soda/check_soda_execute.py +7 -5
- datacontract/engines/soda/connections/duckdb.py +1 -0
- datacontract/engines/soda/connections/kafka.py +12 -12
- datacontract/export/avro_idl_converter.py +1 -2
- datacontract/export/bigquery_converter.py +4 -3
- datacontract/export/data_caterer_converter.py +1 -1
- datacontract/export/dbml_converter.py +2 -4
- datacontract/export/dbt_converter.py +45 -39
- datacontract/export/exporter.py +2 -1
- datacontract/export/exporter_factory.py +7 -2
- datacontract/export/go_converter.py +3 -2
- datacontract/export/great_expectations_converter.py +202 -40
- datacontract/export/html_export.py +1 -1
- datacontract/export/iceberg_converter.py +188 -0
- datacontract/export/jsonschema_converter.py +3 -2
- datacontract/export/odcs_v2_exporter.py +1 -1
- datacontract/export/odcs_v3_exporter.py +44 -30
- datacontract/export/pandas_type_converter.py +40 -0
- datacontract/export/protobuf_converter.py +1 -1
- datacontract/export/rdf_converter.py +4 -5
- datacontract/export/sodacl_converter.py +9 -4
- datacontract/export/spark_converter.py +7 -6
- datacontract/export/sql_converter.py +1 -2
- datacontract/export/sqlalchemy_converter.py +1 -2
- datacontract/export/terraform_converter.py +1 -1
- datacontract/imports/avro_importer.py +1 -1
- datacontract/imports/bigquery_importer.py +1 -1
- datacontract/imports/dbml_importer.py +2 -2
- datacontract/imports/dbt_importer.py +80 -15
- datacontract/imports/glue_importer.py +5 -3
- datacontract/imports/iceberg_importer.py +17 -7
- datacontract/imports/importer.py +1 -0
- datacontract/imports/importer_factory.py +7 -1
- datacontract/imports/jsonschema_importer.py +3 -2
- datacontract/imports/odcs_v2_importer.py +2 -2
- datacontract/imports/odcs_v3_importer.py +7 -2
- datacontract/imports/parquet_importer.py +81 -0
- datacontract/imports/spark_importer.py +2 -1
- datacontract/imports/sql_importer.py +1 -1
- datacontract/imports/unity_importer.py +3 -3
- datacontract/integration/opentelemetry.py +0 -1
- datacontract/lint/lint.py +2 -1
- datacontract/lint/linters/description_linter.py +1 -0
- datacontract/lint/linters/example_model_linter.py +1 -0
- datacontract/lint/linters/field_pattern_linter.py +1 -0
- datacontract/lint/linters/field_reference_linter.py +1 -0
- datacontract/lint/linters/notice_period_linter.py +1 -0
- datacontract/lint/linters/quality_schema_linter.py +1 -0
- datacontract/lint/linters/valid_constraints_linter.py +1 -0
- datacontract/lint/resolve.py +7 -3
- datacontract/lint/schema.py +1 -1
- datacontract/model/data_contract_specification.py +13 -6
- datacontract/model/run.py +21 -12
- datacontract/templates/index.html +6 -6
- datacontract/web.py +2 -3
- {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.16.dist-info}/METADATA +163 -60
- datacontract_cli-0.10.16.dist-info/RECORD +106 -0
- {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.16.dist-info}/WHEEL +1 -1
- datacontract_cli-0.10.14.dist-info/RECORD +0 -103
- {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.16.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.16.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.16.dist-info}/top_level.txt +0 -0
|
@@ -1,49 +1,118 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module provides functionalities to export data contracts to Great Expectations suites.
|
|
3
|
+
It includes definitions for exporting different types of data (pandas, Spark, SQL) into
|
|
4
|
+
Great Expectations expectations format.
|
|
5
|
+
"""
|
|
6
|
+
|
|
1
7
|
import json
|
|
2
|
-
from
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from typing import Any, Dict, List
|
|
3
10
|
|
|
4
11
|
import yaml
|
|
5
12
|
|
|
6
|
-
from datacontract.
|
|
7
|
-
|
|
13
|
+
from datacontract.export.exporter import (
|
|
14
|
+
Exporter,
|
|
15
|
+
_check_models_for_export,
|
|
16
|
+
)
|
|
17
|
+
from datacontract.export.pandas_type_converter import convert_to_pandas_type
|
|
18
|
+
from datacontract.export.spark_converter import to_spark_data_type
|
|
19
|
+
from datacontract.export.sql_type_converter import convert_to_sql_type
|
|
20
|
+
from datacontract.model.data_contract_specification import (
|
|
21
|
+
DataContractSpecification,
|
|
22
|
+
Field,
|
|
23
|
+
Quality,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class GreatExpectationsEngine(Enum):
|
|
28
|
+
"""Enum to represent the type of data engine for expectations.
|
|
29
|
+
|
|
30
|
+
Attributes:
|
|
31
|
+
pandas (str): Represents the Pandas engine type.
|
|
32
|
+
spark (str): Represents the Spark engine type.
|
|
33
|
+
sql (str): Represents the SQL engine type.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
pandas = "pandas"
|
|
37
|
+
spark = "spark"
|
|
38
|
+
sql = "sql"
|
|
39
|
+
|
|
8
40
|
|
|
41
|
+
class GreatExpectationsExporter(Exporter):
|
|
42
|
+
"""Exporter class to convert data contracts to Great Expectations suites.
|
|
43
|
+
|
|
44
|
+
Methods:
|
|
45
|
+
export: Converts a data contract model to a Great Expectations suite.
|
|
46
|
+
|
|
47
|
+
"""
|
|
9
48
|
|
|
10
|
-
class GreateExpectationsExporter(Exporter):
|
|
11
49
|
def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
|
|
50
|
+
"""Exports a data contract model to a Great Expectations suite.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
data_contract (DataContractSpecification): The data contract specification.
|
|
54
|
+
model (str): The model name to export.
|
|
55
|
+
server (str): The server information.
|
|
56
|
+
sql_server_type (str): Type of SQL server (e.g., "snowflake").
|
|
57
|
+
export_args (dict): Additional arguments for export, such as "suite_name" and "engine".
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
dict: A dictionary representation of the Great Expectations suite.
|
|
61
|
+
"""
|
|
62
|
+
expectation_suite_name = export_args.get("suite_name")
|
|
63
|
+
engine = export_args.get("engine")
|
|
12
64
|
model_name, model_value = _check_models_for_export(data_contract, model, self.export_format)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
model_name,
|
|
16
|
-
)
|
|
65
|
+
sql_server_type = "snowflake" if sql_server_type == "auto" else sql_server_type
|
|
66
|
+
return to_great_expectations(data_contract, model_name, expectation_suite_name, engine, sql_server_type)
|
|
17
67
|
|
|
18
68
|
|
|
19
|
-
def to_great_expectations(
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
69
|
+
def to_great_expectations(
|
|
70
|
+
data_contract_spec: DataContractSpecification,
|
|
71
|
+
model_key: str,
|
|
72
|
+
expectation_suite_name: str | None = None,
|
|
73
|
+
engine: str | None = None,
|
|
74
|
+
sql_server_type: str = "snowflake",
|
|
75
|
+
) -> str:
|
|
76
|
+
"""Converts a data contract model to a Great Expectations suite.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
data_contract_spec (DataContractSpecification): The data contract specification.
|
|
80
|
+
model_key (str): The model key.
|
|
81
|
+
expectation_suite_name (str | None): Optional suite name for the expectations.
|
|
82
|
+
engine (str | None): Optional engine type (e.g., "pandas", "spark").
|
|
83
|
+
sql_server_type (str): The type of SQL server (default is "snowflake").
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
str: JSON string of the Great Expectations suite.
|
|
25
87
|
"""
|
|
26
88
|
expectations = []
|
|
89
|
+
if not expectation_suite_name:
|
|
90
|
+
expectation_suite_name = "{model_key}.{contract_version}".format(
|
|
91
|
+
model_key=model_key, contract_version=data_contract_spec.info.version
|
|
92
|
+
)
|
|
27
93
|
model_value = data_contract_spec.models.get(model_key)
|
|
28
94
|
quality_checks = get_quality_checks(data_contract_spec.quality)
|
|
29
|
-
expectations.extend(model_to_expectations(model_value.fields))
|
|
95
|
+
expectations.extend(model_to_expectations(model_value.fields, engine, sql_server_type))
|
|
30
96
|
expectations.extend(checks_to_expectations(quality_checks, model_key))
|
|
31
|
-
model_expectation_suite = to_suite(
|
|
97
|
+
model_expectation_suite = to_suite(expectations, expectation_suite_name)
|
|
32
98
|
|
|
33
99
|
return model_expectation_suite
|
|
34
100
|
|
|
35
101
|
|
|
36
|
-
def to_suite(
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
)
|
|
102
|
+
def to_suite(expectations: List[Dict[str, Any]], expectation_suite_name: str) -> str:
|
|
103
|
+
"""Converts a list of expectations to a JSON-formatted suite.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
expectations (List[Dict[str, Any]]): List of expectations.
|
|
107
|
+
expectation_suite_name (str): Name of the expectation suite.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
str: JSON string of the expectation suite.
|
|
111
|
+
"""
|
|
41
112
|
return json.dumps(
|
|
42
113
|
{
|
|
43
114
|
"data_asset_type": "null",
|
|
44
|
-
"expectation_suite_name":
|
|
45
|
-
model_key=model_key, contract_version=contract_version
|
|
46
|
-
),
|
|
115
|
+
"expectation_suite_name": expectation_suite_name,
|
|
47
116
|
"expectations": expectations,
|
|
48
117
|
"meta": {},
|
|
49
118
|
},
|
|
@@ -51,22 +120,53 @@ def to_suite(
|
|
|
51
120
|
)
|
|
52
121
|
|
|
53
122
|
|
|
54
|
-
def model_to_expectations(fields: Dict[str, Field]) -> List[Dict[str, Any]]:
|
|
55
|
-
"""
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
123
|
+
def model_to_expectations(fields: Dict[str, Field], engine: str | None, sql_server_type: str) -> List[Dict[str, Any]]:
|
|
124
|
+
"""Converts model fields to a list of expectations.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
fields (Dict[str, Field]): Dictionary of model fields.
|
|
128
|
+
engine (str | None): Engine type (e.g., "pandas", "spark").
|
|
129
|
+
sql_server_type (str): SQL server type.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
List[Dict[str, Any]]: List of expectations.
|
|
59
133
|
"""
|
|
60
134
|
expectations = []
|
|
61
135
|
add_column_order_exp(fields, expectations)
|
|
62
136
|
for field_name, field in fields.items():
|
|
63
|
-
add_field_expectations(field_name, field, expectations)
|
|
137
|
+
add_field_expectations(field_name, field, expectations, engine, sql_server_type)
|
|
64
138
|
return expectations
|
|
65
139
|
|
|
66
140
|
|
|
67
|
-
def add_field_expectations(
|
|
141
|
+
def add_field_expectations(
|
|
142
|
+
field_name,
|
|
143
|
+
field: Field,
|
|
144
|
+
expectations: List[Dict[str, Any]],
|
|
145
|
+
engine: str | None,
|
|
146
|
+
sql_server_type: str,
|
|
147
|
+
) -> List[Dict[str, Any]]:
|
|
148
|
+
"""Adds expectations for a specific field based on its properties.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
field_name (str): The name of the field.
|
|
152
|
+
field (Field): The field object.
|
|
153
|
+
expectations (List[Dict[str, Any]]): The expectations list to update.
|
|
154
|
+
engine (str | None): Engine type (e.g., "pandas", "spark").
|
|
155
|
+
sql_server_type (str): SQL server type.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
List[Dict[str, Any]]: Updated list of expectations.
|
|
159
|
+
"""
|
|
68
160
|
if field.type is not None:
|
|
69
|
-
|
|
161
|
+
if engine == GreatExpectationsEngine.spark.value:
|
|
162
|
+
field_type = to_spark_data_type(field).__class__.__name__
|
|
163
|
+
elif engine == GreatExpectationsEngine.pandas.value:
|
|
164
|
+
field_type = convert_to_pandas_type(field)
|
|
165
|
+
elif engine == GreatExpectationsEngine.sql.value:
|
|
166
|
+
field_type = convert_to_sql_type(field, sql_server_type)
|
|
167
|
+
else:
|
|
168
|
+
field_type = field.type
|
|
169
|
+
expectations.append(to_column_types_exp(field_name, field_type))
|
|
70
170
|
if field.unique:
|
|
71
171
|
expectations.append(to_column_unique_exp(field_name))
|
|
72
172
|
if field.maxLength is not None or field.minLength is not None:
|
|
@@ -74,11 +174,16 @@ def add_field_expectations(field_name, field: Field, expectations: List[Dict[str
|
|
|
74
174
|
if field.minimum is not None or field.maximum is not None:
|
|
75
175
|
expectations.append(to_column_min_max_exp(field_name, field.minimum, field.maximum))
|
|
76
176
|
|
|
77
|
-
# TODO: all constraints
|
|
78
177
|
return expectations
|
|
79
178
|
|
|
80
179
|
|
|
81
180
|
def add_column_order_exp(fields: Dict[str, Field], expectations: List[Dict[str, Any]]):
|
|
181
|
+
"""Adds expectation for column ordering.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
fields (Dict[str, Field]): Dictionary of fields.
|
|
185
|
+
expectations (List[Dict[str, Any]]): The expectations list to update.
|
|
186
|
+
"""
|
|
82
187
|
expectations.append(
|
|
83
188
|
{
|
|
84
189
|
"expectation_type": "expect_table_columns_to_match_ordered_list",
|
|
@@ -89,6 +194,15 @@ def add_column_order_exp(fields: Dict[str, Field], expectations: List[Dict[str,
|
|
|
89
194
|
|
|
90
195
|
|
|
91
196
|
def to_column_types_exp(field_name, field_type) -> Dict[str, Any]:
|
|
197
|
+
"""Creates a column type expectation.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
field_name (str): The name of the field.
|
|
201
|
+
field_type (str): The type of the field.
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
Dict[str, Any]: Column type expectation.
|
|
205
|
+
"""
|
|
92
206
|
return {
|
|
93
207
|
"expectation_type": "expect_column_values_to_be_of_type",
|
|
94
208
|
"kwargs": {"column": field_name, "type_": field_type},
|
|
@@ -97,18 +211,54 @@ def to_column_types_exp(field_name, field_type) -> Dict[str, Any]:
|
|
|
97
211
|
|
|
98
212
|
|
|
99
213
|
def to_column_unique_exp(field_name) -> Dict[str, Any]:
|
|
100
|
-
|
|
214
|
+
"""Creates a column uniqueness expectation.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
field_name (str): The name of the field.
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
Dict[str, Any]: Column uniqueness expectation.
|
|
221
|
+
"""
|
|
222
|
+
return {
|
|
223
|
+
"expectation_type": "expect_column_values_to_be_unique",
|
|
224
|
+
"kwargs": {"column": field_name},
|
|
225
|
+
"meta": {},
|
|
226
|
+
}
|
|
101
227
|
|
|
102
228
|
|
|
103
229
|
def to_column_length_exp(field_name, min_length, max_length) -> Dict[str, Any]:
|
|
230
|
+
"""Creates a column length expectation.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
field_name (str): The name of the field.
|
|
234
|
+
min_length (int | None): Minimum length.
|
|
235
|
+
max_length (int | None): Maximum length.
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
Dict[str, Any]: Column length expectation.
|
|
239
|
+
"""
|
|
104
240
|
return {
|
|
105
241
|
"expectation_type": "expect_column_value_lengths_to_be_between",
|
|
106
|
-
"kwargs": {
|
|
242
|
+
"kwargs": {
|
|
243
|
+
"column": field_name,
|
|
244
|
+
"min_value": min_length,
|
|
245
|
+
"max_value": max_length,
|
|
246
|
+
},
|
|
107
247
|
"meta": {},
|
|
108
248
|
}
|
|
109
249
|
|
|
110
250
|
|
|
111
251
|
def to_column_min_max_exp(field_name, minimum, maximum) -> Dict[str, Any]:
|
|
252
|
+
"""Creates a column min-max value expectation.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
field_name (str): The name of the field.
|
|
256
|
+
minimum (float | None): Minimum value.
|
|
257
|
+
maximum (float | None): Maximum value.
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
Dict[str, Any]: Column min-max value expectation.
|
|
261
|
+
"""
|
|
112
262
|
return {
|
|
113
263
|
"expectation_type": "expect_column_values_to_be_between",
|
|
114
264
|
"kwargs": {"column": field_name, "min_value": minimum, "max_value": maximum},
|
|
@@ -117,6 +267,14 @@ def to_column_min_max_exp(field_name, minimum, maximum) -> Dict[str, Any]:
|
|
|
117
267
|
|
|
118
268
|
|
|
119
269
|
def get_quality_checks(quality: Quality) -> Dict[str, Any]:
|
|
270
|
+
"""Retrieves quality checks defined in a data contract.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
quality (Quality): Quality object from the data contract.
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
Dict[str, Any]: Dictionary of quality checks.
|
|
277
|
+
"""
|
|
120
278
|
if quality is None:
|
|
121
279
|
return {}
|
|
122
280
|
if quality.type is None:
|
|
@@ -131,11 +289,14 @@ def get_quality_checks(quality: Quality) -> Dict[str, Any]:
|
|
|
131
289
|
|
|
132
290
|
|
|
133
291
|
def checks_to_expectations(quality_checks: Dict[str, Any], model_key: str) -> List[Dict[str, Any]]:
|
|
134
|
-
"""
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
292
|
+
"""Converts quality checks to a list of expectations.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
quality_checks (Dict[str, Any]): Dictionary of quality checks by model.
|
|
296
|
+
model_key (str): The model key.
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
List[Dict[str, Any]]: List of expectations for the model.
|
|
139
300
|
"""
|
|
140
301
|
if quality_checks is None or model_key not in quality_checks:
|
|
141
302
|
return []
|
|
@@ -148,3 +309,4 @@ def checks_to_expectations(quality_checks: Dict[str, Any], model_key: str) -> Li
|
|
|
148
309
|
if isinstance(model_quality_checks, str):
|
|
149
310
|
expectation_list = json.loads(model_quality_checks)
|
|
150
311
|
return expectation_list
|
|
312
|
+
return []
|
|
@@ -7,8 +7,8 @@ import pytz
|
|
|
7
7
|
import yaml
|
|
8
8
|
from jinja2 import Environment, PackageLoader, select_autoescape
|
|
9
9
|
|
|
10
|
-
from datacontract.model.data_contract_specification import DataContractSpecification
|
|
11
10
|
from datacontract.export.exporter import Exporter
|
|
11
|
+
from datacontract.model.data_contract_specification import DataContractSpecification
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class HtmlExporter(Exporter):
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
from pyiceberg import types
|
|
2
|
+
from pyiceberg.schema import Schema, assign_fresh_schema_ids
|
|
3
|
+
|
|
4
|
+
from datacontract.export.exporter import Exporter
|
|
5
|
+
from datacontract.model.data_contract_specification import (
|
|
6
|
+
DataContractSpecification,
|
|
7
|
+
Field,
|
|
8
|
+
Model,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class IcebergExporter(Exporter):
|
|
13
|
+
"""
|
|
14
|
+
Exporter class for exporting data contracts to Iceberg schemas.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def export(
|
|
18
|
+
self,
|
|
19
|
+
data_contract: DataContractSpecification,
|
|
20
|
+
model,
|
|
21
|
+
server,
|
|
22
|
+
sql_server_type,
|
|
23
|
+
export_args,
|
|
24
|
+
):
|
|
25
|
+
"""
|
|
26
|
+
Export the given data contract model to an Iceberg schema.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
data_contract (DataContractSpecification): The data contract specification.
|
|
30
|
+
model: The model to export, currently just supports one model.
|
|
31
|
+
server: Not used in this implementation.
|
|
32
|
+
sql_server_type: Not used in this implementation.
|
|
33
|
+
export_args: Additional arguments for export.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
str: A string representation of the Iceberg json schema.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
return to_iceberg(data_contract, model)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def to_iceberg(contract: DataContractSpecification, model: str) -> str:
|
|
43
|
+
"""
|
|
44
|
+
Converts a DataContractSpecification into an Iceberg json schema string. JSON string follows https://iceberg.apache.org/spec/#appendix-c-json-serialization.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
contract (DataContractSpecification): The data contract specification containing models.
|
|
48
|
+
model: The model to export, currently just supports one model.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
str: A string representation of the Iceberg json schema.
|
|
52
|
+
"""
|
|
53
|
+
if model is None or model == "all":
|
|
54
|
+
if len(contract.models.items()) != 1:
|
|
55
|
+
# Iceberg doesn't have a way to combine multiple models into a single schema, an alternative would be to export json lines
|
|
56
|
+
raise Exception(f"Can only output one model at a time, found {len(contract.models.items())} models")
|
|
57
|
+
for model_name, model in contract.models.items():
|
|
58
|
+
schema = to_iceberg_schema(model)
|
|
59
|
+
else:
|
|
60
|
+
if model not in contract.models:
|
|
61
|
+
raise Exception(f"model {model} not found in contract")
|
|
62
|
+
schema = to_iceberg_schema(contract.models[model])
|
|
63
|
+
|
|
64
|
+
return schema.model_dump_json()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def to_iceberg_schema(model: Model) -> types.StructType:
|
|
68
|
+
"""
|
|
69
|
+
Convert a model to a Iceberg schema.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
model (Model): The model to convert.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
types.StructType: The corresponding Iceberg schema.
|
|
76
|
+
"""
|
|
77
|
+
iceberg_fields = []
|
|
78
|
+
primary_keys = []
|
|
79
|
+
for field_name, spec_field in model.fields.items():
|
|
80
|
+
iceberg_field = make_field(field_name, spec_field)
|
|
81
|
+
iceberg_fields.append(iceberg_field)
|
|
82
|
+
|
|
83
|
+
if spec_field.primaryKey:
|
|
84
|
+
primary_keys.append(iceberg_field.name)
|
|
85
|
+
|
|
86
|
+
schema = Schema(*iceberg_fields)
|
|
87
|
+
|
|
88
|
+
# apply non-0 field IDs so we can set the identifier fields for the schema
|
|
89
|
+
schema = assign_fresh_schema_ids(schema)
|
|
90
|
+
for field in schema.fields:
|
|
91
|
+
if field.name in primary_keys:
|
|
92
|
+
schema.identifier_field_ids.append(field.field_id)
|
|
93
|
+
|
|
94
|
+
return schema
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def make_field(field_name, field):
|
|
98
|
+
field_type = get_field_type(field)
|
|
99
|
+
|
|
100
|
+
# Note: might want to re-populate field_id from config['icebergFieldId'] if it exists, however, it gets
|
|
101
|
+
# complicated since field_ids impact the list and map element_ids, and the importer is not keeping track of those.
|
|
102
|
+
# Even if IDs are re-constituted, it seems like the SDK code would still reset them before any operation against a catalog,
|
|
103
|
+
# so it's likely not worth it.
|
|
104
|
+
|
|
105
|
+
# Note 2: field_id defaults to 0 to signify that the exporter is not attempting to populate meaningful values.
|
|
106
|
+
# also, the Iceberg sdk catalog code will re-set the fieldIDs prior to executing any table operations on the schema
|
|
107
|
+
# ref: https://github.com/apache/iceberg-python/pull/1072
|
|
108
|
+
return types.NestedField(field_id=0, name=field_name, field_type=field_type, required=field.required)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def make_list(item):
|
|
112
|
+
field_type = get_field_type(item)
|
|
113
|
+
|
|
114
|
+
# element_id defaults to 0 to signify that the exporter is not attempting to populate meaningful values (see #make_field)
|
|
115
|
+
return types.ListType(element_id=0, element_type=field_type, element_required=item.required)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def make_map(field):
|
|
119
|
+
key_type = get_field_type(field.keys)
|
|
120
|
+
value_type = get_field_type(field.values)
|
|
121
|
+
|
|
122
|
+
# key_id and value_id defaults to 0 to signify that the exporter is not attempting to populate meaningful values (see #make_field)
|
|
123
|
+
return types.MapType(
|
|
124
|
+
key_id=0, key_type=key_type, value_id=0, value_type=value_type, value_required=field.values.required
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def to_struct_type(fields: dict[str, Field]) -> types.StructType:
|
|
129
|
+
"""
|
|
130
|
+
Convert a dictionary of fields to a Iceberg StructType.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
fields (dict[str, Field]): The fields to convert.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
types.StructType: The corresponding Iceberg StructType.
|
|
137
|
+
"""
|
|
138
|
+
struct_fields = []
|
|
139
|
+
for field_name, field in fields.items():
|
|
140
|
+
struct_field = make_field(field_name, field)
|
|
141
|
+
struct_fields.append(struct_field)
|
|
142
|
+
return types.StructType(*struct_fields)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def get_field_type(field: Field) -> types.IcebergType:
|
|
146
|
+
"""
|
|
147
|
+
Convert a field to a Iceberg IcebergType.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
field (Field): The field to convert.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
types.IcebergType: The corresponding Iceberg IcebergType.
|
|
154
|
+
"""
|
|
155
|
+
field_type = field.type
|
|
156
|
+
if field_type is None or field_type in ["null"]:
|
|
157
|
+
return types.NullType()
|
|
158
|
+
if field_type == "array":
|
|
159
|
+
return make_list(field.items)
|
|
160
|
+
if field_type == "map":
|
|
161
|
+
return make_map(field)
|
|
162
|
+
if field_type in ["object", "record", "struct"]:
|
|
163
|
+
return to_struct_type(field.fields)
|
|
164
|
+
if field_type in ["string", "varchar", "text"]:
|
|
165
|
+
return types.StringType()
|
|
166
|
+
if field_type in ["number", "decimal", "numeric"]:
|
|
167
|
+
precision = field.precision if field.precision is not None else 38
|
|
168
|
+
scale = field.scale if field.scale is not None else 0
|
|
169
|
+
return types.DecimalType(precision=precision, scale=scale)
|
|
170
|
+
if field_type in ["integer", "int"]:
|
|
171
|
+
return types.IntegerType()
|
|
172
|
+
if field_type in ["bigint", "long"]:
|
|
173
|
+
return types.LongType()
|
|
174
|
+
if field_type == "float":
|
|
175
|
+
return types.FloatType()
|
|
176
|
+
if field_type == "double":
|
|
177
|
+
return types.DoubleType()
|
|
178
|
+
if field_type == "boolean":
|
|
179
|
+
return types.BooleanType()
|
|
180
|
+
if field_type in ["timestamp", "timestamp_tz"]:
|
|
181
|
+
return types.TimestamptzType()
|
|
182
|
+
if field_type == "timestamp_ntz":
|
|
183
|
+
return types.TimestampType()
|
|
184
|
+
if field_type == "date":
|
|
185
|
+
return types.DateType()
|
|
186
|
+
if field_type == "bytes":
|
|
187
|
+
return types.BinaryType()
|
|
188
|
+
return types.BinaryType()
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from typing import Dict
|
|
3
3
|
|
|
4
|
-
from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
|
|
5
|
-
|
|
6
4
|
from datacontract.export.exporter import Exporter, _check_models_for_export
|
|
5
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
class JsonSchemaExporter(Exporter):
|
|
@@ -51,6 +50,8 @@ def to_property(field: Field) -> dict:
|
|
|
51
50
|
property["type"] = json_type
|
|
52
51
|
if json_format is not None:
|
|
53
52
|
property["format"] = json_format
|
|
53
|
+
if field.primaryKey:
|
|
54
|
+
property["primaryKey"] = field.primaryKey
|
|
54
55
|
if field.unique:
|
|
55
56
|
property["unique"] = True
|
|
56
57
|
if json_type == "object":
|
|
@@ -2,8 +2,8 @@ from typing import Dict
|
|
|
2
2
|
|
|
3
3
|
import yaml
|
|
4
4
|
|
|
5
|
-
from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
|
|
6
5
|
from datacontract.export.exporter import Exporter
|
|
6
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class OdcsV2Exporter(Exporter):
|
|
@@ -3,7 +3,7 @@ from typing import Dict
|
|
|
3
3
|
import yaml
|
|
4
4
|
|
|
5
5
|
from datacontract.export.exporter import Exporter
|
|
6
|
-
from datacontract.model.data_contract_specification import DataContractSpecification,
|
|
6
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class OdcsV3Exporter(Exporter):
|
|
@@ -148,6 +148,10 @@ def to_odcs_schema(model_key, model_value: Model) -> dict:
|
|
|
148
148
|
if properties:
|
|
149
149
|
odcs_table["properties"] = properties
|
|
150
150
|
|
|
151
|
+
model_quality = to_odcs_quality_list(model_value.quality)
|
|
152
|
+
if len(model_quality) > 0:
|
|
153
|
+
odcs_table["quality"] = model_quality
|
|
154
|
+
|
|
151
155
|
odcs_table["customProperties"] = []
|
|
152
156
|
if model_value.model_extra is not None:
|
|
153
157
|
for key, value in model_value.model_extra.items():
|
|
@@ -257,38 +261,48 @@ def to_property(field_name: str, field: Field) -> dict:
|
|
|
257
261
|
del property["logicalTypeOptions"]
|
|
258
262
|
|
|
259
263
|
if field.quality is not None:
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
quality_dict = {"type": quality.type}
|
|
263
|
-
if quality.description is not None:
|
|
264
|
-
quality_dict["description"] = quality.description
|
|
265
|
-
if quality.query is not None:
|
|
266
|
-
quality_dict["query"] = quality.query
|
|
267
|
-
# dialect is not supported in v3.0.0
|
|
268
|
-
if quality.mustBe is not None:
|
|
269
|
-
quality_dict["mustBe"] = quality.mustBe
|
|
270
|
-
if quality.mustNotBe is not None:
|
|
271
|
-
quality_dict["mustNotBe"] = quality.mustNotBe
|
|
272
|
-
if quality.mustBeGreaterThan is not None:
|
|
273
|
-
quality_dict["mustBeGreaterThan"] = quality.mustBeGreaterThan
|
|
274
|
-
if quality.mustBeGreaterThanOrEqualTo is not None:
|
|
275
|
-
quality_dict["mustBeGreaterThanOrEqualTo"] = quality.mustBeGreaterThanOrEqualTo
|
|
276
|
-
if quality.mustBeLessThan is not None:
|
|
277
|
-
quality_dict["mustBeLessThan"] = quality.mustBeLessThan
|
|
278
|
-
if quality.mustBeLessThanOrEqualTo is not None:
|
|
279
|
-
quality_dict["mustBeLessThanOrEqualTo"] = quality.mustBeLessThanOrEqualTo
|
|
280
|
-
if quality.mustBeBetween is not None:
|
|
281
|
-
quality_dict["mustBeBetween"] = quality.mustBeBetween
|
|
282
|
-
if quality.mustNotBeBetween is not None:
|
|
283
|
-
quality_dict["mustNotBeBetween"] = quality.mustNotBeBetween
|
|
284
|
-
if quality.engine is not None:
|
|
285
|
-
quality_dict["engine"] = quality.engine
|
|
286
|
-
if quality.implementation is not None:
|
|
287
|
-
quality_dict["implementation"] = quality.implementation
|
|
288
|
-
quality_property.append(quality_dict)
|
|
264
|
+
quality_list = field.quality
|
|
265
|
+
quality_property = to_odcs_quality_list(quality_list)
|
|
289
266
|
if len(quality_property) > 0:
|
|
290
267
|
property["quality"] = quality_property
|
|
291
268
|
|
|
292
269
|
# todo enum
|
|
293
270
|
|
|
294
271
|
return property
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def to_odcs_quality_list(quality_list):
|
|
275
|
+
quality_property = []
|
|
276
|
+
for quality in quality_list:
|
|
277
|
+
quality_property.append(to_odcs_quality(quality))
|
|
278
|
+
return quality_property
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def to_odcs_quality(quality):
|
|
282
|
+
quality_dict = {"type": quality.type}
|
|
283
|
+
if quality.description is not None:
|
|
284
|
+
quality_dict["description"] = quality.description
|
|
285
|
+
if quality.query is not None:
|
|
286
|
+
quality_dict["query"] = quality.query
|
|
287
|
+
# dialect is not supported in v3.0.0
|
|
288
|
+
if quality.mustBe is not None:
|
|
289
|
+
quality_dict["mustBe"] = quality.mustBe
|
|
290
|
+
if quality.mustNotBe is not None:
|
|
291
|
+
quality_dict["mustNotBe"] = quality.mustNotBe
|
|
292
|
+
if quality.mustBeGreaterThan is not None:
|
|
293
|
+
quality_dict["mustBeGreaterThan"] = quality.mustBeGreaterThan
|
|
294
|
+
if quality.mustBeGreaterThanOrEqualTo is not None:
|
|
295
|
+
quality_dict["mustBeGreaterThanOrEqualTo"] = quality.mustBeGreaterThanOrEqualTo
|
|
296
|
+
if quality.mustBeLessThan is not None:
|
|
297
|
+
quality_dict["mustBeLessThan"] = quality.mustBeLessThan
|
|
298
|
+
if quality.mustBeLessThanOrEqualTo is not None:
|
|
299
|
+
quality_dict["mustBeLessThanOrEqualTo"] = quality.mustBeLessThanOrEqualTo
|
|
300
|
+
if quality.mustBeBetween is not None:
|
|
301
|
+
quality_dict["mustBeBetween"] = quality.mustBeBetween
|
|
302
|
+
if quality.mustNotBeBetween is not None:
|
|
303
|
+
quality_dict["mustNotBeBetween"] = quality.mustNotBeBetween
|
|
304
|
+
if quality.engine is not None:
|
|
305
|
+
quality_dict["engine"] = quality.engine
|
|
306
|
+
if quality.implementation is not None:
|
|
307
|
+
quality_dict["implementation"] = quality.implementation
|
|
308
|
+
return quality_dict
|