datacontract-cli 0.10.0__py3-none-any.whl → 0.10.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datacontract/__init__.py +13 -0
- datacontract/api.py +260 -0
- datacontract/breaking/breaking.py +242 -12
- datacontract/breaking/breaking_rules.py +37 -1
- datacontract/catalog/catalog.py +80 -0
- datacontract/cli.py +387 -117
- datacontract/data_contract.py +216 -353
- datacontract/engines/data_contract_checks.py +1041 -0
- datacontract/engines/data_contract_test.py +113 -0
- datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +2 -3
- datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
- datacontract/engines/fastjsonschema/check_jsonschema.py +176 -42
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +16 -1
- datacontract/engines/soda/check_soda_execute.py +100 -56
- datacontract/engines/soda/connections/athena.py +79 -0
- datacontract/engines/soda/connections/bigquery.py +8 -1
- datacontract/engines/soda/connections/databricks.py +12 -3
- datacontract/engines/soda/connections/duckdb_connection.py +241 -0
- datacontract/engines/soda/connections/kafka.py +206 -113
- datacontract/engines/soda/connections/snowflake.py +8 -5
- datacontract/engines/soda/connections/sqlserver.py +43 -0
- datacontract/engines/soda/connections/trino.py +26 -0
- datacontract/export/avro_converter.py +72 -8
- datacontract/export/avro_idl_converter.py +31 -25
- datacontract/export/bigquery_converter.py +130 -0
- datacontract/export/custom_converter.py +40 -0
- datacontract/export/data_caterer_converter.py +161 -0
- datacontract/export/dbml_converter.py +148 -0
- datacontract/export/dbt_converter.py +141 -54
- datacontract/export/dcs_exporter.py +6 -0
- datacontract/export/dqx_converter.py +126 -0
- datacontract/export/duckdb_type_converter.py +57 -0
- datacontract/export/excel_exporter.py +923 -0
- datacontract/export/exporter.py +100 -0
- datacontract/export/exporter_factory.py +216 -0
- datacontract/export/go_converter.py +105 -0
- datacontract/export/great_expectations_converter.py +257 -36
- datacontract/export/html_exporter.py +86 -0
- datacontract/export/iceberg_converter.py +188 -0
- datacontract/export/jsonschema_converter.py +71 -16
- datacontract/export/markdown_converter.py +337 -0
- datacontract/export/mermaid_exporter.py +110 -0
- datacontract/export/odcs_v3_exporter.py +375 -0
- datacontract/export/pandas_type_converter.py +40 -0
- datacontract/export/protobuf_converter.py +168 -68
- datacontract/export/pydantic_converter.py +6 -0
- datacontract/export/rdf_converter.py +13 -6
- datacontract/export/sodacl_converter.py +36 -188
- datacontract/export/spark_converter.py +245 -0
- datacontract/export/sql_converter.py +37 -3
- datacontract/export/sql_type_converter.py +269 -8
- datacontract/export/sqlalchemy_converter.py +170 -0
- datacontract/export/terraform_converter.py +7 -2
- datacontract/imports/avro_importer.py +246 -26
- datacontract/imports/bigquery_importer.py +221 -0
- datacontract/imports/csv_importer.py +143 -0
- datacontract/imports/dbml_importer.py +112 -0
- datacontract/imports/dbt_importer.py +240 -0
- datacontract/imports/excel_importer.py +1111 -0
- datacontract/imports/glue_importer.py +288 -0
- datacontract/imports/iceberg_importer.py +172 -0
- datacontract/imports/importer.py +51 -0
- datacontract/imports/importer_factory.py +128 -0
- datacontract/imports/json_importer.py +325 -0
- datacontract/imports/jsonschema_importer.py +146 -0
- datacontract/imports/odcs_importer.py +60 -0
- datacontract/imports/odcs_v3_importer.py +516 -0
- datacontract/imports/parquet_importer.py +81 -0
- datacontract/imports/protobuf_importer.py +264 -0
- datacontract/imports/spark_importer.py +262 -0
- datacontract/imports/sql_importer.py +274 -35
- datacontract/imports/unity_importer.py +219 -0
- datacontract/init/init_template.py +20 -0
- datacontract/integration/datamesh_manager.py +86 -0
- datacontract/lint/resolve.py +271 -49
- datacontract/lint/resources.py +21 -0
- datacontract/lint/schema.py +53 -17
- datacontract/lint/urls.py +32 -12
- datacontract/model/data_contract_specification/__init__.py +1 -0
- datacontract/model/exceptions.py +4 -1
- datacontract/model/odcs.py +24 -0
- datacontract/model/run.py +49 -29
- datacontract/output/__init__.py +0 -0
- datacontract/output/junit_test_results.py +135 -0
- datacontract/output/output_format.py +10 -0
- datacontract/output/test_results_writer.py +79 -0
- datacontract/py.typed +0 -0
- datacontract/schemas/datacontract-1.1.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.1.0.schema.json +1975 -0
- datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
- datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
- datacontract/schemas/odcs-3.0.1.schema.json +2634 -0
- datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
- datacontract/templates/datacontract.html +139 -294
- datacontract/templates/datacontract_odcs.html +685 -0
- datacontract/templates/index.html +236 -0
- datacontract/templates/partials/datacontract_information.html +86 -0
- datacontract/templates/partials/datacontract_servicelevels.html +253 -0
- datacontract/templates/partials/datacontract_terms.html +51 -0
- datacontract/templates/partials/definition.html +25 -0
- datacontract/templates/partials/example.html +27 -0
- datacontract/templates/partials/model_field.html +144 -0
- datacontract/templates/partials/quality.html +49 -0
- datacontract/templates/partials/server.html +211 -0
- datacontract/templates/style/output.css +491 -72
- datacontract_cli-0.10.37.dist-info/METADATA +2235 -0
- datacontract_cli-0.10.37.dist-info/RECORD +119 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/WHEEL +1 -1
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info/licenses}/LICENSE +1 -1
- datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
- datacontract/engines/soda/connections/dask.py +0 -28
- datacontract/engines/soda/connections/duckdb.py +0 -76
- datacontract/export/csv_type_converter.py +0 -36
- datacontract/export/html_export.py +0 -66
- datacontract/export/odcs_converter.py +0 -102
- datacontract/init/download_datacontract_file.py +0 -17
- datacontract/integration/publish_datamesh_manager.py +0 -33
- datacontract/integration/publish_opentelemetry.py +0 -107
- datacontract/lint/lint.py +0 -141
- datacontract/lint/linters/description_linter.py +0 -34
- datacontract/lint/linters/example_model_linter.py +0 -91
- datacontract/lint/linters/field_pattern_linter.py +0 -34
- datacontract/lint/linters/field_reference_linter.py +0 -38
- datacontract/lint/linters/notice_period_linter.py +0 -55
- datacontract/lint/linters/quality_schema_linter.py +0 -52
- datacontract/lint/linters/valid_constraints_linter.py +0 -99
- datacontract/model/data_contract_specification.py +0 -141
- datacontract/web.py +0 -14
- datacontract_cli-0.10.0.dist-info/METADATA +0 -951
- datacontract_cli-0.10.0.dist-info/RECORD +0 -66
- /datacontract/{model → breaking}/breaking_change.py +0 -0
- /datacontract/{lint/linters → export}/__init__.py +0 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/top_level.txt +0 -0
|
@@ -1,40 +1,122 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module provides functionalities to export data contracts to Great Expectations suites.
|
|
3
|
+
It includes definitions for exporting different types of data (pandas, Spark, SQL) into
|
|
4
|
+
Great Expectations expectations format.
|
|
5
|
+
"""
|
|
6
|
+
|
|
1
7
|
import json
|
|
2
|
-
from
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from typing import Any, Dict, List
|
|
3
10
|
|
|
4
11
|
import yaml
|
|
5
12
|
|
|
6
|
-
from datacontract.
|
|
7
|
-
|
|
13
|
+
from datacontract.export.exporter import (
|
|
14
|
+
Exporter,
|
|
15
|
+
_check_models_for_export,
|
|
16
|
+
)
|
|
17
|
+
from datacontract.model.data_contract_specification import (
|
|
18
|
+
DataContractSpecification,
|
|
19
|
+
DeprecatedQuality,
|
|
20
|
+
Field,
|
|
21
|
+
Quality,
|
|
22
|
+
)
|
|
23
|
+
|
|
8
24
|
|
|
25
|
+
class GreatExpectationsEngine(Enum):
|
|
26
|
+
"""Enum to represent the type of data engine for expectations.
|
|
9
27
|
|
|
10
|
-
|
|
28
|
+
Attributes:
|
|
29
|
+
pandas (str): Represents the Pandas engine type.
|
|
30
|
+
spark (str): Represents the Spark engine type.
|
|
31
|
+
sql (str): Represents the SQL engine type.
|
|
11
32
|
"""
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
33
|
+
|
|
34
|
+
pandas = "pandas"
|
|
35
|
+
spark = "spark"
|
|
36
|
+
sql = "sql"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class GreatExpectationsExporter(Exporter):
|
|
40
|
+
"""Exporter class to convert data contracts to Great Expectations suites.
|
|
41
|
+
|
|
42
|
+
Methods:
|
|
43
|
+
export: Converts a data contract model to a Great Expectations suite.
|
|
44
|
+
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
|
|
48
|
+
"""Exports a data contract model to a Great Expectations suite.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
data_contract (DataContractSpecification): The data contract specification.
|
|
52
|
+
model (str): The model name to export.
|
|
53
|
+
server (str): The server information.
|
|
54
|
+
sql_server_type (str): Type of SQL server (e.g., "snowflake").
|
|
55
|
+
export_args (dict): Additional arguments for export, such as "suite_name" and "engine".
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
dict: A dictionary representation of the Great Expectations suite.
|
|
59
|
+
"""
|
|
60
|
+
expectation_suite_name = export_args.get("suite_name")
|
|
61
|
+
engine = export_args.get("engine")
|
|
62
|
+
model_name, model_value = _check_models_for_export(data_contract, model, self.export_format)
|
|
63
|
+
sql_server_type = "snowflake" if sql_server_type == "auto" else sql_server_type
|
|
64
|
+
return to_great_expectations(data_contract, model_name, expectation_suite_name, engine, sql_server_type)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def to_great_expectations(
|
|
68
|
+
data_contract_spec: DataContractSpecification,
|
|
69
|
+
model_key: str,
|
|
70
|
+
expectation_suite_name: str | None = None,
|
|
71
|
+
engine: str | None = None,
|
|
72
|
+
sql_server_type: str = "snowflake",
|
|
73
|
+
) -> str:
|
|
74
|
+
"""Converts a data contract model to a Great Expectations suite.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
data_contract_spec (DataContractSpecification): The data contract specification.
|
|
78
|
+
model_key (str): The model key.
|
|
79
|
+
expectation_suite_name (str | None): Optional suite name for the expectations.
|
|
80
|
+
engine (str | None): Optional engine type (e.g., "pandas", "spark").
|
|
81
|
+
sql_server_type (str): The type of SQL server (default is "snowflake").
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
str: JSON string of the Great Expectations suite.
|
|
16
85
|
"""
|
|
17
86
|
expectations = []
|
|
87
|
+
if not expectation_suite_name:
|
|
88
|
+
expectation_suite_name = "{model_key}.{contract_version}".format(
|
|
89
|
+
model_key=model_key, contract_version=data_contract_spec.info.version
|
|
90
|
+
)
|
|
18
91
|
model_value = data_contract_spec.models.get(model_key)
|
|
19
|
-
|
|
20
|
-
|
|
92
|
+
|
|
93
|
+
# Support for Deprecated Quality
|
|
94
|
+
quality_checks = get_deprecated_quality_checks(data_contract_spec.quality)
|
|
95
|
+
|
|
96
|
+
expectations.extend(get_quality_checks(model_value.quality))
|
|
97
|
+
|
|
98
|
+
expectations.extend(model_to_expectations(model_value.fields, engine, sql_server_type))
|
|
99
|
+
|
|
21
100
|
expectations.extend(checks_to_expectations(quality_checks, model_key))
|
|
22
|
-
model_expectation_suite = to_suite(
|
|
101
|
+
model_expectation_suite = to_suite(expectations, expectation_suite_name)
|
|
23
102
|
|
|
24
103
|
return model_expectation_suite
|
|
25
104
|
|
|
26
105
|
|
|
27
|
-
def to_suite(
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
)
|
|
106
|
+
def to_suite(expectations: List[Dict[str, Any]], expectation_suite_name: str) -> str:
|
|
107
|
+
"""Converts a list of expectations to a JSON-formatted suite.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
expectations (List[Dict[str, Any]]): List of expectations.
|
|
111
|
+
expectation_suite_name (str): Name of the expectation suite.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
str: JSON string of the expectation suite.
|
|
115
|
+
"""
|
|
32
116
|
return json.dumps(
|
|
33
117
|
{
|
|
34
118
|
"data_asset_type": "null",
|
|
35
|
-
"expectation_suite_name":
|
|
36
|
-
model_key=model_key, contract_version=contract_version
|
|
37
|
-
),
|
|
119
|
+
"expectation_suite_name": expectation_suite_name,
|
|
38
120
|
"expectations": expectations,
|
|
39
121
|
"meta": {},
|
|
40
122
|
},
|
|
@@ -42,34 +124,79 @@ def to_suite(
|
|
|
42
124
|
)
|
|
43
125
|
|
|
44
126
|
|
|
45
|
-
def model_to_expectations(fields: Dict[str, Field]) -> List[Dict[str, Any]]:
|
|
46
|
-
"""
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
127
|
+
def model_to_expectations(fields: Dict[str, Field], engine: str | None, sql_server_type: str) -> List[Dict[str, Any]]:
|
|
128
|
+
"""Converts model fields to a list of expectations.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
fields (Dict[str, Field]): Dictionary of model fields.
|
|
132
|
+
engine (str | None): Engine type (e.g., "pandas", "spark").
|
|
133
|
+
sql_server_type (str): SQL server type.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
List[Dict[str, Any]]: List of expectations.
|
|
50
137
|
"""
|
|
51
138
|
expectations = []
|
|
52
139
|
add_column_order_exp(fields, expectations)
|
|
53
140
|
for field_name, field in fields.items():
|
|
54
|
-
add_field_expectations(field_name, field, expectations)
|
|
141
|
+
add_field_expectations(field_name, field, expectations, engine, sql_server_type)
|
|
142
|
+
expectations.extend(get_quality_checks(field.quality, field_name))
|
|
55
143
|
return expectations
|
|
56
144
|
|
|
57
145
|
|
|
58
|
-
def add_field_expectations(
|
|
146
|
+
def add_field_expectations(
|
|
147
|
+
field_name,
|
|
148
|
+
field: Field,
|
|
149
|
+
expectations: List[Dict[str, Any]],
|
|
150
|
+
engine: str | None,
|
|
151
|
+
sql_server_type: str,
|
|
152
|
+
) -> List[Dict[str, Any]]:
|
|
153
|
+
"""Adds expectations for a specific field based on its properties.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
field_name (str): The name of the field.
|
|
157
|
+
field (Field): The field object.
|
|
158
|
+
expectations (List[Dict[str, Any]]): The expectations list to update.
|
|
159
|
+
engine (str | None): Engine type (e.g., "pandas", "spark").
|
|
160
|
+
sql_server_type (str): SQL server type.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
List[Dict[str, Any]]: Updated list of expectations.
|
|
164
|
+
"""
|
|
59
165
|
if field.type is not None:
|
|
60
|
-
|
|
166
|
+
if engine == GreatExpectationsEngine.spark.value:
|
|
167
|
+
from datacontract.export.spark_converter import to_spark_data_type
|
|
168
|
+
|
|
169
|
+
field_type = to_spark_data_type(field).__class__.__name__
|
|
170
|
+
elif engine == GreatExpectationsEngine.pandas.value:
|
|
171
|
+
from datacontract.export.pandas_type_converter import convert_to_pandas_type
|
|
172
|
+
|
|
173
|
+
field_type = convert_to_pandas_type(field)
|
|
174
|
+
elif engine == GreatExpectationsEngine.sql.value:
|
|
175
|
+
from datacontract.export.sql_type_converter import convert_to_sql_type
|
|
176
|
+
|
|
177
|
+
field_type = convert_to_sql_type(field, sql_server_type)
|
|
178
|
+
else:
|
|
179
|
+
field_type = field.type
|
|
180
|
+
expectations.append(to_column_types_exp(field_name, field_type))
|
|
61
181
|
if field.unique:
|
|
62
182
|
expectations.append(to_column_unique_exp(field_name))
|
|
63
183
|
if field.maxLength is not None or field.minLength is not None:
|
|
64
184
|
expectations.append(to_column_length_exp(field_name, field.minLength, field.maxLength))
|
|
65
185
|
if field.minimum is not None or field.maximum is not None:
|
|
66
186
|
expectations.append(to_column_min_max_exp(field_name, field.minimum, field.maximum))
|
|
187
|
+
if field.enum is not None and len(field.enum) != 0:
|
|
188
|
+
expectations.append(to_column_enum_exp(field_name, field.enum))
|
|
67
189
|
|
|
68
|
-
# TODO: all constraints
|
|
69
190
|
return expectations
|
|
70
191
|
|
|
71
192
|
|
|
72
193
|
def add_column_order_exp(fields: Dict[str, Field], expectations: List[Dict[str, Any]]):
|
|
194
|
+
"""Adds expectation for column ordering.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
fields (Dict[str, Field]): Dictionary of fields.
|
|
198
|
+
expectations (List[Dict[str, Any]]): The expectations list to update.
|
|
199
|
+
"""
|
|
73
200
|
expectations.append(
|
|
74
201
|
{
|
|
75
202
|
"expectation_type": "expect_table_columns_to_match_ordered_list",
|
|
@@ -80,6 +207,15 @@ def add_column_order_exp(fields: Dict[str, Field], expectations: List[Dict[str,
|
|
|
80
207
|
|
|
81
208
|
|
|
82
209
|
def to_column_types_exp(field_name, field_type) -> Dict[str, Any]:
|
|
210
|
+
"""Creates a column type expectation.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
field_name (str): The name of the field.
|
|
214
|
+
field_type (str): The type of the field.
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
Dict[str, Any]: Column type expectation.
|
|
218
|
+
"""
|
|
83
219
|
return {
|
|
84
220
|
"expectation_type": "expect_column_values_to_be_of_type",
|
|
85
221
|
"kwargs": {"column": field_name, "type_": field_type},
|
|
@@ -88,18 +224,54 @@ def to_column_types_exp(field_name, field_type) -> Dict[str, Any]:
|
|
|
88
224
|
|
|
89
225
|
|
|
90
226
|
def to_column_unique_exp(field_name) -> Dict[str, Any]:
|
|
91
|
-
|
|
227
|
+
"""Creates a column uniqueness expectation.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
field_name (str): The name of the field.
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
Dict[str, Any]: Column uniqueness expectation.
|
|
234
|
+
"""
|
|
235
|
+
return {
|
|
236
|
+
"expectation_type": "expect_column_values_to_be_unique",
|
|
237
|
+
"kwargs": {"column": field_name},
|
|
238
|
+
"meta": {},
|
|
239
|
+
}
|
|
92
240
|
|
|
93
241
|
|
|
94
242
|
def to_column_length_exp(field_name, min_length, max_length) -> Dict[str, Any]:
|
|
243
|
+
"""Creates a column length expectation.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
field_name (str): The name of the field.
|
|
247
|
+
min_length (int | None): Minimum length.
|
|
248
|
+
max_length (int | None): Maximum length.
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
Dict[str, Any]: Column length expectation.
|
|
252
|
+
"""
|
|
95
253
|
return {
|
|
96
254
|
"expectation_type": "expect_column_value_lengths_to_be_between",
|
|
97
|
-
"kwargs": {
|
|
255
|
+
"kwargs": {
|
|
256
|
+
"column": field_name,
|
|
257
|
+
"min_value": min_length,
|
|
258
|
+
"max_value": max_length,
|
|
259
|
+
},
|
|
98
260
|
"meta": {},
|
|
99
261
|
}
|
|
100
262
|
|
|
101
263
|
|
|
102
264
|
def to_column_min_max_exp(field_name, minimum, maximum) -> Dict[str, Any]:
|
|
265
|
+
"""Creates a column min-max value expectation.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
field_name (str): The name of the field.
|
|
269
|
+
minimum (float | None): Minimum value.
|
|
270
|
+
maximum (float | None): Maximum value.
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
Dict[str, Any]: Column min-max value expectation.
|
|
274
|
+
"""
|
|
103
275
|
return {
|
|
104
276
|
"expectation_type": "expect_column_values_to_be_between",
|
|
105
277
|
"kwargs": {"column": field_name, "min_value": minimum, "max_value": maximum},
|
|
@@ -107,7 +279,32 @@ def to_column_min_max_exp(field_name, minimum, maximum) -> Dict[str, Any]:
|
|
|
107
279
|
}
|
|
108
280
|
|
|
109
281
|
|
|
110
|
-
def
|
|
282
|
+
def to_column_enum_exp(field_name, enum_list: List[str]) -> Dict[str, Any]:
|
|
283
|
+
"""Creates a expect_column_values_to_be_in_set expectation.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
field_name (str): The name of the field.
|
|
287
|
+
enum_list (Set[str]): enum list of value.
|
|
288
|
+
|
|
289
|
+
Returns:
|
|
290
|
+
Dict[str, Any]: Column value in set expectation.
|
|
291
|
+
"""
|
|
292
|
+
return {
|
|
293
|
+
"expectation_type": "expect_column_values_to_be_in_set",
|
|
294
|
+
"kwargs": {"column": field_name, "value_set": enum_list},
|
|
295
|
+
"meta": {},
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def get_deprecated_quality_checks(quality: DeprecatedQuality) -> Dict[str, Any]:
|
|
300
|
+
"""Retrieves quality checks defined in a data contract.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
quality (Quality): Quality object from the data contract.
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
Dict[str, Any]: Dictionary of quality checks.
|
|
307
|
+
"""
|
|
111
308
|
if quality is None:
|
|
112
309
|
return {}
|
|
113
310
|
if quality.type is None:
|
|
@@ -121,12 +318,35 @@ def get_quality_checks(quality: Quality) -> Dict[str, Any]:
|
|
|
121
318
|
return quality_specification
|
|
122
319
|
|
|
123
320
|
|
|
124
|
-
def
|
|
321
|
+
def get_quality_checks(qualities: List[Quality], field_name: str | None = None) -> List[Dict[str, Any]]:
|
|
322
|
+
"""Retrieves quality checks defined in a data contract.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
qualities (List[Quality]): List of quality object from the model specification.
|
|
326
|
+
field_name (str | None): field name if the quality list is attached to a specific field
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
Dict[str, Any]: Dictionary of quality checks.
|
|
125
330
|
"""
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
331
|
+
quality_specification = []
|
|
332
|
+
for quality in qualities:
|
|
333
|
+
if quality is not None and quality.engine is not None and quality.engine.lower() == "great-expectations":
|
|
334
|
+
ge_expectation = quality.implementation
|
|
335
|
+
if field_name is not None:
|
|
336
|
+
ge_expectation["column"] = field_name
|
|
337
|
+
quality_specification.append(ge_expectation)
|
|
338
|
+
return quality_specification
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def checks_to_expectations(quality_checks: Dict[str, Any], model_key: str) -> List[Dict[str, Any]]:
|
|
342
|
+
"""Converts quality checks to a list of expectations.
|
|
343
|
+
|
|
344
|
+
Args:
|
|
345
|
+
quality_checks (Dict[str, Any]): Dictionary of quality checks by model.
|
|
346
|
+
model_key (str): The model key.
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
List[Dict[str, Any]]: List of expectations for the model.
|
|
130
350
|
"""
|
|
131
351
|
if quality_checks is None or model_key not in quality_checks:
|
|
132
352
|
return []
|
|
@@ -139,3 +359,4 @@ def checks_to_expectations(quality_checks: Dict[str, Any], model_key: str) -> Li
|
|
|
139
359
|
if isinstance(model_quality_checks, str):
|
|
140
360
|
expectation_list = json.loads(model_quality_checks)
|
|
141
361
|
return expectation_list
|
|
362
|
+
return []
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import logging
|
|
3
|
+
from importlib.metadata import version
|
|
4
|
+
|
|
5
|
+
import jinja_partials
|
|
6
|
+
import pytz
|
|
7
|
+
import yaml
|
|
8
|
+
from jinja2 import Environment, PackageLoader, select_autoescape
|
|
9
|
+
from open_data_contract_standard.model import OpenDataContractStandard
|
|
10
|
+
|
|
11
|
+
from datacontract.export.exporter import Exporter
|
|
12
|
+
from datacontract.export.mermaid_exporter import to_mermaid
|
|
13
|
+
from datacontract.model.data_contract_specification import DataContractSpecification
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class HtmlExporter(Exporter):
|
|
17
|
+
def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
|
|
18
|
+
return to_html(data_contract)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def to_html(data_contract_spec: DataContractSpecification | OpenDataContractStandard) -> str:
|
|
22
|
+
# Load templates from templates folder
|
|
23
|
+
package_loader = PackageLoader("datacontract", "templates")
|
|
24
|
+
env = Environment(
|
|
25
|
+
loader=package_loader,
|
|
26
|
+
autoescape=select_autoescape(
|
|
27
|
+
enabled_extensions="html",
|
|
28
|
+
default_for_string=True,
|
|
29
|
+
),
|
|
30
|
+
)
|
|
31
|
+
# Set up for partials
|
|
32
|
+
jinja_partials.register_environment(env)
|
|
33
|
+
|
|
34
|
+
# Load the required template
|
|
35
|
+
# needs to be included in /MANIFEST.in
|
|
36
|
+
template_file = "datacontract.html"
|
|
37
|
+
if isinstance(data_contract_spec, OpenDataContractStandard):
|
|
38
|
+
template_file = "datacontract_odcs.html"
|
|
39
|
+
|
|
40
|
+
template = env.get_template(template_file)
|
|
41
|
+
|
|
42
|
+
style_content, _, _ = package_loader.get_source(env, "style/output.css")
|
|
43
|
+
|
|
44
|
+
quality_specification = None
|
|
45
|
+
if isinstance(data_contract_spec, DataContractSpecification):
|
|
46
|
+
if data_contract_spec.quality is not None and isinstance(data_contract_spec.quality.specification, str):
|
|
47
|
+
quality_specification = data_contract_spec.quality.specification
|
|
48
|
+
elif data_contract_spec.quality is not None and isinstance(data_contract_spec.quality.specification, object):
|
|
49
|
+
if data_contract_spec.quality.type == "great-expectations":
|
|
50
|
+
quality_specification = yaml.dump(
|
|
51
|
+
data_contract_spec.quality.specification, sort_keys=False, default_style="|"
|
|
52
|
+
)
|
|
53
|
+
else:
|
|
54
|
+
quality_specification = yaml.dump(data_contract_spec.quality.specification, sort_keys=False)
|
|
55
|
+
|
|
56
|
+
datacontract_yaml = data_contract_spec.to_yaml()
|
|
57
|
+
|
|
58
|
+
# Get the mermaid diagram
|
|
59
|
+
mermaid_diagram = to_mermaid(data_contract_spec)
|
|
60
|
+
|
|
61
|
+
# Render the template with necessary data
|
|
62
|
+
html_string = template.render(
|
|
63
|
+
datacontract=data_contract_spec,
|
|
64
|
+
quality_specification=quality_specification,
|
|
65
|
+
style=style_content,
|
|
66
|
+
datacontract_yaml=datacontract_yaml,
|
|
67
|
+
formatted_date=_formatted_date(),
|
|
68
|
+
datacontract_cli_version=get_version(),
|
|
69
|
+
mermaid_diagram=mermaid_diagram,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
return html_string
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _formatted_date() -> str:
|
|
76
|
+
tz = pytz.timezone("UTC")
|
|
77
|
+
now = datetime.datetime.now(tz)
|
|
78
|
+
return now.strftime("%d %b %Y %H:%M:%S UTC")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def get_version() -> str:
|
|
82
|
+
try:
|
|
83
|
+
return version("datacontract_cli")
|
|
84
|
+
except Exception as e:
|
|
85
|
+
logging.debug("Ignoring exception", e)
|
|
86
|
+
return ""
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
from pyiceberg import types
|
|
2
|
+
from pyiceberg.schema import Schema, assign_fresh_schema_ids
|
|
3
|
+
|
|
4
|
+
from datacontract.export.exporter import Exporter
|
|
5
|
+
from datacontract.model.data_contract_specification import (
|
|
6
|
+
DataContractSpecification,
|
|
7
|
+
Field,
|
|
8
|
+
Model,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class IcebergExporter(Exporter):
|
|
13
|
+
"""
|
|
14
|
+
Exporter class for exporting data contracts to Iceberg schemas.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def export(
|
|
18
|
+
self,
|
|
19
|
+
data_contract: DataContractSpecification,
|
|
20
|
+
model,
|
|
21
|
+
server,
|
|
22
|
+
sql_server_type,
|
|
23
|
+
export_args,
|
|
24
|
+
):
|
|
25
|
+
"""
|
|
26
|
+
Export the given data contract model to an Iceberg schema.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
data_contract (DataContractSpecification): The data contract specification.
|
|
30
|
+
model: The model to export, currently just supports one model.
|
|
31
|
+
server: Not used in this implementation.
|
|
32
|
+
sql_server_type: Not used in this implementation.
|
|
33
|
+
export_args: Additional arguments for export.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
str: A string representation of the Iceberg json schema.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
return to_iceberg(data_contract, model)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def to_iceberg(contract: DataContractSpecification, model: str) -> str:
|
|
43
|
+
"""
|
|
44
|
+
Converts a DataContractSpecification into an Iceberg json schema string. JSON string follows https://iceberg.apache.org/spec/#appendix-c-json-serialization.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
contract (DataContractSpecification): The data contract specification containing models.
|
|
48
|
+
model: The model to export, currently just supports one model.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
str: A string representation of the Iceberg json schema.
|
|
52
|
+
"""
|
|
53
|
+
if model is None or model == "all":
|
|
54
|
+
if len(contract.models.items()) != 1:
|
|
55
|
+
# Iceberg doesn't have a way to combine multiple models into a single schema, an alternative would be to export json lines
|
|
56
|
+
raise Exception(f"Can only output one model at a time, found {len(contract.models.items())} models")
|
|
57
|
+
for model_name, model in contract.models.items():
|
|
58
|
+
schema = to_iceberg_schema(model)
|
|
59
|
+
else:
|
|
60
|
+
if model not in contract.models:
|
|
61
|
+
raise Exception(f"model {model} not found in contract")
|
|
62
|
+
schema = to_iceberg_schema(contract.models[model])
|
|
63
|
+
|
|
64
|
+
return schema.model_dump_json()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def to_iceberg_schema(model: Model) -> types.StructType:
|
|
68
|
+
"""
|
|
69
|
+
Convert a model to a Iceberg schema.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
model (Model): The model to convert.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
types.StructType: The corresponding Iceberg schema.
|
|
76
|
+
"""
|
|
77
|
+
iceberg_fields = []
|
|
78
|
+
primary_keys = []
|
|
79
|
+
for field_name, spec_field in model.fields.items():
|
|
80
|
+
iceberg_field = make_field(field_name, spec_field)
|
|
81
|
+
iceberg_fields.append(iceberg_field)
|
|
82
|
+
|
|
83
|
+
if spec_field.primaryKey:
|
|
84
|
+
primary_keys.append(iceberg_field.name)
|
|
85
|
+
|
|
86
|
+
schema = Schema(*iceberg_fields)
|
|
87
|
+
|
|
88
|
+
# apply non-0 field IDs so we can set the identifier fields for the schema
|
|
89
|
+
schema = assign_fresh_schema_ids(schema)
|
|
90
|
+
for field in schema.fields:
|
|
91
|
+
if field.name in primary_keys:
|
|
92
|
+
schema.identifier_field_ids.append(field.field_id)
|
|
93
|
+
|
|
94
|
+
return schema
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def make_field(field_name, field):
|
|
98
|
+
field_type = get_field_type(field)
|
|
99
|
+
|
|
100
|
+
# Note: might want to re-populate field_id from config['icebergFieldId'] if it exists, however, it gets
|
|
101
|
+
# complicated since field_ids impact the list and map element_ids, and the importer is not keeping track of those.
|
|
102
|
+
# Even if IDs are re-constituted, it seems like the SDK code would still reset them before any operation against a catalog,
|
|
103
|
+
# so it's likely not worth it.
|
|
104
|
+
|
|
105
|
+
# Note 2: field_id defaults to 0 to signify that the exporter is not attempting to populate meaningful values.
|
|
106
|
+
# also, the Iceberg sdk catalog code will re-set the fieldIDs prior to executing any table operations on the schema
|
|
107
|
+
# ref: https://github.com/apache/iceberg-python/pull/1072
|
|
108
|
+
return types.NestedField(field_id=0, name=field_name, field_type=field_type, required=field.required is True)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def make_list(item):
|
|
112
|
+
field_type = get_field_type(item)
|
|
113
|
+
|
|
114
|
+
# element_id defaults to 0 to signify that the exporter is not attempting to populate meaningful values (see #make_field)
|
|
115
|
+
return types.ListType(element_id=0, element_type=field_type, element_required=item.required is True)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def make_map(field):
|
|
119
|
+
key_type = get_field_type(field.keys)
|
|
120
|
+
value_type = get_field_type(field.values)
|
|
121
|
+
|
|
122
|
+
# key_id and value_id defaults to 0 to signify that the exporter is not attempting to populate meaningful values (see #make_field)
|
|
123
|
+
return types.MapType(
|
|
124
|
+
key_id=0, key_type=key_type, value_id=0, value_type=value_type, value_required=field.values.required is True
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def to_struct_type(fields: dict[str, Field]) -> types.StructType:
|
|
129
|
+
"""
|
|
130
|
+
Convert a dictionary of fields to a Iceberg StructType.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
fields (dict[str, Field]): The fields to convert.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
types.StructType: The corresponding Iceberg StructType.
|
|
137
|
+
"""
|
|
138
|
+
struct_fields = []
|
|
139
|
+
for field_name, field in fields.items():
|
|
140
|
+
struct_field = make_field(field_name, field)
|
|
141
|
+
struct_fields.append(struct_field)
|
|
142
|
+
return types.StructType(*struct_fields)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def get_field_type(field: Field) -> types.IcebergType:
|
|
146
|
+
"""
|
|
147
|
+
Convert a field to a Iceberg IcebergType.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
field (Field): The field to convert.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
types.IcebergType: The corresponding Iceberg IcebergType.
|
|
154
|
+
"""
|
|
155
|
+
field_type = field.type
|
|
156
|
+
if field_type is None or field_type in ["null"]:
|
|
157
|
+
return types.NullType()
|
|
158
|
+
if field_type == "array":
|
|
159
|
+
return make_list(field.items)
|
|
160
|
+
if field_type == "map":
|
|
161
|
+
return make_map(field)
|
|
162
|
+
if field_type in ["object", "record", "struct"]:
|
|
163
|
+
return to_struct_type(field.fields)
|
|
164
|
+
if field_type in ["string", "varchar", "text"]:
|
|
165
|
+
return types.StringType()
|
|
166
|
+
if field_type in ["number", "decimal", "numeric"]:
|
|
167
|
+
precision = field.precision if field.precision is not None else 38
|
|
168
|
+
scale = field.scale if field.scale is not None else 0
|
|
169
|
+
return types.DecimalType(precision=precision, scale=scale)
|
|
170
|
+
if field_type in ["integer", "int"]:
|
|
171
|
+
return types.IntegerType()
|
|
172
|
+
if field_type in ["bigint", "long"]:
|
|
173
|
+
return types.LongType()
|
|
174
|
+
if field_type == "float":
|
|
175
|
+
return types.FloatType()
|
|
176
|
+
if field_type == "double":
|
|
177
|
+
return types.DoubleType()
|
|
178
|
+
if field_type == "boolean":
|
|
179
|
+
return types.BooleanType()
|
|
180
|
+
if field_type in ["timestamp", "timestamp_tz"]:
|
|
181
|
+
return types.TimestamptzType()
|
|
182
|
+
if field_type == "timestamp_ntz":
|
|
183
|
+
return types.TimestampType()
|
|
184
|
+
if field_type == "date":
|
|
185
|
+
return types.DateType()
|
|
186
|
+
if field_type == "bytes":
|
|
187
|
+
return types.BinaryType()
|
|
188
|
+
return types.BinaryType()
|