datacontract-cli 0.9.6.post2__py3-none-any.whl → 0.9.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/breaking/breaking.py +139 -63
- datacontract/breaking/breaking_rules.py +71 -54
- datacontract/cli.py +138 -45
- datacontract/data_contract.py +316 -78
- datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +5 -1
- datacontract/engines/datacontract/check_that_datacontract_file_exists.py +9 -8
- datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +26 -22
- datacontract/engines/fastjsonschema/check_jsonschema.py +31 -25
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +8 -6
- datacontract/engines/soda/check_soda_execute.py +46 -35
- datacontract/engines/soda/connections/bigquery.py +5 -3
- datacontract/engines/soda/connections/dask.py +0 -1
- datacontract/engines/soda/connections/databricks.py +2 -2
- datacontract/engines/soda/connections/duckdb.py +4 -4
- datacontract/engines/soda/connections/kafka.py +36 -17
- datacontract/engines/soda/connections/postgres.py +3 -3
- datacontract/engines/soda/connections/snowflake.py +4 -4
- datacontract/export/avro_converter.py +3 -7
- datacontract/export/avro_idl_converter.py +280 -0
- datacontract/export/dbt_converter.py +55 -80
- datacontract/export/great_expectations_converter.py +141 -0
- datacontract/export/jsonschema_converter.py +3 -1
- datacontract/export/odcs_converter.py +10 -12
- datacontract/export/protobuf_converter.py +99 -0
- datacontract/export/pydantic_converter.py +140 -0
- datacontract/export/rdf_converter.py +35 -12
- datacontract/export/sodacl_converter.py +24 -24
- datacontract/export/sql_converter.py +93 -0
- datacontract/export/sql_type_converter.py +131 -0
- datacontract/export/terraform_converter.py +71 -0
- datacontract/imports/avro_importer.py +106 -0
- datacontract/imports/sql_importer.py +0 -2
- datacontract/init/download_datacontract_file.py +2 -2
- datacontract/integration/publish_datamesh_manager.py +4 -9
- datacontract/integration/publish_opentelemetry.py +107 -0
- datacontract/lint/files.py +2 -2
- datacontract/lint/lint.py +46 -31
- datacontract/lint/linters/description_linter.py +34 -0
- datacontract/lint/linters/example_model_linter.py +67 -43
- datacontract/lint/linters/field_pattern_linter.py +34 -0
- datacontract/lint/linters/field_reference_linter.py +38 -0
- datacontract/lint/linters/notice_period_linter.py +55 -0
- datacontract/lint/linters/primary_field_linter.py +28 -0
- datacontract/lint/linters/quality_schema_linter.py +52 -0
- datacontract/lint/linters/valid_constraints_linter.py +99 -0
- datacontract/lint/resolve.py +53 -8
- datacontract/lint/schema.py +2 -3
- datacontract/lint/urls.py +4 -5
- datacontract/model/breaking_change.py +27 -5
- datacontract/model/data_contract_specification.py +45 -25
- datacontract/model/exceptions.py +13 -2
- datacontract/model/run.py +1 -1
- datacontract/web.py +5 -8
- {datacontract_cli-0.9.6.post2.dist-info → datacontract_cli-0.9.8.dist-info}/METADATA +207 -35
- datacontract_cli-0.9.8.dist-info/RECORD +63 -0
- {datacontract_cli-0.9.6.post2.dist-info → datacontract_cli-0.9.8.dist-info}/WHEEL +1 -1
- datacontract_cli-0.9.6.post2.dist-info/RECORD +0 -47
- {datacontract_cli-0.9.6.post2.dist-info → datacontract_cli-0.9.8.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.9.6.post2.dist-info → datacontract_cli-0.9.8.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.9.6.post2.dist-info → datacontract_cli-0.9.8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from io import StringIO
|
|
5
|
+
|
|
6
|
+
from datacontract.lint.resolve import inline_definitions_into_data_contract
|
|
7
|
+
from datacontract.model.data_contract_specification import \
|
|
8
|
+
DataContractSpecification, Field
|
|
9
|
+
from datacontract.model.exceptions import DataContractException
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def to_avro_idl(contract: DataContractSpecification) -> str:
|
|
13
|
+
"""Serialize the provided data contract specification into an Avro IDL string.
|
|
14
|
+
|
|
15
|
+
The data contract will be serialized as a protocol, with one record type
|
|
16
|
+
for each contained model. Model fields are mapped one-to-one to Avro IDL
|
|
17
|
+
record fields.
|
|
18
|
+
"""
|
|
19
|
+
stream = StringIO()
|
|
20
|
+
to_avro_idl_stream(contract, stream)
|
|
21
|
+
return stream.getvalue()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def to_avro_idl_stream(contract: DataContractSpecification, stream: typing.TextIO):
|
|
25
|
+
"""Serialize the provided data contract specification into Avro IDL."""
|
|
26
|
+
ir = _contract_to_avro_idl_ir(contract)
|
|
27
|
+
if ir.description:
|
|
28
|
+
stream.write(f"/** {contract.info.description} */\n")
|
|
29
|
+
stream.write(f"protocol {ir.name or 'Unnamed'} {{\n")
|
|
30
|
+
for model_type in ir.model_types:
|
|
31
|
+
_write_model_type(model_type, stream)
|
|
32
|
+
stream.write("}\n")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class AvroPrimitiveType(Enum):
|
|
36
|
+
int = "int"
|
|
37
|
+
long = "long"
|
|
38
|
+
string = "string"
|
|
39
|
+
boolean = "boolean"
|
|
40
|
+
float = "float"
|
|
41
|
+
double = "double"
|
|
42
|
+
null = "null"
|
|
43
|
+
bytes = "bytes"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class AvroLogicalType(Enum):
|
|
47
|
+
decimal = "decimal"
|
|
48
|
+
date = "date"
|
|
49
|
+
time_ms = "time_ms"
|
|
50
|
+
timestamp_ms = "timestamp_ms"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class AvroField:
|
|
55
|
+
name: str
|
|
56
|
+
required: bool
|
|
57
|
+
description: typing.Optional[str]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class AvroPrimitiveField(AvroField):
|
|
62
|
+
type: typing.Union[AvroPrimitiveType, AvroLogicalType]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class AvroComplexField(AvroField):
|
|
67
|
+
subfields: list[AvroField]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass
|
|
71
|
+
class AvroArrayField(AvroField):
|
|
72
|
+
type: AvroField
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class AvroModelType:
|
|
77
|
+
name: str
|
|
78
|
+
description: typing.Optional[str]
|
|
79
|
+
fields: list[AvroField]
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@dataclass
|
|
83
|
+
class AvroIDLProtocol:
|
|
84
|
+
name: typing.Optional[str]
|
|
85
|
+
description: typing.Optional[str]
|
|
86
|
+
model_types: list[AvroModelType]
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
avro_primitive_types = set(
|
|
90
|
+
[
|
|
91
|
+
"string",
|
|
92
|
+
"text",
|
|
93
|
+
"varchar",
|
|
94
|
+
"float",
|
|
95
|
+
"double",
|
|
96
|
+
"int",
|
|
97
|
+
"integer",
|
|
98
|
+
"long",
|
|
99
|
+
"bigint",
|
|
100
|
+
"boolean",
|
|
101
|
+
"timestamp_ntz",
|
|
102
|
+
"timestamp",
|
|
103
|
+
"timestamp_tz",
|
|
104
|
+
"date",
|
|
105
|
+
"bytes",
|
|
106
|
+
"null",
|
|
107
|
+
]
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _to_avro_primitive_logical_type(field_name: str, field: Field) -> AvroPrimitiveField:
|
|
112
|
+
result = AvroPrimitiveField(field_name, field.required, field.description, AvroPrimitiveType.string)
|
|
113
|
+
match field.type:
|
|
114
|
+
case "string" | "text" | "varchar":
|
|
115
|
+
result.type = AvroPrimitiveType.string
|
|
116
|
+
case "float":
|
|
117
|
+
result.type = AvroPrimitiveType.float
|
|
118
|
+
case "double":
|
|
119
|
+
result.type = AvroPrimitiveType.double
|
|
120
|
+
case "int" | "integer":
|
|
121
|
+
result.type = AvroPrimitiveType.int
|
|
122
|
+
case "long" | "bigint":
|
|
123
|
+
result.type = AvroPrimitiveType.long
|
|
124
|
+
case "boolean":
|
|
125
|
+
result.type = AvroPrimitiveType.boolean
|
|
126
|
+
case "timestamp" | "timestamp_tz":
|
|
127
|
+
result.type = AvroPrimitiveType.string
|
|
128
|
+
case "timestamp_ntz":
|
|
129
|
+
result.type = AvroLogicalType.timestamp_ms
|
|
130
|
+
case "date":
|
|
131
|
+
result.type = AvroLogicalType.date
|
|
132
|
+
case "bytes":
|
|
133
|
+
result.type = AvroPrimitiveType.bytes
|
|
134
|
+
case "null":
|
|
135
|
+
result.type = AvroPrimitiveType.null
|
|
136
|
+
case _:
|
|
137
|
+
raise DataContractException(
|
|
138
|
+
type="general",
|
|
139
|
+
name="avro-idl-export",
|
|
140
|
+
model=field,
|
|
141
|
+
reason="Unknown field type {field.type}",
|
|
142
|
+
result="failed",
|
|
143
|
+
message="Avro IDL type conversion failed.",
|
|
144
|
+
)
|
|
145
|
+
return result
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _to_avro_idl_type(field_name: str, field: Field) -> AvroField:
|
|
149
|
+
if field.type in avro_primitive_types:
|
|
150
|
+
return _to_avro_primitive_logical_type(field_name, field)
|
|
151
|
+
else:
|
|
152
|
+
match field.type:
|
|
153
|
+
case "array":
|
|
154
|
+
return AvroArrayField(
|
|
155
|
+
field_name, field.required, field.description, _to_avro_idl_type(field_name, field.items)
|
|
156
|
+
)
|
|
157
|
+
case "object" | "record" | "struct":
|
|
158
|
+
return AvroComplexField(
|
|
159
|
+
field_name,
|
|
160
|
+
field.required,
|
|
161
|
+
field.description,
|
|
162
|
+
[_to_avro_idl_type(field_name, field) for (field_name, field) in field.fields.items()],
|
|
163
|
+
)
|
|
164
|
+
case _:
|
|
165
|
+
raise DataContractException(
|
|
166
|
+
type="general",
|
|
167
|
+
name="avro-idl-export",
|
|
168
|
+
model=type,
|
|
169
|
+
reason="Unknown Data Contract field type",
|
|
170
|
+
result="failed",
|
|
171
|
+
message="Avro IDL type conversion failed.",
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _generate_field_types(contract: DataContractSpecification) -> list[AvroField]:
|
|
176
|
+
result = []
|
|
177
|
+
for _, model in contract.models.items():
|
|
178
|
+
for field_name, field in model.fields.items():
|
|
179
|
+
result.append(_to_avro_idl_type(field_name, field))
|
|
180
|
+
return result
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def generate_model_types(contract: DataContractSpecification) -> list[AvroModelType]:
|
|
184
|
+
result = []
|
|
185
|
+
for model_name, model in contract.models.items():
|
|
186
|
+
result.append(
|
|
187
|
+
AvroModelType(name=model_name, description=model.description, fields=_generate_field_types(contract))
|
|
188
|
+
)
|
|
189
|
+
return result
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _model_name_to_identifier(model_name: str):
|
|
193
|
+
return "".join([word.title() for word in model_name.split()])
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _contract_to_avro_idl_ir(contract: DataContractSpecification) -> AvroIDLProtocol:
|
|
197
|
+
"""Convert models into an intermediate representation for later serialization into Avro IDL.
|
|
198
|
+
|
|
199
|
+
Each model is converted to a record containing a field for each model field.
|
|
200
|
+
"""
|
|
201
|
+
inlined_contract = contract.model_copy()
|
|
202
|
+
inline_definitions_into_data_contract(inlined_contract)
|
|
203
|
+
protocol_name = _model_name_to_identifier(contract.info.title) if contract.info and contract.info.title else None
|
|
204
|
+
description = contract.info.description if contract.info and contract.info.description else None
|
|
205
|
+
return AvroIDLProtocol(
|
|
206
|
+
name=protocol_name, description=description, model_types=generate_model_types(inlined_contract)
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _write_indent(indent: int, stream: typing.TextIO):
|
|
211
|
+
stream.write(" " * indent)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _write_field_description(field: AvroField, indent: int, stream: typing.TextIO):
|
|
215
|
+
if field.description:
|
|
216
|
+
_write_indent(indent, stream)
|
|
217
|
+
stream.write(f"/** {field.description} */\n")
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _write_field_type_definition(field: AvroField, indent: int, stream: typing.TextIO) -> str:
|
|
221
|
+
# Write any extra information (such as record type definition) and return
|
|
222
|
+
# the name of the generated type. Writes descriptions only for record
|
|
223
|
+
# types. This leads to record types being described twice, once on the
|
|
224
|
+
# record definition, and once on use. The alternative (detect when the
|
|
225
|
+
# complex field type is not used in an array or another complex type) is
|
|
226
|
+
# significantly more complex to implement.
|
|
227
|
+
match field:
|
|
228
|
+
case AvroPrimitiveField(name, required, _, typ) if required is True:
|
|
229
|
+
return typ.value
|
|
230
|
+
case AvroPrimitiveField(name, required, _, typ):
|
|
231
|
+
return typ.value + "?"
|
|
232
|
+
case AvroComplexField(name, required, _, subfields):
|
|
233
|
+
_write_field_description(field, indent, stream)
|
|
234
|
+
_write_indent(indent, stream)
|
|
235
|
+
stream.write(f"record {name}_type {{\n")
|
|
236
|
+
subfield_types = []
|
|
237
|
+
# Recursively define records for all subfields if necessary
|
|
238
|
+
for subfield in subfields:
|
|
239
|
+
subfield_types.append(_write_field_type_definition(subfield, indent + 1, stream))
|
|
240
|
+
# Reference all defined record types.
|
|
241
|
+
for field, subfield_type in zip(field.subfields, subfield_types):
|
|
242
|
+
_write_field_description(field, indent + 1, stream)
|
|
243
|
+
_write_indent(indent + 1, stream)
|
|
244
|
+
stream.write(f"{subfield_type} {field.name};\n")
|
|
245
|
+
_write_indent(indent, stream)
|
|
246
|
+
stream.write("}\n")
|
|
247
|
+
if required is True:
|
|
248
|
+
return f"{name}_type"
|
|
249
|
+
else:
|
|
250
|
+
return f"{name}_type?"
|
|
251
|
+
case AvroArrayField(name, required, _, item_type):
|
|
252
|
+
subfield_type = _write_field_type_definition(item_type, indent, stream)
|
|
253
|
+
if required is True:
|
|
254
|
+
return f"array<{subfield_type}>"
|
|
255
|
+
else:
|
|
256
|
+
return f"array<{subfield_type}>?"
|
|
257
|
+
case _:
|
|
258
|
+
raise RuntimeError("Unknown Avro field type {field}")
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _write_field(field: AvroField, indent, stream: typing.TextIO):
|
|
262
|
+
# Start of recursion.
|
|
263
|
+
typename = _write_field_type_definition(field, indent, stream)
|
|
264
|
+
_write_field_description(field, indent, stream)
|
|
265
|
+
_write_indent(indent, stream)
|
|
266
|
+
stream.write(f"{typename} {field.name};\n")
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def _write_model_type(model: AvroModelType, stream: typing.TextIO):
|
|
270
|
+
# Called once for each model
|
|
271
|
+
if model.description:
|
|
272
|
+
_write_indent(1, stream)
|
|
273
|
+
stream.write(f"/** {model.description} */\n")
|
|
274
|
+
_write_indent(1, stream)
|
|
275
|
+
stream.write(f"record {model.name} {{\n")
|
|
276
|
+
# Called for each model field
|
|
277
|
+
for field in model.fields:
|
|
278
|
+
_write_field(field, 2, stream)
|
|
279
|
+
_write_indent(1, stream)
|
|
280
|
+
stream.write("}\n")
|
|
@@ -2,14 +2,11 @@ from typing import Dict
|
|
|
2
2
|
|
|
3
3
|
import yaml
|
|
4
4
|
|
|
5
|
+
from datacontract.export.sql_type_converter import convert_to_sql_type
|
|
5
6
|
from datacontract.model.data_contract_specification import \
|
|
6
7
|
DataContractSpecification, Model, Field
|
|
7
8
|
|
|
8
9
|
|
|
9
|
-
# snowflake data types:
|
|
10
|
-
# https://docs.snowflake.com/en/sql-reference/data-types.html
|
|
11
|
-
|
|
12
|
-
|
|
13
10
|
def to_dbt_models_yaml(data_contract_spec: DataContractSpecification):
|
|
14
11
|
dbt = {
|
|
15
12
|
"version": 2,
|
|
@@ -18,18 +15,17 @@ def to_dbt_models_yaml(data_contract_spec: DataContractSpecification):
|
|
|
18
15
|
for model_key, model_value in data_contract_spec.models.items():
|
|
19
16
|
dbt_model = _to_dbt_model(model_key, model_value, data_contract_spec)
|
|
20
17
|
dbt["models"].append(dbt_model)
|
|
21
|
-
return yaml.dump(dbt, indent=2, sort_keys=False)
|
|
18
|
+
return yaml.dump(dbt, indent=2, sort_keys=False, allow_unicode=True)
|
|
22
19
|
|
|
23
20
|
|
|
24
|
-
def to_dbt_staging_sql(data_contract_spec: DataContractSpecification):
|
|
21
|
+
def to_dbt_staging_sql(data_contract_spec: DataContractSpecification, model_name: str, model_value: Model) -> str:
|
|
25
22
|
if data_contract_spec.models is None or len(data_contract_spec.models.items()) != 1:
|
|
26
|
-
print(
|
|
23
|
+
print("Export to dbt-staging-sql currently only works with exactly one model in the data contract.")
|
|
27
24
|
return ""
|
|
28
25
|
|
|
29
26
|
id = data_contract_spec.id
|
|
30
|
-
model_name, model = next(iter(data_contract_spec.models.items()))
|
|
31
27
|
columns = []
|
|
32
|
-
for field_name, field in
|
|
28
|
+
for field_name, field in model_value.fields.items():
|
|
33
29
|
# TODO escape SQL reserved key words, probably dependent on server type
|
|
34
30
|
columns.append(field_name)
|
|
35
31
|
return f"""
|
|
@@ -40,15 +36,10 @@ def to_dbt_staging_sql(data_contract_spec: DataContractSpecification):
|
|
|
40
36
|
|
|
41
37
|
|
|
42
38
|
def to_dbt_sources_yaml(data_contract_spec: DataContractSpecification, server: str = None):
|
|
43
|
-
source = {
|
|
44
|
-
"name": data_contract_spec.id,
|
|
45
|
-
"tables": []
|
|
46
|
-
}
|
|
39
|
+
source = {"name": data_contract_spec.id, "tables": []}
|
|
47
40
|
dbt = {
|
|
48
41
|
"version": 2,
|
|
49
|
-
"sources": [
|
|
50
|
-
source
|
|
51
|
-
],
|
|
42
|
+
"sources": [source],
|
|
52
43
|
}
|
|
53
44
|
if data_contract_spec.info.owner is not None:
|
|
54
45
|
source["meta"] = {"owner": data_contract_spec.info.owner}
|
|
@@ -62,7 +53,7 @@ def to_dbt_sources_yaml(data_contract_spec: DataContractSpecification, server: s
|
|
|
62
53
|
for model_key, model_value in data_contract_spec.models.items():
|
|
63
54
|
dbt_model = _to_dbt_source_table(model_key, model_value)
|
|
64
55
|
source["tables"].append(dbt_model)
|
|
65
|
-
return yaml.dump(dbt, indent=2, sort_keys=False)
|
|
56
|
+
return yaml.dump(dbt, indent=2, sort_keys=False, allow_unicode=True)
|
|
66
57
|
|
|
67
58
|
|
|
68
59
|
def _to_dbt_source_table(model_key, model_value: Model) -> dict:
|
|
@@ -83,20 +74,14 @@ def _to_dbt_model(model_key, model_value: Model, data_contract_spec: DataContrac
|
|
|
83
74
|
"name": model_key,
|
|
84
75
|
}
|
|
85
76
|
model_type = _to_dbt_model_type(model_value.type)
|
|
86
|
-
dbt_model["config"] = {
|
|
87
|
-
"meta": {
|
|
88
|
-
"data_contract": data_contract_spec.id
|
|
89
|
-
}
|
|
90
|
-
}
|
|
77
|
+
dbt_model["config"] = {"meta": {"data_contract": data_contract_spec.id}}
|
|
91
78
|
dbt_model["config"]["materialized"] = model_type
|
|
92
79
|
|
|
93
80
|
if data_contract_spec.info.owner is not None:
|
|
94
81
|
dbt_model["config"]["meta"]["owner"] = data_contract_spec.info.owner
|
|
95
82
|
|
|
96
83
|
if _supports_constraints(model_type):
|
|
97
|
-
dbt_model["config"]["contract"] = {
|
|
98
|
-
"enforced": True
|
|
99
|
-
}
|
|
84
|
+
dbt_model["config"]["contract"] = {"enforced": True}
|
|
100
85
|
if model_value.description is not None:
|
|
101
86
|
dbt_model["description"] = model_value.description
|
|
102
87
|
columns = _to_columns(model_value.fields, _supports_constraints(model_type), True)
|
|
@@ -133,14 +118,14 @@ def _to_columns(fields: Dict[str, Field], supports_constraints: bool, supports_d
|
|
|
133
118
|
|
|
134
119
|
def _to_column(field: Field, supports_constraints: bool, supports_datatype: bool) -> dict:
|
|
135
120
|
column = {}
|
|
136
|
-
dbt_type =
|
|
121
|
+
dbt_type = convert_to_sql_type(field, "snowflake")
|
|
137
122
|
if dbt_type is not None:
|
|
138
123
|
if supports_datatype:
|
|
139
124
|
column["data_type"] = dbt_type
|
|
140
125
|
else:
|
|
141
126
|
column.setdefault("tests", []).append(
|
|
142
|
-
{"dbt_expectations.dbt_expectations.expect_column_values_to_be_of_type": {
|
|
143
|
-
|
|
127
|
+
{"dbt_expectations.dbt_expectations.expect_column_values_to_be_of_type": {"column_type": dbt_type}}
|
|
128
|
+
)
|
|
144
129
|
if field.description is not None:
|
|
145
130
|
column["description"] = field.description
|
|
146
131
|
if field.required:
|
|
@@ -162,7 +147,8 @@ def _to_column(field: Field, supports_constraints: bool, supports_datatype: bool
|
|
|
162
147
|
if field.maxLength is not None:
|
|
163
148
|
length_test["max_value"] = field.maxLength
|
|
164
149
|
column.setdefault("tests", []).append(
|
|
165
|
-
{"dbt_expectations.expect_column_value_lengths_to_be_between": length_test}
|
|
150
|
+
{"dbt_expectations.expect_column_value_lengths_to_be_between": length_test}
|
|
151
|
+
)
|
|
166
152
|
if field.pii is not None:
|
|
167
153
|
column.setdefault("meta", {})["pii"] = field.pii
|
|
168
154
|
if field.classification is not None:
|
|
@@ -172,71 +158,60 @@ def _to_column(field: Field, supports_constraints: bool, supports_datatype: bool
|
|
|
172
158
|
if field.pattern is not None:
|
|
173
159
|
# Beware, the data contract pattern is a regex, not a like pattern
|
|
174
160
|
column.setdefault("tests", []).append(
|
|
175
|
-
{"dbt_expectations.expect_column_values_to_match_regex": {"regex": field.pattern}}
|
|
176
|
-
|
|
161
|
+
{"dbt_expectations.expect_column_values_to_match_regex": {"regex": field.pattern}}
|
|
162
|
+
)
|
|
163
|
+
if (
|
|
164
|
+
field.minimum is not None
|
|
165
|
+
or field.maximum is not None
|
|
166
|
+
and field.exclusiveMinimum is None
|
|
167
|
+
and field.exclusiveMaximum is None
|
|
168
|
+
):
|
|
177
169
|
range_test = {}
|
|
178
170
|
if field.minimum is not None:
|
|
179
171
|
range_test["min_value"] = field.minimum
|
|
180
172
|
if field.maximum is not None:
|
|
181
173
|
range_test["max_value"] = field.maximum
|
|
182
174
|
column.setdefault("tests", []).append({"dbt_expectations.expect_column_values_to_be_between": range_test})
|
|
183
|
-
elif
|
|
175
|
+
elif (
|
|
176
|
+
field.exclusiveMinimum is not None
|
|
177
|
+
or field.exclusiveMaximum is not None
|
|
178
|
+
and field.minimum is None
|
|
179
|
+
and field.maximum is None
|
|
180
|
+
):
|
|
184
181
|
range_test = {}
|
|
185
|
-
if field.
|
|
186
|
-
range_test["min_value"] = field.
|
|
187
|
-
if field.
|
|
188
|
-
range_test["max_value"] = field.
|
|
182
|
+
if field.exclusiveMinimum is not None:
|
|
183
|
+
range_test["min_value"] = field.exclusiveMinimum
|
|
184
|
+
if field.exclusiveMaximum is not None:
|
|
185
|
+
range_test["max_value"] = field.exclusiveMaximum
|
|
189
186
|
range_test["strictly"] = True
|
|
190
187
|
column.setdefault("tests", []).append({"dbt_expectations.expect_column_values_to_be_between": range_test})
|
|
191
188
|
else:
|
|
192
189
|
if field.minimum is not None:
|
|
193
190
|
column.setdefault("tests", []).append(
|
|
194
|
-
{"dbt_expectations.expect_column_values_to_be_between": {"min_value": field.minimum}}
|
|
191
|
+
{"dbt_expectations.expect_column_values_to_be_between": {"min_value": field.minimum}}
|
|
192
|
+
)
|
|
195
193
|
if field.maximum is not None:
|
|
196
194
|
column.setdefault("tests", []).append(
|
|
197
|
-
{"dbt_expectations.expect_column_values_to_be_between": {"max_value": field.maximum}}
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
195
|
+
{"dbt_expectations.expect_column_values_to_be_between": {"max_value": field.maximum}}
|
|
196
|
+
)
|
|
197
|
+
if field.exclusiveMinimum is not None:
|
|
198
|
+
column.setdefault("tests", []).append(
|
|
199
|
+
{
|
|
200
|
+
"dbt_expectations.expect_column_values_to_be_between": {
|
|
201
|
+
"min_value": field.exclusiveMinimum,
|
|
202
|
+
"strictly": True,
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
)
|
|
206
|
+
if field.exclusiveMaximum is not None:
|
|
207
|
+
column.setdefault("tests", []).append(
|
|
208
|
+
{
|
|
209
|
+
"dbt_expectations.expect_column_values_to_be_between": {
|
|
210
|
+
"max_value": field.exclusiveMaximum,
|
|
211
|
+
"strictly": True,
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
)
|
|
204
215
|
|
|
205
216
|
# TODO: all constraints
|
|
206
217
|
return column
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
def _convert_type_to_snowflake(type) -> None | str:
|
|
210
|
-
# currently optimized for snowflake
|
|
211
|
-
# LEARNING: data contract has no direct support for CHAR,CHARACTER
|
|
212
|
-
# LEARNING: data contract has no support for "date-time", "datetime", "time"
|
|
213
|
-
# LEARNING: No precision and scale support in data contract
|
|
214
|
-
# LEARNING: no support for any
|
|
215
|
-
# GEOGRAPHY and GEOMETRY are not supported by the mapping
|
|
216
|
-
if type is None:
|
|
217
|
-
return None
|
|
218
|
-
if type.lower() in ["string", "varchar", "text"]:
|
|
219
|
-
return type.upper() # STRING, TEXT, VARCHAR are all the same in snowflake
|
|
220
|
-
if type.lower() in ["timestamp", "timestamp_tz"]:
|
|
221
|
-
return "TIMESTAMP_TZ"
|
|
222
|
-
if type.lower() in ["timestamp_ntz"]:
|
|
223
|
-
return "TIMESTAMP_NTZ"
|
|
224
|
-
if type.lower() in ["date"]:
|
|
225
|
-
return "DATE"
|
|
226
|
-
if type.lower() in ["time"]:
|
|
227
|
-
return "TIME"
|
|
228
|
-
if type.lower() in ["number", "decimal", "numeric"]:
|
|
229
|
-
return "NUMBER" # precision and scale not supported by data contract
|
|
230
|
-
if type.lower() in ["float", "double"]:
|
|
231
|
-
return "FLOAT"
|
|
232
|
-
if type.lower() in ["integer", "int", "long", "bigint"]:
|
|
233
|
-
return "NUMBER" # always NUMBER(38,0)
|
|
234
|
-
if type.lower() in ["boolean"]:
|
|
235
|
-
return "BOOLEAN"
|
|
236
|
-
if type.lower() in ["object", "record", "struct"]:
|
|
237
|
-
return "OBJECT"
|
|
238
|
-
if type.lower() in ["bytes"]:
|
|
239
|
-
return "BINARY"
|
|
240
|
-
if type.lower() in ["array"]:
|
|
241
|
-
return "ARRAY"
|
|
242
|
-
return None
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Dict, List, Any
|
|
3
|
+
|
|
4
|
+
import yaml
|
|
5
|
+
|
|
6
|
+
from datacontract.model.data_contract_specification import \
|
|
7
|
+
DataContractSpecification, Field, Quality
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def to_great_expectations(data_contract_spec: DataContractSpecification, model_key: str) -> str:
|
|
11
|
+
"""
|
|
12
|
+
Convert each model in the contract to a Great Expectation suite
|
|
13
|
+
@param data_contract_spec: data contract to export to great expectations
|
|
14
|
+
@param model_key: model to great expectations to
|
|
15
|
+
@return: a dictionary of great expectation suites
|
|
16
|
+
"""
|
|
17
|
+
expectations = []
|
|
18
|
+
model_value = data_contract_spec.models.get(model_key)
|
|
19
|
+
quality_checks = get_quality_checks(data_contract_spec.quality)
|
|
20
|
+
expectations.extend(model_to_expectations(model_value.fields))
|
|
21
|
+
expectations.extend(checks_to_expectations(quality_checks, model_key))
|
|
22
|
+
model_expectation_suite = to_suite(model_key, data_contract_spec.info.version, expectations)
|
|
23
|
+
|
|
24
|
+
return model_expectation_suite
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def to_suite(
|
|
28
|
+
model_key: str,
|
|
29
|
+
contract_version: str,
|
|
30
|
+
expectations: List[Dict[str, Any]],
|
|
31
|
+
) -> str:
|
|
32
|
+
return json.dumps(
|
|
33
|
+
{
|
|
34
|
+
"data_asset_type": "null",
|
|
35
|
+
"expectation_suite_name": "user-defined.{model_key}.{contract_version}".format(
|
|
36
|
+
model_key=model_key, contract_version=contract_version
|
|
37
|
+
),
|
|
38
|
+
"expectations": expectations,
|
|
39
|
+
"meta": {},
|
|
40
|
+
},
|
|
41
|
+
indent=2,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def model_to_expectations(fields: Dict[str, Field]) -> List[Dict[str, Any]]:
|
|
46
|
+
"""
|
|
47
|
+
Convert the model information to expectations
|
|
48
|
+
@param fields: model field
|
|
49
|
+
@return: list of expectations
|
|
50
|
+
"""
|
|
51
|
+
expectations = []
|
|
52
|
+
add_column_order_exp(fields, expectations)
|
|
53
|
+
for field_name, field in fields.items():
|
|
54
|
+
add_field_expectations(field_name, field, expectations)
|
|
55
|
+
return expectations
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def add_field_expectations(field_name, field: Field, expectations: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
59
|
+
if field.type is not None:
|
|
60
|
+
expectations.append(to_column_types_exp(field_name, field.type))
|
|
61
|
+
if field.unique is not None:
|
|
62
|
+
expectations.append(to_column_unique_exp(field_name))
|
|
63
|
+
if field.maxLength is not None or field.minLength is not None:
|
|
64
|
+
expectations.append(to_column_length_exp(field_name, field.minLength, field.maxLength))
|
|
65
|
+
if field.minimum is not None or field.maximum is not None:
|
|
66
|
+
expectations.append(to_column_min_max_exp(field_name, field.minimum, field.maximum))
|
|
67
|
+
|
|
68
|
+
# TODO: all constraints
|
|
69
|
+
return expectations
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def add_column_order_exp(fields: Dict[str, Field], expectations: List[Dict[str, Any]]):
|
|
73
|
+
expectations.append(
|
|
74
|
+
{
|
|
75
|
+
"expectation_type": "expect_table_columns_to_match_ordered_list",
|
|
76
|
+
"kwargs": {"column_list": list(fields.keys())},
|
|
77
|
+
"meta": {},
|
|
78
|
+
}
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def to_column_types_exp(field_name, field_type) -> Dict[str, Any]:
|
|
83
|
+
return {
|
|
84
|
+
"expectation_type": "expect_column_values_to_be_of_type",
|
|
85
|
+
"kwargs": {"column": field_name, "type_": field_type},
|
|
86
|
+
"meta": {},
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def to_column_unique_exp(field_name) -> Dict[str, Any]:
|
|
91
|
+
return {"expectation_type": "expect_column_values_to_be_unique", "kwargs": {"column": field_name}, "meta": {}}
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def to_column_length_exp(field_name, min_length, max_length) -> Dict[str, Any]:
|
|
95
|
+
return {
|
|
96
|
+
"expectation_type": "expect_column_value_lengths_to_be_between",
|
|
97
|
+
"kwargs": {"column": field_name, "min_value": min_length, "max_value": max_length},
|
|
98
|
+
"meta": {},
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def to_column_min_max_exp(field_name, minimum, maximum) -> Dict[str, Any]:
|
|
103
|
+
return {
|
|
104
|
+
"expectation_type": "expect_column_values_to_be_between",
|
|
105
|
+
"kwargs": {"column": field_name, "min_value": minimum, "max_value": maximum},
|
|
106
|
+
"meta": {},
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def get_quality_checks(quality: Quality) -> Dict[str, Any]:
|
|
111
|
+
if quality is None:
|
|
112
|
+
return {}
|
|
113
|
+
if quality.type is None:
|
|
114
|
+
return {}
|
|
115
|
+
if quality.type.lower() != "great-expectations":
|
|
116
|
+
return {}
|
|
117
|
+
if isinstance(quality.specification, str):
|
|
118
|
+
quality_specification = yaml.safe_load(quality.specification)
|
|
119
|
+
else:
|
|
120
|
+
quality_specification = quality.specification
|
|
121
|
+
return quality_specification
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def checks_to_expectations(quality_checks: Dict[str, Any], model_key: str) -> List[Dict[str, Any]]:
|
|
125
|
+
"""
|
|
126
|
+
Get the quality definition for each model to the model expectation list
|
|
127
|
+
@param quality_checks: dictionary of quality checks by model
|
|
128
|
+
@param model_key: id of the model
|
|
129
|
+
@return: the list of expectations for that model
|
|
130
|
+
"""
|
|
131
|
+
if quality_checks is None or model_key not in quality_checks:
|
|
132
|
+
return []
|
|
133
|
+
|
|
134
|
+
model_quality_checks = quality_checks[model_key]
|
|
135
|
+
|
|
136
|
+
if model_quality_checks is None:
|
|
137
|
+
return []
|
|
138
|
+
|
|
139
|
+
if isinstance(model_quality_checks, str):
|
|
140
|
+
expectation_list = json.loads(model_quality_checks)
|
|
141
|
+
return expectation_list
|
|
@@ -12,16 +12,18 @@ def to_jsonschemas(data_contract_spec: DataContractSpecification):
|
|
|
12
12
|
jsonschmemas[model_key] = jsonschema
|
|
13
13
|
return jsonschmemas
|
|
14
14
|
|
|
15
|
+
|
|
15
16
|
def to_jsonschema_json(model_key, model_value: Model) -> str:
|
|
16
17
|
jsonschema = to_jsonschema(model_key, model_value)
|
|
17
18
|
return json.dumps(jsonschema, indent=2)
|
|
18
19
|
|
|
20
|
+
|
|
19
21
|
def to_jsonschema(model_key, model_value: Model) -> dict:
|
|
20
22
|
return {
|
|
21
23
|
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
22
24
|
"type": "object",
|
|
23
25
|
"properties": to_properties(model_value.fields),
|
|
24
|
-
"required": to_required(model_value.fields)
|
|
26
|
+
"required": to_required(model_value.fields),
|
|
25
27
|
}
|
|
26
28
|
|
|
27
29
|
|