datacontract-cli 0.9.6.post2__py3-none-any.whl → 0.9.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (60) hide show
  1. datacontract/breaking/breaking.py +139 -63
  2. datacontract/breaking/breaking_rules.py +71 -54
  3. datacontract/cli.py +138 -45
  4. datacontract/data_contract.py +316 -78
  5. datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +5 -1
  6. datacontract/engines/datacontract/check_that_datacontract_file_exists.py +9 -8
  7. datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +26 -22
  8. datacontract/engines/fastjsonschema/check_jsonschema.py +31 -25
  9. datacontract/engines/fastjsonschema/s3/s3_read_files.py +8 -6
  10. datacontract/engines/soda/check_soda_execute.py +46 -35
  11. datacontract/engines/soda/connections/bigquery.py +5 -3
  12. datacontract/engines/soda/connections/dask.py +0 -1
  13. datacontract/engines/soda/connections/databricks.py +2 -2
  14. datacontract/engines/soda/connections/duckdb.py +4 -4
  15. datacontract/engines/soda/connections/kafka.py +36 -17
  16. datacontract/engines/soda/connections/postgres.py +3 -3
  17. datacontract/engines/soda/connections/snowflake.py +4 -4
  18. datacontract/export/avro_converter.py +3 -7
  19. datacontract/export/avro_idl_converter.py +280 -0
  20. datacontract/export/dbt_converter.py +55 -80
  21. datacontract/export/great_expectations_converter.py +141 -0
  22. datacontract/export/jsonschema_converter.py +3 -1
  23. datacontract/export/odcs_converter.py +10 -12
  24. datacontract/export/protobuf_converter.py +99 -0
  25. datacontract/export/pydantic_converter.py +140 -0
  26. datacontract/export/rdf_converter.py +35 -12
  27. datacontract/export/sodacl_converter.py +24 -24
  28. datacontract/export/sql_converter.py +93 -0
  29. datacontract/export/sql_type_converter.py +131 -0
  30. datacontract/export/terraform_converter.py +71 -0
  31. datacontract/imports/avro_importer.py +106 -0
  32. datacontract/imports/sql_importer.py +0 -2
  33. datacontract/init/download_datacontract_file.py +2 -2
  34. datacontract/integration/publish_datamesh_manager.py +4 -9
  35. datacontract/integration/publish_opentelemetry.py +107 -0
  36. datacontract/lint/files.py +2 -2
  37. datacontract/lint/lint.py +46 -31
  38. datacontract/lint/linters/description_linter.py +34 -0
  39. datacontract/lint/linters/example_model_linter.py +67 -43
  40. datacontract/lint/linters/field_pattern_linter.py +34 -0
  41. datacontract/lint/linters/field_reference_linter.py +38 -0
  42. datacontract/lint/linters/notice_period_linter.py +55 -0
  43. datacontract/lint/linters/primary_field_linter.py +28 -0
  44. datacontract/lint/linters/quality_schema_linter.py +52 -0
  45. datacontract/lint/linters/valid_constraints_linter.py +99 -0
  46. datacontract/lint/resolve.py +53 -8
  47. datacontract/lint/schema.py +2 -3
  48. datacontract/lint/urls.py +4 -5
  49. datacontract/model/breaking_change.py +27 -5
  50. datacontract/model/data_contract_specification.py +45 -25
  51. datacontract/model/exceptions.py +13 -2
  52. datacontract/model/run.py +1 -1
  53. datacontract/web.py +5 -8
  54. {datacontract_cli-0.9.6.post2.dist-info → datacontract_cli-0.9.8.dist-info}/METADATA +207 -35
  55. datacontract_cli-0.9.8.dist-info/RECORD +63 -0
  56. {datacontract_cli-0.9.6.post2.dist-info → datacontract_cli-0.9.8.dist-info}/WHEEL +1 -1
  57. datacontract_cli-0.9.6.post2.dist-info/RECORD +0 -47
  58. {datacontract_cli-0.9.6.post2.dist-info → datacontract_cli-0.9.8.dist-info}/LICENSE +0 -0
  59. {datacontract_cli-0.9.6.post2.dist-info → datacontract_cli-0.9.8.dist-info}/entry_points.txt +0 -0
  60. {datacontract_cli-0.9.6.post2.dist-info → datacontract_cli-0.9.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,280 @@
1
+ import typing
2
+ from dataclasses import dataclass
3
+ from enum import Enum
4
+ from io import StringIO
5
+
6
+ from datacontract.lint.resolve import inline_definitions_into_data_contract
7
+ from datacontract.model.data_contract_specification import \
8
+ DataContractSpecification, Field
9
+ from datacontract.model.exceptions import DataContractException
10
+
11
+
12
+ def to_avro_idl(contract: DataContractSpecification) -> str:
13
+ """Serialize the provided data contract specification into an Avro IDL string.
14
+
15
+ The data contract will be serialized as a protocol, with one record type
16
+ for each contained model. Model fields are mapped one-to-one to Avro IDL
17
+ record fields.
18
+ """
19
+ stream = StringIO()
20
+ to_avro_idl_stream(contract, stream)
21
+ return stream.getvalue()
22
+
23
+
24
+ def to_avro_idl_stream(contract: DataContractSpecification, stream: typing.TextIO):
25
+ """Serialize the provided data contract specification into Avro IDL."""
26
+ ir = _contract_to_avro_idl_ir(contract)
27
+ if ir.description:
28
+ stream.write(f"/** {contract.info.description} */\n")
29
+ stream.write(f"protocol {ir.name or 'Unnamed'} {{\n")
30
+ for model_type in ir.model_types:
31
+ _write_model_type(model_type, stream)
32
+ stream.write("}\n")
33
+
34
+
35
+ class AvroPrimitiveType(Enum):
36
+ int = "int"
37
+ long = "long"
38
+ string = "string"
39
+ boolean = "boolean"
40
+ float = "float"
41
+ double = "double"
42
+ null = "null"
43
+ bytes = "bytes"
44
+
45
+
46
+ class AvroLogicalType(Enum):
47
+ decimal = "decimal"
48
+ date = "date"
49
+ time_ms = "time_ms"
50
+ timestamp_ms = "timestamp_ms"
51
+
52
+
53
+ @dataclass
54
+ class AvroField:
55
+ name: str
56
+ required: bool
57
+ description: typing.Optional[str]
58
+
59
+
60
+ @dataclass
61
+ class AvroPrimitiveField(AvroField):
62
+ type: typing.Union[AvroPrimitiveType, AvroLogicalType]
63
+
64
+
65
+ @dataclass
66
+ class AvroComplexField(AvroField):
67
+ subfields: list[AvroField]
68
+
69
+
70
+ @dataclass
71
+ class AvroArrayField(AvroField):
72
+ type: AvroField
73
+
74
+
75
+ @dataclass
76
+ class AvroModelType:
77
+ name: str
78
+ description: typing.Optional[str]
79
+ fields: list[AvroField]
80
+
81
+
82
+ @dataclass
83
+ class AvroIDLProtocol:
84
+ name: typing.Optional[str]
85
+ description: typing.Optional[str]
86
+ model_types: list[AvroModelType]
87
+
88
+
89
+ avro_primitive_types = set(
90
+ [
91
+ "string",
92
+ "text",
93
+ "varchar",
94
+ "float",
95
+ "double",
96
+ "int",
97
+ "integer",
98
+ "long",
99
+ "bigint",
100
+ "boolean",
101
+ "timestamp_ntz",
102
+ "timestamp",
103
+ "timestamp_tz",
104
+ "date",
105
+ "bytes",
106
+ "null",
107
+ ]
108
+ )
109
+
110
+
111
+ def _to_avro_primitive_logical_type(field_name: str, field: Field) -> AvroPrimitiveField:
112
+ result = AvroPrimitiveField(field_name, field.required, field.description, AvroPrimitiveType.string)
113
+ match field.type:
114
+ case "string" | "text" | "varchar":
115
+ result.type = AvroPrimitiveType.string
116
+ case "float":
117
+ result.type = AvroPrimitiveType.float
118
+ case "double":
119
+ result.type = AvroPrimitiveType.double
120
+ case "int" | "integer":
121
+ result.type = AvroPrimitiveType.int
122
+ case "long" | "bigint":
123
+ result.type = AvroPrimitiveType.long
124
+ case "boolean":
125
+ result.type = AvroPrimitiveType.boolean
126
+ case "timestamp" | "timestamp_tz":
127
+ result.type = AvroPrimitiveType.string
128
+ case "timestamp_ntz":
129
+ result.type = AvroLogicalType.timestamp_ms
130
+ case "date":
131
+ result.type = AvroLogicalType.date
132
+ case "bytes":
133
+ result.type = AvroPrimitiveType.bytes
134
+ case "null":
135
+ result.type = AvroPrimitiveType.null
136
+ case _:
137
+ raise DataContractException(
138
+ type="general",
139
+ name="avro-idl-export",
140
+ model=field,
141
+ reason="Unknown field type {field.type}",
142
+ result="failed",
143
+ message="Avro IDL type conversion failed.",
144
+ )
145
+ return result
146
+
147
+
148
+ def _to_avro_idl_type(field_name: str, field: Field) -> AvroField:
149
+ if field.type in avro_primitive_types:
150
+ return _to_avro_primitive_logical_type(field_name, field)
151
+ else:
152
+ match field.type:
153
+ case "array":
154
+ return AvroArrayField(
155
+ field_name, field.required, field.description, _to_avro_idl_type(field_name, field.items)
156
+ )
157
+ case "object" | "record" | "struct":
158
+ return AvroComplexField(
159
+ field_name,
160
+ field.required,
161
+ field.description,
162
+ [_to_avro_idl_type(field_name, field) for (field_name, field) in field.fields.items()],
163
+ )
164
+ case _:
165
+ raise DataContractException(
166
+ type="general",
167
+ name="avro-idl-export",
168
+ model=type,
169
+ reason="Unknown Data Contract field type",
170
+ result="failed",
171
+ message="Avro IDL type conversion failed.",
172
+ )
173
+
174
+
175
+ def _generate_field_types(contract: DataContractSpecification) -> list[AvroField]:
176
+ result = []
177
+ for _, model in contract.models.items():
178
+ for field_name, field in model.fields.items():
179
+ result.append(_to_avro_idl_type(field_name, field))
180
+ return result
181
+
182
+
183
+ def generate_model_types(contract: DataContractSpecification) -> list[AvroModelType]:
184
+ result = []
185
+ for model_name, model in contract.models.items():
186
+ result.append(
187
+ AvroModelType(name=model_name, description=model.description, fields=_generate_field_types(contract))
188
+ )
189
+ return result
190
+
191
+
192
+ def _model_name_to_identifier(model_name: str):
193
+ return "".join([word.title() for word in model_name.split()])
194
+
195
+
196
+ def _contract_to_avro_idl_ir(contract: DataContractSpecification) -> AvroIDLProtocol:
197
+ """Convert models into an intermediate representation for later serialization into Avro IDL.
198
+
199
+ Each model is converted to a record containing a field for each model field.
200
+ """
201
+ inlined_contract = contract.model_copy()
202
+ inline_definitions_into_data_contract(inlined_contract)
203
+ protocol_name = _model_name_to_identifier(contract.info.title) if contract.info and contract.info.title else None
204
+ description = contract.info.description if contract.info and contract.info.description else None
205
+ return AvroIDLProtocol(
206
+ name=protocol_name, description=description, model_types=generate_model_types(inlined_contract)
207
+ )
208
+
209
+
210
+ def _write_indent(indent: int, stream: typing.TextIO):
211
+ stream.write(" " * indent)
212
+
213
+
214
+ def _write_field_description(field: AvroField, indent: int, stream: typing.TextIO):
215
+ if field.description:
216
+ _write_indent(indent, stream)
217
+ stream.write(f"/** {field.description} */\n")
218
+
219
+
220
+ def _write_field_type_definition(field: AvroField, indent: int, stream: typing.TextIO) -> str:
221
+ # Write any extra information (such as record type definition) and return
222
+ # the name of the generated type. Writes descriptions only for record
223
+ # types. This leads to record types being described twice, once on the
224
+ # record definition, and once on use. The alternative (detect when the
225
+ # complex field type is not used in an array or another complex type) is
226
+ # significantly more complex to implement.
227
+ match field:
228
+ case AvroPrimitiveField(name, required, _, typ) if required is True:
229
+ return typ.value
230
+ case AvroPrimitiveField(name, required, _, typ):
231
+ return typ.value + "?"
232
+ case AvroComplexField(name, required, _, subfields):
233
+ _write_field_description(field, indent, stream)
234
+ _write_indent(indent, stream)
235
+ stream.write(f"record {name}_type {{\n")
236
+ subfield_types = []
237
+ # Recursively define records for all subfields if necessary
238
+ for subfield in subfields:
239
+ subfield_types.append(_write_field_type_definition(subfield, indent + 1, stream))
240
+ # Reference all defined record types.
241
+ for field, subfield_type in zip(field.subfields, subfield_types):
242
+ _write_field_description(field, indent + 1, stream)
243
+ _write_indent(indent + 1, stream)
244
+ stream.write(f"{subfield_type} {field.name};\n")
245
+ _write_indent(indent, stream)
246
+ stream.write("}\n")
247
+ if required is True:
248
+ return f"{name}_type"
249
+ else:
250
+ return f"{name}_type?"
251
+ case AvroArrayField(name, required, _, item_type):
252
+ subfield_type = _write_field_type_definition(item_type, indent, stream)
253
+ if required is True:
254
+ return f"array<{subfield_type}>"
255
+ else:
256
+ return f"array<{subfield_type}>?"
257
+ case _:
258
+ raise RuntimeError("Unknown Avro field type {field}")
259
+
260
+
261
+ def _write_field(field: AvroField, indent, stream: typing.TextIO):
262
+ # Start of recursion.
263
+ typename = _write_field_type_definition(field, indent, stream)
264
+ _write_field_description(field, indent, stream)
265
+ _write_indent(indent, stream)
266
+ stream.write(f"{typename} {field.name};\n")
267
+
268
+
269
+ def _write_model_type(model: AvroModelType, stream: typing.TextIO):
270
+ # Called once for each model
271
+ if model.description:
272
+ _write_indent(1, stream)
273
+ stream.write(f"/** {model.description} */\n")
274
+ _write_indent(1, stream)
275
+ stream.write(f"record {model.name} {{\n")
276
+ # Called for each model field
277
+ for field in model.fields:
278
+ _write_field(field, 2, stream)
279
+ _write_indent(1, stream)
280
+ stream.write("}\n")
@@ -2,14 +2,11 @@ from typing import Dict
2
2
 
3
3
  import yaml
4
4
 
5
+ from datacontract.export.sql_type_converter import convert_to_sql_type
5
6
  from datacontract.model.data_contract_specification import \
6
7
  DataContractSpecification, Model, Field
7
8
 
8
9
 
9
- # snowflake data types:
10
- # https://docs.snowflake.com/en/sql-reference/data-types.html
11
-
12
-
13
10
  def to_dbt_models_yaml(data_contract_spec: DataContractSpecification):
14
11
  dbt = {
15
12
  "version": 2,
@@ -18,18 +15,17 @@ def to_dbt_models_yaml(data_contract_spec: DataContractSpecification):
18
15
  for model_key, model_value in data_contract_spec.models.items():
19
16
  dbt_model = _to_dbt_model(model_key, model_value, data_contract_spec)
20
17
  dbt["models"].append(dbt_model)
21
- return yaml.dump(dbt, indent=2, sort_keys=False)
18
+ return yaml.dump(dbt, indent=2, sort_keys=False, allow_unicode=True)
22
19
 
23
20
 
24
- def to_dbt_staging_sql(data_contract_spec: DataContractSpecification):
21
+ def to_dbt_staging_sql(data_contract_spec: DataContractSpecification, model_name: str, model_value: Model) -> str:
25
22
  if data_contract_spec.models is None or len(data_contract_spec.models.items()) != 1:
26
- print(f"Export to dbt-staging-sql currently only works with exactly one model in the data contract.")
23
+ print("Export to dbt-staging-sql currently only works with exactly one model in the data contract.")
27
24
  return ""
28
25
 
29
26
  id = data_contract_spec.id
30
- model_name, model = next(iter(data_contract_spec.models.items()))
31
27
  columns = []
32
- for field_name, field in model.fields.items():
28
+ for field_name, field in model_value.fields.items():
33
29
  # TODO escape SQL reserved key words, probably dependent on server type
34
30
  columns.append(field_name)
35
31
  return f"""
@@ -40,15 +36,10 @@ def to_dbt_staging_sql(data_contract_spec: DataContractSpecification):
40
36
 
41
37
 
42
38
  def to_dbt_sources_yaml(data_contract_spec: DataContractSpecification, server: str = None):
43
- source = {
44
- "name": data_contract_spec.id,
45
- "tables": []
46
- }
39
+ source = {"name": data_contract_spec.id, "tables": []}
47
40
  dbt = {
48
41
  "version": 2,
49
- "sources": [
50
- source
51
- ],
42
+ "sources": [source],
52
43
  }
53
44
  if data_contract_spec.info.owner is not None:
54
45
  source["meta"] = {"owner": data_contract_spec.info.owner}
@@ -62,7 +53,7 @@ def to_dbt_sources_yaml(data_contract_spec: DataContractSpecification, server: s
62
53
  for model_key, model_value in data_contract_spec.models.items():
63
54
  dbt_model = _to_dbt_source_table(model_key, model_value)
64
55
  source["tables"].append(dbt_model)
65
- return yaml.dump(dbt, indent=2, sort_keys=False)
56
+ return yaml.dump(dbt, indent=2, sort_keys=False, allow_unicode=True)
66
57
 
67
58
 
68
59
  def _to_dbt_source_table(model_key, model_value: Model) -> dict:
@@ -83,20 +74,14 @@ def _to_dbt_model(model_key, model_value: Model, data_contract_spec: DataContrac
83
74
  "name": model_key,
84
75
  }
85
76
  model_type = _to_dbt_model_type(model_value.type)
86
- dbt_model["config"] = {
87
- "meta": {
88
- "data_contract": data_contract_spec.id
89
- }
90
- }
77
+ dbt_model["config"] = {"meta": {"data_contract": data_contract_spec.id}}
91
78
  dbt_model["config"]["materialized"] = model_type
92
79
 
93
80
  if data_contract_spec.info.owner is not None:
94
81
  dbt_model["config"]["meta"]["owner"] = data_contract_spec.info.owner
95
82
 
96
83
  if _supports_constraints(model_type):
97
- dbt_model["config"]["contract"] = {
98
- "enforced": True
99
- }
84
+ dbt_model["config"]["contract"] = {"enforced": True}
100
85
  if model_value.description is not None:
101
86
  dbt_model["description"] = model_value.description
102
87
  columns = _to_columns(model_value.fields, _supports_constraints(model_type), True)
@@ -133,14 +118,14 @@ def _to_columns(fields: Dict[str, Field], supports_constraints: bool, supports_d
133
118
 
134
119
  def _to_column(field: Field, supports_constraints: bool, supports_datatype: bool) -> dict:
135
120
  column = {}
136
- dbt_type = _convert_type_to_snowflake(field.type)
121
+ dbt_type = convert_to_sql_type(field, "snowflake")
137
122
  if dbt_type is not None:
138
123
  if supports_datatype:
139
124
  column["data_type"] = dbt_type
140
125
  else:
141
126
  column.setdefault("tests", []).append(
142
- {"dbt_expectations.dbt_expectations.expect_column_values_to_be_of_type": {
143
- "column_type": dbt_type}})
127
+ {"dbt_expectations.dbt_expectations.expect_column_values_to_be_of_type": {"column_type": dbt_type}}
128
+ )
144
129
  if field.description is not None:
145
130
  column["description"] = field.description
146
131
  if field.required:
@@ -162,7 +147,8 @@ def _to_column(field: Field, supports_constraints: bool, supports_datatype: bool
162
147
  if field.maxLength is not None:
163
148
  length_test["max_value"] = field.maxLength
164
149
  column.setdefault("tests", []).append(
165
- {"dbt_expectations.expect_column_value_lengths_to_be_between": length_test})
150
+ {"dbt_expectations.expect_column_value_lengths_to_be_between": length_test}
151
+ )
166
152
  if field.pii is not None:
167
153
  column.setdefault("meta", {})["pii"] = field.pii
168
154
  if field.classification is not None:
@@ -172,71 +158,60 @@ def _to_column(field: Field, supports_constraints: bool, supports_datatype: bool
172
158
  if field.pattern is not None:
173
159
  # Beware, the data contract pattern is a regex, not a like pattern
174
160
  column.setdefault("tests", []).append(
175
- {"dbt_expectations.expect_column_values_to_match_regex": {"regex": field.pattern}})
176
- if field.minimum is not None or field.maximum is not None and field.minimumExclusive is None and field.maximumExclusive is None:
161
+ {"dbt_expectations.expect_column_values_to_match_regex": {"regex": field.pattern}}
162
+ )
163
+ if (
164
+ field.minimum is not None
165
+ or field.maximum is not None
166
+ and field.exclusiveMinimum is None
167
+ and field.exclusiveMaximum is None
168
+ ):
177
169
  range_test = {}
178
170
  if field.minimum is not None:
179
171
  range_test["min_value"] = field.minimum
180
172
  if field.maximum is not None:
181
173
  range_test["max_value"] = field.maximum
182
174
  column.setdefault("tests", []).append({"dbt_expectations.expect_column_values_to_be_between": range_test})
183
- elif field.minimumExclusive is not None or field.maximumExclusive is not None and field.minimum is None and field.maximum is None:
175
+ elif (
176
+ field.exclusiveMinimum is not None
177
+ or field.exclusiveMaximum is not None
178
+ and field.minimum is None
179
+ and field.maximum is None
180
+ ):
184
181
  range_test = {}
185
- if field.minimumExclusive is not None:
186
- range_test["min_value"] = field.minimumExclusive
187
- if field.maximumExclusive is not None:
188
- range_test["max_value"] = field.maximumExclusive
182
+ if field.exclusiveMinimum is not None:
183
+ range_test["min_value"] = field.exclusiveMinimum
184
+ if field.exclusiveMaximum is not None:
185
+ range_test["max_value"] = field.exclusiveMaximum
189
186
  range_test["strictly"] = True
190
187
  column.setdefault("tests", []).append({"dbt_expectations.expect_column_values_to_be_between": range_test})
191
188
  else:
192
189
  if field.minimum is not None:
193
190
  column.setdefault("tests", []).append(
194
- {"dbt_expectations.expect_column_values_to_be_between": {"min_value": field.minimum}})
191
+ {"dbt_expectations.expect_column_values_to_be_between": {"min_value": field.minimum}}
192
+ )
195
193
  if field.maximum is not None:
196
194
  column.setdefault("tests", []).append(
197
- {"dbt_expectations.expect_column_values_to_be_between": {"max_value": field.maximum}})
198
- if field.minimumExclusive is not None:
199
- column.setdefault("tests", []).append({"dbt_expectations.expect_column_values_to_be_between": {
200
- "min_value": field.minimumExclusive, "strictly": True}})
201
- if field.maximumExclusive is not None:
202
- column.setdefault("tests", []).append({"dbt_expectations.expect_column_values_to_be_between": {
203
- "max_value": field.maximumExclusive, "strictly": True}})
195
+ {"dbt_expectations.expect_column_values_to_be_between": {"max_value": field.maximum}}
196
+ )
197
+ if field.exclusiveMinimum is not None:
198
+ column.setdefault("tests", []).append(
199
+ {
200
+ "dbt_expectations.expect_column_values_to_be_between": {
201
+ "min_value": field.exclusiveMinimum,
202
+ "strictly": True,
203
+ }
204
+ }
205
+ )
206
+ if field.exclusiveMaximum is not None:
207
+ column.setdefault("tests", []).append(
208
+ {
209
+ "dbt_expectations.expect_column_values_to_be_between": {
210
+ "max_value": field.exclusiveMaximum,
211
+ "strictly": True,
212
+ }
213
+ }
214
+ )
204
215
 
205
216
  # TODO: all constraints
206
217
  return column
207
-
208
-
209
- def _convert_type_to_snowflake(type) -> None | str:
210
- # currently optimized for snowflake
211
- # LEARNING: data contract has no direct support for CHAR,CHARACTER
212
- # LEARNING: data contract has no support for "date-time", "datetime", "time"
213
- # LEARNING: No precision and scale support in data contract
214
- # LEARNING: no support for any
215
- # GEOGRAPHY and GEOMETRY are not supported by the mapping
216
- if type is None:
217
- return None
218
- if type.lower() in ["string", "varchar", "text"]:
219
- return type.upper() # STRING, TEXT, VARCHAR are all the same in snowflake
220
- if type.lower() in ["timestamp", "timestamp_tz"]:
221
- return "TIMESTAMP_TZ"
222
- if type.lower() in ["timestamp_ntz"]:
223
- return "TIMESTAMP_NTZ"
224
- if type.lower() in ["date"]:
225
- return "DATE"
226
- if type.lower() in ["time"]:
227
- return "TIME"
228
- if type.lower() in ["number", "decimal", "numeric"]:
229
- return "NUMBER" # precision and scale not supported by data contract
230
- if type.lower() in ["float", "double"]:
231
- return "FLOAT"
232
- if type.lower() in ["integer", "int", "long", "bigint"]:
233
- return "NUMBER" # always NUMBER(38,0)
234
- if type.lower() in ["boolean"]:
235
- return "BOOLEAN"
236
- if type.lower() in ["object", "record", "struct"]:
237
- return "OBJECT"
238
- if type.lower() in ["bytes"]:
239
- return "BINARY"
240
- if type.lower() in ["array"]:
241
- return "ARRAY"
242
- return None
@@ -0,0 +1,141 @@
1
+ import json
2
+ from typing import Dict, List, Any
3
+
4
+ import yaml
5
+
6
+ from datacontract.model.data_contract_specification import \
7
+ DataContractSpecification, Field, Quality
8
+
9
+
10
+ def to_great_expectations(data_contract_spec: DataContractSpecification, model_key: str) -> str:
11
+ """
12
+ Convert each model in the contract to a Great Expectation suite
13
+ @param data_contract_spec: data contract to export to great expectations
14
+ @param model_key: model to great expectations to
15
+ @return: a dictionary of great expectation suites
16
+ """
17
+ expectations = []
18
+ model_value = data_contract_spec.models.get(model_key)
19
+ quality_checks = get_quality_checks(data_contract_spec.quality)
20
+ expectations.extend(model_to_expectations(model_value.fields))
21
+ expectations.extend(checks_to_expectations(quality_checks, model_key))
22
+ model_expectation_suite = to_suite(model_key, data_contract_spec.info.version, expectations)
23
+
24
+ return model_expectation_suite
25
+
26
+
27
+ def to_suite(
28
+ model_key: str,
29
+ contract_version: str,
30
+ expectations: List[Dict[str, Any]],
31
+ ) -> str:
32
+ return json.dumps(
33
+ {
34
+ "data_asset_type": "null",
35
+ "expectation_suite_name": "user-defined.{model_key}.{contract_version}".format(
36
+ model_key=model_key, contract_version=contract_version
37
+ ),
38
+ "expectations": expectations,
39
+ "meta": {},
40
+ },
41
+ indent=2,
42
+ )
43
+
44
+
45
+ def model_to_expectations(fields: Dict[str, Field]) -> List[Dict[str, Any]]:
46
+ """
47
+ Convert the model information to expectations
48
+ @param fields: model field
49
+ @return: list of expectations
50
+ """
51
+ expectations = []
52
+ add_column_order_exp(fields, expectations)
53
+ for field_name, field in fields.items():
54
+ add_field_expectations(field_name, field, expectations)
55
+ return expectations
56
+
57
+
58
+ def add_field_expectations(field_name, field: Field, expectations: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
59
+ if field.type is not None:
60
+ expectations.append(to_column_types_exp(field_name, field.type))
61
+ if field.unique is not None:
62
+ expectations.append(to_column_unique_exp(field_name))
63
+ if field.maxLength is not None or field.minLength is not None:
64
+ expectations.append(to_column_length_exp(field_name, field.minLength, field.maxLength))
65
+ if field.minimum is not None or field.maximum is not None:
66
+ expectations.append(to_column_min_max_exp(field_name, field.minimum, field.maximum))
67
+
68
+ # TODO: all constraints
69
+ return expectations
70
+
71
+
72
+ def add_column_order_exp(fields: Dict[str, Field], expectations: List[Dict[str, Any]]):
73
+ expectations.append(
74
+ {
75
+ "expectation_type": "expect_table_columns_to_match_ordered_list",
76
+ "kwargs": {"column_list": list(fields.keys())},
77
+ "meta": {},
78
+ }
79
+ )
80
+
81
+
82
+ def to_column_types_exp(field_name, field_type) -> Dict[str, Any]:
83
+ return {
84
+ "expectation_type": "expect_column_values_to_be_of_type",
85
+ "kwargs": {"column": field_name, "type_": field_type},
86
+ "meta": {},
87
+ }
88
+
89
+
90
+ def to_column_unique_exp(field_name) -> Dict[str, Any]:
91
+ return {"expectation_type": "expect_column_values_to_be_unique", "kwargs": {"column": field_name}, "meta": {}}
92
+
93
+
94
+ def to_column_length_exp(field_name, min_length, max_length) -> Dict[str, Any]:
95
+ return {
96
+ "expectation_type": "expect_column_value_lengths_to_be_between",
97
+ "kwargs": {"column": field_name, "min_value": min_length, "max_value": max_length},
98
+ "meta": {},
99
+ }
100
+
101
+
102
+ def to_column_min_max_exp(field_name, minimum, maximum) -> Dict[str, Any]:
103
+ return {
104
+ "expectation_type": "expect_column_values_to_be_between",
105
+ "kwargs": {"column": field_name, "min_value": minimum, "max_value": maximum},
106
+ "meta": {},
107
+ }
108
+
109
+
110
+ def get_quality_checks(quality: Quality) -> Dict[str, Any]:
111
+ if quality is None:
112
+ return {}
113
+ if quality.type is None:
114
+ return {}
115
+ if quality.type.lower() != "great-expectations":
116
+ return {}
117
+ if isinstance(quality.specification, str):
118
+ quality_specification = yaml.safe_load(quality.specification)
119
+ else:
120
+ quality_specification = quality.specification
121
+ return quality_specification
122
+
123
+
124
+ def checks_to_expectations(quality_checks: Dict[str, Any], model_key: str) -> List[Dict[str, Any]]:
125
+ """
126
+ Get the quality definition for each model to the model expectation list
127
+ @param quality_checks: dictionary of quality checks by model
128
+ @param model_key: id of the model
129
+ @return: the list of expectations for that model
130
+ """
131
+ if quality_checks is None or model_key not in quality_checks:
132
+ return []
133
+
134
+ model_quality_checks = quality_checks[model_key]
135
+
136
+ if model_quality_checks is None:
137
+ return []
138
+
139
+ if isinstance(model_quality_checks, str):
140
+ expectation_list = json.loads(model_quality_checks)
141
+ return expectation_list
@@ -12,16 +12,18 @@ def to_jsonschemas(data_contract_spec: DataContractSpecification):
12
12
  jsonschmemas[model_key] = jsonschema
13
13
  return jsonschmemas
14
14
 
15
+
15
16
  def to_jsonschema_json(model_key, model_value: Model) -> str:
16
17
  jsonschema = to_jsonschema(model_key, model_value)
17
18
  return json.dumps(jsonschema, indent=2)
18
19
 
20
+
19
21
  def to_jsonschema(model_key, model_value: Model) -> dict:
20
22
  return {
21
23
  "$schema": "http://json-schema.org/draft-07/schema#",
22
24
  "type": "object",
23
25
  "properties": to_properties(model_value.fields),
24
- "required": to_required(model_value.fields)
26
+ "required": to_required(model_value.fields),
25
27
  }
26
28
 
27
29