datacontract-cli 0.10.0__py3-none-any.whl → 0.10.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datacontract/__init__.py +13 -0
- datacontract/api.py +260 -0
- datacontract/breaking/breaking.py +242 -12
- datacontract/breaking/breaking_rules.py +37 -1
- datacontract/catalog/catalog.py +80 -0
- datacontract/cli.py +387 -117
- datacontract/data_contract.py +216 -353
- datacontract/engines/data_contract_checks.py +1041 -0
- datacontract/engines/data_contract_test.py +113 -0
- datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +2 -3
- datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
- datacontract/engines/fastjsonschema/check_jsonschema.py +176 -42
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +16 -1
- datacontract/engines/soda/check_soda_execute.py +100 -56
- datacontract/engines/soda/connections/athena.py +79 -0
- datacontract/engines/soda/connections/bigquery.py +8 -1
- datacontract/engines/soda/connections/databricks.py +12 -3
- datacontract/engines/soda/connections/duckdb_connection.py +241 -0
- datacontract/engines/soda/connections/kafka.py +206 -113
- datacontract/engines/soda/connections/snowflake.py +8 -5
- datacontract/engines/soda/connections/sqlserver.py +43 -0
- datacontract/engines/soda/connections/trino.py +26 -0
- datacontract/export/avro_converter.py +72 -8
- datacontract/export/avro_idl_converter.py +31 -25
- datacontract/export/bigquery_converter.py +130 -0
- datacontract/export/custom_converter.py +40 -0
- datacontract/export/data_caterer_converter.py +161 -0
- datacontract/export/dbml_converter.py +148 -0
- datacontract/export/dbt_converter.py +141 -54
- datacontract/export/dcs_exporter.py +6 -0
- datacontract/export/dqx_converter.py +126 -0
- datacontract/export/duckdb_type_converter.py +57 -0
- datacontract/export/excel_exporter.py +923 -0
- datacontract/export/exporter.py +100 -0
- datacontract/export/exporter_factory.py +216 -0
- datacontract/export/go_converter.py +105 -0
- datacontract/export/great_expectations_converter.py +257 -36
- datacontract/export/html_exporter.py +86 -0
- datacontract/export/iceberg_converter.py +188 -0
- datacontract/export/jsonschema_converter.py +71 -16
- datacontract/export/markdown_converter.py +337 -0
- datacontract/export/mermaid_exporter.py +110 -0
- datacontract/export/odcs_v3_exporter.py +375 -0
- datacontract/export/pandas_type_converter.py +40 -0
- datacontract/export/protobuf_converter.py +168 -68
- datacontract/export/pydantic_converter.py +6 -0
- datacontract/export/rdf_converter.py +13 -6
- datacontract/export/sodacl_converter.py +36 -188
- datacontract/export/spark_converter.py +245 -0
- datacontract/export/sql_converter.py +37 -3
- datacontract/export/sql_type_converter.py +269 -8
- datacontract/export/sqlalchemy_converter.py +170 -0
- datacontract/export/terraform_converter.py +7 -2
- datacontract/imports/avro_importer.py +246 -26
- datacontract/imports/bigquery_importer.py +221 -0
- datacontract/imports/csv_importer.py +143 -0
- datacontract/imports/dbml_importer.py +112 -0
- datacontract/imports/dbt_importer.py +240 -0
- datacontract/imports/excel_importer.py +1111 -0
- datacontract/imports/glue_importer.py +288 -0
- datacontract/imports/iceberg_importer.py +172 -0
- datacontract/imports/importer.py +51 -0
- datacontract/imports/importer_factory.py +128 -0
- datacontract/imports/json_importer.py +325 -0
- datacontract/imports/jsonschema_importer.py +146 -0
- datacontract/imports/odcs_importer.py +60 -0
- datacontract/imports/odcs_v3_importer.py +516 -0
- datacontract/imports/parquet_importer.py +81 -0
- datacontract/imports/protobuf_importer.py +264 -0
- datacontract/imports/spark_importer.py +262 -0
- datacontract/imports/sql_importer.py +274 -35
- datacontract/imports/unity_importer.py +219 -0
- datacontract/init/init_template.py +20 -0
- datacontract/integration/datamesh_manager.py +86 -0
- datacontract/lint/resolve.py +271 -49
- datacontract/lint/resources.py +21 -0
- datacontract/lint/schema.py +53 -17
- datacontract/lint/urls.py +32 -12
- datacontract/model/data_contract_specification/__init__.py +1 -0
- datacontract/model/exceptions.py +4 -1
- datacontract/model/odcs.py +24 -0
- datacontract/model/run.py +49 -29
- datacontract/output/__init__.py +0 -0
- datacontract/output/junit_test_results.py +135 -0
- datacontract/output/output_format.py +10 -0
- datacontract/output/test_results_writer.py +79 -0
- datacontract/py.typed +0 -0
- datacontract/schemas/datacontract-1.1.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.1.0.schema.json +1975 -0
- datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
- datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
- datacontract/schemas/odcs-3.0.1.schema.json +2634 -0
- datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
- datacontract/templates/datacontract.html +139 -294
- datacontract/templates/datacontract_odcs.html +685 -0
- datacontract/templates/index.html +236 -0
- datacontract/templates/partials/datacontract_information.html +86 -0
- datacontract/templates/partials/datacontract_servicelevels.html +253 -0
- datacontract/templates/partials/datacontract_terms.html +51 -0
- datacontract/templates/partials/definition.html +25 -0
- datacontract/templates/partials/example.html +27 -0
- datacontract/templates/partials/model_field.html +144 -0
- datacontract/templates/partials/quality.html +49 -0
- datacontract/templates/partials/server.html +211 -0
- datacontract/templates/style/output.css +491 -72
- datacontract_cli-0.10.37.dist-info/METADATA +2235 -0
- datacontract_cli-0.10.37.dist-info/RECORD +119 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/WHEEL +1 -1
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info/licenses}/LICENSE +1 -1
- datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
- datacontract/engines/soda/connections/dask.py +0 -28
- datacontract/engines/soda/connections/duckdb.py +0 -76
- datacontract/export/csv_type_converter.py +0 -36
- datacontract/export/html_export.py +0 -66
- datacontract/export/odcs_converter.py +0 -102
- datacontract/init/download_datacontract_file.py +0 -17
- datacontract/integration/publish_datamesh_manager.py +0 -33
- datacontract/integration/publish_opentelemetry.py +0 -107
- datacontract/lint/lint.py +0 -141
- datacontract/lint/linters/description_linter.py +0 -34
- datacontract/lint/linters/example_model_linter.py +0 -91
- datacontract/lint/linters/field_pattern_linter.py +0 -34
- datacontract/lint/linters/field_reference_linter.py +0 -38
- datacontract/lint/linters/notice_period_linter.py +0 -55
- datacontract/lint/linters/quality_schema_linter.py +0 -52
- datacontract/lint/linters/valid_constraints_linter.py +0 -99
- datacontract/model/data_contract_specification.py +0 -141
- datacontract/web.py +0 -14
- datacontract_cli-0.10.0.dist-info/METADATA +0 -951
- datacontract_cli-0.10.0.dist-info/RECORD +0 -66
- /datacontract/{model → breaking}/breaking_change.py +0 -0
- /datacontract/{lint/linters → export}/__init__.py +0 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,46 @@
|
|
|
1
|
+
from typing import Dict, List
|
|
2
|
+
|
|
1
3
|
import avro.schema
|
|
2
4
|
|
|
3
|
-
from datacontract.
|
|
4
|
-
|
|
5
|
+
from datacontract.imports.importer import Importer
|
|
6
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
5
7
|
from datacontract.model.exceptions import DataContractException
|
|
6
8
|
|
|
7
9
|
|
|
10
|
+
class AvroImporter(Importer):
|
|
11
|
+
"""Class to import Avro Schema file"""
|
|
12
|
+
|
|
13
|
+
def import_source(
|
|
14
|
+
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
15
|
+
) -> DataContractSpecification:
|
|
16
|
+
"""
|
|
17
|
+
Import Avro schema from a source file.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
data_contract_specification: The data contract specification to update.
|
|
21
|
+
source: The path to the Avro schema file.
|
|
22
|
+
import_args: Additional import arguments.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
The updated data contract specification.
|
|
26
|
+
"""
|
|
27
|
+
return import_avro(data_contract_specification, source)
|
|
28
|
+
|
|
29
|
+
|
|
8
30
|
def import_avro(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
|
|
31
|
+
"""
|
|
32
|
+
Import an Avro schema from a file and update the data contract specification.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
data_contract_specification: The data contract specification to update.
|
|
36
|
+
source: The path to the Avro schema file.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
DataContractSpecification: The updated data contract specification.
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
DataContractException: If there's an error parsing the Avro schema.
|
|
43
|
+
"""
|
|
9
44
|
if data_contract_specification.models is None:
|
|
10
45
|
data_contract_specification.models = {}
|
|
11
46
|
|
|
@@ -20,7 +55,6 @@ def import_avro(data_contract_specification: DataContractSpecification, source:
|
|
|
20
55
|
engine="datacontract",
|
|
21
56
|
original_exception=e,
|
|
22
57
|
)
|
|
23
|
-
|
|
24
58
|
# type record is being used for both the table and the object types in data contract
|
|
25
59
|
# -> CONSTRAINT: one table per .avsc input, all nested records are interpreted as objects
|
|
26
60
|
fields = import_record_fields(avro_schema.fields)
|
|
@@ -38,35 +72,118 @@ def import_avro(data_contract_specification: DataContractSpecification, source:
|
|
|
38
72
|
return data_contract_specification
|
|
39
73
|
|
|
40
74
|
|
|
41
|
-
def
|
|
75
|
+
def handle_config_avro_custom_properties(field: avro.schema.Field, imported_field: Field) -> None:
|
|
76
|
+
"""
|
|
77
|
+
Handle custom Avro properties and add them to the imported field's config.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
field: The Avro field.
|
|
81
|
+
imported_field: The imported field to update.
|
|
82
|
+
"""
|
|
83
|
+
if field.get_prop("logicalType") is not None:
|
|
84
|
+
if imported_field.config is None:
|
|
85
|
+
imported_field.config = {}
|
|
86
|
+
imported_field.config["avroLogicalType"] = field.get_prop("logicalType")
|
|
87
|
+
|
|
88
|
+
if field.default is not None:
|
|
89
|
+
if imported_field.config is None:
|
|
90
|
+
imported_field.config = {}
|
|
91
|
+
imported_field.config["avroDefault"] = field.default
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
LOGICAL_TYPE_MAPPING = {
|
|
95
|
+
"decimal": "decimal",
|
|
96
|
+
"date": "date",
|
|
97
|
+
"time-millis": "time",
|
|
98
|
+
"time-micros": "time",
|
|
99
|
+
"timestamp-millis": "timestamp_tz",
|
|
100
|
+
"timestamp-micros": "timestamp_tz",
|
|
101
|
+
"local-timestamp-micros": "timestamp_ntz",
|
|
102
|
+
"local-timestamp-millis": "timestamp_ntz",
|
|
103
|
+
"duration": "string",
|
|
104
|
+
"uuid": "string",
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def import_record_fields(record_fields: List[avro.schema.Field]) -> Dict[str, Field]:
|
|
109
|
+
"""
|
|
110
|
+
Import Avro record fields and convert them to data contract fields.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
record_fields: List of Avro record fields.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
A dictionary of imported fields.
|
|
117
|
+
"""
|
|
42
118
|
imported_fields = {}
|
|
43
119
|
for field in record_fields:
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
120
|
+
imported_field = Field()
|
|
121
|
+
imported_field.required = True
|
|
122
|
+
imported_field.description = field.doc
|
|
123
|
+
|
|
124
|
+
handle_config_avro_custom_properties(field, imported_field)
|
|
49
125
|
|
|
126
|
+
# Determine field type and handle nested structures
|
|
50
127
|
if field.type.type == "record":
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
128
|
+
imported_field.type = "object"
|
|
129
|
+
imported_field.description = field.type.doc
|
|
130
|
+
imported_field.fields = import_record_fields(field.type.fields)
|
|
54
131
|
elif field.type.type == "union":
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
if
|
|
59
|
-
|
|
132
|
+
imported_field.required = False
|
|
133
|
+
# Check for enum in union first, since it needs special handling
|
|
134
|
+
enum_schema = get_enum_from_union_field(field)
|
|
135
|
+
if enum_schema:
|
|
136
|
+
imported_field.type = "string"
|
|
137
|
+
imported_field.enum = enum_schema.symbols
|
|
138
|
+
imported_field.title = enum_schema.name
|
|
139
|
+
if not imported_field.config:
|
|
140
|
+
imported_field.config = {}
|
|
141
|
+
imported_field.config["avroType"] = "enum"
|
|
142
|
+
else:
|
|
143
|
+
type = import_type_of_optional_field(field)
|
|
144
|
+
imported_field.type = type
|
|
145
|
+
if type == "record":
|
|
146
|
+
imported_field.fields = import_record_fields(get_record_from_union_field(field).fields)
|
|
147
|
+
elif type == "array":
|
|
148
|
+
imported_field.type = "array"
|
|
149
|
+
imported_field.items = import_avro_array_items(get_array_from_union_field(field))
|
|
60
150
|
elif field.type.type == "array":
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
151
|
+
imported_field.type = "array"
|
|
152
|
+
imported_field.items = import_avro_array_items(field.type)
|
|
153
|
+
elif field.type.type == "map":
|
|
154
|
+
imported_field.type = "map"
|
|
155
|
+
imported_field.values = import_avro_map_values(field.type)
|
|
156
|
+
elif field.type.type == "enum":
|
|
157
|
+
imported_field.type = "string"
|
|
158
|
+
imported_field.enum = field.type.symbols
|
|
159
|
+
imported_field.title = field.type.name
|
|
160
|
+
if not imported_field.config:
|
|
161
|
+
imported_field.config = {}
|
|
162
|
+
imported_field.config["avroType"] = "enum"
|
|
163
|
+
else:
|
|
164
|
+
logical_type = field.type.get_prop("logicalType")
|
|
165
|
+
if logical_type in LOGICAL_TYPE_MAPPING:
|
|
166
|
+
imported_field.type = LOGICAL_TYPE_MAPPING[logical_type]
|
|
167
|
+
if logical_type == "decimal":
|
|
168
|
+
imported_field.precision = field.type.precision
|
|
169
|
+
imported_field.scale = field.type.scale
|
|
170
|
+
else:
|
|
171
|
+
imported_field.type = map_type_from_avro(field.type.type)
|
|
172
|
+
imported_fields[field.name] = imported_field
|
|
65
173
|
|
|
66
174
|
return imported_fields
|
|
67
175
|
|
|
68
176
|
|
|
69
|
-
def import_avro_array_items(array_schema):
|
|
177
|
+
def import_avro_array_items(array_schema: avro.schema.ArraySchema) -> Field:
|
|
178
|
+
"""
|
|
179
|
+
Import Avro array items and convert them to a data contract field.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
array_schema: The Avro array schema.
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
Field: The imported field representing the array items.
|
|
186
|
+
"""
|
|
70
187
|
items = Field()
|
|
71
188
|
for prop in array_schema.other_props:
|
|
72
189
|
items.__setattr__(prop, array_schema.other_props[prop])
|
|
@@ -83,10 +200,52 @@ def import_avro_array_items(array_schema):
|
|
|
83
200
|
return items
|
|
84
201
|
|
|
85
202
|
|
|
86
|
-
def
|
|
203
|
+
def import_avro_map_values(map_schema: avro.schema.MapSchema) -> Field:
|
|
204
|
+
"""
|
|
205
|
+
Import Avro map values and convert them to a data contract field.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
map_schema: The Avro map schema.
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
Field: The imported field representing the map values.
|
|
212
|
+
"""
|
|
213
|
+
values = Field()
|
|
214
|
+
for prop in map_schema.other_props:
|
|
215
|
+
values.__setattr__(prop, map_schema.other_props[prop])
|
|
216
|
+
|
|
217
|
+
if map_schema.values.type == "record":
|
|
218
|
+
values.type = "object"
|
|
219
|
+
values.fields = import_record_fields(map_schema.values.fields)
|
|
220
|
+
elif map_schema.values.type == "array":
|
|
221
|
+
values.type = "array"
|
|
222
|
+
values.items = import_avro_array_items(map_schema.values)
|
|
223
|
+
else: # primitive type
|
|
224
|
+
values.type = map_type_from_avro(map_schema.values.type)
|
|
225
|
+
|
|
226
|
+
return values
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def import_type_of_optional_field(field: avro.schema.Field) -> str:
|
|
230
|
+
"""
|
|
231
|
+
Determine the type of optional field in an Avro union.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
field: The Avro field with a union type.
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
str: The mapped type of the non-null field in the union.
|
|
238
|
+
|
|
239
|
+
Raises:
|
|
240
|
+
DataContractException: If no non-null type is found in the union.
|
|
241
|
+
"""
|
|
87
242
|
for field_type in field.type.schemas:
|
|
88
243
|
if field_type.type != "null":
|
|
89
|
-
|
|
244
|
+
logical_type = field_type.get_prop("logicalType")
|
|
245
|
+
if logical_type and logical_type in LOGICAL_TYPE_MAPPING:
|
|
246
|
+
return LOGICAL_TYPE_MAPPING[logical_type]
|
|
247
|
+
else:
|
|
248
|
+
return map_type_from_avro(field_type.type)
|
|
90
249
|
raise DataContractException(
|
|
91
250
|
type="schema",
|
|
92
251
|
result="failed",
|
|
@@ -96,14 +255,67 @@ def import_type_of_optional_field(field):
|
|
|
96
255
|
)
|
|
97
256
|
|
|
98
257
|
|
|
99
|
-
def get_record_from_union_field(field):
|
|
258
|
+
def get_record_from_union_field(field: avro.schema.Field) -> avro.schema.RecordSchema | None:
|
|
259
|
+
"""
|
|
260
|
+
Get the record schema from a union field.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
field: The Avro field with a union type.
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
The record schema if found, None otherwise.
|
|
267
|
+
"""
|
|
100
268
|
for field_type in field.type.schemas:
|
|
101
269
|
if field_type.type == "record":
|
|
102
270
|
return field_type
|
|
103
271
|
return None
|
|
104
272
|
|
|
105
273
|
|
|
106
|
-
def
|
|
274
|
+
def get_array_from_union_field(field: avro.schema.Field) -> avro.schema.ArraySchema | None:
|
|
275
|
+
"""
|
|
276
|
+
Get the array schema from a union field.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
field: The Avro field with a union type.
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
The array schema if found, None otherwise.
|
|
283
|
+
"""
|
|
284
|
+
for field_type in field.type.schemas:
|
|
285
|
+
if field_type.type == "array":
|
|
286
|
+
return field_type
|
|
287
|
+
return None
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def get_enum_from_union_field(field: avro.schema.Field) -> avro.schema.EnumSchema | None:
|
|
291
|
+
"""
|
|
292
|
+
Get the enum schema from a union field.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
field: The Avro field with a union type.
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
The enum schema if found, None otherwise.
|
|
299
|
+
"""
|
|
300
|
+
for field_type in field.type.schemas:
|
|
301
|
+
if field_type.type == "enum":
|
|
302
|
+
return field_type
|
|
303
|
+
return None
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def map_type_from_avro(avro_type_str: str) -> str:
|
|
307
|
+
"""
|
|
308
|
+
Map Avro type strings to data contract type strings.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
avro_type_str (str): The Avro type string.
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
str: The corresponding data contract type string.
|
|
315
|
+
|
|
316
|
+
Raises:
|
|
317
|
+
DataContractException: If the Avro type is unsupported.
|
|
318
|
+
"""
|
|
107
319
|
# TODO: ambiguous mapping in the export
|
|
108
320
|
if avro_type_str == "null":
|
|
109
321
|
return "null"
|
|
@@ -113,6 +325,8 @@ def map_type_from_avro(avro_type_str: str):
|
|
|
113
325
|
return "binary"
|
|
114
326
|
elif avro_type_str == "double":
|
|
115
327
|
return "double"
|
|
328
|
+
elif avro_type_str == "float":
|
|
329
|
+
return "float"
|
|
116
330
|
elif avro_type_str == "int":
|
|
117
331
|
return "int"
|
|
118
332
|
elif avro_type_str == "long":
|
|
@@ -121,6 +335,12 @@ def map_type_from_avro(avro_type_str: str):
|
|
|
121
335
|
return "boolean"
|
|
122
336
|
elif avro_type_str == "record":
|
|
123
337
|
return "record"
|
|
338
|
+
elif avro_type_str == "array":
|
|
339
|
+
return "array"
|
|
340
|
+
elif avro_type_str == "map":
|
|
341
|
+
return "map"
|
|
342
|
+
elif avro_type_str == "enum":
|
|
343
|
+
return "string"
|
|
124
344
|
else:
|
|
125
345
|
raise DataContractException(
|
|
126
346
|
type="schema",
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from datacontract.imports.importer import Importer
|
|
6
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
7
|
+
from datacontract.model.exceptions import DataContractException
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BigQueryImporter(Importer):
|
|
11
|
+
def import_source(
|
|
12
|
+
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
13
|
+
) -> DataContractSpecification:
|
|
14
|
+
if source is not None:
|
|
15
|
+
data_contract_specification = import_bigquery_from_json(data_contract_specification, source)
|
|
16
|
+
else:
|
|
17
|
+
data_contract_specification = import_bigquery_from_api(
|
|
18
|
+
data_contract_specification,
|
|
19
|
+
import_args.get("bigquery_table"),
|
|
20
|
+
import_args.get("bigquery_project"),
|
|
21
|
+
import_args.get("bigquery_dataset"),
|
|
22
|
+
)
|
|
23
|
+
return data_contract_specification
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def import_bigquery_from_json(
|
|
27
|
+
data_contract_specification: DataContractSpecification, source: str
|
|
28
|
+
) -> DataContractSpecification:
|
|
29
|
+
try:
|
|
30
|
+
with open(source, "r") as file:
|
|
31
|
+
bigquery_schema = json.loads(file.read())
|
|
32
|
+
except json.JSONDecodeError as e:
|
|
33
|
+
raise DataContractException(
|
|
34
|
+
type="schema",
|
|
35
|
+
name="Parse bigquery schema",
|
|
36
|
+
reason=f"Failed to parse bigquery schema from {source}",
|
|
37
|
+
engine="datacontract",
|
|
38
|
+
original_exception=e,
|
|
39
|
+
)
|
|
40
|
+
return convert_bigquery_schema(data_contract_specification, bigquery_schema)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def import_bigquery_from_api(
|
|
44
|
+
data_contract_specification: DataContractSpecification,
|
|
45
|
+
bigquery_tables: List[str],
|
|
46
|
+
bigquery_project: str,
|
|
47
|
+
bigquery_dataset: str,
|
|
48
|
+
) -> DataContractSpecification:
|
|
49
|
+
try:
|
|
50
|
+
from google.cloud import bigquery
|
|
51
|
+
except ImportError as e:
|
|
52
|
+
raise DataContractException(
|
|
53
|
+
type="schema",
|
|
54
|
+
result="failed",
|
|
55
|
+
name="bigquery extra missing",
|
|
56
|
+
reason="Install the extra datacontract-cli[bigquery] to use bigquery",
|
|
57
|
+
engine="datacontract",
|
|
58
|
+
original_exception=e,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
client = bigquery.Client(project=bigquery_project)
|
|
62
|
+
|
|
63
|
+
if bigquery_tables is None:
|
|
64
|
+
bigquery_tables = fetch_table_names(client, bigquery_dataset)
|
|
65
|
+
|
|
66
|
+
for table in bigquery_tables:
|
|
67
|
+
try:
|
|
68
|
+
api_table = client.get_table("{}.{}.{}".format(bigquery_project, bigquery_dataset, table))
|
|
69
|
+
|
|
70
|
+
except ValueError as e:
|
|
71
|
+
raise DataContractException(
|
|
72
|
+
type="schema",
|
|
73
|
+
result="failed",
|
|
74
|
+
name="Invalid table name for bigquery API",
|
|
75
|
+
reason=f"Tablename {table} is invalid for the bigquery API",
|
|
76
|
+
original_exception=e,
|
|
77
|
+
engine="datacontract",
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
if api_table is None:
|
|
81
|
+
raise DataContractException(
|
|
82
|
+
type="request",
|
|
83
|
+
result="failed",
|
|
84
|
+
name="Query bigtable Schema from API",
|
|
85
|
+
reason=f"Table {table} bnot found on bigtable schema Project {bigquery_project}, dataset {bigquery_dataset}.",
|
|
86
|
+
engine="datacontract",
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
convert_bigquery_schema(data_contract_specification, api_table.to_api_repr())
|
|
90
|
+
|
|
91
|
+
return data_contract_specification
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def fetch_table_names(client, dataset: str) -> List[str]:
|
|
95
|
+
table_names = []
|
|
96
|
+
api_tables = client.list_tables(dataset)
|
|
97
|
+
for api_table in api_tables:
|
|
98
|
+
table_names.append(api_table.table_id)
|
|
99
|
+
|
|
100
|
+
return table_names
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def convert_bigquery_schema(
|
|
104
|
+
data_contract_specification: DataContractSpecification, bigquery_schema: dict
|
|
105
|
+
) -> DataContractSpecification:
|
|
106
|
+
if data_contract_specification.models is None:
|
|
107
|
+
data_contract_specification.models = {}
|
|
108
|
+
|
|
109
|
+
fields = import_table_fields(bigquery_schema.get("schema").get("fields"))
|
|
110
|
+
|
|
111
|
+
# Looking at actual export data, I guess this is always set and friendlyName isn't, though I couldn't say
|
|
112
|
+
# what exactly leads to friendlyName being set
|
|
113
|
+
table_id = bigquery_schema.get("tableReference").get("tableId")
|
|
114
|
+
|
|
115
|
+
data_contract_specification.models[table_id] = Model(
|
|
116
|
+
fields=fields, type=map_bigquery_type(bigquery_schema.get("type"))
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Copy the description, if it exists
|
|
120
|
+
if bigquery_schema.get("description") is not None:
|
|
121
|
+
data_contract_specification.models[table_id].description = bigquery_schema.get("description")
|
|
122
|
+
|
|
123
|
+
# Set the title from friendlyName if it exists
|
|
124
|
+
if bigquery_schema.get("friendlyName") is not None:
|
|
125
|
+
data_contract_specification.models[table_id].title = bigquery_schema.get("friendlyName")
|
|
126
|
+
|
|
127
|
+
return data_contract_specification
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def import_table_fields(table_fields):
|
|
131
|
+
imported_fields = {}
|
|
132
|
+
for field in table_fields:
|
|
133
|
+
field_name = field.get("name")
|
|
134
|
+
imported_fields[field_name] = Field()
|
|
135
|
+
imported_fields[field_name].required = field.get("mode") == "REQUIRED"
|
|
136
|
+
imported_fields[field_name].description = field.get("description")
|
|
137
|
+
|
|
138
|
+
if field.get("type") == "RECORD":
|
|
139
|
+
imported_fields[field_name].type = "object"
|
|
140
|
+
imported_fields[field_name].fields = import_table_fields(field.get("fields"))
|
|
141
|
+
elif field.get("type") == "STRUCT":
|
|
142
|
+
imported_fields[field_name].type = "struct"
|
|
143
|
+
imported_fields[field_name].fields = import_table_fields(field.get("fields"))
|
|
144
|
+
elif field.get("type") == "RANGE":
|
|
145
|
+
# This is a range of date/datetime/timestamp but multiple values
|
|
146
|
+
# So we map it to an array
|
|
147
|
+
imported_fields[field_name].type = "array"
|
|
148
|
+
imported_fields[field_name].items = Field(
|
|
149
|
+
type=map_type_from_bigquery(field["rangeElementType"].get("type"))
|
|
150
|
+
)
|
|
151
|
+
else: # primitive type
|
|
152
|
+
imported_fields[field_name].type = map_type_from_bigquery(field.get("type"))
|
|
153
|
+
|
|
154
|
+
if field.get("type") == "STRING":
|
|
155
|
+
# in bigquery both string and bytes have maxLength but in the datacontracts
|
|
156
|
+
# spec it is only valid for strings
|
|
157
|
+
if field.get("maxLength") is not None:
|
|
158
|
+
imported_fields[field_name].maxLength = int(field.get("maxLength"))
|
|
159
|
+
|
|
160
|
+
if field.get("type") == "NUMERIC" or field.get("type") == "BIGNUMERIC":
|
|
161
|
+
if field.get("precision") is not None:
|
|
162
|
+
imported_fields[field_name].precision = int(field.get("precision"))
|
|
163
|
+
|
|
164
|
+
if field.get("scale") is not None:
|
|
165
|
+
imported_fields[field_name].scale = int(field.get("scale"))
|
|
166
|
+
|
|
167
|
+
return imported_fields
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def map_type_from_bigquery(bigquery_type_str: str):
|
|
171
|
+
if bigquery_type_str == "STRING":
|
|
172
|
+
return "string"
|
|
173
|
+
elif bigquery_type_str == "BYTES":
|
|
174
|
+
return "bytes"
|
|
175
|
+
elif bigquery_type_str == "INTEGER":
|
|
176
|
+
return "int"
|
|
177
|
+
elif bigquery_type_str == "INT64":
|
|
178
|
+
return "bigint"
|
|
179
|
+
elif bigquery_type_str == "FLOAT":
|
|
180
|
+
return "float"
|
|
181
|
+
elif bigquery_type_str == "FLOAT64":
|
|
182
|
+
return "double"
|
|
183
|
+
elif bigquery_type_str == "BOOLEAN" or bigquery_type_str == "BOOL":
|
|
184
|
+
return "boolean"
|
|
185
|
+
elif bigquery_type_str == "TIMESTAMP":
|
|
186
|
+
return "timestamp"
|
|
187
|
+
elif bigquery_type_str == "DATE":
|
|
188
|
+
return "date"
|
|
189
|
+
elif bigquery_type_str == "TIME":
|
|
190
|
+
return "timestamp_ntz"
|
|
191
|
+
elif bigquery_type_str == "DATETIME":
|
|
192
|
+
return "timestamp"
|
|
193
|
+
elif bigquery_type_str == "NUMERIC":
|
|
194
|
+
return "numeric"
|
|
195
|
+
elif bigquery_type_str == "BIGNUMERIC":
|
|
196
|
+
return "double"
|
|
197
|
+
elif bigquery_type_str == "GEOGRAPHY":
|
|
198
|
+
return "object"
|
|
199
|
+
elif bigquery_type_str == "JSON":
|
|
200
|
+
return "object"
|
|
201
|
+
else:
|
|
202
|
+
raise DataContractException(
|
|
203
|
+
type="schema",
|
|
204
|
+
result="failed",
|
|
205
|
+
name="Map bigquery type to data contract type",
|
|
206
|
+
reason=f"Unsupported type {bigquery_type_str} in bigquery json definition.",
|
|
207
|
+
engine="datacontract",
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def map_bigquery_type(bigquery_type: str) -> str:
|
|
212
|
+
if bigquery_type == "TABLE" or bigquery_type == "EXTERNAL" or bigquery_type == "SNAPSHOT":
|
|
213
|
+
return "table"
|
|
214
|
+
elif bigquery_type == "VIEW" or bigquery_type == "MATERIALIZED_VIEW":
|
|
215
|
+
return "view"
|
|
216
|
+
else:
|
|
217
|
+
logger = logging.getLogger(__name__)
|
|
218
|
+
logger.info(
|
|
219
|
+
f"Can't properly map bigquery table type '{bigquery_type}' to datacontracts model types. Mapping it to table."
|
|
220
|
+
)
|
|
221
|
+
return "table"
|