datacontract-cli 0.10.8__py3-none-any.whl → 0.10.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/catalog/catalog.py +4 -2
- datacontract/cli.py +36 -18
- datacontract/data_contract.py +13 -53
- datacontract/engines/soda/check_soda_execute.py +10 -2
- datacontract/engines/soda/connections/duckdb.py +32 -12
- datacontract/engines/soda/connections/trino.py +26 -0
- datacontract/export/avro_converter.py +1 -1
- datacontract/export/exporter.py +3 -2
- datacontract/export/exporter_factory.py +132 -39
- datacontract/export/jsonschema_converter.py +7 -7
- datacontract/export/sodacl_converter.py +17 -12
- datacontract/export/spark_converter.py +211 -0
- datacontract/export/sql_type_converter.py +28 -0
- datacontract/imports/avro_importer.py +149 -7
- datacontract/imports/bigquery_importer.py +17 -0
- datacontract/imports/dbt_importer.py +117 -0
- datacontract/imports/glue_importer.py +116 -33
- datacontract/imports/importer.py +34 -0
- datacontract/imports/importer_factory.py +90 -0
- datacontract/imports/jsonschema_importer.py +14 -3
- datacontract/imports/odcs_importer.py +8 -0
- datacontract/imports/spark_importer.py +134 -0
- datacontract/imports/sql_importer.py +8 -0
- datacontract/imports/unity_importer.py +23 -9
- datacontract/integration/publish_datamesh_manager.py +10 -5
- datacontract/lint/resolve.py +87 -21
- datacontract/lint/schema.py +24 -4
- datacontract/model/data_contract_specification.py +37 -4
- datacontract/templates/datacontract.html +18 -3
- datacontract/templates/index.html +1 -1
- datacontract/templates/partials/datacontract_information.html +20 -0
- datacontract/templates/partials/datacontract_terms.html +7 -0
- datacontract/templates/partials/definition.html +9 -1
- datacontract/templates/partials/model_field.html +23 -6
- datacontract/templates/partials/server.html +49 -16
- datacontract/templates/style/output.css +42 -0
- {datacontract_cli-0.10.8.dist-info → datacontract_cli-0.10.10.dist-info}/METADATA +310 -122
- {datacontract_cli-0.10.8.dist-info → datacontract_cli-0.10.10.dist-info}/RECORD +42 -36
- {datacontract_cli-0.10.8.dist-info → datacontract_cli-0.10.10.dist-info}/WHEEL +1 -1
- {datacontract_cli-0.10.8.dist-info → datacontract_cli-0.10.10.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.10.8.dist-info → datacontract_cli-0.10.10.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.8.dist-info → datacontract_cli-0.10.10.dist-info}/top_level.txt +0 -0
|
@@ -41,27 +41,32 @@ def to_checks(model_key, model_value, server_type: str, check_types: bool):
|
|
|
41
41
|
if field.unique:
|
|
42
42
|
checks.append(check_field_unique(field_name, quote_field_name))
|
|
43
43
|
if field.minLength is not None:
|
|
44
|
-
checks.append(check_field_min_length(field_name, field.minLength))
|
|
44
|
+
checks.append(check_field_min_length(field_name, field.minLength, quote_field_name))
|
|
45
45
|
if field.maxLength is not None:
|
|
46
|
-
checks.append(check_field_max_length(field_name, field.maxLength))
|
|
46
|
+
checks.append(check_field_max_length(field_name, field.maxLength, quote_field_name))
|
|
47
47
|
if field.minimum is not None:
|
|
48
|
-
checks.append(check_field_minimum(field_name, field.minimum))
|
|
48
|
+
checks.append(check_field_minimum(field_name, field.minimum, quote_field_name))
|
|
49
49
|
if field.maximum is not None:
|
|
50
|
-
checks.append(check_field_maximum(field_name, field.maximum))
|
|
50
|
+
checks.append(check_field_maximum(field_name, field.maximum, quote_field_name))
|
|
51
51
|
if field.exclusiveMinimum is not None:
|
|
52
|
-
checks.append(check_field_minimum(field_name, field.exclusiveMinimum))
|
|
53
|
-
checks.append(check_field_not_equal(field_name, field.exclusiveMinimum))
|
|
52
|
+
checks.append(check_field_minimum(field_name, field.exclusiveMinimum, quote_field_name))
|
|
53
|
+
checks.append(check_field_not_equal(field_name, field.exclusiveMinimum, quote_field_name))
|
|
54
54
|
if field.exclusiveMaximum is not None:
|
|
55
|
-
checks.append(check_field_maximum(field_name, field.exclusiveMaximum))
|
|
56
|
-
checks.append(check_field_not_equal(field_name, field.exclusiveMaximum))
|
|
55
|
+
checks.append(check_field_maximum(field_name, field.exclusiveMaximum, quote_field_name))
|
|
56
|
+
checks.append(check_field_not_equal(field_name, field.exclusiveMaximum, quote_field_name))
|
|
57
57
|
if field.pattern is not None:
|
|
58
|
-
checks.append(check_field_regex(field_name, field.pattern))
|
|
58
|
+
checks.append(check_field_regex(field_name, field.pattern, quote_field_name))
|
|
59
59
|
if field.enum is not None and len(field.enum) > 0:
|
|
60
|
-
checks.append(check_field_enum(field_name, field.enum))
|
|
60
|
+
checks.append(check_field_enum(field_name, field.enum, quote_field_name))
|
|
61
61
|
# TODO references: str = None
|
|
62
62
|
# TODO format
|
|
63
63
|
|
|
64
|
-
|
|
64
|
+
checks_for_model_key = f"checks for {model_key}"
|
|
65
|
+
|
|
66
|
+
if quote_field_name:
|
|
67
|
+
checks_for_model_key = f'checks for "{model_key}"'
|
|
68
|
+
|
|
69
|
+
return checks_for_model_key, checks
|
|
65
70
|
|
|
66
71
|
|
|
67
72
|
def check_field_is_present(field_name):
|
|
@@ -126,7 +131,7 @@ def check_field_minimum(field_name, minimum, quote_field_name: bool = False):
|
|
|
126
131
|
field_name = f'"{field_name}"'
|
|
127
132
|
return {
|
|
128
133
|
f"invalid_count({field_name}) = 0": {
|
|
129
|
-
"name": f"Check that field {field_name} has a minimum of {
|
|
134
|
+
"name": f"Check that field {field_name} has a minimum of {minimum}",
|
|
130
135
|
"valid min": minimum,
|
|
131
136
|
}
|
|
132
137
|
}
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
from pyspark.sql import types
|
|
2
|
+
from datacontract.model.data_contract_specification import (
|
|
3
|
+
DataContractSpecification,
|
|
4
|
+
Model,
|
|
5
|
+
Field,
|
|
6
|
+
)
|
|
7
|
+
from datacontract.export.exporter import Exporter
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SparkExporter(Exporter):
|
|
11
|
+
"""
|
|
12
|
+
Exporter class for exporting data contracts to Spark schemas.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def export(
|
|
16
|
+
self,
|
|
17
|
+
data_contract: DataContractSpecification,
|
|
18
|
+
model,
|
|
19
|
+
server,
|
|
20
|
+
sql_server_type,
|
|
21
|
+
export_args,
|
|
22
|
+
) -> dict[str, types.StructType]:
|
|
23
|
+
"""
|
|
24
|
+
Export the given data contract to Spark schemas.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
data_contract (DataContractSpecification): The data contract specification.
|
|
28
|
+
model: Not used in this implementation.
|
|
29
|
+
server: Not used in this implementation.
|
|
30
|
+
sql_server_type: Not used in this implementation.
|
|
31
|
+
export_args: Additional arguments for export.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
dict[str, types.StructType]: A dictionary mapping model names to their corresponding Spark schemas.
|
|
35
|
+
"""
|
|
36
|
+
return to_spark(data_contract)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def to_spark(contract: DataContractSpecification) -> str:
|
|
40
|
+
"""
|
|
41
|
+
Converts a DataContractSpecification into a Spark schema string.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
contract (DataContractSpecification): The data contract specification containing models.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
str: A string representation of the Spark schema for each model in the contract.
|
|
48
|
+
"""
|
|
49
|
+
return "\n\n".join(
|
|
50
|
+
f"{model_name} = {print_schema(to_spark_schema(model))}" for model_name, model in contract.models.items()
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def to_spark_dict(contract: DataContractSpecification) -> dict[str, types.StructType]:
|
|
55
|
+
"""
|
|
56
|
+
Convert a data contract specification to Spark schemas.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
contract (DataContractSpecification): The data contract specification.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
dict[str, types.StructType]: A dictionary mapping model names to their corresponding Spark schemas.
|
|
63
|
+
"""
|
|
64
|
+
return {model_name: to_spark_schema(model) for model_name, model in contract.models.items()}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def to_spark_schema(model: Model) -> types.StructType:
|
|
68
|
+
"""
|
|
69
|
+
Convert a model to a Spark schema.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
model (Model): The model to convert.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
types.StructType: The corresponding Spark schema.
|
|
76
|
+
"""
|
|
77
|
+
return to_struct_type(model.fields)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def to_struct_type(fields: dict[str, Field]) -> types.StructType:
|
|
81
|
+
"""
|
|
82
|
+
Convert a dictionary of fields to a Spark StructType.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
fields (dict[str, Field]): The fields to convert.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
types.StructType: The corresponding Spark StructType.
|
|
89
|
+
"""
|
|
90
|
+
struct_fields = [to_struct_field(field, field_name) for field_name, field in fields.items()]
|
|
91
|
+
return types.StructType(struct_fields)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def to_struct_field(field: Field, field_name: str) -> types.StructField:
|
|
95
|
+
"""
|
|
96
|
+
Convert a field to a Spark StructField.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
field (Field): The field to convert.
|
|
100
|
+
field_name (str): The name of the field.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
types.StructField: The corresponding Spark StructField.
|
|
104
|
+
"""
|
|
105
|
+
data_type = to_data_type(field)
|
|
106
|
+
return types.StructField(name=field_name, dataType=data_type, nullable=not field.required)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def to_data_type(field: Field) -> types.DataType:
|
|
110
|
+
"""
|
|
111
|
+
Convert a field to a Spark DataType.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
field (Field): The field to convert.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
types.DataType: The corresponding Spark DataType.
|
|
118
|
+
"""
|
|
119
|
+
field_type = field.type
|
|
120
|
+
if field_type is None or field_type in ["null"]:
|
|
121
|
+
return types.NullType()
|
|
122
|
+
if field_type == "array":
|
|
123
|
+
return types.ArrayType(to_data_type(field.items))
|
|
124
|
+
if field_type in ["object", "record", "struct"]:
|
|
125
|
+
return types.StructType(to_struct_type(field.fields))
|
|
126
|
+
if field_type in ["string", "varchar", "text"]:
|
|
127
|
+
return types.StringType()
|
|
128
|
+
if field_type in ["number", "decimal", "numeric"]:
|
|
129
|
+
return types.DecimalType()
|
|
130
|
+
if field_type in ["integer", "int"]:
|
|
131
|
+
return types.IntegerType()
|
|
132
|
+
if field_type == "long":
|
|
133
|
+
return types.LongType()
|
|
134
|
+
if field_type == "float":
|
|
135
|
+
return types.FloatType()
|
|
136
|
+
if field_type == "double":
|
|
137
|
+
return types.DoubleType()
|
|
138
|
+
if field_type == "boolean":
|
|
139
|
+
return types.BooleanType()
|
|
140
|
+
if field_type in ["timestamp", "timestamp_tz"]:
|
|
141
|
+
return types.TimestampType()
|
|
142
|
+
if field_type == "timestamp_ntz":
|
|
143
|
+
return types.TimestampNTZType()
|
|
144
|
+
if field_type == "date":
|
|
145
|
+
return types.DateType()
|
|
146
|
+
if field_type == "bytes":
|
|
147
|
+
return types.BinaryType()
|
|
148
|
+
return types.BinaryType()
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def print_schema(dtype: types.DataType) -> str:
|
|
152
|
+
"""
|
|
153
|
+
Converts a PySpark DataType schema to its equivalent code representation.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
dtype (types.DataType): The PySpark DataType schema to be converted.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
str: The code representation of the PySpark DataType schema.
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
def indent(text: str, level: int) -> str:
|
|
163
|
+
"""
|
|
164
|
+
Indents each line of the given text by a specified number of levels.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
text (str): The text to be indented.
|
|
168
|
+
level (int): The number of indentation levels.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
str: The indented text.
|
|
172
|
+
"""
|
|
173
|
+
return "\n".join([f'{" " * level}{line}' for line in text.split("\n")])
|
|
174
|
+
|
|
175
|
+
def repr_column(column: types.StructField) -> str:
|
|
176
|
+
"""
|
|
177
|
+
Converts a PySpark StructField to its code representation.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
column (types.StructField): The StructField to be converted.
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
str: The code representation of the StructField.
|
|
184
|
+
"""
|
|
185
|
+
name = f'"{column.name}"'
|
|
186
|
+
data_type = indent(print_schema(column.dataType), 1)
|
|
187
|
+
nullable = indent(f"{column.nullable}", 1)
|
|
188
|
+
return f"StructField({name},\n{data_type},\n{nullable}\n)"
|
|
189
|
+
|
|
190
|
+
def format_struct_type(struct_type: types.StructType) -> str:
|
|
191
|
+
"""
|
|
192
|
+
Converts a PySpark StructType to its code representation.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
struct_type (types.StructType): The StructType to be converted.
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
str: The code representation of the StructType.
|
|
199
|
+
"""
|
|
200
|
+
fields = ",\n".join([indent(repr_column(field), 1) for field in struct_type.fields])
|
|
201
|
+
return f"StructType([\n{fields}\n])"
|
|
202
|
+
|
|
203
|
+
if isinstance(dtype, types.StructType):
|
|
204
|
+
return format_struct_type(dtype)
|
|
205
|
+
elif isinstance(dtype, types.ArrayType):
|
|
206
|
+
return f"ArrayType({print_schema(dtype.elementType)})"
|
|
207
|
+
elif isinstance(dtype, types.DecimalType):
|
|
208
|
+
return f"DecimalType({dtype.precision}, {dtype.scale})"
|
|
209
|
+
else:
|
|
210
|
+
dtype_str = str(dtype)
|
|
211
|
+
return dtype_str if dtype_str.endswith("()") else f"{dtype_str}()"
|
|
@@ -15,6 +15,8 @@ def convert_to_sql_type(field: Field, server_type: str) -> str:
|
|
|
15
15
|
return convert_type_to_sqlserver(field)
|
|
16
16
|
elif server_type == "bigquery":
|
|
17
17
|
return convert_type_to_bigquery(field)
|
|
18
|
+
elif server_type == "trino":
|
|
19
|
+
return convert_type_to_trino(field)
|
|
18
20
|
return field.type
|
|
19
21
|
|
|
20
22
|
|
|
@@ -249,3 +251,29 @@ def get_type_config(field: Field, config_attr: str) -> dict[str, str] | None:
|
|
|
249
251
|
if not field.config:
|
|
250
252
|
return None
|
|
251
253
|
return field.config.get(config_attr, None)
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def convert_type_to_trino(field: Field) -> None | str:
|
|
257
|
+
"""Convert from supported datacontract types to equivalent trino types"""
|
|
258
|
+
field_type = field.type
|
|
259
|
+
|
|
260
|
+
if field_type.lower() in ["string", "text", "varchar"]:
|
|
261
|
+
return "varchar"
|
|
262
|
+
# tinyint, smallint not supported by data contract
|
|
263
|
+
if field_type.lower() in ["number", "decimal", "numeric"]:
|
|
264
|
+
# precision and scale not supported by data contract
|
|
265
|
+
return "decimal"
|
|
266
|
+
if field_type.lower() in ["int", "integer"]:
|
|
267
|
+
return "integer"
|
|
268
|
+
if field_type.lower() in ["long", "bigint"]:
|
|
269
|
+
return "bigint"
|
|
270
|
+
if field_type.lower() in ["float"]:
|
|
271
|
+
return "real"
|
|
272
|
+
if field_type.lower() in ["timestamp", "timestamp_tz"]:
|
|
273
|
+
return "timestamp(3) with time zone"
|
|
274
|
+
if field_type.lower() in ["timestamp_ntz"]:
|
|
275
|
+
return "timestamp(3)"
|
|
276
|
+
if field_type.lower() in ["bytes"]:
|
|
277
|
+
return "varbinary"
|
|
278
|
+
if field_type.lower() in ["object", "record", "struct"]:
|
|
279
|
+
return "json"
|
|
@@ -1,10 +1,46 @@
|
|
|
1
|
+
from typing import Dict, List
|
|
2
|
+
|
|
1
3
|
import avro.schema
|
|
2
4
|
|
|
5
|
+
from datacontract.imports.importer import Importer
|
|
3
6
|
from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
|
|
4
7
|
from datacontract.model.exceptions import DataContractException
|
|
5
8
|
|
|
6
9
|
|
|
10
|
+
class AvroImporter(Importer):
|
|
11
|
+
"""Class to import Avro Schema file"""
|
|
12
|
+
|
|
13
|
+
def import_source(
|
|
14
|
+
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
15
|
+
) -> DataContractSpecification:
|
|
16
|
+
"""
|
|
17
|
+
Import Avro schema from a source file.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
data_contract_specification: The data contract specification to update.
|
|
21
|
+
source: The path to the Avro schema file.
|
|
22
|
+
import_args: Additional import arguments.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
The updated data contract specification.
|
|
26
|
+
"""
|
|
27
|
+
return import_avro(data_contract_specification, source)
|
|
28
|
+
|
|
29
|
+
|
|
7
30
|
def import_avro(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
|
|
31
|
+
"""
|
|
32
|
+
Import an Avro schema from a file and update the data contract specification.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
data_contract_specification: The data contract specification to update.
|
|
36
|
+
source: The path to the Avro schema file.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
DataContractSpecification: The updated data contract specification.
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
DataContractException: If there's an error parsing the Avro schema.
|
|
43
|
+
"""
|
|
8
44
|
if data_contract_specification.models is None:
|
|
9
45
|
data_contract_specification.models = {}
|
|
10
46
|
|
|
@@ -37,7 +73,14 @@ def import_avro(data_contract_specification: DataContractSpecification, source:
|
|
|
37
73
|
return data_contract_specification
|
|
38
74
|
|
|
39
75
|
|
|
40
|
-
def handle_config_avro_custom_properties(field, imported_field):
|
|
76
|
+
def handle_config_avro_custom_properties(field: avro.schema.Field, imported_field: Field) -> None:
|
|
77
|
+
"""
|
|
78
|
+
Handle custom Avro properties and add them to the imported field's config.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
field: The Avro field.
|
|
82
|
+
imported_field: The imported field to update.
|
|
83
|
+
"""
|
|
41
84
|
if field.get_prop("logicalType") is not None:
|
|
42
85
|
if imported_field.config is None:
|
|
43
86
|
imported_field.config = {}
|
|
@@ -49,7 +92,16 @@ def handle_config_avro_custom_properties(field, imported_field):
|
|
|
49
92
|
imported_field.config["avroDefault"] = field.default
|
|
50
93
|
|
|
51
94
|
|
|
52
|
-
def import_record_fields(record_fields):
|
|
95
|
+
def import_record_fields(record_fields: List[avro.schema.Field]) -> Dict[str, Field]:
|
|
96
|
+
"""
|
|
97
|
+
Import Avro record fields and convert them to data contract fields.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
record_fields: List of Avro record fields.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
A dictionary of imported fields.
|
|
104
|
+
"""
|
|
53
105
|
imported_fields = {}
|
|
54
106
|
for field in record_fields:
|
|
55
107
|
imported_field = Field()
|
|
@@ -75,6 +127,15 @@ def import_record_fields(record_fields):
|
|
|
75
127
|
elif field.type.type == "array":
|
|
76
128
|
imported_field.type = "array"
|
|
77
129
|
imported_field.items = import_avro_array_items(field.type)
|
|
130
|
+
elif field.type.type == "map":
|
|
131
|
+
imported_field.type = "map"
|
|
132
|
+
imported_field.values = import_avro_map_values(field.type)
|
|
133
|
+
elif field.type.type == "enum":
|
|
134
|
+
imported_field.type = "string"
|
|
135
|
+
imported_field.enum = field.type.symbols
|
|
136
|
+
if not imported_field.config:
|
|
137
|
+
imported_field.config = {}
|
|
138
|
+
imported_field.config["avroType"] = "enum"
|
|
78
139
|
else: # primitive type
|
|
79
140
|
imported_field.type = map_type_from_avro(field.type.type)
|
|
80
141
|
|
|
@@ -83,7 +144,16 @@ def import_record_fields(record_fields):
|
|
|
83
144
|
return imported_fields
|
|
84
145
|
|
|
85
146
|
|
|
86
|
-
def import_avro_array_items(array_schema):
|
|
147
|
+
def import_avro_array_items(array_schema: avro.schema.ArraySchema) -> Field:
|
|
148
|
+
"""
|
|
149
|
+
Import Avro array items and convert them to a data contract field.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
array_schema: The Avro array schema.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Field: The imported field representing the array items.
|
|
156
|
+
"""
|
|
87
157
|
items = Field()
|
|
88
158
|
for prop in array_schema.other_props:
|
|
89
159
|
items.__setattr__(prop, array_schema.other_props[prop])
|
|
@@ -100,7 +170,45 @@ def import_avro_array_items(array_schema):
|
|
|
100
170
|
return items
|
|
101
171
|
|
|
102
172
|
|
|
103
|
-
def
|
|
173
|
+
def import_avro_map_values(map_schema: avro.schema.MapSchema) -> Field:
|
|
174
|
+
"""
|
|
175
|
+
Import Avro map values and convert them to a data contract field.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
map_schema: The Avro map schema.
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
Field: The imported field representing the map values.
|
|
182
|
+
"""
|
|
183
|
+
values = Field()
|
|
184
|
+
for prop in map_schema.other_props:
|
|
185
|
+
values.__setattr__(prop, map_schema.other_props[prop])
|
|
186
|
+
|
|
187
|
+
if map_schema.values.type == "record":
|
|
188
|
+
values.type = "object"
|
|
189
|
+
values.fields = import_record_fields(map_schema.values.fields)
|
|
190
|
+
elif map_schema.values.type == "array":
|
|
191
|
+
values.type = "array"
|
|
192
|
+
values.items = import_avro_array_items(map_schema.values)
|
|
193
|
+
else: # primitive type
|
|
194
|
+
values.type = map_type_from_avro(map_schema.values.type)
|
|
195
|
+
|
|
196
|
+
return values
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def import_type_of_optional_field(field: avro.schema.Field) -> str:
|
|
200
|
+
"""
|
|
201
|
+
Determine the type of optional field in an Avro union.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
field: The Avro field with a union type.
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
str: The mapped type of the non-null field in the union.
|
|
208
|
+
|
|
209
|
+
Raises:
|
|
210
|
+
DataContractException: If no non-null type is found in the union.
|
|
211
|
+
"""
|
|
104
212
|
for field_type in field.type.schemas:
|
|
105
213
|
if field_type.type != "null":
|
|
106
214
|
return map_type_from_avro(field_type.type)
|
|
@@ -113,21 +221,51 @@ def import_type_of_optional_field(field):
|
|
|
113
221
|
)
|
|
114
222
|
|
|
115
223
|
|
|
116
|
-
def get_record_from_union_field(field):
|
|
224
|
+
def get_record_from_union_field(field: avro.schema.Field) -> avro.schema.RecordSchema | None:
|
|
225
|
+
"""
|
|
226
|
+
Get the record schema from a union field.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
field: The Avro field with a union type.
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
The record schema if found, None otherwise.
|
|
233
|
+
"""
|
|
117
234
|
for field_type in field.type.schemas:
|
|
118
235
|
if field_type.type == "record":
|
|
119
236
|
return field_type
|
|
120
237
|
return None
|
|
121
238
|
|
|
122
239
|
|
|
123
|
-
def get_array_from_union_field(field):
|
|
240
|
+
def get_array_from_union_field(field: avro.schema.Field) -> avro.schema.ArraySchema | None:
|
|
241
|
+
"""
|
|
242
|
+
Get the array schema from a union field.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
field: The Avro field with a union type.
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
The array schema if found, None otherwise.
|
|
249
|
+
"""
|
|
124
250
|
for field_type in field.type.schemas:
|
|
125
251
|
if field_type.type == "array":
|
|
126
252
|
return field_type
|
|
127
253
|
return None
|
|
128
254
|
|
|
129
255
|
|
|
130
|
-
def map_type_from_avro(avro_type_str: str):
|
|
256
|
+
def map_type_from_avro(avro_type_str: str) -> str:
|
|
257
|
+
"""
|
|
258
|
+
Map Avro type strings to data contract type strings.
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
avro_type_str (str): The Avro type string.
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
str: The corresponding data contract type string.
|
|
265
|
+
|
|
266
|
+
Raises:
|
|
267
|
+
DataContractException: If the Avro type is unsupported.
|
|
268
|
+
"""
|
|
131
269
|
# TODO: ambiguous mapping in the export
|
|
132
270
|
if avro_type_str == "null":
|
|
133
271
|
return "null"
|
|
@@ -147,6 +285,10 @@ def map_type_from_avro(avro_type_str: str):
|
|
|
147
285
|
return "record"
|
|
148
286
|
elif avro_type_str == "array":
|
|
149
287
|
return "array"
|
|
288
|
+
elif avro_type_str == "map":
|
|
289
|
+
return "map"
|
|
290
|
+
elif avro_type_str == "enum":
|
|
291
|
+
return "string"
|
|
150
292
|
else:
|
|
151
293
|
raise DataContractException(
|
|
152
294
|
type="schema",
|
|
@@ -2,10 +2,27 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from datacontract.imports.importer import Importer
|
|
5
6
|
from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
|
|
6
7
|
from datacontract.model.exceptions import DataContractException
|
|
7
8
|
|
|
8
9
|
|
|
10
|
+
class BigQueryImporter(Importer):
|
|
11
|
+
def import_source(
|
|
12
|
+
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
13
|
+
) -> dict:
|
|
14
|
+
if source is not None:
|
|
15
|
+
data_contract_specification = import_bigquery_from_json(data_contract_specification, source)
|
|
16
|
+
else:
|
|
17
|
+
data_contract_specification = import_bigquery_from_api(
|
|
18
|
+
data_contract_specification,
|
|
19
|
+
import_args.get("bigquery_tables"),
|
|
20
|
+
import_args.get("bigquery_project"),
|
|
21
|
+
import_args.get("bigquery_dataset"),
|
|
22
|
+
)
|
|
23
|
+
return data_contract_specification
|
|
24
|
+
|
|
25
|
+
|
|
9
26
|
def import_bigquery_from_json(
|
|
10
27
|
data_contract_specification: DataContractSpecification, source: str
|
|
11
28
|
) -> DataContractSpecification:
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from typing import (
|
|
4
|
+
List,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
from datacontract.imports.importer import Importer
|
|
8
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DbtManifestImporter(Importer):
|
|
12
|
+
def import_source(
|
|
13
|
+
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
14
|
+
) -> dict:
|
|
15
|
+
data = read_dbt_manifest(manifest_path=source)
|
|
16
|
+
return import_dbt_manifest(
|
|
17
|
+
data_contract_specification, manifest_dict=data, dbt_models=import_args.get("dbt_model")
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def import_dbt_manifest(
|
|
22
|
+
data_contract_specification: DataContractSpecification, manifest_dict: dict, dbt_models: List[str]
|
|
23
|
+
):
|
|
24
|
+
data_contract_specification.info.title = manifest_dict.get("info").get("project_name")
|
|
25
|
+
data_contract_specification.info.dbt_version = manifest_dict.get("info").get("dbt_version")
|
|
26
|
+
|
|
27
|
+
if data_contract_specification.models is None:
|
|
28
|
+
data_contract_specification.models = {}
|
|
29
|
+
|
|
30
|
+
for model in manifest_dict.get("models", []):
|
|
31
|
+
if dbt_models and model.name not in dbt_models:
|
|
32
|
+
continue
|
|
33
|
+
|
|
34
|
+
dc_model = Model(
|
|
35
|
+
description=model.description,
|
|
36
|
+
tags=model.tags,
|
|
37
|
+
fields=create_fields(model.columns),
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
data_contract_specification.models[model.name] = dc_model
|
|
41
|
+
|
|
42
|
+
return data_contract_specification
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def create_fields(columns: List):
|
|
46
|
+
fields = {}
|
|
47
|
+
for column in columns:
|
|
48
|
+
field = Field(
|
|
49
|
+
description=column.description, type=column.data_type if column.data_type else "", tags=column.tags
|
|
50
|
+
)
|
|
51
|
+
fields[column.name] = field
|
|
52
|
+
|
|
53
|
+
return fields
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def read_dbt_manifest(manifest_path: str):
|
|
57
|
+
with open(manifest_path, "r", encoding="utf-8") as f:
|
|
58
|
+
manifest = json.load(f)
|
|
59
|
+
return {"info": manifest.get("metadata"), "models": create_manifest_models(manifest)}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def create_manifest_models(manifest: dict) -> List:
|
|
63
|
+
models = []
|
|
64
|
+
nodes = manifest.get("nodes")
|
|
65
|
+
|
|
66
|
+
for node in nodes.values():
|
|
67
|
+
if node["resource_type"] != "model":
|
|
68
|
+
continue
|
|
69
|
+
|
|
70
|
+
models.append(DbtModel(node))
|
|
71
|
+
return models
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class DbtColumn:
|
|
75
|
+
name: str
|
|
76
|
+
description: str
|
|
77
|
+
data_type: str
|
|
78
|
+
meta: dict
|
|
79
|
+
tags: List
|
|
80
|
+
|
|
81
|
+
def __init__(self, node_column: dict):
|
|
82
|
+
self.name = node_column.get("name")
|
|
83
|
+
self.description = node_column.get("description")
|
|
84
|
+
self.data_type = node_column.get("data_type")
|
|
85
|
+
self.meta = node_column.get("meta", {})
|
|
86
|
+
self.tags = node_column.get("tags", [])
|
|
87
|
+
|
|
88
|
+
def __repr__(self) -> str:
|
|
89
|
+
return self.name
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class DbtModel:
|
|
93
|
+
name: str
|
|
94
|
+
database: str
|
|
95
|
+
schema: str
|
|
96
|
+
description: str
|
|
97
|
+
unique_id: str
|
|
98
|
+
tags: List
|
|
99
|
+
|
|
100
|
+
def __init__(self, node: dict):
|
|
101
|
+
self.name = node.get("name")
|
|
102
|
+
self.database = node.get("database")
|
|
103
|
+
self.schema = node.get("schema")
|
|
104
|
+
self.description = node.get("description")
|
|
105
|
+
self.display_name = node.get("display_name")
|
|
106
|
+
self.unique_id = node.get("unique_id")
|
|
107
|
+
self.columns = []
|
|
108
|
+
self.tags = node.get("tags")
|
|
109
|
+
if node.get("columns"):
|
|
110
|
+
self.add_columns(node.get("columns").values())
|
|
111
|
+
|
|
112
|
+
def add_columns(self, model_columns: List):
|
|
113
|
+
for column in model_columns:
|
|
114
|
+
self.columns.append(DbtColumn(column))
|
|
115
|
+
|
|
116
|
+
def __repr__(self) -> str:
|
|
117
|
+
return self.name
|