datacontract-cli 0.10.0__py3-none-any.whl → 0.10.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datacontract/__init__.py +13 -0
- datacontract/api.py +260 -0
- datacontract/breaking/breaking.py +242 -12
- datacontract/breaking/breaking_rules.py +37 -1
- datacontract/catalog/catalog.py +80 -0
- datacontract/cli.py +387 -117
- datacontract/data_contract.py +216 -353
- datacontract/engines/data_contract_checks.py +1041 -0
- datacontract/engines/data_contract_test.py +113 -0
- datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +2 -3
- datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
- datacontract/engines/fastjsonschema/check_jsonschema.py +176 -42
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +16 -1
- datacontract/engines/soda/check_soda_execute.py +100 -56
- datacontract/engines/soda/connections/athena.py +79 -0
- datacontract/engines/soda/connections/bigquery.py +8 -1
- datacontract/engines/soda/connections/databricks.py +12 -3
- datacontract/engines/soda/connections/duckdb_connection.py +241 -0
- datacontract/engines/soda/connections/kafka.py +206 -113
- datacontract/engines/soda/connections/snowflake.py +8 -5
- datacontract/engines/soda/connections/sqlserver.py +43 -0
- datacontract/engines/soda/connections/trino.py +26 -0
- datacontract/export/avro_converter.py +72 -8
- datacontract/export/avro_idl_converter.py +31 -25
- datacontract/export/bigquery_converter.py +130 -0
- datacontract/export/custom_converter.py +40 -0
- datacontract/export/data_caterer_converter.py +161 -0
- datacontract/export/dbml_converter.py +148 -0
- datacontract/export/dbt_converter.py +141 -54
- datacontract/export/dcs_exporter.py +6 -0
- datacontract/export/dqx_converter.py +126 -0
- datacontract/export/duckdb_type_converter.py +57 -0
- datacontract/export/excel_exporter.py +923 -0
- datacontract/export/exporter.py +100 -0
- datacontract/export/exporter_factory.py +216 -0
- datacontract/export/go_converter.py +105 -0
- datacontract/export/great_expectations_converter.py +257 -36
- datacontract/export/html_exporter.py +86 -0
- datacontract/export/iceberg_converter.py +188 -0
- datacontract/export/jsonschema_converter.py +71 -16
- datacontract/export/markdown_converter.py +337 -0
- datacontract/export/mermaid_exporter.py +110 -0
- datacontract/export/odcs_v3_exporter.py +375 -0
- datacontract/export/pandas_type_converter.py +40 -0
- datacontract/export/protobuf_converter.py +168 -68
- datacontract/export/pydantic_converter.py +6 -0
- datacontract/export/rdf_converter.py +13 -6
- datacontract/export/sodacl_converter.py +36 -188
- datacontract/export/spark_converter.py +245 -0
- datacontract/export/sql_converter.py +37 -3
- datacontract/export/sql_type_converter.py +269 -8
- datacontract/export/sqlalchemy_converter.py +170 -0
- datacontract/export/terraform_converter.py +7 -2
- datacontract/imports/avro_importer.py +246 -26
- datacontract/imports/bigquery_importer.py +221 -0
- datacontract/imports/csv_importer.py +143 -0
- datacontract/imports/dbml_importer.py +112 -0
- datacontract/imports/dbt_importer.py +240 -0
- datacontract/imports/excel_importer.py +1111 -0
- datacontract/imports/glue_importer.py +288 -0
- datacontract/imports/iceberg_importer.py +172 -0
- datacontract/imports/importer.py +51 -0
- datacontract/imports/importer_factory.py +128 -0
- datacontract/imports/json_importer.py +325 -0
- datacontract/imports/jsonschema_importer.py +146 -0
- datacontract/imports/odcs_importer.py +60 -0
- datacontract/imports/odcs_v3_importer.py +516 -0
- datacontract/imports/parquet_importer.py +81 -0
- datacontract/imports/protobuf_importer.py +264 -0
- datacontract/imports/spark_importer.py +262 -0
- datacontract/imports/sql_importer.py +274 -35
- datacontract/imports/unity_importer.py +219 -0
- datacontract/init/init_template.py +20 -0
- datacontract/integration/datamesh_manager.py +86 -0
- datacontract/lint/resolve.py +271 -49
- datacontract/lint/resources.py +21 -0
- datacontract/lint/schema.py +53 -17
- datacontract/lint/urls.py +32 -12
- datacontract/model/data_contract_specification/__init__.py +1 -0
- datacontract/model/exceptions.py +4 -1
- datacontract/model/odcs.py +24 -0
- datacontract/model/run.py +49 -29
- datacontract/output/__init__.py +0 -0
- datacontract/output/junit_test_results.py +135 -0
- datacontract/output/output_format.py +10 -0
- datacontract/output/test_results_writer.py +79 -0
- datacontract/py.typed +0 -0
- datacontract/schemas/datacontract-1.1.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.1.0.schema.json +1975 -0
- datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
- datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
- datacontract/schemas/odcs-3.0.1.schema.json +2634 -0
- datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
- datacontract/templates/datacontract.html +139 -294
- datacontract/templates/datacontract_odcs.html +685 -0
- datacontract/templates/index.html +236 -0
- datacontract/templates/partials/datacontract_information.html +86 -0
- datacontract/templates/partials/datacontract_servicelevels.html +253 -0
- datacontract/templates/partials/datacontract_terms.html +51 -0
- datacontract/templates/partials/definition.html +25 -0
- datacontract/templates/partials/example.html +27 -0
- datacontract/templates/partials/model_field.html +144 -0
- datacontract/templates/partials/quality.html +49 -0
- datacontract/templates/partials/server.html +211 -0
- datacontract/templates/style/output.css +491 -72
- datacontract_cli-0.10.37.dist-info/METADATA +2235 -0
- datacontract_cli-0.10.37.dist-info/RECORD +119 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/WHEEL +1 -1
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info/licenses}/LICENSE +1 -1
- datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
- datacontract/engines/soda/connections/dask.py +0 -28
- datacontract/engines/soda/connections/duckdb.py +0 -76
- datacontract/export/csv_type_converter.py +0 -36
- datacontract/export/html_export.py +0 -66
- datacontract/export/odcs_converter.py +0 -102
- datacontract/init/download_datacontract_file.py +0 -17
- datacontract/integration/publish_datamesh_manager.py +0 -33
- datacontract/integration/publish_opentelemetry.py +0 -107
- datacontract/lint/lint.py +0 -141
- datacontract/lint/linters/description_linter.py +0 -34
- datacontract/lint/linters/example_model_linter.py +0 -91
- datacontract/lint/linters/field_pattern_linter.py +0 -34
- datacontract/lint/linters/field_reference_linter.py +0 -38
- datacontract/lint/linters/notice_period_linter.py +0 -55
- datacontract/lint/linters/quality_schema_linter.py +0 -52
- datacontract/lint/linters/valid_constraints_linter.py +0 -99
- datacontract/model/data_contract_specification.py +0 -141
- datacontract/web.py +0 -14
- datacontract_cli-0.10.0.dist-info/METADATA +0 -951
- datacontract_cli-0.10.0.dist-info/RECORD +0 -66
- /datacontract/{model → breaking}/breaking_change.py +0 -0
- /datacontract/{lint/linters → export}/__init__.py +0 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Dict, Generator, List
|
|
3
|
+
|
|
4
|
+
import boto3
|
|
5
|
+
|
|
6
|
+
from datacontract.imports.importer import Importer
|
|
7
|
+
from datacontract.model.data_contract_specification import (
|
|
8
|
+
DataContractSpecification,
|
|
9
|
+
Field,
|
|
10
|
+
Model,
|
|
11
|
+
Server,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class GlueImporter(Importer):
|
|
16
|
+
def import_source(
|
|
17
|
+
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
18
|
+
) -> DataContractSpecification:
|
|
19
|
+
return import_glue(data_contract_specification, source, import_args.get("glue_table"))
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_glue_database(database_name: str):
|
|
23
|
+
"""Get the details Glue database.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
database_name (str): glue database to request.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
set: catalogid and locationUri
|
|
30
|
+
"""
|
|
31
|
+
glue = boto3.client("glue")
|
|
32
|
+
try:
|
|
33
|
+
response = glue.get_database(Name=database_name)
|
|
34
|
+
except glue.exceptions.EntityNotFoundException:
|
|
35
|
+
print(f"Database not found {database_name}.")
|
|
36
|
+
return (None, None)
|
|
37
|
+
except Exception as e:
|
|
38
|
+
# todo catch all
|
|
39
|
+
print(f"Error: {e}")
|
|
40
|
+
return (None, None)
|
|
41
|
+
|
|
42
|
+
return (
|
|
43
|
+
response["Database"]["CatalogId"],
|
|
44
|
+
response["Database"].get("LocationUri"),
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_glue_tables(database_name: str) -> List[str]:
|
|
49
|
+
"""Get the list of tables in a Glue database.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
database_name (str): Glue database to request.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
List[str]: List of table names
|
|
56
|
+
"""
|
|
57
|
+
glue = boto3.client("glue")
|
|
58
|
+
|
|
59
|
+
# Set the paginator
|
|
60
|
+
paginator = glue.get_paginator("get_tables")
|
|
61
|
+
|
|
62
|
+
# Initialize an empty list to store the table names
|
|
63
|
+
table_names = []
|
|
64
|
+
try:
|
|
65
|
+
# Paginate through the tables
|
|
66
|
+
for page in paginator.paginate(DatabaseName=database_name, PaginationConfig={"PageSize": 100}):
|
|
67
|
+
# Add the tables from the current page to the list
|
|
68
|
+
table_names.extend([table["Name"] for table in page["TableList"] if "Name" in table])
|
|
69
|
+
except glue.exceptions.EntityNotFoundException:
|
|
70
|
+
print(f"Database {database_name} not found.")
|
|
71
|
+
return []
|
|
72
|
+
except Exception as e:
|
|
73
|
+
# todo catch all
|
|
74
|
+
print(f"Error: {e}")
|
|
75
|
+
return []
|
|
76
|
+
|
|
77
|
+
return table_names
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def get_glue_table_schema(database_name: str, table_name: str) -> List[Dict]:
|
|
81
|
+
"""Get the schema of a Glue table.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
database_name (str): Glue database name.
|
|
85
|
+
table_name (str): Glue table name.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
dict: Table schema
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
glue = boto3.client("glue")
|
|
92
|
+
|
|
93
|
+
# Get the table schema
|
|
94
|
+
try:
|
|
95
|
+
response = glue.get_table(DatabaseName=database_name, Name=table_name)
|
|
96
|
+
except glue.exceptions.EntityNotFoundException:
|
|
97
|
+
print(f"Table {table_name} not found in database {database_name}.")
|
|
98
|
+
return []
|
|
99
|
+
except Exception as e:
|
|
100
|
+
# todo catch all
|
|
101
|
+
print(f"Error: {e}")
|
|
102
|
+
return []
|
|
103
|
+
|
|
104
|
+
table_schema = response["Table"]["StorageDescriptor"]["Columns"]
|
|
105
|
+
|
|
106
|
+
# when using hive partition keys, the schema is stored in the PartitionKeys field
|
|
107
|
+
if response["Table"].get("PartitionKeys") is not None:
|
|
108
|
+
for pk in response["Table"]["PartitionKeys"]:
|
|
109
|
+
table_schema.append(
|
|
110
|
+
{
|
|
111
|
+
"Name": pk["Name"],
|
|
112
|
+
"Type": pk["Type"],
|
|
113
|
+
"Hive": True,
|
|
114
|
+
"Comment": pk.get("Comment"),
|
|
115
|
+
}
|
|
116
|
+
)
|
|
117
|
+
return table_schema
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def import_glue(
|
|
121
|
+
data_contract_specification: DataContractSpecification,
|
|
122
|
+
source: str,
|
|
123
|
+
table_names: List[str],
|
|
124
|
+
) -> DataContractSpecification:
|
|
125
|
+
"""Import the schema of a Glue database.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
data_contract_specification (DataContractSpecification): The data contract specification to update.
|
|
129
|
+
source (str): The name of the Glue database.
|
|
130
|
+
table_names (List[str]): List of table names to import. If None, all tables in the database are imported.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
DataContractSpecification: The updated data contract specification.
|
|
134
|
+
"""
|
|
135
|
+
catalogid, location_uri = get_glue_database(source)
|
|
136
|
+
|
|
137
|
+
# something went wrong
|
|
138
|
+
if catalogid is None:
|
|
139
|
+
return data_contract_specification
|
|
140
|
+
|
|
141
|
+
if table_names is None:
|
|
142
|
+
table_names = get_glue_tables(source)
|
|
143
|
+
|
|
144
|
+
server_kwargs = {"type": "glue", "account": catalogid, "database": source}
|
|
145
|
+
|
|
146
|
+
if location_uri:
|
|
147
|
+
server_kwargs["location"] = location_uri
|
|
148
|
+
|
|
149
|
+
data_contract_specification.servers = {
|
|
150
|
+
"production": Server(**server_kwargs),
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
for table_name in table_names:
|
|
154
|
+
if data_contract_specification.models is None:
|
|
155
|
+
data_contract_specification.models = {}
|
|
156
|
+
|
|
157
|
+
table_schema = get_glue_table_schema(source, table_name)
|
|
158
|
+
|
|
159
|
+
fields = {}
|
|
160
|
+
for column in table_schema:
|
|
161
|
+
field = create_typed_field(column["Type"])
|
|
162
|
+
|
|
163
|
+
# hive partitions are required, but are not primary keys
|
|
164
|
+
if column.get("Hive"):
|
|
165
|
+
field.required = True
|
|
166
|
+
|
|
167
|
+
field.description = column.get("Comment")
|
|
168
|
+
fields[column["Name"]] = field
|
|
169
|
+
|
|
170
|
+
data_contract_specification.models[table_name] = Model(
|
|
171
|
+
type="table",
|
|
172
|
+
fields=fields,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
return data_contract_specification
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def create_typed_field(dtype: str) -> Field:
|
|
179
|
+
"""Create a typed field based on the given data type.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
dtype (str): The data type of the field.
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
Field: The created field with the appropriate type.
|
|
186
|
+
"""
|
|
187
|
+
field = Field()
|
|
188
|
+
dtype = dtype.strip().lower().replace(" ", "")
|
|
189
|
+
# Example: array<string>
|
|
190
|
+
if dtype.startswith("array"):
|
|
191
|
+
field.type = "array"
|
|
192
|
+
field.items = create_typed_field(dtype[6:-1])
|
|
193
|
+
# Example: struct<field1:float,field2:string>
|
|
194
|
+
elif dtype.startswith("struct"):
|
|
195
|
+
field.type = "struct"
|
|
196
|
+
for f in split_struct(dtype[7:-1]):
|
|
197
|
+
field_name, field_key = f.split(":", 1)
|
|
198
|
+
field.fields[field_name] = create_typed_field(field_key)
|
|
199
|
+
# Example: map<string,int>
|
|
200
|
+
elif dtype.startswith("map"):
|
|
201
|
+
field.type = "map"
|
|
202
|
+
map_match = re.match(r"map<(.+?),\s*(.+)>", dtype)
|
|
203
|
+
if map_match:
|
|
204
|
+
key_type = map_match.group(1)
|
|
205
|
+
value_type = map_match.group(2)
|
|
206
|
+
field.keys = create_typed_field(key_type)
|
|
207
|
+
field.values = create_typed_field(value_type)
|
|
208
|
+
# Example: decimal(38, 6) or decimal
|
|
209
|
+
elif dtype.startswith("decimal"):
|
|
210
|
+
field.type = "decimal"
|
|
211
|
+
decimal_match = re.match(r"decimal\((\d+),\s*(\d+)\)", dtype)
|
|
212
|
+
if decimal_match: # if precision specified
|
|
213
|
+
field.precision = int(decimal_match.group(1))
|
|
214
|
+
field.scale = int(decimal_match.group(2))
|
|
215
|
+
# Example: varchar(255) or varchar
|
|
216
|
+
elif dtype.startswith("varchar"):
|
|
217
|
+
field.type = "varchar"
|
|
218
|
+
if len(dtype) > 7:
|
|
219
|
+
field.maxLength = int(dtype[8:-1])
|
|
220
|
+
else:
|
|
221
|
+
field.type = map_type_from_sql(dtype)
|
|
222
|
+
return field
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def split_fields(s: str) -> Generator[str, None, None]:
|
|
226
|
+
"""Split a string of fields considering nested structures.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
s (str): The string to split.
|
|
230
|
+
|
|
231
|
+
Yields:
|
|
232
|
+
str: The next field in the string.
|
|
233
|
+
"""
|
|
234
|
+
counter: int = 0
|
|
235
|
+
last: int = 0
|
|
236
|
+
for i, x in enumerate(s):
|
|
237
|
+
if x in ("<", "("):
|
|
238
|
+
counter += 1
|
|
239
|
+
elif x in (">", ")"):
|
|
240
|
+
counter -= 1
|
|
241
|
+
elif x == "," and counter == 0:
|
|
242
|
+
yield s[last:i]
|
|
243
|
+
last = i + 1
|
|
244
|
+
yield s[last:]
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def split_struct(s: str) -> List[str]:
|
|
248
|
+
"""Split a struct string into individual fields.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
s (str): The struct string to split.
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
List[str]: List of individual fields in the struct.
|
|
255
|
+
"""
|
|
256
|
+
return list(split_fields(s=s))
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def map_type_from_sql(sql_type: str) -> str:
|
|
260
|
+
"""Map an SQL type to a corresponding field type.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
sql_type (str): The SQL type to map.
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
str: The corresponding field type.
|
|
267
|
+
"""
|
|
268
|
+
if sql_type is None:
|
|
269
|
+
return None
|
|
270
|
+
|
|
271
|
+
sql_type = sql_type.lower()
|
|
272
|
+
|
|
273
|
+
type_mapping = {
|
|
274
|
+
"string": "string",
|
|
275
|
+
"int": "int",
|
|
276
|
+
"bigint": "bigint",
|
|
277
|
+
"float": "float",
|
|
278
|
+
"double": "double",
|
|
279
|
+
"boolean": "boolean",
|
|
280
|
+
"timestamp": "timestamp",
|
|
281
|
+
"date": "date",
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
for prefix, mapped_type in type_mapping.items():
|
|
285
|
+
if sql_type.startswith(prefix):
|
|
286
|
+
return mapped_type
|
|
287
|
+
|
|
288
|
+
return "unknown"
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
from typing import Any, Dict
|
|
2
|
+
|
|
3
|
+
from pydantic import ValidationError
|
|
4
|
+
from pyiceberg import types as iceberg_types
|
|
5
|
+
from pyiceberg.schema import Schema
|
|
6
|
+
|
|
7
|
+
from datacontract.imports.importer import Importer
|
|
8
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
9
|
+
from datacontract.model.exceptions import DataContractException
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class IcebergImporter(Importer):
|
|
13
|
+
def import_source(
|
|
14
|
+
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
15
|
+
) -> DataContractSpecification:
|
|
16
|
+
schema = load_and_validate_iceberg_schema(source)
|
|
17
|
+
return import_iceberg(
|
|
18
|
+
data_contract_specification,
|
|
19
|
+
schema,
|
|
20
|
+
import_args.get("iceberg_table"),
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def load_and_validate_iceberg_schema(source: str) -> Schema:
|
|
25
|
+
with open(source, "r") as file:
|
|
26
|
+
try:
|
|
27
|
+
return Schema.model_validate_json(file.read())
|
|
28
|
+
except ValidationError as e:
|
|
29
|
+
raise DataContractException(
|
|
30
|
+
type="schema",
|
|
31
|
+
name="Parse iceberg schema",
|
|
32
|
+
reason=f"Failed to validate iceberg schema from {source}: {e}",
|
|
33
|
+
engine="datacontract",
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def import_iceberg(
|
|
38
|
+
data_contract_specification: DataContractSpecification, schema: Schema, table_name: str
|
|
39
|
+
) -> DataContractSpecification:
|
|
40
|
+
if data_contract_specification.models is None:
|
|
41
|
+
data_contract_specification.models = {}
|
|
42
|
+
|
|
43
|
+
model = Model(type="table", title=table_name)
|
|
44
|
+
|
|
45
|
+
# Iceberg identifier_fields aren't technically primary keys since Iceberg doesn't support primary keys,
|
|
46
|
+
# but they are close enough that we can probably treat them as primary keys on the conversion.
|
|
47
|
+
# ref: https://iceberg.apache.org/spec/#identifier-field-ids
|
|
48
|
+
# this code WILL NOT support finding nested primary key fields.
|
|
49
|
+
identifier_fields_ids = schema.identifier_field_ids
|
|
50
|
+
|
|
51
|
+
for field in schema.fields:
|
|
52
|
+
model_field = _field_from_nested_field(field)
|
|
53
|
+
|
|
54
|
+
if field.field_id in identifier_fields_ids:
|
|
55
|
+
model_field.primaryKey = True
|
|
56
|
+
|
|
57
|
+
model.fields[field.name] = model_field
|
|
58
|
+
|
|
59
|
+
data_contract_specification.models[table_name] = model
|
|
60
|
+
return data_contract_specification
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _field_from_nested_field(nested_field: iceberg_types.NestedField) -> Field:
|
|
64
|
+
"""
|
|
65
|
+
Converts an Iceberg NestedField into a Field object for the data contract.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
nested_field: The Iceberg NestedField to convert.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Field: The generated Field object.
|
|
72
|
+
"""
|
|
73
|
+
field = Field(
|
|
74
|
+
title=nested_field.name,
|
|
75
|
+
required=nested_field.required,
|
|
76
|
+
config=build_field_config(nested_field),
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
if nested_field.doc is not None:
|
|
80
|
+
field.description = nested_field.doc
|
|
81
|
+
|
|
82
|
+
return _type_from_iceberg_type(field, nested_field.field_type)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _type_from_iceberg_type(field: Field, iceberg_type: iceberg_types.IcebergType) -> Field:
|
|
86
|
+
"""
|
|
87
|
+
Maps Iceberg data types to the Data Contract type system and updates the field.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
field: The Field object to update.
|
|
91
|
+
iceberg_type: The Iceberg data type to map.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Field: The updated Field object.
|
|
95
|
+
"""
|
|
96
|
+
field.type = _data_type_from_iceberg(iceberg_type)
|
|
97
|
+
|
|
98
|
+
if field.type == "array":
|
|
99
|
+
field.items = _type_from_iceberg_type(Field(required=iceberg_type.element_required), iceberg_type.element_type)
|
|
100
|
+
|
|
101
|
+
elif field.type == "map":
|
|
102
|
+
field.keys = _type_from_iceberg_type(Field(required=True), iceberg_type.key_type)
|
|
103
|
+
field.values = _type_from_iceberg_type(Field(required=iceberg_type.value_required), iceberg_type.value_type)
|
|
104
|
+
|
|
105
|
+
elif field.type == "object":
|
|
106
|
+
field.fields = {nf.name: _field_from_nested_field(nf) for nf in iceberg_type.fields}
|
|
107
|
+
|
|
108
|
+
return field
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def build_field_config(iceberg_field: iceberg_types.NestedField) -> Dict[str, Any]:
|
|
112
|
+
config = {}
|
|
113
|
+
|
|
114
|
+
if iceberg_field.field_id > 0:
|
|
115
|
+
config["icebergFieldId"] = iceberg_field.field_id
|
|
116
|
+
|
|
117
|
+
if iceberg_field.initial_default is not None:
|
|
118
|
+
config["icebergInitialDefault"] = iceberg_field.initial_default
|
|
119
|
+
|
|
120
|
+
if iceberg_field.write_default is not None:
|
|
121
|
+
config["icebergWriteDefault"] = iceberg_field.write_default
|
|
122
|
+
|
|
123
|
+
return config
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _data_type_from_iceberg(type: iceberg_types.IcebergType) -> str:
|
|
127
|
+
"""
|
|
128
|
+
Convert an Iceberg field type to a datacontract field type
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
type: The Iceberg field type
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
str: The datacontract field type
|
|
135
|
+
"""
|
|
136
|
+
if isinstance(type, iceberg_types.BooleanType):
|
|
137
|
+
return "boolean"
|
|
138
|
+
if isinstance(type, iceberg_types.IntegerType):
|
|
139
|
+
return "integer"
|
|
140
|
+
if isinstance(type, iceberg_types.LongType):
|
|
141
|
+
return "long"
|
|
142
|
+
if isinstance(type, iceberg_types.FloatType):
|
|
143
|
+
return "float"
|
|
144
|
+
if isinstance(type, iceberg_types.DoubleType):
|
|
145
|
+
return "double"
|
|
146
|
+
if isinstance(type, iceberg_types.DecimalType):
|
|
147
|
+
return "decimal"
|
|
148
|
+
if isinstance(type, iceberg_types.DateType):
|
|
149
|
+
return "date"
|
|
150
|
+
if isinstance(type, iceberg_types.TimeType):
|
|
151
|
+
# there isn't a great mapping for the iceberg type "time", just map to string for now
|
|
152
|
+
return "string"
|
|
153
|
+
if isinstance(type, iceberg_types.TimestampType):
|
|
154
|
+
return "timestamp_ntz"
|
|
155
|
+
if isinstance(type, iceberg_types.TimestamptzType):
|
|
156
|
+
return "timestamp_tz"
|
|
157
|
+
if isinstance(type, iceberg_types.StringType):
|
|
158
|
+
return "string"
|
|
159
|
+
if isinstance(type, iceberg_types.UUIDType):
|
|
160
|
+
return "string"
|
|
161
|
+
if isinstance(type, iceberg_types.BinaryType):
|
|
162
|
+
return "bytes"
|
|
163
|
+
if isinstance(type, iceberg_types.FixedType):
|
|
164
|
+
return "bytes"
|
|
165
|
+
if isinstance(type, iceberg_types.MapType):
|
|
166
|
+
return "map"
|
|
167
|
+
if isinstance(type, iceberg_types.ListType):
|
|
168
|
+
return "array"
|
|
169
|
+
if isinstance(type, iceberg_types.StructType):
|
|
170
|
+
return "object"
|
|
171
|
+
|
|
172
|
+
raise ValueError(f"Unknown Iceberg type: {type}")
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from enum import Enum
|
|
3
|
+
|
|
4
|
+
from datacontract_specification.model import DataContractSpecification
|
|
5
|
+
from open_data_contract_standard.model import OpenDataContractStandard
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Importer(ABC):
|
|
9
|
+
def __init__(self, import_format) -> None:
|
|
10
|
+
self.import_format = import_format
|
|
11
|
+
|
|
12
|
+
@abstractmethod
|
|
13
|
+
def import_source(
|
|
14
|
+
self,
|
|
15
|
+
data_contract_specification: DataContractSpecification | OpenDataContractStandard,
|
|
16
|
+
source: str,
|
|
17
|
+
import_args: dict,
|
|
18
|
+
) -> DataContractSpecification | OpenDataContractStandard:
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ImportFormat(str, Enum):
|
|
23
|
+
sql = "sql"
|
|
24
|
+
avro = "avro"
|
|
25
|
+
dbt = "dbt"
|
|
26
|
+
dbml = "dbml"
|
|
27
|
+
glue = "glue"
|
|
28
|
+
jsonschema = "jsonschema"
|
|
29
|
+
json = "json"
|
|
30
|
+
bigquery = "bigquery"
|
|
31
|
+
odcs = "odcs"
|
|
32
|
+
unity = "unity"
|
|
33
|
+
spark = "spark"
|
|
34
|
+
iceberg = "iceberg"
|
|
35
|
+
parquet = "parquet"
|
|
36
|
+
csv = "csv"
|
|
37
|
+
protobuf = "protobuf"
|
|
38
|
+
excel = "excel"
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
def get_supported_formats(cls):
|
|
42
|
+
return list(map(lambda c: c.value, cls))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class Spec(str, Enum):
|
|
46
|
+
datacontract_specification = "datacontract_specification"
|
|
47
|
+
odcs = "odcs"
|
|
48
|
+
|
|
49
|
+
@classmethod
|
|
50
|
+
def get_supported_types(cls):
|
|
51
|
+
return list(map(lambda c: c.value, cls))
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import importlib.util
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
from datacontract.imports.importer import Importer, ImportFormat
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ImporterFactory:
|
|
8
|
+
def __init__(self):
|
|
9
|
+
self.dict_importer = {}
|
|
10
|
+
self.dict_lazy_importer = {}
|
|
11
|
+
|
|
12
|
+
def register_importer(self, name, importer: Importer):
|
|
13
|
+
self.dict_importer.update({name: importer})
|
|
14
|
+
|
|
15
|
+
def register_lazy_importer(self, name: str, module_path: str, class_name: str):
|
|
16
|
+
self.dict_lazy_importer.update({name: (module_path, class_name)})
|
|
17
|
+
|
|
18
|
+
def create(self, name) -> Importer:
|
|
19
|
+
importers = self.dict_importer.copy()
|
|
20
|
+
importers.update(self.dict_lazy_importer.copy())
|
|
21
|
+
if name not in importers.keys():
|
|
22
|
+
raise ValueError(f"The '{name}' format is not supported.")
|
|
23
|
+
importer_class = importers[name]
|
|
24
|
+
if type(importers[name]) is tuple:
|
|
25
|
+
importer_class = load_module_class(module_path=importers[name][0], class_name=importers[name][1])
|
|
26
|
+
if not importer_class:
|
|
27
|
+
raise ValueError(f"Module {name} could not be loaded.")
|
|
28
|
+
return importer_class(name)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def import_module(module_path):
|
|
32
|
+
if importlib.util.find_spec(module_path) is not None:
|
|
33
|
+
try:
|
|
34
|
+
module = importlib.import_module(module_path)
|
|
35
|
+
except ModuleNotFoundError:
|
|
36
|
+
return None
|
|
37
|
+
sys.modules[module_path] = module
|
|
38
|
+
return module
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def load_module_class(module_path, class_name):
|
|
42
|
+
module = import_module(module_path)
|
|
43
|
+
if not module:
|
|
44
|
+
return None
|
|
45
|
+
return getattr(module, class_name)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
importer_factory = ImporterFactory()
|
|
49
|
+
importer_factory.register_lazy_importer(
|
|
50
|
+
name=ImportFormat.avro,
|
|
51
|
+
module_path="datacontract.imports.avro_importer",
|
|
52
|
+
class_name="AvroImporter",
|
|
53
|
+
)
|
|
54
|
+
importer_factory.register_lazy_importer(
|
|
55
|
+
name=ImportFormat.bigquery,
|
|
56
|
+
module_path="datacontract.imports.bigquery_importer",
|
|
57
|
+
class_name="BigQueryImporter",
|
|
58
|
+
)
|
|
59
|
+
importer_factory.register_lazy_importer(
|
|
60
|
+
name=ImportFormat.glue,
|
|
61
|
+
module_path="datacontract.imports.glue_importer",
|
|
62
|
+
class_name="GlueImporter",
|
|
63
|
+
)
|
|
64
|
+
importer_factory.register_lazy_importer(
|
|
65
|
+
name=ImportFormat.jsonschema,
|
|
66
|
+
module_path="datacontract.imports.jsonschema_importer",
|
|
67
|
+
class_name="JsonSchemaImporter",
|
|
68
|
+
)
|
|
69
|
+
importer_factory.register_lazy_importer(
|
|
70
|
+
name=ImportFormat.odcs,
|
|
71
|
+
module_path="datacontract.imports.odcs_importer",
|
|
72
|
+
class_name="OdcsImporter",
|
|
73
|
+
)
|
|
74
|
+
importer_factory.register_lazy_importer(
|
|
75
|
+
name=ImportFormat.sql,
|
|
76
|
+
module_path="datacontract.imports.sql_importer",
|
|
77
|
+
class_name="SqlImporter",
|
|
78
|
+
)
|
|
79
|
+
importer_factory.register_lazy_importer(
|
|
80
|
+
name=ImportFormat.unity,
|
|
81
|
+
module_path="datacontract.imports.unity_importer",
|
|
82
|
+
class_name="UnityImporter",
|
|
83
|
+
)
|
|
84
|
+
importer_factory.register_lazy_importer(
|
|
85
|
+
name=ImportFormat.spark,
|
|
86
|
+
module_path="datacontract.imports.spark_importer",
|
|
87
|
+
class_name="SparkImporter",
|
|
88
|
+
)
|
|
89
|
+
importer_factory.register_lazy_importer(
|
|
90
|
+
name=ImportFormat.dbt, module_path="datacontract.imports.dbt_importer", class_name="DbtManifestImporter"
|
|
91
|
+
)
|
|
92
|
+
importer_factory.register_lazy_importer(
|
|
93
|
+
name=ImportFormat.dbml,
|
|
94
|
+
module_path="datacontract.imports.dbml_importer",
|
|
95
|
+
class_name="DBMLImporter",
|
|
96
|
+
)
|
|
97
|
+
importer_factory.register_lazy_importer(
|
|
98
|
+
name=ImportFormat.iceberg,
|
|
99
|
+
module_path="datacontract.imports.iceberg_importer",
|
|
100
|
+
class_name="IcebergImporter",
|
|
101
|
+
)
|
|
102
|
+
importer_factory.register_lazy_importer(
|
|
103
|
+
name=ImportFormat.parquet,
|
|
104
|
+
module_path="datacontract.imports.parquet_importer",
|
|
105
|
+
class_name="ParquetImporter",
|
|
106
|
+
)
|
|
107
|
+
importer_factory.register_lazy_importer(
|
|
108
|
+
name=ImportFormat.csv,
|
|
109
|
+
module_path="datacontract.imports.csv_importer",
|
|
110
|
+
class_name="CsvImporter",
|
|
111
|
+
)
|
|
112
|
+
importer_factory.register_lazy_importer(
|
|
113
|
+
name=ImportFormat.protobuf,
|
|
114
|
+
module_path="datacontract.imports.protobuf_importer",
|
|
115
|
+
class_name="ProtoBufImporter",
|
|
116
|
+
)
|
|
117
|
+
importer_factory.register_lazy_importer(
|
|
118
|
+
name=ImportFormat.excel,
|
|
119
|
+
module_path="datacontract.imports.excel_importer",
|
|
120
|
+
class_name="ExcelImporter",
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
importer_factory.register_lazy_importer(
|
|
125
|
+
name=ImportFormat.json,
|
|
126
|
+
module_path="datacontract.imports.json_importer",
|
|
127
|
+
class_name="JsonImporter",
|
|
128
|
+
)
|