datacontract-cli 0.10.11__py3-none-any.whl → 0.10.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/cli.py +19 -3
- datacontract/data_contract.py +5 -10
- datacontract/engines/fastjsonschema/check_jsonschema.py +11 -0
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +2 -0
- datacontract/engines/soda/check_soda_execute.py +2 -8
- datacontract/engines/soda/connections/duckdb.py +23 -24
- datacontract/engines/soda/connections/kafka.py +84 -25
- datacontract/export/avro_converter.py +12 -2
- datacontract/export/bigquery_converter.py +30 -23
- datacontract/export/data_caterer_converter.py +148 -0
- datacontract/export/dbml_converter.py +3 -2
- datacontract/export/exporter.py +2 -0
- datacontract/export/exporter_factory.py +12 -0
- datacontract/export/jsonschema_converter.py +13 -2
- datacontract/export/spark_converter.py +5 -1
- datacontract/export/sql_type_converter.py +65 -39
- datacontract/export/sqlalchemy_converter.py +169 -0
- datacontract/imports/avro_importer.py +1 -0
- datacontract/imports/bigquery_importer.py +2 -2
- datacontract/imports/dbml_importer.py +112 -0
- datacontract/imports/dbt_importer.py +67 -91
- datacontract/imports/glue_importer.py +62 -58
- datacontract/imports/importer.py +2 -1
- datacontract/imports/importer_factory.py +5 -0
- datacontract/imports/odcs_importer.py +1 -1
- datacontract/imports/spark_importer.py +34 -11
- datacontract/imports/sql_importer.py +1 -1
- datacontract/imports/unity_importer.py +106 -85
- datacontract/integration/{publish_datamesh_manager.py → datamesh_manager.py} +33 -5
- datacontract/integration/{publish_opentelemetry.py → opentelemetry.py} +1 -1
- datacontract/lint/resolve.py +10 -1
- datacontract/lint/urls.py +27 -13
- datacontract/model/data_contract_specification.py +6 -2
- {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/METADATA +123 -32
- {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/RECORD +39 -37
- {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/WHEEL +1 -1
- datacontract/publish/publish.py +0 -32
- {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/top_level.txt +0 -0
|
@@ -1,117 +1,93 @@
|
|
|
1
1
|
import json
|
|
2
|
-
|
|
3
|
-
from typing import (
|
|
4
|
-
List,
|
|
5
|
-
)
|
|
2
|
+
from typing import TypedDict
|
|
6
3
|
|
|
7
4
|
from datacontract.imports.importer import Importer
|
|
8
5
|
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
6
|
+
from dbt.artifacts.resources.v1.components import ColumnInfo
|
|
7
|
+
from dbt.contracts.graph.manifest import Manifest
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DBTImportArgs(TypedDict, total=False):
|
|
11
|
+
"""
|
|
12
|
+
A dictionary representing arguments for importing DBT models.
|
|
13
|
+
Makes the DBT Importer more customizable by allowing for flexible filtering
|
|
14
|
+
of models and their properties, through wrapping or extending.
|
|
15
|
+
|
|
16
|
+
Attributes:
|
|
17
|
+
dbt_models: The keys of models to be used in contract. All as default.
|
|
18
|
+
resource_types: Nodes listed in resource_types are kept while importing. model as default.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
dbt_nodes: list[str]
|
|
22
|
+
resource_types: list[str]
|
|
9
23
|
|
|
10
24
|
|
|
11
25
|
class DbtManifestImporter(Importer):
|
|
12
26
|
def import_source(
|
|
13
|
-
self,
|
|
14
|
-
|
|
15
|
-
|
|
27
|
+
self,
|
|
28
|
+
data_contract_specification: DataContractSpecification,
|
|
29
|
+
source: str,
|
|
30
|
+
import_args: DBTImportArgs,
|
|
31
|
+
) -> DataContractSpecification:
|
|
32
|
+
manifest = read_dbt_manifest(manifest_path=source)
|
|
16
33
|
return import_dbt_manifest(
|
|
17
|
-
data_contract_specification
|
|
34
|
+
data_contract_specification=data_contract_specification,
|
|
35
|
+
manifest=manifest,
|
|
36
|
+
dbt_nodes=import_args.get("dbt_nodes", []),
|
|
37
|
+
resource_types=import_args.get("resource_types", ["model"]),
|
|
18
38
|
)
|
|
19
39
|
|
|
20
40
|
|
|
21
|
-
def
|
|
22
|
-
|
|
23
|
-
):
|
|
24
|
-
|
|
25
|
-
|
|
41
|
+
def read_dbt_manifest(manifest_path: str) -> Manifest:
|
|
42
|
+
"""Read a manifest from file."""
|
|
43
|
+
with open(file=manifest_path, mode="r", encoding="utf-8") as f:
|
|
44
|
+
manifest_dict: dict = json.load(f)
|
|
45
|
+
return Manifest.from_dict(manifest_dict)
|
|
26
46
|
|
|
27
|
-
if data_contract_specification.models is None:
|
|
28
|
-
data_contract_specification.models = {}
|
|
29
47
|
|
|
30
|
-
|
|
31
|
-
|
|
48
|
+
def import_dbt_manifest(
|
|
49
|
+
data_contract_specification: DataContractSpecification,
|
|
50
|
+
manifest: Manifest,
|
|
51
|
+
dbt_nodes: list[str],
|
|
52
|
+
resource_types: list[str],
|
|
53
|
+
) -> DataContractSpecification:
|
|
54
|
+
"""
|
|
55
|
+
Extracts all relevant information from the manifest,
|
|
56
|
+
and puts it in a data contract specification.
|
|
57
|
+
"""
|
|
58
|
+
data_contract_specification.info.title = manifest.metadata.project_name
|
|
59
|
+
data_contract_specification.info.dbt_version = manifest.metadata.dbt_version
|
|
60
|
+
|
|
61
|
+
data_contract_specification.models = data_contract_specification.models or {}
|
|
62
|
+
for model_contents in manifest.nodes.values():
|
|
63
|
+
# Only intressted in processing models.
|
|
64
|
+
if model_contents.resource_type not in resource_types:
|
|
65
|
+
continue
|
|
66
|
+
|
|
67
|
+
# To allow args stored in dbt_models to filter relevant models.
|
|
68
|
+
# If dbt_models is empty, use all models.
|
|
69
|
+
if dbt_nodes and model_contents.name not in dbt_nodes:
|
|
32
70
|
continue
|
|
33
71
|
|
|
34
72
|
dc_model = Model(
|
|
35
|
-
description=
|
|
36
|
-
tags=
|
|
37
|
-
fields=create_fields(
|
|
73
|
+
description=model_contents.description,
|
|
74
|
+
tags=model_contents.tags,
|
|
75
|
+
fields=create_fields(columns=model_contents.columns),
|
|
38
76
|
)
|
|
39
77
|
|
|
40
|
-
data_contract_specification.models[
|
|
78
|
+
data_contract_specification.models[model_contents.name] = dc_model
|
|
41
79
|
|
|
42
80
|
return data_contract_specification
|
|
43
81
|
|
|
44
82
|
|
|
45
|
-
def create_fields(columns:
|
|
46
|
-
fields = {
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
83
|
+
def create_fields(columns: dict[str, ColumnInfo]) -> dict[str, Field]:
|
|
84
|
+
fields = {
|
|
85
|
+
column.name: Field(
|
|
86
|
+
description=column.description,
|
|
87
|
+
type=column.data_type if column.data_type else "",
|
|
88
|
+
tags=column.tags,
|
|
50
89
|
)
|
|
51
|
-
|
|
90
|
+
for column in columns.values()
|
|
91
|
+
}
|
|
52
92
|
|
|
53
93
|
return fields
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def read_dbt_manifest(manifest_path: str):
|
|
57
|
-
with open(manifest_path, "r", encoding="utf-8") as f:
|
|
58
|
-
manifest = json.load(f)
|
|
59
|
-
return {"info": manifest.get("metadata"), "models": create_manifest_models(manifest)}
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def create_manifest_models(manifest: dict) -> List:
|
|
63
|
-
models = []
|
|
64
|
-
nodes = manifest.get("nodes")
|
|
65
|
-
|
|
66
|
-
for node in nodes.values():
|
|
67
|
-
if node["resource_type"] != "model":
|
|
68
|
-
continue
|
|
69
|
-
|
|
70
|
-
models.append(DbtModel(node))
|
|
71
|
-
return models
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
class DbtColumn:
|
|
75
|
-
name: str
|
|
76
|
-
description: str
|
|
77
|
-
data_type: str
|
|
78
|
-
meta: dict
|
|
79
|
-
tags: List
|
|
80
|
-
|
|
81
|
-
def __init__(self, node_column: dict):
|
|
82
|
-
self.name = node_column.get("name")
|
|
83
|
-
self.description = node_column.get("description")
|
|
84
|
-
self.data_type = node_column.get("data_type")
|
|
85
|
-
self.meta = node_column.get("meta", {})
|
|
86
|
-
self.tags = node_column.get("tags", [])
|
|
87
|
-
|
|
88
|
-
def __repr__(self) -> str:
|
|
89
|
-
return self.name
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
class DbtModel:
|
|
93
|
-
name: str
|
|
94
|
-
database: str
|
|
95
|
-
schema: str
|
|
96
|
-
description: str
|
|
97
|
-
unique_id: str
|
|
98
|
-
tags: List
|
|
99
|
-
|
|
100
|
-
def __init__(self, node: dict):
|
|
101
|
-
self.name = node.get("name")
|
|
102
|
-
self.database = node.get("database")
|
|
103
|
-
self.schema = node.get("schema")
|
|
104
|
-
self.description = node.get("description")
|
|
105
|
-
self.display_name = node.get("display_name")
|
|
106
|
-
self.unique_id = node.get("unique_id")
|
|
107
|
-
self.columns = []
|
|
108
|
-
self.tags = node.get("tags")
|
|
109
|
-
if node.get("columns"):
|
|
110
|
-
self.add_columns(node.get("columns").values())
|
|
111
|
-
|
|
112
|
-
def add_columns(self, model_columns: List):
|
|
113
|
-
for column in model_columns:
|
|
114
|
-
self.columns.append(DbtColumn(column))
|
|
115
|
-
|
|
116
|
-
def __repr__(self) -> str:
|
|
117
|
-
return self.name
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import boto3
|
|
2
|
-
from typing import List
|
|
3
|
-
|
|
2
|
+
from typing import List, Dict, Generator
|
|
3
|
+
import re
|
|
4
4
|
from datacontract.imports.importer import Importer
|
|
5
5
|
from datacontract.model.data_contract_specification import (
|
|
6
6
|
DataContractSpecification,
|
|
@@ -13,7 +13,7 @@ from datacontract.model.data_contract_specification import (
|
|
|
13
13
|
class GlueImporter(Importer):
|
|
14
14
|
def import_source(
|
|
15
15
|
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
16
|
-
) ->
|
|
16
|
+
) -> DataContractSpecification:
|
|
17
17
|
return import_glue(data_contract_specification, source, import_args.get("glue_table"))
|
|
18
18
|
|
|
19
19
|
|
|
@@ -39,7 +39,7 @@ def get_glue_database(database_name: str):
|
|
|
39
39
|
|
|
40
40
|
return (
|
|
41
41
|
response["Database"]["CatalogId"],
|
|
42
|
-
response["Database"].get("LocationUri"
|
|
42
|
+
response["Database"].get("LocationUri"),
|
|
43
43
|
)
|
|
44
44
|
|
|
45
45
|
|
|
@@ -75,7 +75,7 @@ def get_glue_tables(database_name: str) -> List[str]:
|
|
|
75
75
|
return table_names
|
|
76
76
|
|
|
77
77
|
|
|
78
|
-
def get_glue_table_schema(database_name: str, table_name: str):
|
|
78
|
+
def get_glue_table_schema(database_name: str, table_name: str) -> List[Dict]:
|
|
79
79
|
"""Get the schema of a Glue table.
|
|
80
80
|
|
|
81
81
|
Args:
|
|
@@ -93,11 +93,11 @@ def get_glue_table_schema(database_name: str, table_name: str):
|
|
|
93
93
|
response = glue.get_table(DatabaseName=database_name, Name=table_name)
|
|
94
94
|
except glue.exceptions.EntityNotFoundException:
|
|
95
95
|
print(f"Table {table_name} not found in database {database_name}.")
|
|
96
|
-
return
|
|
96
|
+
return []
|
|
97
97
|
except Exception as e:
|
|
98
98
|
# todo catch all
|
|
99
99
|
print(f"Error: {e}")
|
|
100
|
-
return
|
|
100
|
+
return []
|
|
101
101
|
|
|
102
102
|
table_schema = response["Table"]["StorageDescriptor"]["Columns"]
|
|
103
103
|
|
|
@@ -109,10 +109,9 @@ def get_glue_table_schema(database_name: str, table_name: str):
|
|
|
109
109
|
"Name": pk["Name"],
|
|
110
110
|
"Type": pk["Type"],
|
|
111
111
|
"Hive": True,
|
|
112
|
-
"Comment": "
|
|
112
|
+
"Comment": pk.get("Comment"),
|
|
113
113
|
}
|
|
114
114
|
)
|
|
115
|
-
|
|
116
115
|
return table_schema
|
|
117
116
|
|
|
118
117
|
|
|
@@ -120,7 +119,7 @@ def import_glue(
|
|
|
120
119
|
data_contract_specification: DataContractSpecification,
|
|
121
120
|
source: str,
|
|
122
121
|
table_names: List[str],
|
|
123
|
-
):
|
|
122
|
+
) -> DataContractSpecification:
|
|
124
123
|
"""Import the schema of a Glue database.
|
|
125
124
|
|
|
126
125
|
Args:
|
|
@@ -140,8 +139,13 @@ def import_glue(
|
|
|
140
139
|
if table_names is None:
|
|
141
140
|
table_names = get_glue_tables(source)
|
|
142
141
|
|
|
142
|
+
server_kwargs = {"type": "glue", "account": catalogid, "database": source}
|
|
143
|
+
|
|
144
|
+
if location_uri:
|
|
145
|
+
server_kwargs["location"] = location_uri
|
|
146
|
+
|
|
143
147
|
data_contract_specification.servers = {
|
|
144
|
-
"production": Server(
|
|
148
|
+
"production": Server(**server_kwargs),
|
|
145
149
|
}
|
|
146
150
|
|
|
147
151
|
for table_name in table_names:
|
|
@@ -161,12 +165,6 @@ def import_glue(
|
|
|
161
165
|
field.description = column.get("Comment")
|
|
162
166
|
fields[column["Name"]] = field
|
|
163
167
|
|
|
164
|
-
if "decimal" in column["Type"]:
|
|
165
|
-
# Extract precision and scale from the string
|
|
166
|
-
perc_scale = column["Type"][8:-1].split(",")
|
|
167
|
-
field.precision = int(perc_scale[0])
|
|
168
|
-
field.scale = int(perc_scale[1])
|
|
169
|
-
|
|
170
168
|
data_contract_specification.models[table_name] = Model(
|
|
171
169
|
type="table",
|
|
172
170
|
fields=fields,
|
|
@@ -186,27 +184,43 @@ def create_typed_field(dtype: str) -> Field:
|
|
|
186
184
|
"""
|
|
187
185
|
field = Field()
|
|
188
186
|
dtype = dtype.strip().lower().replace(" ", "")
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
187
|
+
# Example: array<string>
|
|
188
|
+
if dtype.startswith("array"):
|
|
189
|
+
field.type = "array"
|
|
190
|
+
field.items = create_typed_field(dtype[6:-1])
|
|
191
|
+
# Example: struct<field1:float,field2:string>
|
|
192
|
+
elif dtype.startswith("struct"):
|
|
193
|
+
field.type = "struct"
|
|
194
|
+
for f in split_struct(dtype[7:-1]):
|
|
195
|
+
field_name, field_key = f.split(":", 1)
|
|
196
|
+
field.fields[field_name] = create_typed_field(field_key)
|
|
197
|
+
# Example: map<string,int>
|
|
198
|
+
elif dtype.startswith("map"):
|
|
199
|
+
field.type = "map"
|
|
200
|
+
map_match = re.match(r"map<(.+?),\s*(.+)>", dtype)
|
|
201
|
+
if map_match:
|
|
202
|
+
key_type = map_match.group(1)
|
|
203
|
+
value_type = map_match.group(2)
|
|
202
204
|
field.keys = create_typed_field(key_type)
|
|
203
205
|
field.values = create_typed_field(value_type)
|
|
206
|
+
# Example: decimal(38, 6) or decimal
|
|
207
|
+
elif dtype.startswith("decimal"):
|
|
208
|
+
field.type = "decimal"
|
|
209
|
+
decimal_match = re.match(r"decimal\((\d+),\s*(\d+)\)", dtype)
|
|
210
|
+
if decimal_match: # if precision specified
|
|
211
|
+
field.precision = int(decimal_match.group(1))
|
|
212
|
+
field.scale = int(decimal_match.group(2))
|
|
213
|
+
# Example: varchar(255) or varchar
|
|
214
|
+
elif dtype.startswith("varchar"):
|
|
215
|
+
field.type = "varchar"
|
|
216
|
+
if len(dtype) > 7:
|
|
217
|
+
field.maxLength = int(dtype[8:-1])
|
|
204
218
|
else:
|
|
205
219
|
field.type = map_type_from_sql(dtype)
|
|
206
220
|
return field
|
|
207
221
|
|
|
208
222
|
|
|
209
|
-
def split_fields(s: str):
|
|
223
|
+
def split_fields(s: str) -> Generator[str, None, None]:
|
|
210
224
|
"""Split a string of fields considering nested structures.
|
|
211
225
|
|
|
212
226
|
Args:
|
|
@@ -253,30 +267,20 @@ def map_type_from_sql(sql_type: str) -> str:
|
|
|
253
267
|
return None
|
|
254
268
|
|
|
255
269
|
sql_type = sql_type.lower()
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
return "double"
|
|
274
|
-
if sql_type.startswith("boolean"):
|
|
275
|
-
return "boolean"
|
|
276
|
-
if sql_type.startswith("timestamp"):
|
|
277
|
-
return "timestamp"
|
|
278
|
-
if sql_type.startswith("date"):
|
|
279
|
-
return "date"
|
|
280
|
-
if sql_type.startswith("decimal"):
|
|
281
|
-
return "decimal"
|
|
282
|
-
return "variant"
|
|
270
|
+
|
|
271
|
+
type_mapping = {
|
|
272
|
+
"string": "string",
|
|
273
|
+
"int": "int",
|
|
274
|
+
"bigint": "bigint",
|
|
275
|
+
"float": "float",
|
|
276
|
+
"double": "double",
|
|
277
|
+
"boolean": "boolean",
|
|
278
|
+
"timestamp": "timestamp",
|
|
279
|
+
"date": "date",
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
for prefix, mapped_type in type_mapping.items():
|
|
283
|
+
if sql_type.startswith(prefix):
|
|
284
|
+
return mapped_type
|
|
285
|
+
|
|
286
|
+
return "unknown"
|
datacontract/imports/importer.py
CHANGED
|
@@ -14,7 +14,7 @@ class Importer(ABC):
|
|
|
14
14
|
data_contract_specification: DataContractSpecification,
|
|
15
15
|
source: str,
|
|
16
16
|
import_args: dict,
|
|
17
|
-
) ->
|
|
17
|
+
) -> DataContractSpecification:
|
|
18
18
|
pass
|
|
19
19
|
|
|
20
20
|
|
|
@@ -22,6 +22,7 @@ class ImportFormat(str, Enum):
|
|
|
22
22
|
sql = "sql"
|
|
23
23
|
avro = "avro"
|
|
24
24
|
dbt = "dbt"
|
|
25
|
+
dbml = "dbml"
|
|
25
26
|
glue = "glue"
|
|
26
27
|
jsonschema = "jsonschema"
|
|
27
28
|
bigquery = "bigquery"
|
|
@@ -88,3 +88,8 @@ importer_factory.register_lazy_importer(
|
|
|
88
88
|
importer_factory.register_lazy_importer(
|
|
89
89
|
name=ImportFormat.dbt, module_path="datacontract.imports.dbt_importer", class_name="DbtManifestImporter"
|
|
90
90
|
)
|
|
91
|
+
importer_factory.register_lazy_importer(
|
|
92
|
+
name=ImportFormat.dbml,
|
|
93
|
+
module_path="datacontract.imports.dbml_importer",
|
|
94
|
+
class_name="DBMLImporter",
|
|
95
|
+
)
|
|
@@ -46,7 +46,7 @@ DATACONTRACT_TYPES = [
|
|
|
46
46
|
class OdcsImporter(Importer):
|
|
47
47
|
def import_source(
|
|
48
48
|
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
49
|
-
) ->
|
|
49
|
+
) -> DataContractSpecification:
|
|
50
50
|
return import_odcs(data_contract_specification, source)
|
|
51
51
|
|
|
52
52
|
|
|
@@ -14,7 +14,7 @@ class SparkImporter(Importer):
|
|
|
14
14
|
data_contract_specification: DataContractSpecification,
|
|
15
15
|
source: str,
|
|
16
16
|
import_args: dict,
|
|
17
|
-
) ->
|
|
17
|
+
) -> DataContractSpecification:
|
|
18
18
|
"""
|
|
19
19
|
Imports data from a Spark source into the data contract specification.
|
|
20
20
|
|
|
@@ -63,12 +63,12 @@ def import_from_spark_df(df: DataFrame) -> Model:
|
|
|
63
63
|
schema = df.schema
|
|
64
64
|
|
|
65
65
|
for field in schema:
|
|
66
|
-
model.fields[field.name] =
|
|
66
|
+
model.fields[field.name] = _field_from_struct_type(field)
|
|
67
67
|
|
|
68
68
|
return model
|
|
69
69
|
|
|
70
70
|
|
|
71
|
-
def
|
|
71
|
+
def _field_from_struct_type(spark_field: types.StructField) -> Field:
|
|
72
72
|
"""
|
|
73
73
|
Converts a Spark StructField into a Field object for the data contract.
|
|
74
74
|
|
|
@@ -76,18 +76,37 @@ def _field_from_spark(spark_field: types.StructField) -> Field:
|
|
|
76
76
|
spark_field: The Spark StructField to convert.
|
|
77
77
|
|
|
78
78
|
Returns:
|
|
79
|
-
Field: The
|
|
79
|
+
Field: The generated Field object.
|
|
80
80
|
"""
|
|
81
|
-
field_type = _data_type_from_spark(spark_field.dataType)
|
|
82
81
|
field = Field()
|
|
83
|
-
field.type = field_type
|
|
84
82
|
field.required = not spark_field.nullable
|
|
83
|
+
field.description = spark_field.metadata.get("comment")
|
|
85
84
|
|
|
86
|
-
|
|
87
|
-
field.items = _field_from_spark(spark_field.dataType.elementType)
|
|
85
|
+
return _type_from_data_type(field, spark_field.dataType)
|
|
88
86
|
|
|
89
|
-
|
|
90
|
-
|
|
87
|
+
|
|
88
|
+
def _type_from_data_type(field: Field, spark_type: types.DataType) -> Field:
|
|
89
|
+
"""
|
|
90
|
+
Maps Spark data types to the Data Contract type system and updates the field.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
field: The Field object to update.
|
|
94
|
+
spark_type: The Spark data type to map.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Field: The updated Field object.
|
|
98
|
+
"""
|
|
99
|
+
field.type = _data_type_from_spark(spark_type)
|
|
100
|
+
|
|
101
|
+
if field.type == "array":
|
|
102
|
+
field.items = _type_from_data_type(Field(required=not spark_type.containsNull), spark_type.elementType)
|
|
103
|
+
|
|
104
|
+
elif field.type == "map":
|
|
105
|
+
field.keys = _type_from_data_type(Field(required=True), spark_type.keyType)
|
|
106
|
+
field.values = _type_from_data_type(Field(required=not spark_type.valueContainsNull), spark_type.valueType)
|
|
107
|
+
|
|
108
|
+
elif field.type == "struct":
|
|
109
|
+
field.fields = {sf.name: _field_from_struct_type(sf) for sf in spark_type.fields}
|
|
91
110
|
|
|
92
111
|
return field
|
|
93
112
|
|
|
@@ -104,7 +123,7 @@ def _data_type_from_spark(spark_type: types.DataType) -> str:
|
|
|
104
123
|
"""
|
|
105
124
|
if isinstance(spark_type, types.StringType):
|
|
106
125
|
return "string"
|
|
107
|
-
elif isinstance(spark_type, types.IntegerType):
|
|
126
|
+
elif isinstance(spark_type, (types.IntegerType, types.ShortType)):
|
|
108
127
|
return "integer"
|
|
109
128
|
elif isinstance(spark_type, types.LongType):
|
|
110
129
|
return "long"
|
|
@@ -116,6 +135,8 @@ def _data_type_from_spark(spark_type: types.DataType) -> str:
|
|
|
116
135
|
return "struct"
|
|
117
136
|
elif isinstance(spark_type, types.ArrayType):
|
|
118
137
|
return "array"
|
|
138
|
+
elif isinstance(spark_type, types.MapType):
|
|
139
|
+
return "map"
|
|
119
140
|
elif isinstance(spark_type, types.TimestampType):
|
|
120
141
|
return "timestamp"
|
|
121
142
|
elif isinstance(spark_type, types.TimestampNTZType):
|
|
@@ -130,5 +151,7 @@ def _data_type_from_spark(spark_type: types.DataType) -> str:
|
|
|
130
151
|
return "decimal"
|
|
131
152
|
elif isinstance(spark_type, types.NullType):
|
|
132
153
|
return "null"
|
|
154
|
+
elif isinstance(spark_type, types.VarcharType):
|
|
155
|
+
return "varchar"
|
|
133
156
|
else:
|
|
134
157
|
raise ValueError(f"Unsupported Spark type: {spark_type}")
|
|
@@ -7,7 +7,7 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
|
|
|
7
7
|
class SqlImporter(Importer):
|
|
8
8
|
def import_source(
|
|
9
9
|
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
10
|
-
) ->
|
|
10
|
+
) -> DataContractSpecification:
|
|
11
11
|
return import_sql(data_contract_specification, self.import_format, source)
|
|
12
12
|
|
|
13
13
|
|