datacontract-cli 0.10.0__py3-none-any.whl → 0.10.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datacontract/__init__.py +13 -0
- datacontract/api.py +260 -0
- datacontract/breaking/breaking.py +242 -12
- datacontract/breaking/breaking_rules.py +37 -1
- datacontract/catalog/catalog.py +80 -0
- datacontract/cli.py +387 -117
- datacontract/data_contract.py +216 -353
- datacontract/engines/data_contract_checks.py +1041 -0
- datacontract/engines/data_contract_test.py +113 -0
- datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +2 -3
- datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
- datacontract/engines/fastjsonschema/check_jsonschema.py +176 -42
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +16 -1
- datacontract/engines/soda/check_soda_execute.py +100 -56
- datacontract/engines/soda/connections/athena.py +79 -0
- datacontract/engines/soda/connections/bigquery.py +8 -1
- datacontract/engines/soda/connections/databricks.py +12 -3
- datacontract/engines/soda/connections/duckdb_connection.py +241 -0
- datacontract/engines/soda/connections/kafka.py +206 -113
- datacontract/engines/soda/connections/snowflake.py +8 -5
- datacontract/engines/soda/connections/sqlserver.py +43 -0
- datacontract/engines/soda/connections/trino.py +26 -0
- datacontract/export/avro_converter.py +72 -8
- datacontract/export/avro_idl_converter.py +31 -25
- datacontract/export/bigquery_converter.py +130 -0
- datacontract/export/custom_converter.py +40 -0
- datacontract/export/data_caterer_converter.py +161 -0
- datacontract/export/dbml_converter.py +148 -0
- datacontract/export/dbt_converter.py +141 -54
- datacontract/export/dcs_exporter.py +6 -0
- datacontract/export/dqx_converter.py +126 -0
- datacontract/export/duckdb_type_converter.py +57 -0
- datacontract/export/excel_exporter.py +923 -0
- datacontract/export/exporter.py +100 -0
- datacontract/export/exporter_factory.py +216 -0
- datacontract/export/go_converter.py +105 -0
- datacontract/export/great_expectations_converter.py +257 -36
- datacontract/export/html_exporter.py +86 -0
- datacontract/export/iceberg_converter.py +188 -0
- datacontract/export/jsonschema_converter.py +71 -16
- datacontract/export/markdown_converter.py +337 -0
- datacontract/export/mermaid_exporter.py +110 -0
- datacontract/export/odcs_v3_exporter.py +375 -0
- datacontract/export/pandas_type_converter.py +40 -0
- datacontract/export/protobuf_converter.py +168 -68
- datacontract/export/pydantic_converter.py +6 -0
- datacontract/export/rdf_converter.py +13 -6
- datacontract/export/sodacl_converter.py +36 -188
- datacontract/export/spark_converter.py +245 -0
- datacontract/export/sql_converter.py +37 -3
- datacontract/export/sql_type_converter.py +269 -8
- datacontract/export/sqlalchemy_converter.py +170 -0
- datacontract/export/terraform_converter.py +7 -2
- datacontract/imports/avro_importer.py +246 -26
- datacontract/imports/bigquery_importer.py +221 -0
- datacontract/imports/csv_importer.py +143 -0
- datacontract/imports/dbml_importer.py +112 -0
- datacontract/imports/dbt_importer.py +240 -0
- datacontract/imports/excel_importer.py +1111 -0
- datacontract/imports/glue_importer.py +288 -0
- datacontract/imports/iceberg_importer.py +172 -0
- datacontract/imports/importer.py +51 -0
- datacontract/imports/importer_factory.py +128 -0
- datacontract/imports/json_importer.py +325 -0
- datacontract/imports/jsonschema_importer.py +146 -0
- datacontract/imports/odcs_importer.py +60 -0
- datacontract/imports/odcs_v3_importer.py +516 -0
- datacontract/imports/parquet_importer.py +81 -0
- datacontract/imports/protobuf_importer.py +264 -0
- datacontract/imports/spark_importer.py +262 -0
- datacontract/imports/sql_importer.py +274 -35
- datacontract/imports/unity_importer.py +219 -0
- datacontract/init/init_template.py +20 -0
- datacontract/integration/datamesh_manager.py +86 -0
- datacontract/lint/resolve.py +271 -49
- datacontract/lint/resources.py +21 -0
- datacontract/lint/schema.py +53 -17
- datacontract/lint/urls.py +32 -12
- datacontract/model/data_contract_specification/__init__.py +1 -0
- datacontract/model/exceptions.py +4 -1
- datacontract/model/odcs.py +24 -0
- datacontract/model/run.py +49 -29
- datacontract/output/__init__.py +0 -0
- datacontract/output/junit_test_results.py +135 -0
- datacontract/output/output_format.py +10 -0
- datacontract/output/test_results_writer.py +79 -0
- datacontract/py.typed +0 -0
- datacontract/schemas/datacontract-1.1.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.1.0.schema.json +1975 -0
- datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
- datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
- datacontract/schemas/odcs-3.0.1.schema.json +2634 -0
- datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
- datacontract/templates/datacontract.html +139 -294
- datacontract/templates/datacontract_odcs.html +685 -0
- datacontract/templates/index.html +236 -0
- datacontract/templates/partials/datacontract_information.html +86 -0
- datacontract/templates/partials/datacontract_servicelevels.html +253 -0
- datacontract/templates/partials/datacontract_terms.html +51 -0
- datacontract/templates/partials/definition.html +25 -0
- datacontract/templates/partials/example.html +27 -0
- datacontract/templates/partials/model_field.html +144 -0
- datacontract/templates/partials/quality.html +49 -0
- datacontract/templates/partials/server.html +211 -0
- datacontract/templates/style/output.css +491 -72
- datacontract_cli-0.10.37.dist-info/METADATA +2235 -0
- datacontract_cli-0.10.37.dist-info/RECORD +119 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/WHEEL +1 -1
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info/licenses}/LICENSE +1 -1
- datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
- datacontract/engines/soda/connections/dask.py +0 -28
- datacontract/engines/soda/connections/duckdb.py +0 -76
- datacontract/export/csv_type_converter.py +0 -36
- datacontract/export/html_export.py +0 -66
- datacontract/export/odcs_converter.py +0 -102
- datacontract/init/download_datacontract_file.py +0 -17
- datacontract/integration/publish_datamesh_manager.py +0 -33
- datacontract/integration/publish_opentelemetry.py +0 -107
- datacontract/lint/lint.py +0 -141
- datacontract/lint/linters/description_linter.py +0 -34
- datacontract/lint/linters/example_model_linter.py +0 -91
- datacontract/lint/linters/field_pattern_linter.py +0 -34
- datacontract/lint/linters/field_reference_linter.py +0 -38
- datacontract/lint/linters/notice_period_linter.py +0 -55
- datacontract/lint/linters/quality_schema_linter.py +0 -52
- datacontract/lint/linters/valid_constraints_linter.py +0 -99
- datacontract/model/data_contract_specification.py +0 -141
- datacontract/web.py +0 -14
- datacontract_cli-0.10.0.dist-info/METADATA +0 -951
- datacontract_cli-0.10.0.dist-info/RECORD +0 -66
- /datacontract/{model → breaking}/breaking_change.py +0 -0
- /datacontract/{lint/linters → export}/__init__.py +0 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/top_level.txt +0 -0
|
@@ -1,37 +1,76 @@
|
|
|
1
|
-
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
2
3
|
|
|
3
|
-
|
|
4
|
-
|
|
4
|
+
import sqlglot
|
|
5
|
+
from sqlglot.dialects.dialect import Dialects
|
|
5
6
|
|
|
7
|
+
from datacontract.imports.importer import Importer
|
|
8
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Server
|
|
9
|
+
from datacontract.model.exceptions import DataContractException
|
|
10
|
+
from datacontract.model.run import ResultEnum
|
|
6
11
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
12
|
+
|
|
13
|
+
class SqlImporter(Importer):
|
|
14
|
+
def import_source(
|
|
15
|
+
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
16
|
+
) -> DataContractSpecification:
|
|
17
|
+
return import_sql(data_contract_specification, self.import_format, source, import_args)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def import_sql(
|
|
21
|
+
data_contract_specification: DataContractSpecification, format: str, source: str, import_args: dict = None
|
|
22
|
+
) -> DataContractSpecification:
|
|
23
|
+
sql = read_file(source)
|
|
24
|
+
|
|
25
|
+
dialect = to_dialect(import_args)
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
parsed = sqlglot.parse_one(sql=sql, read=dialect)
|
|
29
|
+
except Exception as e:
|
|
30
|
+
logging.error(f"Error parsing SQL: {str(e)}")
|
|
31
|
+
raise DataContractException(
|
|
32
|
+
type="import",
|
|
33
|
+
name=f"Reading source from {source}",
|
|
34
|
+
reason=f"Error parsing SQL: {str(e)}",
|
|
35
|
+
engine="datacontract",
|
|
36
|
+
result=ResultEnum.error,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
server_type: str | None = to_server_type(source, dialect)
|
|
40
|
+
if server_type is not None:
|
|
41
|
+
data_contract_specification.servers[server_type] = Server(type=server_type)
|
|
42
|
+
|
|
43
|
+
tables = parsed.find_all(sqlglot.expressions.Table)
|
|
10
44
|
|
|
11
45
|
for table in tables:
|
|
12
46
|
if data_contract_specification.models is None:
|
|
13
47
|
data_contract_specification.models = {}
|
|
14
48
|
|
|
15
|
-
table_name = table
|
|
49
|
+
table_name = table.this.name
|
|
16
50
|
|
|
17
51
|
fields = {}
|
|
18
|
-
for column in
|
|
52
|
+
for column in parsed.find_all(sqlglot.exp.ColumnDef):
|
|
53
|
+
if column.parent.this.name != table_name:
|
|
54
|
+
continue
|
|
55
|
+
|
|
19
56
|
field = Field()
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
57
|
+
col_name = column.this.name
|
|
58
|
+
col_type = to_col_type(column, dialect)
|
|
59
|
+
field.type = map_type_from_sql(col_type)
|
|
60
|
+
col_description = get_description(column)
|
|
61
|
+
field.description = col_description
|
|
62
|
+
field.maxLength = get_max_length(column)
|
|
63
|
+
precision, scale = get_precision_scale(column)
|
|
64
|
+
field.precision = precision
|
|
65
|
+
field.scale = scale
|
|
66
|
+
field.primaryKey = get_primary_key(column)
|
|
67
|
+
field.required = column.find(sqlglot.exp.NotNullColumnConstraint) is not None or None
|
|
68
|
+
physical_type_key = to_physical_type_key(dialect)
|
|
69
|
+
field.config = {
|
|
70
|
+
physical_type_key: col_type,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
fields[col_name] = field
|
|
35
74
|
|
|
36
75
|
data_contract_specification.models[table_name] = Model(
|
|
37
76
|
type="table",
|
|
@@ -41,23 +80,223 @@ def import_sql(data_contract_specification: DataContractSpecification, format: s
|
|
|
41
80
|
return data_contract_specification
|
|
42
81
|
|
|
43
82
|
|
|
44
|
-
def
|
|
83
|
+
def get_primary_key(column) -> bool | None:
|
|
84
|
+
if column.find(sqlglot.exp.PrimaryKeyColumnConstraint) is not None:
|
|
85
|
+
return True
|
|
86
|
+
if column.find(sqlglot.exp.PrimaryKey) is not None:
|
|
87
|
+
return True
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def to_dialect(import_args: dict) -> Dialects | None:
|
|
92
|
+
if import_args is None:
|
|
93
|
+
return None
|
|
94
|
+
if "dialect" not in import_args:
|
|
95
|
+
return None
|
|
96
|
+
dialect = import_args.get("dialect")
|
|
97
|
+
if dialect is None:
|
|
98
|
+
return None
|
|
99
|
+
if dialect == "sqlserver":
|
|
100
|
+
return Dialects.TSQL
|
|
101
|
+
if dialect.upper() in Dialects.__members__:
|
|
102
|
+
return Dialects[dialect.upper()]
|
|
103
|
+
if dialect == "sqlserver":
|
|
104
|
+
return Dialects.TSQL
|
|
105
|
+
return None
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def to_physical_type_key(dialect: Dialects | str | None) -> str:
|
|
109
|
+
dialect_map = {
|
|
110
|
+
Dialects.TSQL: "sqlserverType",
|
|
111
|
+
Dialects.POSTGRES: "postgresType",
|
|
112
|
+
Dialects.BIGQUERY: "bigqueryType",
|
|
113
|
+
Dialects.SNOWFLAKE: "snowflakeType",
|
|
114
|
+
Dialects.REDSHIFT: "redshiftType",
|
|
115
|
+
Dialects.ORACLE: "oracleType",
|
|
116
|
+
Dialects.MYSQL: "mysqlType",
|
|
117
|
+
Dialects.DATABRICKS: "databricksType",
|
|
118
|
+
}
|
|
119
|
+
if isinstance(dialect, str):
|
|
120
|
+
dialect = Dialects[dialect.upper()] if dialect.upper() in Dialects.__members__ else None
|
|
121
|
+
return dialect_map.get(dialect, "physicalType")
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def to_server_type(source, dialect: Dialects | None) -> str | None:
|
|
125
|
+
if dialect is None:
|
|
126
|
+
return None
|
|
127
|
+
dialect_map = {
|
|
128
|
+
Dialects.TSQL: "sqlserver",
|
|
129
|
+
Dialects.POSTGRES: "postgres",
|
|
130
|
+
Dialects.BIGQUERY: "bigquery",
|
|
131
|
+
Dialects.SNOWFLAKE: "snowflake",
|
|
132
|
+
Dialects.REDSHIFT: "redshift",
|
|
133
|
+
Dialects.ORACLE: "oracle",
|
|
134
|
+
Dialects.MYSQL: "mysql",
|
|
135
|
+
Dialects.DATABRICKS: "databricks",
|
|
136
|
+
}
|
|
137
|
+
return dialect_map.get(dialect, None)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def to_col_type(column, dialect):
|
|
141
|
+
col_type_kind = column.args["kind"]
|
|
142
|
+
if col_type_kind is None:
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
return col_type_kind.sql(dialect)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def to_col_type_normalized(column):
|
|
149
|
+
col_type = column.args["kind"].this.name
|
|
150
|
+
if col_type is None:
|
|
151
|
+
return None
|
|
152
|
+
return col_type.lower()
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def get_description(column: sqlglot.expressions.ColumnDef) -> str | None:
|
|
156
|
+
if column.comments is None:
|
|
157
|
+
return None
|
|
158
|
+
return " ".join(comment.strip() for comment in column.comments)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def get_max_length(column: sqlglot.expressions.ColumnDef) -> int | None:
|
|
162
|
+
col_type = to_col_type_normalized(column)
|
|
163
|
+
if col_type is None:
|
|
164
|
+
return None
|
|
165
|
+
if col_type not in ["varchar", "char", "nvarchar", "nchar"]:
|
|
166
|
+
return None
|
|
167
|
+
col_params = list(column.args["kind"].find_all(sqlglot.expressions.DataTypeParam))
|
|
168
|
+
max_length_str = None
|
|
169
|
+
if len(col_params) == 0:
|
|
170
|
+
return None
|
|
171
|
+
if len(col_params) == 1:
|
|
172
|
+
max_length_str = col_params[0].name
|
|
173
|
+
if len(col_params) == 2:
|
|
174
|
+
max_length_str = col_params[1].name
|
|
175
|
+
if max_length_str is not None:
|
|
176
|
+
return int(max_length_str) if max_length_str.isdigit() else None
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def get_precision_scale(column):
|
|
180
|
+
col_type = to_col_type_normalized(column)
|
|
181
|
+
if col_type is None:
|
|
182
|
+
return None, None
|
|
183
|
+
if col_type not in ["decimal", "numeric", "float", "number"]:
|
|
184
|
+
return None, None
|
|
185
|
+
col_params = list(column.args["kind"].find_all(sqlglot.expressions.DataTypeParam))
|
|
186
|
+
if len(col_params) == 0:
|
|
187
|
+
return None, None
|
|
188
|
+
if len(col_params) == 1:
|
|
189
|
+
if not col_params[0].name.isdigit():
|
|
190
|
+
return None, None
|
|
191
|
+
precision = int(col_params[0].name)
|
|
192
|
+
scale = 0
|
|
193
|
+
return precision, scale
|
|
194
|
+
if len(col_params) == 2:
|
|
195
|
+
if not col_params[0].name.isdigit() or not col_params[1].name.isdigit():
|
|
196
|
+
return None, None
|
|
197
|
+
precision = int(col_params[0].name)
|
|
198
|
+
scale = int(col_params[1].name)
|
|
199
|
+
return precision, scale
|
|
200
|
+
return None, None
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def map_type_from_sql(sql_type: str) -> str | None:
|
|
45
204
|
if sql_type is None:
|
|
46
205
|
return None
|
|
47
206
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
if
|
|
207
|
+
sql_type_normed = sql_type.lower().strip()
|
|
208
|
+
|
|
209
|
+
if sql_type_normed.startswith("varchar"):
|
|
51
210
|
return "string"
|
|
52
|
-
|
|
53
|
-
return "
|
|
54
|
-
elif
|
|
55
|
-
return "
|
|
56
|
-
elif
|
|
211
|
+
elif sql_type_normed.startswith("char"):
|
|
212
|
+
return "string"
|
|
213
|
+
elif sql_type_normed.startswith("string"):
|
|
214
|
+
return "string"
|
|
215
|
+
elif sql_type_normed.startswith("nchar"):
|
|
216
|
+
return "string"
|
|
217
|
+
elif sql_type_normed.startswith("text"):
|
|
218
|
+
return "string"
|
|
219
|
+
elif sql_type_normed.startswith("nvarchar"):
|
|
220
|
+
return "string"
|
|
221
|
+
elif sql_type_normed.startswith("ntext"):
|
|
222
|
+
return "string"
|
|
223
|
+
elif sql_type_normed.startswith("int") and not sql_type_normed.startswith("interval"):
|
|
224
|
+
return "int"
|
|
225
|
+
elif sql_type_normed.startswith("bigint"):
|
|
226
|
+
return "long"
|
|
227
|
+
elif sql_type_normed.startswith("tinyint"):
|
|
228
|
+
return "int"
|
|
229
|
+
elif sql_type_normed.startswith("smallint"):
|
|
230
|
+
return "int"
|
|
231
|
+
elif sql_type_normed.startswith("float"):
|
|
57
232
|
return "float"
|
|
58
|
-
elif
|
|
233
|
+
elif sql_type_normed.startswith("double"):
|
|
234
|
+
return "double"
|
|
235
|
+
elif sql_type_normed.startswith("decimal"):
|
|
236
|
+
return "decimal"
|
|
237
|
+
elif sql_type_normed.startswith("numeric"):
|
|
238
|
+
return "decimal"
|
|
239
|
+
elif sql_type_normed.startswith("bool"):
|
|
240
|
+
return "boolean"
|
|
241
|
+
elif sql_type_normed.startswith("bit"):
|
|
59
242
|
return "boolean"
|
|
60
|
-
elif
|
|
61
|
-
return "
|
|
243
|
+
elif sql_type_normed.startswith("binary"):
|
|
244
|
+
return "bytes"
|
|
245
|
+
elif sql_type_normed.startswith("varbinary"):
|
|
246
|
+
return "bytes"
|
|
247
|
+
elif sql_type_normed.startswith("raw"):
|
|
248
|
+
return "bytes"
|
|
249
|
+
elif sql_type_normed == "blob" or sql_type_normed == "bfile":
|
|
250
|
+
return "bytes"
|
|
251
|
+
elif sql_type_normed == "date":
|
|
252
|
+
return "date"
|
|
253
|
+
elif sql_type_normed == "time":
|
|
254
|
+
return "string"
|
|
255
|
+
elif sql_type_normed.startswith("timestamp"):
|
|
256
|
+
return map_timestamp(sql_type_normed)
|
|
257
|
+
elif sql_type_normed == "datetime" or sql_type_normed == "datetime2":
|
|
258
|
+
return "timestamp_ntz"
|
|
259
|
+
elif sql_type_normed == "smalldatetime":
|
|
260
|
+
return "timestamp_ntz"
|
|
261
|
+
elif sql_type_normed == "datetimeoffset":
|
|
262
|
+
return "timestamp_tz"
|
|
263
|
+
elif sql_type_normed == "uniqueidentifier": # tsql
|
|
264
|
+
return "string"
|
|
265
|
+
elif sql_type_normed == "json":
|
|
266
|
+
return "string"
|
|
267
|
+
elif sql_type_normed == "xml": # tsql
|
|
268
|
+
return "string"
|
|
269
|
+
elif sql_type_normed.startswith("number"):
|
|
270
|
+
return "number"
|
|
271
|
+
elif sql_type_normed == "clob" or sql_type_normed == "nclob":
|
|
272
|
+
return "text"
|
|
62
273
|
else:
|
|
63
274
|
return "variant"
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def map_timestamp(timestamp_type: str) -> str:
|
|
278
|
+
match timestamp_type:
|
|
279
|
+
case "timestamp" | "timestampntz" | "timestamp_ntz":
|
|
280
|
+
return "timestamp_ntz"
|
|
281
|
+
case "timestamptz" | "timestamp_tz" | "timestamp with time zone":
|
|
282
|
+
return "timestamp_tz"
|
|
283
|
+
case localTimezone if localTimezone.startswith("timestampltz"):
|
|
284
|
+
return "timestamp_tz"
|
|
285
|
+
case timezoneWrittenOut if timezoneWrittenOut.endswith("time zone"):
|
|
286
|
+
return "timestamp_tz"
|
|
287
|
+
case _:
|
|
288
|
+
return "timestamp"
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def read_file(path):
|
|
292
|
+
if not os.path.exists(path):
|
|
293
|
+
raise DataContractException(
|
|
294
|
+
type="import",
|
|
295
|
+
name=f"Reading source from {path}",
|
|
296
|
+
reason=f"The file '{path}' does not exist.",
|
|
297
|
+
engine="datacontract",
|
|
298
|
+
result=ResultEnum.error,
|
|
299
|
+
)
|
|
300
|
+
with open(path, "r") as file:
|
|
301
|
+
file_content = file.read()
|
|
302
|
+
return file_content
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from databricks.sdk import WorkspaceClient
|
|
6
|
+
from databricks.sdk.service.catalog import ColumnInfo, TableInfo
|
|
7
|
+
from open_data_contract_standard.model import OpenDataContractStandard
|
|
8
|
+
|
|
9
|
+
from datacontract.imports.importer import Importer
|
|
10
|
+
from datacontract.imports.sql_importer import map_type_from_sql, to_physical_type_key
|
|
11
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Server
|
|
12
|
+
from datacontract.model.exceptions import DataContractException
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class UnityImporter(Importer):
|
|
16
|
+
"""
|
|
17
|
+
UnityImporter class for importing data contract specifications from Unity Catalog.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def import_source(
|
|
21
|
+
self,
|
|
22
|
+
data_contract_specification: DataContractSpecification | OpenDataContractStandard,
|
|
23
|
+
source: str,
|
|
24
|
+
import_args: dict,
|
|
25
|
+
) -> DataContractSpecification | OpenDataContractStandard:
|
|
26
|
+
"""
|
|
27
|
+
Import data contract specification from a source.
|
|
28
|
+
|
|
29
|
+
:param data_contract_specification: The data contract specification to be imported.
|
|
30
|
+
:type data_contract_specification: DataContractSpecification
|
|
31
|
+
:param source: The source from which to import the data contract specification.
|
|
32
|
+
:type source: str
|
|
33
|
+
:param import_args: Additional arguments for the import process.
|
|
34
|
+
:type import_args: dict
|
|
35
|
+
:return: The imported data contract specification.
|
|
36
|
+
:rtype: DataContractSpecification
|
|
37
|
+
"""
|
|
38
|
+
if source is not None:
|
|
39
|
+
data_contract_specification = import_unity_from_json(data_contract_specification, source)
|
|
40
|
+
else:
|
|
41
|
+
unity_table_full_name_list = import_args.get("unity_table_full_name")
|
|
42
|
+
data_contract_specification = import_unity_from_api(data_contract_specification, unity_table_full_name_list)
|
|
43
|
+
return data_contract_specification
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def import_unity_from_json(
|
|
47
|
+
data_contract_specification: DataContractSpecification | OpenDataContractStandard, source: str
|
|
48
|
+
) -> DataContractSpecification | OpenDataContractStandard:
|
|
49
|
+
"""
|
|
50
|
+
Import data contract specification from a JSON file.
|
|
51
|
+
|
|
52
|
+
:param data_contract_specification: The data contract specification to be imported.
|
|
53
|
+
:type data_contract_specification: DataContractSpecification
|
|
54
|
+
:param source: The path to the JSON file.
|
|
55
|
+
:type source: str
|
|
56
|
+
:return: The imported data contract specification.
|
|
57
|
+
:rtype: DataContractSpecification
|
|
58
|
+
:raises DataContractException: If there is an error parsing the JSON file.
|
|
59
|
+
"""
|
|
60
|
+
try:
|
|
61
|
+
with open(source, "r") as file:
|
|
62
|
+
json_contents = json.loads(file.read())
|
|
63
|
+
unity_schema = TableInfo.from_dict(json_contents)
|
|
64
|
+
except json.JSONDecodeError as e:
|
|
65
|
+
raise DataContractException(
|
|
66
|
+
type="schema",
|
|
67
|
+
name="Parse unity schema",
|
|
68
|
+
reason=f"Failed to parse unity schema from {source}",
|
|
69
|
+
engine="datacontract",
|
|
70
|
+
original_exception=e,
|
|
71
|
+
)
|
|
72
|
+
return convert_unity_schema(data_contract_specification, unity_schema)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def import_unity_from_api(
|
|
76
|
+
data_contract_specification: DataContractSpecification, unity_table_full_name_list: List[str] = None
|
|
77
|
+
) -> DataContractSpecification:
|
|
78
|
+
"""
|
|
79
|
+
Import data contract specification from Unity Catalog API.
|
|
80
|
+
|
|
81
|
+
:param data_contract_specification: The data contract specification to be imported.
|
|
82
|
+
:type data_contract_specification: DataContractSpecification
|
|
83
|
+
:param unity_table_full_name_list: The full name of the Unity table.
|
|
84
|
+
:type unity_table_full_name_list: list[str]
|
|
85
|
+
:return: The imported data contract specification.
|
|
86
|
+
:rtype: DataContractSpecification
|
|
87
|
+
:raises DataContractException: If there is an error retrieving the schema from the API.
|
|
88
|
+
"""
|
|
89
|
+
try:
|
|
90
|
+
# print(f"Retrieving Unity Catalog schema for table: {unity_table_full_name}")
|
|
91
|
+
profile = os.getenv("DATACONTRACT_DATABRICKS_PROFILE")
|
|
92
|
+
host, token = os.getenv("DATACONTRACT_DATABRICKS_SERVER_HOSTNAME"), os.getenv("DATACONTRACT_DATABRICKS_TOKEN")
|
|
93
|
+
# print(f"Databricks host: {host}, token: {'***' if token else 'not set'}")
|
|
94
|
+
exception = DataContractException(
|
|
95
|
+
type="configuration",
|
|
96
|
+
name="Databricks configuration",
|
|
97
|
+
reason="",
|
|
98
|
+
engine="datacontract",
|
|
99
|
+
)
|
|
100
|
+
if not profile and not host and not token:
|
|
101
|
+
reason = "Either DATACONTRACT_DATABRICKS_PROFILE or both DATACONTRACT_DATABRICKS_SERVER_HOSTNAME and DATACONTRACT_DATABRICKS_TOKEN environment variables must be set"
|
|
102
|
+
exception.reason = reason
|
|
103
|
+
raise exception
|
|
104
|
+
if token and not host:
|
|
105
|
+
reason = "DATACONTRACT_DATABRICKS_SERVER_HOSTNAME environment variable is not set"
|
|
106
|
+
exception.reason = reason
|
|
107
|
+
raise exception
|
|
108
|
+
if host and not token:
|
|
109
|
+
reason = "DATACONTRACT_DATABRICKS_TOKEN environment variable is not set"
|
|
110
|
+
exception.reason = reason
|
|
111
|
+
raise exception
|
|
112
|
+
workspace_client = WorkspaceClient(profile=profile) if profile else WorkspaceClient(host=host, token=token)
|
|
113
|
+
except Exception as e:
|
|
114
|
+
raise DataContractException(
|
|
115
|
+
type="schema",
|
|
116
|
+
name="Retrieve unity catalog schema",
|
|
117
|
+
reason="Failed to connect to unity catalog schema",
|
|
118
|
+
engine="datacontract",
|
|
119
|
+
original_exception=e,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
for unity_table_full_name in unity_table_full_name_list:
|
|
123
|
+
try:
|
|
124
|
+
unity_schema: TableInfo = workspace_client.tables.get(unity_table_full_name)
|
|
125
|
+
except Exception as e:
|
|
126
|
+
raise DataContractException(
|
|
127
|
+
type="schema",
|
|
128
|
+
name="Retrieve unity catalog schema",
|
|
129
|
+
reason=f"Unity table {unity_table_full_name} not found",
|
|
130
|
+
engine="datacontract",
|
|
131
|
+
original_exception=e,
|
|
132
|
+
)
|
|
133
|
+
data_contract_specification = convert_unity_schema(data_contract_specification, unity_schema)
|
|
134
|
+
|
|
135
|
+
return data_contract_specification
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def convert_unity_schema(
|
|
139
|
+
data_contract_specification: DataContractSpecification | OpenDataContractStandard, unity_schema: TableInfo
|
|
140
|
+
) -> DataContractSpecification | OpenDataContractStandard:
|
|
141
|
+
"""
|
|
142
|
+
Convert Unity schema to data contract specification.
|
|
143
|
+
|
|
144
|
+
:param data_contract_specification: The data contract specification to be converted.
|
|
145
|
+
:type data_contract_specification: DataContractSpecification
|
|
146
|
+
:param unity_schema: The Unity schema to be converted.
|
|
147
|
+
:type unity_schema: TableInfo
|
|
148
|
+
:return: The converted data contract specification.
|
|
149
|
+
:rtype: DataContractSpecification
|
|
150
|
+
"""
|
|
151
|
+
if data_contract_specification.models is None:
|
|
152
|
+
data_contract_specification.models = {}
|
|
153
|
+
|
|
154
|
+
if data_contract_specification.servers is None:
|
|
155
|
+
data_contract_specification.servers = {}
|
|
156
|
+
|
|
157
|
+
# Configure databricks server with catalog and schema from Unity table info
|
|
158
|
+
schema_name = unity_schema.schema_name
|
|
159
|
+
catalog_name = unity_schema.catalog_name
|
|
160
|
+
if catalog_name and schema_name:
|
|
161
|
+
server_name = "myserver" # Default server name
|
|
162
|
+
|
|
163
|
+
data_contract_specification.servers[server_name] = Server(
|
|
164
|
+
type="databricks",
|
|
165
|
+
catalog=catalog_name,
|
|
166
|
+
schema=schema_name,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
fields = import_table_fields(unity_schema.columns)
|
|
170
|
+
|
|
171
|
+
table_id = unity_schema.name or unity_schema.table_id
|
|
172
|
+
|
|
173
|
+
data_contract_specification.models[table_id] = Model(fields=fields, type="table")
|
|
174
|
+
|
|
175
|
+
if unity_schema.name:
|
|
176
|
+
data_contract_specification.models[table_id].title = unity_schema.name
|
|
177
|
+
|
|
178
|
+
if unity_schema.comment:
|
|
179
|
+
data_contract_specification.models[table_id].description = unity_schema.comment
|
|
180
|
+
|
|
181
|
+
return data_contract_specification
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def import_table_fields(columns: List[ColumnInfo]) -> dict[str, Field]:
|
|
185
|
+
"""
|
|
186
|
+
Import table fields from Unity schema columns.
|
|
187
|
+
|
|
188
|
+
Here we are first converting the `ColumnInfo.type_json` to a Spark StructField object
|
|
189
|
+
so we can leave the complexity of the Spark field types to the Spark JSON schema parser,
|
|
190
|
+
then re-use the logic in `datacontract.imports.spark_importer` to convert the StructField
|
|
191
|
+
into a Field object.
|
|
192
|
+
|
|
193
|
+
:param columns: The list of Unity schema columns.
|
|
194
|
+
:type columns: List[ColumnInfo]
|
|
195
|
+
:return: A dictionary of imported fields.
|
|
196
|
+
:rtype: dict[str, Field]
|
|
197
|
+
"""
|
|
198
|
+
imported_fields = {}
|
|
199
|
+
|
|
200
|
+
for column in columns:
|
|
201
|
+
imported_fields[column.name] = _to_field(column)
|
|
202
|
+
|
|
203
|
+
return imported_fields
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _to_field(column: ColumnInfo) -> Field:
|
|
207
|
+
field = Field()
|
|
208
|
+
# The second condition evaluates for complex types (e.g. variant)
|
|
209
|
+
if column.type_name is not None or (column.type_name is None and column.type_text is not None):
|
|
210
|
+
sql_type = str(column.type_text)
|
|
211
|
+
field.type = map_type_from_sql(sql_type)
|
|
212
|
+
physical_type_key = to_physical_type_key("databricks")
|
|
213
|
+
field.config = {
|
|
214
|
+
physical_type_key: sql_type,
|
|
215
|
+
}
|
|
216
|
+
field.required = column.nullable is None or not column.nullable
|
|
217
|
+
field.description = column.comment if column.comment else None
|
|
218
|
+
|
|
219
|
+
return field
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import importlib.resources as resources
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
|
|
6
|
+
DEFAULT_DATA_CONTRACT_INIT_TEMPLATE = "datacontract-1.2.1.init.yaml"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_init_template(location: str = None) -> str:
|
|
10
|
+
if location is None:
|
|
11
|
+
logging.info("Use default bundled template " + DEFAULT_DATA_CONTRACT_INIT_TEMPLATE)
|
|
12
|
+
schemas = resources.files("datacontract")
|
|
13
|
+
template = schemas.joinpath("schemas", DEFAULT_DATA_CONTRACT_INIT_TEMPLATE)
|
|
14
|
+
with template.open("r") as file:
|
|
15
|
+
return file.read()
|
|
16
|
+
elif location.startswith("http://") or location.startswith("https://"):
|
|
17
|
+
return requests.get(location).text
|
|
18
|
+
else:
|
|
19
|
+
with open(location, "r") as file:
|
|
20
|
+
return file.read()
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
from datacontract.model.run import Run
|
|
6
|
+
|
|
7
|
+
# used to retrieve the HTML location of the published data contract or test results
|
|
8
|
+
RESPONSE_HEADER_LOCATION_HTML = "location-html"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def publish_test_results_to_datamesh_manager(run: Run, publish_url: str, ssl_verification: bool):
|
|
12
|
+
try:
|
|
13
|
+
if publish_url is None:
|
|
14
|
+
# this url supports Data Mesh Manager and Data Contract Manager
|
|
15
|
+
url = "https://api.datamesh-manager.com/api/test-results"
|
|
16
|
+
else:
|
|
17
|
+
url = publish_url
|
|
18
|
+
|
|
19
|
+
api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
|
|
20
|
+
if api_key is None:
|
|
21
|
+
api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY")
|
|
22
|
+
if api_key is None:
|
|
23
|
+
raise Exception(
|
|
24
|
+
"Cannot publish run results, as DATAMESH_MANAGER_API_KEY nor DATACONTRACT_MANAGER_API_KEY are not set"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
if run.dataContractId is None:
|
|
28
|
+
raise Exception("Cannot publish run results for unknown data contract ID")
|
|
29
|
+
|
|
30
|
+
headers = {"Content-Type": "application/json", "x-api-key": api_key}
|
|
31
|
+
request_body = run.model_dump_json()
|
|
32
|
+
# print("Request Body:", request_body)
|
|
33
|
+
response = requests.post(
|
|
34
|
+
url,
|
|
35
|
+
data=request_body,
|
|
36
|
+
headers=headers,
|
|
37
|
+
verify=ssl_verification,
|
|
38
|
+
)
|
|
39
|
+
# print("Status Code:", response.status_code)
|
|
40
|
+
# print("Response Body:", response.text)
|
|
41
|
+
if response.status_code != 200:
|
|
42
|
+
run.log_error(f"Error publishing test results to Data Mesh Manager: {response.text}")
|
|
43
|
+
return
|
|
44
|
+
run.log_info("Published test results successfully")
|
|
45
|
+
|
|
46
|
+
location_html = response.headers.get(RESPONSE_HEADER_LOCATION_HTML)
|
|
47
|
+
if location_html is not None and len(location_html) > 0:
|
|
48
|
+
print(f"🚀 Open {location_html}")
|
|
49
|
+
|
|
50
|
+
except Exception as e:
|
|
51
|
+
run.log_error(f"Failed publishing test results. Error: {str(e)}")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def publish_data_contract_to_datamesh_manager(data_contract_dict: dict, ssl_verification: bool):
|
|
55
|
+
try:
|
|
56
|
+
api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
|
|
57
|
+
host = "https://api.datamesh-manager.com"
|
|
58
|
+
if os.getenv("DATAMESH_MANAGER_HOST") is not None:
|
|
59
|
+
host = os.getenv("DATAMESH_MANAGER_HOST")
|
|
60
|
+
if api_key is None:
|
|
61
|
+
api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY")
|
|
62
|
+
if api_key is None:
|
|
63
|
+
raise Exception(
|
|
64
|
+
"Cannot publish data contract, as neither DATAMESH_MANAGER_API_KEY nor DATACONTRACT_MANAGER_API_KEY is set"
|
|
65
|
+
)
|
|
66
|
+
headers = {"Content-Type": "application/json", "x-api-key": api_key}
|
|
67
|
+
id = data_contract_dict["id"]
|
|
68
|
+
url = f"{host}/api/datacontracts/{id}"
|
|
69
|
+
response = requests.put(
|
|
70
|
+
url=url,
|
|
71
|
+
json=data_contract_dict,
|
|
72
|
+
headers=headers,
|
|
73
|
+
verify=ssl_verification,
|
|
74
|
+
)
|
|
75
|
+
if response.status_code != 200:
|
|
76
|
+
print(f"Error publishing data contract to Data Mesh Manager: {response.text}")
|
|
77
|
+
exit(1)
|
|
78
|
+
|
|
79
|
+
print("✅ Published data contract successfully")
|
|
80
|
+
|
|
81
|
+
location_html = response.headers.get(RESPONSE_HEADER_LOCATION_HTML)
|
|
82
|
+
if location_html is not None and len(location_html) > 0:
|
|
83
|
+
print(f"🚀 Open {location_html}")
|
|
84
|
+
|
|
85
|
+
except Exception as e:
|
|
86
|
+
print(f"Failed publishing data contract. Error: {str(e)}")
|