datacontract-cli 0.10.23__py3-none-any.whl → 0.10.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datacontract/__init__.py +13 -0
- datacontract/api.py +12 -5
- datacontract/catalog/catalog.py +5 -3
- datacontract/cli.py +119 -13
- datacontract/data_contract.py +145 -67
- datacontract/engines/data_contract_checks.py +366 -60
- datacontract/engines/data_contract_test.py +50 -4
- datacontract/engines/fastjsonschema/check_jsonschema.py +37 -19
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +3 -2
- datacontract/engines/soda/check_soda_execute.py +27 -3
- datacontract/engines/soda/connections/athena.py +79 -0
- datacontract/engines/soda/connections/duckdb_connection.py +65 -6
- datacontract/engines/soda/connections/kafka.py +4 -2
- datacontract/engines/soda/connections/oracle.py +50 -0
- datacontract/export/avro_converter.py +20 -3
- datacontract/export/bigquery_converter.py +1 -1
- datacontract/export/dbt_converter.py +36 -7
- datacontract/export/dqx_converter.py +126 -0
- datacontract/export/duckdb_type_converter.py +57 -0
- datacontract/export/excel_exporter.py +923 -0
- datacontract/export/exporter.py +3 -0
- datacontract/export/exporter_factory.py +17 -1
- datacontract/export/great_expectations_converter.py +55 -5
- datacontract/export/{html_export.py → html_exporter.py} +31 -20
- datacontract/export/markdown_converter.py +134 -5
- datacontract/export/mermaid_exporter.py +110 -0
- datacontract/export/odcs_v3_exporter.py +193 -149
- datacontract/export/protobuf_converter.py +163 -69
- datacontract/export/rdf_converter.py +2 -2
- datacontract/export/sodacl_converter.py +9 -1
- datacontract/export/spark_converter.py +31 -4
- datacontract/export/sql_converter.py +6 -2
- datacontract/export/sql_type_converter.py +124 -8
- datacontract/imports/avro_importer.py +63 -12
- datacontract/imports/csv_importer.py +111 -57
- datacontract/imports/excel_importer.py +1112 -0
- datacontract/imports/importer.py +16 -3
- datacontract/imports/importer_factory.py +17 -0
- datacontract/imports/json_importer.py +325 -0
- datacontract/imports/odcs_importer.py +2 -2
- datacontract/imports/odcs_v3_importer.py +367 -151
- datacontract/imports/protobuf_importer.py +264 -0
- datacontract/imports/spark_importer.py +117 -13
- datacontract/imports/sql_importer.py +32 -16
- datacontract/imports/unity_importer.py +84 -38
- datacontract/init/init_template.py +1 -1
- datacontract/integration/entropy_data.py +126 -0
- datacontract/lint/resolve.py +112 -23
- datacontract/lint/schema.py +24 -15
- datacontract/lint/urls.py +17 -3
- datacontract/model/data_contract_specification/__init__.py +1 -0
- datacontract/model/odcs.py +13 -0
- datacontract/model/run.py +3 -0
- datacontract/output/junit_test_results.py +3 -3
- datacontract/schemas/datacontract-1.1.0.init.yaml +1 -1
- datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
- datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
- datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
- datacontract/schemas/odcs-3.1.0.schema.json +2809 -0
- datacontract/templates/datacontract.html +54 -3
- datacontract/templates/datacontract_odcs.html +685 -0
- datacontract/templates/index.html +5 -2
- datacontract/templates/partials/server.html +2 -0
- datacontract/templates/style/output.css +319 -145
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/METADATA +711 -433
- datacontract_cli-0.10.40.dist-info/RECORD +121 -0
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/WHEEL +1 -1
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info/licenses}/LICENSE +1 -1
- datacontract/export/csv_type_converter.py +0 -36
- datacontract/integration/datamesh_manager.py +0 -72
- datacontract/lint/lint.py +0 -142
- datacontract/lint/linters/description_linter.py +0 -35
- datacontract/lint/linters/field_pattern_linter.py +0 -34
- datacontract/lint/linters/field_reference_linter.py +0 -48
- datacontract/lint/linters/notice_period_linter.py +0 -55
- datacontract/lint/linters/quality_schema_linter.py +0 -52
- datacontract/lint/linters/valid_constraints_linter.py +0 -100
- datacontract/model/data_contract_specification.py +0 -327
- datacontract_cli-0.10.23.dist-info/RECORD +0 -113
- /datacontract/{lint/linters → output}/__init__.py +0 -0
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.40.dist-info}/top_level.txt +0 -0
|
@@ -55,7 +55,6 @@ def import_avro(data_contract_specification: DataContractSpecification, source:
|
|
|
55
55
|
engine="datacontract",
|
|
56
56
|
original_exception=e,
|
|
57
57
|
)
|
|
58
|
-
|
|
59
58
|
# type record is being used for both the table and the object types in data contract
|
|
60
59
|
# -> CONSTRAINT: one table per .avsc input, all nested records are interpreted as objects
|
|
61
60
|
fields = import_record_fields(avro_schema.fields)
|
|
@@ -92,6 +91,20 @@ def handle_config_avro_custom_properties(field: avro.schema.Field, imported_fiel
|
|
|
92
91
|
imported_field.config["avroDefault"] = field.default
|
|
93
92
|
|
|
94
93
|
|
|
94
|
+
LOGICAL_TYPE_MAPPING = {
|
|
95
|
+
"decimal": "decimal",
|
|
96
|
+
"date": "date",
|
|
97
|
+
"time-millis": "time",
|
|
98
|
+
"time-micros": "time",
|
|
99
|
+
"timestamp-millis": "timestamp_tz",
|
|
100
|
+
"timestamp-micros": "timestamp_tz",
|
|
101
|
+
"local-timestamp-micros": "timestamp_ntz",
|
|
102
|
+
"local-timestamp-millis": "timestamp_ntz",
|
|
103
|
+
"duration": "string",
|
|
104
|
+
"uuid": "string",
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
|
|
95
108
|
def import_record_fields(record_fields: List[avro.schema.Field]) -> Dict[str, Field]:
|
|
96
109
|
"""
|
|
97
110
|
Import Avro record fields and convert them to data contract fields.
|
|
@@ -117,13 +130,23 @@ def import_record_fields(record_fields: List[avro.schema.Field]) -> Dict[str, Fi
|
|
|
117
130
|
imported_field.fields = import_record_fields(field.type.fields)
|
|
118
131
|
elif field.type.type == "union":
|
|
119
132
|
imported_field.required = False
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
if
|
|
123
|
-
imported_field.
|
|
124
|
-
|
|
125
|
-
imported_field.
|
|
126
|
-
imported_field.
|
|
133
|
+
# Check for enum in union first, since it needs special handling
|
|
134
|
+
enum_schema = get_enum_from_union_field(field)
|
|
135
|
+
if enum_schema:
|
|
136
|
+
imported_field.type = "string"
|
|
137
|
+
imported_field.enum = enum_schema.symbols
|
|
138
|
+
imported_field.title = enum_schema.name
|
|
139
|
+
if not imported_field.config:
|
|
140
|
+
imported_field.config = {}
|
|
141
|
+
imported_field.config["avroType"] = "enum"
|
|
142
|
+
else:
|
|
143
|
+
type = import_type_of_optional_field(field)
|
|
144
|
+
imported_field.type = type
|
|
145
|
+
if type == "record":
|
|
146
|
+
imported_field.fields = import_record_fields(get_record_from_union_field(field).fields)
|
|
147
|
+
elif type == "array":
|
|
148
|
+
imported_field.type = "array"
|
|
149
|
+
imported_field.items = import_avro_array_items(get_array_from_union_field(field))
|
|
127
150
|
elif field.type.type == "array":
|
|
128
151
|
imported_field.type = "array"
|
|
129
152
|
imported_field.items = import_avro_array_items(field.type)
|
|
@@ -137,9 +160,15 @@ def import_record_fields(record_fields: List[avro.schema.Field]) -> Dict[str, Fi
|
|
|
137
160
|
if not imported_field.config:
|
|
138
161
|
imported_field.config = {}
|
|
139
162
|
imported_field.config["avroType"] = "enum"
|
|
140
|
-
else:
|
|
141
|
-
|
|
142
|
-
|
|
163
|
+
else:
|
|
164
|
+
logical_type = field.type.get_prop("logicalType")
|
|
165
|
+
if logical_type in LOGICAL_TYPE_MAPPING:
|
|
166
|
+
imported_field.type = LOGICAL_TYPE_MAPPING[logical_type]
|
|
167
|
+
if logical_type == "decimal":
|
|
168
|
+
imported_field.precision = field.type.precision
|
|
169
|
+
imported_field.scale = field.type.scale
|
|
170
|
+
else:
|
|
171
|
+
imported_field.type = map_type_from_avro(field.type.type)
|
|
143
172
|
imported_fields[field.name] = imported_field
|
|
144
173
|
|
|
145
174
|
return imported_fields
|
|
@@ -212,7 +241,11 @@ def import_type_of_optional_field(field: avro.schema.Field) -> str:
|
|
|
212
241
|
"""
|
|
213
242
|
for field_type in field.type.schemas:
|
|
214
243
|
if field_type.type != "null":
|
|
215
|
-
|
|
244
|
+
logical_type = field_type.get_prop("logicalType")
|
|
245
|
+
if logical_type and logical_type in LOGICAL_TYPE_MAPPING:
|
|
246
|
+
return LOGICAL_TYPE_MAPPING[logical_type]
|
|
247
|
+
else:
|
|
248
|
+
return map_type_from_avro(field_type.type)
|
|
216
249
|
raise DataContractException(
|
|
217
250
|
type="schema",
|
|
218
251
|
result="failed",
|
|
@@ -254,6 +287,22 @@ def get_array_from_union_field(field: avro.schema.Field) -> avro.schema.ArraySch
|
|
|
254
287
|
return None
|
|
255
288
|
|
|
256
289
|
|
|
290
|
+
def get_enum_from_union_field(field: avro.schema.Field) -> avro.schema.EnumSchema | None:
|
|
291
|
+
"""
|
|
292
|
+
Get the enum schema from a union field.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
field: The Avro field with a union type.
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
The enum schema if found, None otherwise.
|
|
299
|
+
"""
|
|
300
|
+
for field_type in field.type.schemas:
|
|
301
|
+
if field_type.type == "enum":
|
|
302
|
+
return field_type
|
|
303
|
+
return None
|
|
304
|
+
|
|
305
|
+
|
|
257
306
|
def map_type_from_avro(avro_type_str: str) -> str:
|
|
258
307
|
"""
|
|
259
308
|
Map Avro type strings to data contract type strings.
|
|
@@ -276,6 +325,8 @@ def map_type_from_avro(avro_type_str: str) -> str:
|
|
|
276
325
|
return "binary"
|
|
277
326
|
elif avro_type_str == "double":
|
|
278
327
|
return "double"
|
|
328
|
+
elif avro_type_str == "float":
|
|
329
|
+
return "float"
|
|
279
330
|
elif avro_type_str == "int":
|
|
280
331
|
return "int"
|
|
281
332
|
elif avro_type_str == "long":
|
|
@@ -1,89 +1,143 @@
|
|
|
1
1
|
import os
|
|
2
|
+
from typing import Any, Dict, List
|
|
2
3
|
|
|
3
|
-
import
|
|
4
|
+
import duckdb
|
|
4
5
|
|
|
5
6
|
from datacontract.imports.importer import Importer
|
|
6
|
-
from datacontract.model.data_contract_specification import DataContractSpecification,
|
|
7
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Model, Server
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class CsvImporter(Importer):
|
|
10
11
|
def import_source(
|
|
11
12
|
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
12
13
|
) -> DataContractSpecification:
|
|
13
|
-
return import_csv(data_contract_specification,
|
|
14
|
+
return import_csv(data_contract_specification, source)
|
|
14
15
|
|
|
15
16
|
|
|
16
|
-
def import_csv(
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
# detect encoding and dialect
|
|
20
|
-
encoding = clevercsv.encoding.get_encoding(source)
|
|
21
|
-
with open(source, "r", newline="") as fp:
|
|
22
|
-
dialect = clevercsv.Sniffer().sniff(fp.read(10000))
|
|
23
|
-
|
|
24
|
-
# using auto detecting of the format and encoding
|
|
25
|
-
df = clevercsv.read_dataframe(source)
|
|
26
|
-
|
|
27
|
-
if data_contract_specification.models is None:
|
|
28
|
-
data_contract_specification.models = {}
|
|
29
|
-
|
|
17
|
+
def import_csv(
|
|
18
|
+
data_contract_specification: DataContractSpecification, source: str, include_examples: bool = False
|
|
19
|
+
) -> DataContractSpecification:
|
|
30
20
|
# use the file name as table name
|
|
31
21
|
table_name = os.path.splitext(os.path.basename(source))[0]
|
|
32
22
|
|
|
23
|
+
# use duckdb to auto detect format, columns, etc.
|
|
24
|
+
con = duckdb.connect(database=":memory:")
|
|
25
|
+
con.sql(
|
|
26
|
+
f"""CREATE VIEW "{table_name}" AS SELECT * FROM read_csv_auto('{source}', hive_partitioning=1, auto_type_candidates = ['BOOLEAN', 'INTEGER', 'BIGINT', 'DOUBLE', 'VARCHAR']);"""
|
|
27
|
+
)
|
|
28
|
+
dialect = con.sql(f"SELECT * FROM sniff_csv('{source}', sample_size = 1000);").fetchnumpy()
|
|
29
|
+
tbl = con.table(table_name)
|
|
30
|
+
|
|
33
31
|
if data_contract_specification.servers is None:
|
|
34
32
|
data_contract_specification.servers = {}
|
|
35
33
|
|
|
34
|
+
delimiter = None if dialect is None else dialect["Delimiter"][0]
|
|
35
|
+
|
|
36
|
+
if dialect is not None:
|
|
37
|
+
dc_types = [map_type_from_duckdb(x["type"]) for x in dialect["Columns"][0]]
|
|
38
|
+
else:
|
|
39
|
+
dc_types = [map_type_from_duckdb(str(x)) for x in tbl.dtypes]
|
|
40
|
+
|
|
36
41
|
data_contract_specification.servers["production"] = Server(
|
|
37
|
-
type="local", path=source, format="csv", delimiter=
|
|
42
|
+
type="local", path=source, format="csv", delimiter=delimiter
|
|
38
43
|
)
|
|
39
44
|
|
|
45
|
+
rowcount = tbl.shape[0]
|
|
46
|
+
|
|
47
|
+
tallies = dict()
|
|
48
|
+
for row in tbl.describe().fetchall():
|
|
49
|
+
if row[0] not in ["count", "max", "min"]:
|
|
50
|
+
continue
|
|
51
|
+
for i in range(tbl.shape[1]):
|
|
52
|
+
tallies[(row[0], tbl.columns[i])] = row[i + 1] if row[0] != "count" else int(row[i + 1])
|
|
53
|
+
|
|
54
|
+
samples: Dict[str, List] = dict()
|
|
55
|
+
for i in range(tbl.shape[1]):
|
|
56
|
+
field_name = tbl.columns[i]
|
|
57
|
+
if tallies[("count", field_name)] > 0 and tbl.dtypes[i] not in ["BOOLEAN", "BLOB"]:
|
|
58
|
+
sql = f"""SELECT DISTINCT "{field_name}" FROM "{table_name}" WHERE "{field_name}" IS NOT NULL USING SAMPLE 5 ROWS;"""
|
|
59
|
+
samples[field_name] = [x[0] for x in con.sql(sql).fetchall()]
|
|
60
|
+
|
|
61
|
+
formats: Dict[str, str] = dict()
|
|
62
|
+
for i in range(tbl.shape[1]):
|
|
63
|
+
field_name = tbl.columns[i]
|
|
64
|
+
if tallies[("count", field_name)] > 0 and tbl.dtypes[i] == "VARCHAR":
|
|
65
|
+
sql = f"""SELECT
|
|
66
|
+
count_if("{field_name}" IS NOT NULL) as count,
|
|
67
|
+
count_if(regexp_matches("{field_name}", '^[\\w-\\.]+@([\\w-]+\\.)+[\\w-]{{2,4}}$')) as email,
|
|
68
|
+
count_if(regexp_matches("{field_name}", '^[[a-z0-9]{{8}}-?[a-z0-9]{{4}}-?[a-z0-9]{{4}}-?[a-z0-9]{{4}}-?[a-z0-9]{{12}}]')) as uuid
|
|
69
|
+
FROM "{table_name}";
|
|
70
|
+
"""
|
|
71
|
+
res = con.sql(sql).fetchone()
|
|
72
|
+
if res[1] == res[0]:
|
|
73
|
+
formats[field_name] = "email"
|
|
74
|
+
elif res[2] == res[0]:
|
|
75
|
+
formats[field_name] = "uuid"
|
|
76
|
+
|
|
40
77
|
fields = {}
|
|
41
|
-
for
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
78
|
+
for i in range(tbl.shape[1]):
|
|
79
|
+
field_name = tbl.columns[i]
|
|
80
|
+
dc_type = dc_types[i]
|
|
81
|
+
|
|
82
|
+
## specifying "integer" rather than "bigint" looks nicer
|
|
83
|
+
if (
|
|
84
|
+
dc_type == "bigint"
|
|
85
|
+
and tallies[("max", field_name)] <= 2147483647
|
|
86
|
+
and tallies[("min", field_name)] >= -2147483648
|
|
87
|
+
):
|
|
88
|
+
dc_type = "integer"
|
|
89
|
+
|
|
90
|
+
field: Dict[str, Any] = {"type": dc_type, "format": formats.get(field_name, None)}
|
|
91
|
+
|
|
92
|
+
if tallies[("count", field_name)] == rowcount:
|
|
93
|
+
field["required"] = True
|
|
94
|
+
if dc_type not in ["boolean", "bytes"]:
|
|
95
|
+
distinct_values = tbl.count(f'DISTINCT "{field_name}"').fetchone()[0] # type: ignore
|
|
96
|
+
if distinct_values > 0 and distinct_values == tallies[("count", field_name)]:
|
|
97
|
+
field["unique"] = True
|
|
98
|
+
s = samples.get(field_name, None)
|
|
99
|
+
if s is not None:
|
|
100
|
+
field["examples"] = s
|
|
101
|
+
if dc_type in ["integer", "bigint", "float", "double"]:
|
|
102
|
+
field["minimum"] = tallies[("min", field_name)]
|
|
103
|
+
field["maximum"] = tallies[("max", field_name)]
|
|
104
|
+
|
|
105
|
+
fields[field_name] = field
|
|
106
|
+
|
|
107
|
+
model_examples = None
|
|
108
|
+
if include_examples:
|
|
109
|
+
model_examples = con.sql(f"""SELECT DISTINCT * FROM "{table_name}" USING SAMPLE 5 ROWS;""").fetchall()
|
|
45
110
|
|
|
46
111
|
data_contract_specification.models[table_name] = Model(
|
|
47
|
-
type="table",
|
|
48
|
-
description=f"Csv file with encoding {encoding}",
|
|
49
|
-
fields=fields,
|
|
112
|
+
type="table", description="Generated model of " + source, fields=fields, examples=model_examples
|
|
50
113
|
)
|
|
51
114
|
|
|
52
|
-
# multiline data is not correctly handled by yaml dump
|
|
53
|
-
if include_example:
|
|
54
|
-
if data_contract_specification.examples is None:
|
|
55
|
-
data_contract_specification.examples = []
|
|
56
|
-
|
|
57
|
-
# read first 10 lines with the detected encoding
|
|
58
|
-
with open(source, "r", encoding=encoding) as csvfile:
|
|
59
|
-
lines = csvfile.readlines()[:10]
|
|
60
|
-
|
|
61
|
-
data_contract_specification.examples.append(Example(type="csv", model=table_name, data="".join(lines)))
|
|
62
|
-
|
|
63
115
|
return data_contract_specification
|
|
64
116
|
|
|
65
117
|
|
|
66
|
-
|
|
118
|
+
_duck_db_types = {
|
|
119
|
+
"BOOLEAN": "boolean",
|
|
120
|
+
"BLOB": "bytes",
|
|
121
|
+
"TINYINT": "integer",
|
|
122
|
+
"SMALLINT": "integer",
|
|
123
|
+
"INTEGER": "integer",
|
|
124
|
+
"BIGINT": "bigint",
|
|
125
|
+
"UTINYINT": "integer",
|
|
126
|
+
"USMALLINT": "integer",
|
|
127
|
+
"UINTEGER": "integer",
|
|
128
|
+
"UBIGINT": "bigint",
|
|
129
|
+
"FLOAT": "float",
|
|
130
|
+
"DOUBLE": "double",
|
|
131
|
+
"VARCHAR": "string",
|
|
132
|
+
"TIMESTAMP": "timestamp",
|
|
133
|
+
"DATE": "date",
|
|
134
|
+
# TODO: Add support for NULL
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def map_type_from_duckdb(sql_type: None | str):
|
|
67
139
|
if sql_type is None:
|
|
68
140
|
return None
|
|
69
141
|
|
|
70
|
-
sql_type_normed = sql_type.
|
|
71
|
-
|
|
72
|
-
if sql_type_normed == "object":
|
|
73
|
-
return "string"
|
|
74
|
-
elif sql_type_normed.startswith("str"):
|
|
75
|
-
return "string"
|
|
76
|
-
elif sql_type_normed.startswith("int"):
|
|
77
|
-
return "integer"
|
|
78
|
-
elif sql_type_normed.startswith("float"):
|
|
79
|
-
return "float"
|
|
80
|
-
elif sql_type_normed.startswith("bool"):
|
|
81
|
-
return "boolean"
|
|
82
|
-
elif sql_type_normed.startswith("timestamp"):
|
|
83
|
-
return "timestamp"
|
|
84
|
-
elif sql_type_normed == "datetime64":
|
|
85
|
-
return "date"
|
|
86
|
-
elif sql_type_normed == "timedelta[ns]":
|
|
87
|
-
return "timestamp_ntz"
|
|
88
|
-
else:
|
|
89
|
-
return "variant"
|
|
142
|
+
sql_type_normed = sql_type.upper().strip()
|
|
143
|
+
return _duck_db_types.get(sql_type_normed, "string")
|