datacontract-cli 0.10.0__py3-none-any.whl → 0.10.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datacontract/__init__.py +13 -0
- datacontract/api.py +260 -0
- datacontract/breaking/breaking.py +242 -12
- datacontract/breaking/breaking_rules.py +37 -1
- datacontract/catalog/catalog.py +80 -0
- datacontract/cli.py +387 -117
- datacontract/data_contract.py +216 -353
- datacontract/engines/data_contract_checks.py +1041 -0
- datacontract/engines/data_contract_test.py +113 -0
- datacontract/engines/datacontract/check_that_datacontract_contains_valid_servers_configuration.py +2 -3
- datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
- datacontract/engines/fastjsonschema/check_jsonschema.py +176 -42
- datacontract/engines/fastjsonschema/s3/s3_read_files.py +16 -1
- datacontract/engines/soda/check_soda_execute.py +100 -56
- datacontract/engines/soda/connections/athena.py +79 -0
- datacontract/engines/soda/connections/bigquery.py +8 -1
- datacontract/engines/soda/connections/databricks.py +12 -3
- datacontract/engines/soda/connections/duckdb_connection.py +241 -0
- datacontract/engines/soda/connections/kafka.py +206 -113
- datacontract/engines/soda/connections/snowflake.py +8 -5
- datacontract/engines/soda/connections/sqlserver.py +43 -0
- datacontract/engines/soda/connections/trino.py +26 -0
- datacontract/export/avro_converter.py +72 -8
- datacontract/export/avro_idl_converter.py +31 -25
- datacontract/export/bigquery_converter.py +130 -0
- datacontract/export/custom_converter.py +40 -0
- datacontract/export/data_caterer_converter.py +161 -0
- datacontract/export/dbml_converter.py +148 -0
- datacontract/export/dbt_converter.py +141 -54
- datacontract/export/dcs_exporter.py +6 -0
- datacontract/export/dqx_converter.py +126 -0
- datacontract/export/duckdb_type_converter.py +57 -0
- datacontract/export/excel_exporter.py +923 -0
- datacontract/export/exporter.py +100 -0
- datacontract/export/exporter_factory.py +216 -0
- datacontract/export/go_converter.py +105 -0
- datacontract/export/great_expectations_converter.py +257 -36
- datacontract/export/html_exporter.py +86 -0
- datacontract/export/iceberg_converter.py +188 -0
- datacontract/export/jsonschema_converter.py +71 -16
- datacontract/export/markdown_converter.py +337 -0
- datacontract/export/mermaid_exporter.py +110 -0
- datacontract/export/odcs_v3_exporter.py +375 -0
- datacontract/export/pandas_type_converter.py +40 -0
- datacontract/export/protobuf_converter.py +168 -68
- datacontract/export/pydantic_converter.py +6 -0
- datacontract/export/rdf_converter.py +13 -6
- datacontract/export/sodacl_converter.py +36 -188
- datacontract/export/spark_converter.py +245 -0
- datacontract/export/sql_converter.py +37 -3
- datacontract/export/sql_type_converter.py +269 -8
- datacontract/export/sqlalchemy_converter.py +170 -0
- datacontract/export/terraform_converter.py +7 -2
- datacontract/imports/avro_importer.py +246 -26
- datacontract/imports/bigquery_importer.py +221 -0
- datacontract/imports/csv_importer.py +143 -0
- datacontract/imports/dbml_importer.py +112 -0
- datacontract/imports/dbt_importer.py +240 -0
- datacontract/imports/excel_importer.py +1111 -0
- datacontract/imports/glue_importer.py +288 -0
- datacontract/imports/iceberg_importer.py +172 -0
- datacontract/imports/importer.py +51 -0
- datacontract/imports/importer_factory.py +128 -0
- datacontract/imports/json_importer.py +325 -0
- datacontract/imports/jsonschema_importer.py +146 -0
- datacontract/imports/odcs_importer.py +60 -0
- datacontract/imports/odcs_v3_importer.py +516 -0
- datacontract/imports/parquet_importer.py +81 -0
- datacontract/imports/protobuf_importer.py +264 -0
- datacontract/imports/spark_importer.py +262 -0
- datacontract/imports/sql_importer.py +274 -35
- datacontract/imports/unity_importer.py +219 -0
- datacontract/init/init_template.py +20 -0
- datacontract/integration/datamesh_manager.py +86 -0
- datacontract/lint/resolve.py +271 -49
- datacontract/lint/resources.py +21 -0
- datacontract/lint/schema.py +53 -17
- datacontract/lint/urls.py +32 -12
- datacontract/model/data_contract_specification/__init__.py +1 -0
- datacontract/model/exceptions.py +4 -1
- datacontract/model/odcs.py +24 -0
- datacontract/model/run.py +49 -29
- datacontract/output/__init__.py +0 -0
- datacontract/output/junit_test_results.py +135 -0
- datacontract/output/output_format.py +10 -0
- datacontract/output/test_results_writer.py +79 -0
- datacontract/py.typed +0 -0
- datacontract/schemas/datacontract-1.1.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.1.0.schema.json +1975 -0
- datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
- datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
- datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
- datacontract/schemas/odcs-3.0.1.schema.json +2634 -0
- datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
- datacontract/templates/datacontract.html +139 -294
- datacontract/templates/datacontract_odcs.html +685 -0
- datacontract/templates/index.html +236 -0
- datacontract/templates/partials/datacontract_information.html +86 -0
- datacontract/templates/partials/datacontract_servicelevels.html +253 -0
- datacontract/templates/partials/datacontract_terms.html +51 -0
- datacontract/templates/partials/definition.html +25 -0
- datacontract/templates/partials/example.html +27 -0
- datacontract/templates/partials/model_field.html +144 -0
- datacontract/templates/partials/quality.html +49 -0
- datacontract/templates/partials/server.html +211 -0
- datacontract/templates/style/output.css +491 -72
- datacontract_cli-0.10.37.dist-info/METADATA +2235 -0
- datacontract_cli-0.10.37.dist-info/RECORD +119 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/WHEEL +1 -1
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info/licenses}/LICENSE +1 -1
- datacontract/engines/datacontract/check_that_datacontract_str_is_valid.py +0 -48
- datacontract/engines/soda/connections/dask.py +0 -28
- datacontract/engines/soda/connections/duckdb.py +0 -76
- datacontract/export/csv_type_converter.py +0 -36
- datacontract/export/html_export.py +0 -66
- datacontract/export/odcs_converter.py +0 -102
- datacontract/init/download_datacontract_file.py +0 -17
- datacontract/integration/publish_datamesh_manager.py +0 -33
- datacontract/integration/publish_opentelemetry.py +0 -107
- datacontract/lint/lint.py +0 -141
- datacontract/lint/linters/description_linter.py +0 -34
- datacontract/lint/linters/example_model_linter.py +0 -91
- datacontract/lint/linters/field_pattern_linter.py +0 -34
- datacontract/lint/linters/field_reference_linter.py +0 -38
- datacontract/lint/linters/notice_period_linter.py +0 -55
- datacontract/lint/linters/quality_schema_linter.py +0 -52
- datacontract/lint/linters/valid_constraints_linter.py +0 -99
- datacontract/model/data_contract_specification.py +0 -141
- datacontract/web.py +0 -14
- datacontract_cli-0.10.0.dist-info/METADATA +0 -951
- datacontract_cli-0.10.0.dist-info/RECORD +0 -66
- /datacontract/{model → breaking}/breaking_change.py +0 -0
- /datacontract/{lint/linters → export}/__init__.py +0 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.0.dist-info → datacontract_cli-0.10.37.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
import duckdb
|
|
5
|
+
|
|
6
|
+
from datacontract.imports.importer import Importer
|
|
7
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Model, Server
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CsvImporter(Importer):
|
|
11
|
+
def import_source(
|
|
12
|
+
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
13
|
+
) -> DataContractSpecification:
|
|
14
|
+
return import_csv(data_contract_specification, source)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def import_csv(
|
|
18
|
+
data_contract_specification: DataContractSpecification, source: str, include_examples: bool = False
|
|
19
|
+
) -> DataContractSpecification:
|
|
20
|
+
# use the file name as table name
|
|
21
|
+
table_name = os.path.splitext(os.path.basename(source))[0]
|
|
22
|
+
|
|
23
|
+
# use duckdb to auto detect format, columns, etc.
|
|
24
|
+
con = duckdb.connect(database=":memory:")
|
|
25
|
+
con.sql(
|
|
26
|
+
f"""CREATE VIEW "{table_name}" AS SELECT * FROM read_csv_auto('{source}', hive_partitioning=1, auto_type_candidates = ['BOOLEAN', 'INTEGER', 'BIGINT', 'DOUBLE', 'VARCHAR']);"""
|
|
27
|
+
)
|
|
28
|
+
dialect = con.sql(f"SELECT * FROM sniff_csv('{source}', sample_size = 1000);").fetchnumpy()
|
|
29
|
+
tbl = con.table(table_name)
|
|
30
|
+
|
|
31
|
+
if data_contract_specification.servers is None:
|
|
32
|
+
data_contract_specification.servers = {}
|
|
33
|
+
|
|
34
|
+
delimiter = None if dialect is None else dialect["Delimiter"][0]
|
|
35
|
+
|
|
36
|
+
if dialect is not None:
|
|
37
|
+
dc_types = [map_type_from_duckdb(x["type"]) for x in dialect["Columns"][0]]
|
|
38
|
+
else:
|
|
39
|
+
dc_types = [map_type_from_duckdb(str(x)) for x in tbl.dtypes]
|
|
40
|
+
|
|
41
|
+
data_contract_specification.servers["production"] = Server(
|
|
42
|
+
type="local", path=source, format="csv", delimiter=delimiter
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
rowcount = tbl.shape[0]
|
|
46
|
+
|
|
47
|
+
tallies = dict()
|
|
48
|
+
for row in tbl.describe().fetchall():
|
|
49
|
+
if row[0] not in ["count", "max", "min"]:
|
|
50
|
+
continue
|
|
51
|
+
for i in range(tbl.shape[1]):
|
|
52
|
+
tallies[(row[0], tbl.columns[i])] = row[i + 1] if row[0] != "count" else int(row[i + 1])
|
|
53
|
+
|
|
54
|
+
samples: Dict[str, List] = dict()
|
|
55
|
+
for i in range(tbl.shape[1]):
|
|
56
|
+
field_name = tbl.columns[i]
|
|
57
|
+
if tallies[("count", field_name)] > 0 and tbl.dtypes[i] not in ["BOOLEAN", "BLOB"]:
|
|
58
|
+
sql = f"""SELECT DISTINCT "{field_name}" FROM "{table_name}" WHERE "{field_name}" IS NOT NULL USING SAMPLE 5 ROWS;"""
|
|
59
|
+
samples[field_name] = [x[0] for x in con.sql(sql).fetchall()]
|
|
60
|
+
|
|
61
|
+
formats: Dict[str, str] = dict()
|
|
62
|
+
for i in range(tbl.shape[1]):
|
|
63
|
+
field_name = tbl.columns[i]
|
|
64
|
+
if tallies[("count", field_name)] > 0 and tbl.dtypes[i] == "VARCHAR":
|
|
65
|
+
sql = f"""SELECT
|
|
66
|
+
count_if("{field_name}" IS NOT NULL) as count,
|
|
67
|
+
count_if(regexp_matches("{field_name}", '^[\\w-\\.]+@([\\w-]+\\.)+[\\w-]{{2,4}}$')) as email,
|
|
68
|
+
count_if(regexp_matches("{field_name}", '^[[a-z0-9]{{8}}-?[a-z0-9]{{4}}-?[a-z0-9]{{4}}-?[a-z0-9]{{4}}-?[a-z0-9]{{12}}]')) as uuid
|
|
69
|
+
FROM "{table_name}";
|
|
70
|
+
"""
|
|
71
|
+
res = con.sql(sql).fetchone()
|
|
72
|
+
if res[1] == res[0]:
|
|
73
|
+
formats[field_name] = "email"
|
|
74
|
+
elif res[2] == res[0]:
|
|
75
|
+
formats[field_name] = "uuid"
|
|
76
|
+
|
|
77
|
+
fields = {}
|
|
78
|
+
for i in range(tbl.shape[1]):
|
|
79
|
+
field_name = tbl.columns[i]
|
|
80
|
+
dc_type = dc_types[i]
|
|
81
|
+
|
|
82
|
+
## specifying "integer" rather than "bigint" looks nicer
|
|
83
|
+
if (
|
|
84
|
+
dc_type == "bigint"
|
|
85
|
+
and tallies[("max", field_name)] <= 2147483647
|
|
86
|
+
and tallies[("min", field_name)] >= -2147483648
|
|
87
|
+
):
|
|
88
|
+
dc_type = "integer"
|
|
89
|
+
|
|
90
|
+
field: Dict[str, Any] = {"type": dc_type, "format": formats.get(field_name, None)}
|
|
91
|
+
|
|
92
|
+
if tallies[("count", field_name)] == rowcount:
|
|
93
|
+
field["required"] = True
|
|
94
|
+
if dc_type not in ["boolean", "bytes"]:
|
|
95
|
+
distinct_values = tbl.count(f'DISTINCT "{field_name}"').fetchone()[0] # type: ignore
|
|
96
|
+
if distinct_values > 0 and distinct_values == tallies[("count", field_name)]:
|
|
97
|
+
field["unique"] = True
|
|
98
|
+
s = samples.get(field_name, None)
|
|
99
|
+
if s is not None:
|
|
100
|
+
field["examples"] = s
|
|
101
|
+
if dc_type in ["integer", "bigint", "float", "double"]:
|
|
102
|
+
field["minimum"] = tallies[("min", field_name)]
|
|
103
|
+
field["maximum"] = tallies[("max", field_name)]
|
|
104
|
+
|
|
105
|
+
fields[field_name] = field
|
|
106
|
+
|
|
107
|
+
model_examples = None
|
|
108
|
+
if include_examples:
|
|
109
|
+
model_examples = con.sql(f"""SELECT DISTINCT * FROM "{table_name}" USING SAMPLE 5 ROWS;""").fetchall()
|
|
110
|
+
|
|
111
|
+
data_contract_specification.models[table_name] = Model(
|
|
112
|
+
type="table", description="Generated model of " + source, fields=fields, examples=model_examples
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
return data_contract_specification
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
_duck_db_types = {
|
|
119
|
+
"BOOLEAN": "boolean",
|
|
120
|
+
"BLOB": "bytes",
|
|
121
|
+
"TINYINT": "integer",
|
|
122
|
+
"SMALLINT": "integer",
|
|
123
|
+
"INTEGER": "integer",
|
|
124
|
+
"BIGINT": "bigint",
|
|
125
|
+
"UTINYINT": "integer",
|
|
126
|
+
"USMALLINT": "integer",
|
|
127
|
+
"UINTEGER": "integer",
|
|
128
|
+
"UBIGINT": "bigint",
|
|
129
|
+
"FLOAT": "float",
|
|
130
|
+
"DOUBLE": "double",
|
|
131
|
+
"VARCHAR": "string",
|
|
132
|
+
"TIMESTAMP": "timestamp",
|
|
133
|
+
"DATE": "date",
|
|
134
|
+
# TODO: Add support for NULL
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def map_type_from_duckdb(sql_type: None | str):
|
|
139
|
+
if sql_type is None:
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
sql_type_normed = sql_type.upper().strip()
|
|
143
|
+
return _duck_db_types.get(sql_type_normed, "string")
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from pydbml import Database, PyDBML
|
|
4
|
+
from pyparsing import ParseException
|
|
5
|
+
|
|
6
|
+
from datacontract.imports.importer import Importer
|
|
7
|
+
from datacontract.imports.sql_importer import map_type_from_sql
|
|
8
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
9
|
+
from datacontract.model.exceptions import DataContractException
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DBMLImporter(Importer):
|
|
13
|
+
def import_source(
|
|
14
|
+
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
15
|
+
) -> DataContractSpecification:
|
|
16
|
+
data_contract_specification = import_dbml_from_source(
|
|
17
|
+
data_contract_specification,
|
|
18
|
+
source,
|
|
19
|
+
import_args.get("dbml_schema"),
|
|
20
|
+
import_args.get("dbml_table"),
|
|
21
|
+
)
|
|
22
|
+
return data_contract_specification
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def import_dbml_from_source(
|
|
26
|
+
data_contract_specification: DataContractSpecification,
|
|
27
|
+
source: str,
|
|
28
|
+
import_schemas: List[str],
|
|
29
|
+
import_tables: List[str],
|
|
30
|
+
) -> DataContractSpecification:
|
|
31
|
+
try:
|
|
32
|
+
with open(source, "r") as file:
|
|
33
|
+
dbml_schema = PyDBML(file)
|
|
34
|
+
except ParseException as e:
|
|
35
|
+
raise DataContractException(
|
|
36
|
+
type="schema",
|
|
37
|
+
name="Parse DBML schema",
|
|
38
|
+
reason=f"Failed to parse DBML schema from {source}",
|
|
39
|
+
engine="datacontract",
|
|
40
|
+
original_exception=e,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
return convert_dbml(data_contract_specification, dbml_schema, import_schemas, import_tables)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def convert_dbml(
|
|
47
|
+
data_contract_specification: DataContractSpecification,
|
|
48
|
+
dbml_schema: Database,
|
|
49
|
+
import_schemas: List[str],
|
|
50
|
+
import_tables: List[str],
|
|
51
|
+
) -> DataContractSpecification:
|
|
52
|
+
if dbml_schema.project is not None:
|
|
53
|
+
data_contract_specification.info.title = dbml_schema.project.name
|
|
54
|
+
|
|
55
|
+
if data_contract_specification.models is None:
|
|
56
|
+
data_contract_specification.models = {}
|
|
57
|
+
|
|
58
|
+
for table in dbml_schema.tables:
|
|
59
|
+
schema_name = table.schema
|
|
60
|
+
table_name = table.name
|
|
61
|
+
|
|
62
|
+
# Skip if import schemas or table names are defined
|
|
63
|
+
# and the current table doesn't match
|
|
64
|
+
# if empty no filtering is done
|
|
65
|
+
if import_schemas and schema_name not in import_schemas:
|
|
66
|
+
continue
|
|
67
|
+
|
|
68
|
+
if import_tables and table_name not in import_tables:
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
fields = import_table_fields(table, dbml_schema.refs)
|
|
72
|
+
|
|
73
|
+
data_contract_specification.models[table_name] = Model(
|
|
74
|
+
fields=fields, namespace=schema_name, description=table.note.text
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return data_contract_specification
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def import_table_fields(table, references) -> dict[str, Field]:
|
|
81
|
+
imported_fields = {}
|
|
82
|
+
for field in table.columns:
|
|
83
|
+
field_name = field.name
|
|
84
|
+
imported_fields[field_name] = Field()
|
|
85
|
+
imported_fields[field_name].required = field.not_null
|
|
86
|
+
imported_fields[field_name].description = field.note.text
|
|
87
|
+
imported_fields[field_name].primaryKey = field.pk
|
|
88
|
+
imported_fields[field_name].unique = field.unique
|
|
89
|
+
# This is an assumption, that these might be valid SQL Types, since
|
|
90
|
+
# DBML doesn't really enforce anything other than 'no spaces' in column types
|
|
91
|
+
imported_fields[field_name].type = map_type_from_sql(field.type)
|
|
92
|
+
|
|
93
|
+
ref = get_reference(field, references)
|
|
94
|
+
if ref is not None:
|
|
95
|
+
imported_fields[field_name].references = ref
|
|
96
|
+
|
|
97
|
+
return imported_fields
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def get_reference(field, references):
|
|
101
|
+
result = None
|
|
102
|
+
for ref in references:
|
|
103
|
+
ref_table_name = ref.col1[0].table.name
|
|
104
|
+
ref_col_name = ref.col1[0].name
|
|
105
|
+
field_table_name = field.table.name
|
|
106
|
+
field_name = field.name
|
|
107
|
+
|
|
108
|
+
if ref_table_name == field_table_name and ref_col_name == field_name:
|
|
109
|
+
result = f"{ref.col2[0].table.name}.{ref.col2[0].name}"
|
|
110
|
+
return result
|
|
111
|
+
|
|
112
|
+
return result
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import TypedDict
|
|
3
|
+
|
|
4
|
+
from dbt.artifacts.resources.v1.components import ColumnInfo
|
|
5
|
+
from dbt.contracts.graph.manifest import Manifest
|
|
6
|
+
from dbt.contracts.graph.nodes import GenericTestNode, ManifestNode, ModelNode
|
|
7
|
+
from dbt_common.contracts.constraints import ConstraintType
|
|
8
|
+
|
|
9
|
+
from datacontract.imports.bigquery_importer import map_type_from_bigquery
|
|
10
|
+
from datacontract.imports.importer import Importer
|
|
11
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DBTImportArgs(TypedDict, total=False):
|
|
15
|
+
"""
|
|
16
|
+
A dictionary representing arguments for importing DBT models.
|
|
17
|
+
Makes the DBT Importer more customizable by allowing for flexible filtering
|
|
18
|
+
of models and their properties, through wrapping or extending.
|
|
19
|
+
|
|
20
|
+
Attributes:
|
|
21
|
+
dbt_models: The keys of models to be used in contract. All as default.
|
|
22
|
+
resource_types: Nodes listed in resource_types are kept while importing. model as default.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
dbt_nodes: list[str]
|
|
26
|
+
resource_types: list[str]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class DbtManifestImporter(Importer):
|
|
30
|
+
def import_source(
|
|
31
|
+
self,
|
|
32
|
+
data_contract_specification: DataContractSpecification,
|
|
33
|
+
source: str,
|
|
34
|
+
import_args: DBTImportArgs,
|
|
35
|
+
) -> DataContractSpecification:
|
|
36
|
+
manifest = read_dbt_manifest(manifest_path=source)
|
|
37
|
+
return import_dbt_manifest(
|
|
38
|
+
data_contract_specification=data_contract_specification,
|
|
39
|
+
manifest=manifest,
|
|
40
|
+
dbt_nodes=import_args.get("dbt_model", []),
|
|
41
|
+
resource_types=import_args.get("resource_types", ["model"]),
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def read_dbt_manifest(manifest_path: str) -> Manifest:
|
|
46
|
+
"""Read a manifest from file."""
|
|
47
|
+
with open(file=manifest_path, mode="r", encoding="utf-8") as f:
|
|
48
|
+
manifest_dict: dict = json.load(f)
|
|
49
|
+
manifest = Manifest.from_dict(manifest_dict)
|
|
50
|
+
manifest.build_parent_and_child_maps()
|
|
51
|
+
return manifest
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _get_primary_keys(manifest: Manifest, node: ManifestNode) -> list[str]:
|
|
55
|
+
node_unique_id = node.unique_id
|
|
56
|
+
if isinstance(node, ModelNode):
|
|
57
|
+
test_nodes = []
|
|
58
|
+
for node_id in manifest.child_map.get(node_unique_id, []):
|
|
59
|
+
test_node = manifest.nodes.get(node_id)
|
|
60
|
+
if not test_node or test_node.resource_type != "test":
|
|
61
|
+
continue
|
|
62
|
+
if not isinstance(test_node, GenericTestNode):
|
|
63
|
+
continue
|
|
64
|
+
if test_node.config.where is not None:
|
|
65
|
+
continue
|
|
66
|
+
test_nodes.append(test_node)
|
|
67
|
+
return node.infer_primary_key(test_nodes)
|
|
68
|
+
return []
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _get_references(manifest: Manifest, node: ManifestNode) -> dict[str, str]:
|
|
72
|
+
node_unique_id = node.unique_id
|
|
73
|
+
references = {}
|
|
74
|
+
for node_id in manifest.child_map.get(node_unique_id, []):
|
|
75
|
+
test_node = manifest.nodes.get(node_id)
|
|
76
|
+
if not test_node or test_node.resource_type != "test":
|
|
77
|
+
continue
|
|
78
|
+
if not isinstance(test_node, GenericTestNode):
|
|
79
|
+
continue
|
|
80
|
+
if test_node.test_metadata.name != "relationships":
|
|
81
|
+
continue
|
|
82
|
+
if test_node.config.where is not None:
|
|
83
|
+
continue
|
|
84
|
+
if test_node.attached_node != node_unique_id:
|
|
85
|
+
continue
|
|
86
|
+
relationship_target_node_id = [n for n in test_node.depends_on.nodes if n != node_unique_id][0]
|
|
87
|
+
relationship_target_node = manifest.nodes.get(relationship_target_node_id)
|
|
88
|
+
references[f"{node.name}.{test_node.column_name}"] = (
|
|
89
|
+
f"""{relationship_target_node.name}.{test_node.test_metadata.kwargs["field"]}"""
|
|
90
|
+
)
|
|
91
|
+
return references
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def import_dbt_manifest(
|
|
95
|
+
data_contract_specification: DataContractSpecification,
|
|
96
|
+
manifest: Manifest,
|
|
97
|
+
dbt_nodes: list[str],
|
|
98
|
+
resource_types: list[str],
|
|
99
|
+
) -> DataContractSpecification:
|
|
100
|
+
"""
|
|
101
|
+
Extracts all relevant information from the manifest,
|
|
102
|
+
and puts it in a data contract specification.
|
|
103
|
+
"""
|
|
104
|
+
data_contract_specification.info.title = manifest.metadata.project_name
|
|
105
|
+
data_contract_specification.info.dbt_version = manifest.metadata.dbt_version
|
|
106
|
+
adapter_type = manifest.metadata.adapter_type
|
|
107
|
+
data_contract_specification.models = data_contract_specification.models or {}
|
|
108
|
+
for node in manifest.nodes.values():
|
|
109
|
+
# Only intressted in processing models.
|
|
110
|
+
if node.resource_type not in resource_types:
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
# To allow args stored in dbt_models to filter relevant models.
|
|
114
|
+
# If dbt_models is empty, use all models.
|
|
115
|
+
if dbt_nodes and node.name not in dbt_nodes:
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
model_unique_id = node.unique_id
|
|
119
|
+
primary_keys = _get_primary_keys(manifest, node)
|
|
120
|
+
references = _get_references(manifest, node)
|
|
121
|
+
|
|
122
|
+
primary_key = None
|
|
123
|
+
if len(primary_keys) == 1:
|
|
124
|
+
primary_key = primary_keys[0]
|
|
125
|
+
|
|
126
|
+
dc_model = Model(
|
|
127
|
+
description=node.description,
|
|
128
|
+
tags=node.tags,
|
|
129
|
+
fields=create_fields(
|
|
130
|
+
manifest,
|
|
131
|
+
model_unique_id=model_unique_id,
|
|
132
|
+
columns=node.columns,
|
|
133
|
+
primary_key_name=primary_key,
|
|
134
|
+
references=references,
|
|
135
|
+
adapter_type=adapter_type,
|
|
136
|
+
),
|
|
137
|
+
)
|
|
138
|
+
if len(primary_keys) > 1:
|
|
139
|
+
dc_model.primaryKey = primary_keys
|
|
140
|
+
|
|
141
|
+
data_contract_specification.models[node.name] = dc_model
|
|
142
|
+
|
|
143
|
+
return data_contract_specification
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def convert_data_type_by_adapter_type(data_type: str, adapter_type: str) -> str:
|
|
147
|
+
if adapter_type == "bigquery":
|
|
148
|
+
return map_type_from_bigquery(data_type)
|
|
149
|
+
return data_type
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def create_fields(
|
|
153
|
+
manifest: Manifest,
|
|
154
|
+
model_unique_id: str,
|
|
155
|
+
columns: dict[str, ColumnInfo],
|
|
156
|
+
primary_key_name: str,
|
|
157
|
+
references: dict[str, str],
|
|
158
|
+
adapter_type: str,
|
|
159
|
+
) -> dict[str, Field]:
|
|
160
|
+
fields = {
|
|
161
|
+
column.name: create_field(manifest, model_unique_id, column, primary_key_name, references, adapter_type)
|
|
162
|
+
for column in columns.values()
|
|
163
|
+
}
|
|
164
|
+
return fields
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def get_column_tests(manifest: Manifest, model_name: str, column_name: str) -> list[dict[str, str]]:
|
|
168
|
+
column_tests = []
|
|
169
|
+
model_node = manifest.nodes.get(model_name)
|
|
170
|
+
if not model_node:
|
|
171
|
+
raise ValueError(f"Model {model_name} not found in manifest.")
|
|
172
|
+
|
|
173
|
+
model_unique_id = model_node.unique_id
|
|
174
|
+
test_ids = manifest.child_map.get(model_unique_id, [])
|
|
175
|
+
|
|
176
|
+
for test_id in test_ids:
|
|
177
|
+
test_node = manifest.nodes.get(test_id)
|
|
178
|
+
if not test_node or test_node.resource_type != "test":
|
|
179
|
+
continue
|
|
180
|
+
|
|
181
|
+
if not isinstance(test_node, GenericTestNode):
|
|
182
|
+
continue
|
|
183
|
+
|
|
184
|
+
if test_node.column_name != column_name:
|
|
185
|
+
continue
|
|
186
|
+
|
|
187
|
+
if test_node.config.where is not None:
|
|
188
|
+
continue
|
|
189
|
+
|
|
190
|
+
column_tests.append(
|
|
191
|
+
{
|
|
192
|
+
"test_name": test_node.name,
|
|
193
|
+
"test_type": test_node.test_metadata.name,
|
|
194
|
+
"column": test_node.column_name,
|
|
195
|
+
}
|
|
196
|
+
)
|
|
197
|
+
return column_tests
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def create_field(
|
|
201
|
+
manifest: Manifest,
|
|
202
|
+
model_unique_id: str,
|
|
203
|
+
column: ColumnInfo,
|
|
204
|
+
primary_key_name: str,
|
|
205
|
+
references: dict[str, str],
|
|
206
|
+
adapter_type: str,
|
|
207
|
+
) -> Field:
|
|
208
|
+
column_type = convert_data_type_by_adapter_type(column.data_type, adapter_type) if column.data_type else ""
|
|
209
|
+
field = Field(
|
|
210
|
+
description=column.description,
|
|
211
|
+
type=column_type,
|
|
212
|
+
tags=column.tags,
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
all_tests = get_column_tests(manifest, model_unique_id, column.name)
|
|
216
|
+
|
|
217
|
+
required = False
|
|
218
|
+
if any(constraint.type == ConstraintType.not_null for constraint in column.constraints):
|
|
219
|
+
required = True
|
|
220
|
+
if [test for test in all_tests if test["test_type"] == "not_null"]:
|
|
221
|
+
required = True
|
|
222
|
+
if required:
|
|
223
|
+
field.required = required
|
|
224
|
+
|
|
225
|
+
unique = False
|
|
226
|
+
if any(constraint.type == ConstraintType.unique for constraint in column.constraints):
|
|
227
|
+
unique = True
|
|
228
|
+
if [test for test in all_tests if test["test_type"] == "unique"]:
|
|
229
|
+
unique = True
|
|
230
|
+
if unique:
|
|
231
|
+
field.unique = unique
|
|
232
|
+
|
|
233
|
+
if column.name == primary_key_name:
|
|
234
|
+
field.primaryKey = True
|
|
235
|
+
|
|
236
|
+
references_key = f"{manifest.nodes[model_unique_id].name}.{column.name}"
|
|
237
|
+
if references_key in references:
|
|
238
|
+
field.references = references[references_key]
|
|
239
|
+
|
|
240
|
+
return field
|