datacontract-cli 0.10.14__py3-none-any.whl → 0.10.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacontract-cli might be problematic. Click here for more details.
- datacontract/breaking/breaking.py +229 -11
- datacontract/breaking/breaking_rules.py +24 -0
- datacontract/catalog/catalog.py +1 -1
- datacontract/cli.py +100 -33
- datacontract/data_contract.py +26 -4
- datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
- datacontract/engines/fastjsonschema/check_jsonschema.py +114 -22
- datacontract/engines/soda/check_soda_execute.py +7 -5
- datacontract/engines/soda/connections/duckdb.py +1 -0
- datacontract/engines/soda/connections/kafka.py +12 -12
- datacontract/export/avro_idl_converter.py +1 -2
- datacontract/export/bigquery_converter.py +4 -3
- datacontract/export/data_caterer_converter.py +1 -1
- datacontract/export/dbml_converter.py +2 -4
- datacontract/export/dbt_converter.py +45 -39
- datacontract/export/exporter.py +2 -1
- datacontract/export/exporter_factory.py +7 -2
- datacontract/export/go_converter.py +3 -2
- datacontract/export/great_expectations_converter.py +202 -40
- datacontract/export/html_export.py +1 -1
- datacontract/export/iceberg_converter.py +188 -0
- datacontract/export/jsonschema_converter.py +3 -2
- datacontract/export/odcs_v2_exporter.py +1 -1
- datacontract/export/odcs_v3_exporter.py +44 -30
- datacontract/export/pandas_type_converter.py +40 -0
- datacontract/export/protobuf_converter.py +1 -1
- datacontract/export/rdf_converter.py +4 -5
- datacontract/export/sodacl_converter.py +9 -4
- datacontract/export/spark_converter.py +7 -6
- datacontract/export/sql_converter.py +1 -2
- datacontract/export/sqlalchemy_converter.py +1 -2
- datacontract/export/terraform_converter.py +1 -1
- datacontract/imports/avro_importer.py +1 -1
- datacontract/imports/bigquery_importer.py +1 -1
- datacontract/imports/dbml_importer.py +2 -2
- datacontract/imports/dbt_importer.py +80 -15
- datacontract/imports/glue_importer.py +5 -3
- datacontract/imports/iceberg_importer.py +17 -7
- datacontract/imports/importer.py +1 -0
- datacontract/imports/importer_factory.py +7 -1
- datacontract/imports/jsonschema_importer.py +3 -2
- datacontract/imports/odcs_v2_importer.py +2 -2
- datacontract/imports/odcs_v3_importer.py +7 -2
- datacontract/imports/parquet_importer.py +81 -0
- datacontract/imports/spark_importer.py +2 -1
- datacontract/imports/sql_importer.py +1 -1
- datacontract/imports/unity_importer.py +3 -3
- datacontract/integration/opentelemetry.py +0 -1
- datacontract/lint/lint.py +2 -1
- datacontract/lint/linters/description_linter.py +1 -0
- datacontract/lint/linters/example_model_linter.py +1 -0
- datacontract/lint/linters/field_pattern_linter.py +1 -0
- datacontract/lint/linters/field_reference_linter.py +1 -0
- datacontract/lint/linters/notice_period_linter.py +1 -0
- datacontract/lint/linters/quality_schema_linter.py +1 -0
- datacontract/lint/linters/valid_constraints_linter.py +1 -0
- datacontract/lint/resolve.py +7 -3
- datacontract/lint/schema.py +1 -1
- datacontract/model/data_contract_specification.py +13 -6
- datacontract/model/run.py +21 -12
- datacontract/templates/index.html +6 -6
- datacontract/web.py +2 -3
- {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.16.dist-info}/METADATA +163 -60
- datacontract_cli-0.10.16.dist-info/RECORD +106 -0
- {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.16.dist-info}/WHEEL +1 -1
- datacontract_cli-0.10.14.dist-info/RECORD +0 -103
- {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.16.dist-info}/LICENSE +0 -0
- {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.16.dist-info}/entry_points.txt +0 -0
- {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.16.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Module for converting data contract field types to corresponding pandas data types.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from datacontract.model.data_contract_specification import Field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def convert_to_pandas_type(field: Field) -> str:
|
|
9
|
+
"""
|
|
10
|
+
Convert a data contract field type to the equivalent pandas data type.
|
|
11
|
+
|
|
12
|
+
Parameters:
|
|
13
|
+
----------
|
|
14
|
+
field : Field
|
|
15
|
+
A Field object containing metadata about the data type of the field.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
-------
|
|
19
|
+
str
|
|
20
|
+
The corresponding pandas data type as a string.
|
|
21
|
+
"""
|
|
22
|
+
field_type = field.type
|
|
23
|
+
|
|
24
|
+
if field_type in ["string", "varchar", "text"]:
|
|
25
|
+
return "str"
|
|
26
|
+
if field_type in ["integer", "int"]:
|
|
27
|
+
return "int32"
|
|
28
|
+
if field_type == "long":
|
|
29
|
+
return "int64"
|
|
30
|
+
if field_type == "float":
|
|
31
|
+
return "float32"
|
|
32
|
+
if field_type in ["number", "decimal", "numeric", "double"]:
|
|
33
|
+
return "float64"
|
|
34
|
+
if field_type == "boolean":
|
|
35
|
+
return "bool"
|
|
36
|
+
if field_type in ["timestamp", "timestamp_tz", "timestamp_ntz", "date"]:
|
|
37
|
+
return "datetime64[ns]"
|
|
38
|
+
if field_type == "bytes":
|
|
39
|
+
return "object"
|
|
40
|
+
return "object"
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
from pydantic import BaseModel
|
|
2
|
-
from rdflib import
|
|
3
|
-
|
|
4
|
-
from datacontract.model.data_contract_specification import DataContractSpecification
|
|
2
|
+
from rdflib import RDF, BNode, Graph, Literal, Namespace, URIRef
|
|
5
3
|
|
|
6
4
|
from datacontract.export.exporter import Exporter
|
|
5
|
+
from datacontract.model.data_contract_specification import DataContractSpecification
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
class RdfExporter(Exporter):
|
|
@@ -58,8 +57,8 @@ def to_rdf(data_contract_spec: DataContractSpecification, base) -> Graph:
|
|
|
58
57
|
else:
|
|
59
58
|
g = Graph(base=Namespace(""))
|
|
60
59
|
|
|
61
|
-
dc = Namespace("https://datacontract.com/DataContractSpecification/
|
|
62
|
-
dcx = Namespace("https://datacontract.com/DataContractSpecification/
|
|
60
|
+
dc = Namespace("https://datacontract.com/DataContractSpecification/1.1.0/")
|
|
61
|
+
dcx = Namespace("https://datacontract.com/DataContractSpecification/1.1.0/Extension/")
|
|
63
62
|
|
|
64
63
|
g.bind("dc", dc)
|
|
65
64
|
g.bind("dcx", dcx)
|
|
@@ -62,12 +62,16 @@ def to_checks(model_key, model_value, server_type: str, check_types: bool):
|
|
|
62
62
|
if field.enum is not None and len(field.enum) > 0:
|
|
63
63
|
checks.append(check_field_enum(field_name, field.enum, quote_field_name))
|
|
64
64
|
if field.quality is not None and len(field.quality) > 0:
|
|
65
|
-
|
|
65
|
+
quality_list = check_quality_list(model_key, field_name, field.quality)
|
|
66
|
+
if (quality_list is not None) and len(quality_list) > 0:
|
|
67
|
+
checks.append(quality_list)
|
|
66
68
|
# TODO references: str = None
|
|
67
69
|
# TODO format
|
|
68
70
|
|
|
69
71
|
if model_value.quality is not None and len(model_value.quality) > 0:
|
|
70
|
-
|
|
72
|
+
quality_list = check_quality_list(model_key, None, model_value.quality)
|
|
73
|
+
if (quality_list is not None) and len(quality_list) > 0:
|
|
74
|
+
checks.append(quality_list)
|
|
71
75
|
|
|
72
76
|
checks_for_model_key = f"checks for {model_key}"
|
|
73
77
|
|
|
@@ -196,9 +200,9 @@ def check_quality_list(model_name, field_name, quality_list: List[Quality]):
|
|
|
196
200
|
for quality in quality_list:
|
|
197
201
|
if quality.type == "sql":
|
|
198
202
|
if field_name is None:
|
|
199
|
-
metric_name = f"{model_name}_{field_name}_quality_sql_{count}"
|
|
200
|
-
else:
|
|
201
203
|
metric_name = f"{model_name}_quality_sql_{count}"
|
|
204
|
+
else:
|
|
205
|
+
metric_name = f"{model_name}_{field_name}_quality_sql_{count}"
|
|
202
206
|
threshold = to_sodacl_threshold(quality)
|
|
203
207
|
query = prepare_query(quality, model_name, field_name)
|
|
204
208
|
if query is None:
|
|
@@ -261,6 +265,7 @@ def to_sodacl_threshold(quality: Quality) -> str | None:
|
|
|
261
265
|
return None
|
|
262
266
|
|
|
263
267
|
|
|
268
|
+
# These are deprecated root-level quality specifications, use the model-level and field-level quality fields instead
|
|
264
269
|
def add_quality_checks(sodacl, data_contract_spec):
|
|
265
270
|
if data_contract_spec.quality is None:
|
|
266
271
|
return
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
from pyspark.sql import types
|
|
2
|
+
|
|
3
|
+
from datacontract.export.exporter import Exporter
|
|
2
4
|
from datacontract.model.data_contract_specification import (
|
|
3
5
|
DataContractSpecification,
|
|
4
|
-
Model,
|
|
5
6
|
Field,
|
|
7
|
+
Model,
|
|
6
8
|
)
|
|
7
|
-
from datacontract.export.exporter import Exporter
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
class SparkExporter(Exporter):
|
|
@@ -102,11 +103,11 @@ def to_struct_field(field: Field, field_name: str) -> types.StructField:
|
|
|
102
103
|
Returns:
|
|
103
104
|
types.StructField: The corresponding Spark StructField.
|
|
104
105
|
"""
|
|
105
|
-
data_type =
|
|
106
|
+
data_type = to_spark_data_type(field)
|
|
106
107
|
return types.StructField(name=field_name, dataType=data_type, nullable=not field.required)
|
|
107
108
|
|
|
108
109
|
|
|
109
|
-
def
|
|
110
|
+
def to_spark_data_type(field: Field) -> types.DataType:
|
|
110
111
|
"""
|
|
111
112
|
Convert a field to a Spark DataType.
|
|
112
113
|
|
|
@@ -120,11 +121,11 @@ def to_data_type(field: Field) -> types.DataType:
|
|
|
120
121
|
if field_type is None or field_type in ["null"]:
|
|
121
122
|
return types.NullType()
|
|
122
123
|
if field_type == "array":
|
|
123
|
-
return types.ArrayType(
|
|
124
|
+
return types.ArrayType(to_spark_data_type(field.items))
|
|
124
125
|
if field_type in ["object", "record", "struct"]:
|
|
125
126
|
return types.StructType(to_struct_type(field.fields))
|
|
126
127
|
if field_type == "map":
|
|
127
|
-
return types.MapType(
|
|
128
|
+
return types.MapType(to_spark_data_type(field.keys), to_spark_data_type(field.values))
|
|
128
129
|
if field_type in ["string", "varchar", "text"]:
|
|
129
130
|
return types.StringType()
|
|
130
131
|
if field_type in ["number", "decimal", "numeric"]:
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
+
from datacontract.export.exporter import Exporter, _check_models_for_export, _determine_sql_server_type
|
|
1
2
|
from datacontract.export.sql_type_converter import convert_to_sql_type
|
|
2
3
|
from datacontract.model.data_contract_specification import DataContractSpecification, Model
|
|
3
4
|
|
|
4
|
-
from datacontract.export.exporter import Exporter, _check_models_for_export, _determine_sql_server_type
|
|
5
|
-
|
|
6
5
|
|
|
7
6
|
class SqlExporter(Exporter):
|
|
8
7
|
def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
|
|
@@ -2,8 +2,7 @@ import ast
|
|
|
2
2
|
import typing
|
|
3
3
|
|
|
4
4
|
import datacontract.model.data_contract_specification as spec
|
|
5
|
-
from datacontract.export.exporter import Exporter
|
|
6
|
-
from datacontract.export.exporter import _determine_sql_server_type
|
|
5
|
+
from datacontract.export.exporter import Exporter, _determine_sql_server_type
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
class SQLAlchemyExporter(Exporter):
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import re
|
|
2
2
|
|
|
3
|
-
from datacontract.model.data_contract_specification import DataContractSpecification, Server
|
|
4
3
|
from datacontract.export.exporter import Exporter
|
|
4
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Server
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class TerraformExporter(Exporter):
|
|
@@ -3,7 +3,7 @@ from typing import Dict, List
|
|
|
3
3
|
import avro.schema
|
|
4
4
|
|
|
5
5
|
from datacontract.imports.importer import Importer
|
|
6
|
-
from datacontract.model.data_contract_specification import DataContractSpecification,
|
|
6
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
7
7
|
from datacontract.model.exceptions import DataContractException
|
|
8
8
|
|
|
9
9
|
|
|
@@ -3,7 +3,7 @@ import logging
|
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
5
|
from datacontract.imports.importer import Importer
|
|
6
|
-
from datacontract.model.data_contract_specification import DataContractSpecification,
|
|
6
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
7
7
|
from datacontract.model.exceptions import DataContractException
|
|
8
8
|
|
|
9
9
|
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
from pydbml import PyDBML, Database
|
|
2
1
|
from typing import List
|
|
3
2
|
|
|
3
|
+
from pydbml import Database, PyDBML
|
|
4
4
|
from pyparsing import ParseException
|
|
5
5
|
|
|
6
6
|
from datacontract.imports.importer import Importer
|
|
7
7
|
from datacontract.imports.sql_importer import map_type_from_sql
|
|
8
|
-
from datacontract.model.data_contract_specification import DataContractSpecification,
|
|
8
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
9
9
|
from datacontract.model.exceptions import DataContractException
|
|
10
10
|
|
|
11
11
|
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from typing import TypedDict
|
|
3
3
|
|
|
4
|
-
from datacontract.imports.importer import Importer
|
|
5
|
-
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
6
4
|
from dbt.artifacts.resources.v1.components import ColumnInfo
|
|
7
5
|
from dbt.contracts.graph.manifest import Manifest
|
|
6
|
+
from dbt.contracts.graph.nodes import GenericTestNode
|
|
7
|
+
from dbt_common.contracts.constraints import ConstraintType
|
|
8
|
+
|
|
9
|
+
from datacontract.imports.bigquery_importer import map_type_from_bigquery
|
|
10
|
+
from datacontract.imports.importer import Importer
|
|
11
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
8
12
|
|
|
9
13
|
|
|
10
14
|
class DBTImportArgs(TypedDict, total=False):
|
|
@@ -33,7 +37,7 @@ class DbtManifestImporter(Importer):
|
|
|
33
37
|
return import_dbt_manifest(
|
|
34
38
|
data_contract_specification=data_contract_specification,
|
|
35
39
|
manifest=manifest,
|
|
36
|
-
dbt_nodes=import_args.get("
|
|
40
|
+
dbt_nodes=import_args.get("dbt_model", []),
|
|
37
41
|
resource_types=import_args.get("resource_types", ["model"]),
|
|
38
42
|
)
|
|
39
43
|
|
|
@@ -42,7 +46,9 @@ def read_dbt_manifest(manifest_path: str) -> Manifest:
|
|
|
42
46
|
"""Read a manifest from file."""
|
|
43
47
|
with open(file=manifest_path, mode="r", encoding="utf-8") as f:
|
|
44
48
|
manifest_dict: dict = json.load(f)
|
|
45
|
-
|
|
49
|
+
manifest = Manifest.from_dict(manifest_dict)
|
|
50
|
+
manifest.build_parent_and_child_maps()
|
|
51
|
+
return manifest
|
|
46
52
|
|
|
47
53
|
|
|
48
54
|
def import_dbt_manifest(
|
|
@@ -57,7 +63,7 @@ def import_dbt_manifest(
|
|
|
57
63
|
"""
|
|
58
64
|
data_contract_specification.info.title = manifest.metadata.project_name
|
|
59
65
|
data_contract_specification.info.dbt_version = manifest.metadata.dbt_version
|
|
60
|
-
|
|
66
|
+
adapter_type = manifest.metadata.adapter_type
|
|
61
67
|
data_contract_specification.models = data_contract_specification.models or {}
|
|
62
68
|
for model_contents in manifest.nodes.values():
|
|
63
69
|
# Only intressted in processing models.
|
|
@@ -72,7 +78,12 @@ def import_dbt_manifest(
|
|
|
72
78
|
dc_model = Model(
|
|
73
79
|
description=model_contents.description,
|
|
74
80
|
tags=model_contents.tags,
|
|
75
|
-
fields=create_fields(
|
|
81
|
+
fields=create_fields(
|
|
82
|
+
manifest,
|
|
83
|
+
model_unique_id=model_contents.unique_id,
|
|
84
|
+
columns=model_contents.columns,
|
|
85
|
+
adapter_type=adapter_type,
|
|
86
|
+
),
|
|
76
87
|
)
|
|
77
88
|
|
|
78
89
|
data_contract_specification.models[model_contents.name] = dc_model
|
|
@@ -80,14 +91,68 @@ def import_dbt_manifest(
|
|
|
80
91
|
return data_contract_specification
|
|
81
92
|
|
|
82
93
|
|
|
83
|
-
def
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
type=column.data_type if column.data_type else "",
|
|
88
|
-
tags=column.tags,
|
|
89
|
-
)
|
|
90
|
-
for column in columns.values()
|
|
91
|
-
}
|
|
94
|
+
def convert_data_type_by_adapter_type(data_type: str, adapter_type: str) -> str:
|
|
95
|
+
if adapter_type == "bigquery":
|
|
96
|
+
return map_type_from_bigquery(data_type)
|
|
97
|
+
return data_type
|
|
92
98
|
|
|
99
|
+
|
|
100
|
+
def create_fields(
|
|
101
|
+
manifest: Manifest, model_unique_id: str, columns: dict[str, ColumnInfo], adapter_type: str
|
|
102
|
+
) -> dict[str, Field]:
|
|
103
|
+
fields = {column.name: create_field(manifest, model_unique_id, column, adapter_type) for column in columns.values()}
|
|
93
104
|
return fields
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_column_tests(manifest: Manifest, model_name: str, column_name: str) -> list[dict[str, str]]:
|
|
108
|
+
column_tests = []
|
|
109
|
+
model_node = manifest.nodes.get(model_name)
|
|
110
|
+
if not model_node:
|
|
111
|
+
raise ValueError(f"Model {model_name} not found in manifest.")
|
|
112
|
+
|
|
113
|
+
model_unique_id = model_node.unique_id
|
|
114
|
+
test_ids = manifest.child_map.get(model_unique_id, [])
|
|
115
|
+
|
|
116
|
+
for test_id in test_ids:
|
|
117
|
+
test_node = manifest.nodes.get(test_id)
|
|
118
|
+
if not test_node or test_node.resource_type != "test":
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
if not isinstance(test_node, GenericTestNode):
|
|
122
|
+
continue
|
|
123
|
+
|
|
124
|
+
if test_node.column_name != column_name:
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
if test_node.config.where is not None:
|
|
128
|
+
continue
|
|
129
|
+
|
|
130
|
+
column_tests.append(
|
|
131
|
+
{
|
|
132
|
+
"test_name": test_node.name,
|
|
133
|
+
"test_type": test_node.test_metadata.name,
|
|
134
|
+
"column": test_node.column_name,
|
|
135
|
+
}
|
|
136
|
+
)
|
|
137
|
+
return column_tests
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def create_field(manifest: Manifest, model_unique_id: str, column: ColumnInfo, adapter_type: str) -> Field:
|
|
141
|
+
column_type = convert_data_type_by_adapter_type(column.data_type, adapter_type) if column.data_type else ""
|
|
142
|
+
field = Field(
|
|
143
|
+
description=column.description,
|
|
144
|
+
type=column_type,
|
|
145
|
+
tags=column.tags,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
all_tests = get_column_tests(manifest, model_unique_id, column.name)
|
|
149
|
+
|
|
150
|
+
required = False
|
|
151
|
+
if any(constraint.type == ConstraintType.not_null for constraint in column.constraints):
|
|
152
|
+
required = True
|
|
153
|
+
if [test for test in all_tests if test["test_type"] == "not_null"]:
|
|
154
|
+
required = True
|
|
155
|
+
if required:
|
|
156
|
+
field.required = required
|
|
157
|
+
|
|
158
|
+
return field
|
|
@@ -1,11 +1,13 @@
|
|
|
1
|
-
import boto3
|
|
2
|
-
from typing import List, Dict, Generator
|
|
3
1
|
import re
|
|
2
|
+
from typing import Dict, Generator, List
|
|
3
|
+
|
|
4
|
+
import boto3
|
|
5
|
+
|
|
4
6
|
from datacontract.imports.importer import Importer
|
|
5
7
|
from datacontract.model.data_contract_specification import (
|
|
6
8
|
DataContractSpecification,
|
|
7
|
-
Model,
|
|
8
9
|
Field,
|
|
10
|
+
Model,
|
|
9
11
|
Server,
|
|
10
12
|
)
|
|
11
13
|
|
|
@@ -1,12 +1,11 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Any, Dict
|
|
2
2
|
|
|
3
|
-
from datacontract.imports.importer import Importer
|
|
4
|
-
from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
|
|
5
|
-
|
|
6
|
-
from pyiceberg.schema import Schema
|
|
7
|
-
from pyiceberg import types as iceberg_types
|
|
8
3
|
from pydantic import ValidationError
|
|
4
|
+
from pyiceberg import types as iceberg_types
|
|
5
|
+
from pyiceberg.schema import Schema
|
|
9
6
|
|
|
7
|
+
from datacontract.imports.importer import Importer
|
|
8
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
10
9
|
from datacontract.model.exceptions import DataContractException
|
|
11
10
|
|
|
12
11
|
|
|
@@ -43,8 +42,19 @@ def import_iceberg(
|
|
|
43
42
|
|
|
44
43
|
model = Model(type="table", title=table_name)
|
|
45
44
|
|
|
45
|
+
# Iceberg identifier_fields aren't technically primary keys since Iceberg doesn't support primary keys,
|
|
46
|
+
# but they are close enough that we can probably treat them as primary keys on the conversion.
|
|
47
|
+
# ref: https://iceberg.apache.org/spec/#identifier-field-ids
|
|
48
|
+
# this code WILL NOT support finding nested primary key fields.
|
|
49
|
+
identifier_fields_ids = schema.identifier_field_ids
|
|
50
|
+
|
|
46
51
|
for field in schema.fields:
|
|
47
|
-
|
|
52
|
+
model_field = _field_from_nested_field(field)
|
|
53
|
+
|
|
54
|
+
if field.field_id in identifier_fields_ids:
|
|
55
|
+
model_field.primaryKey = True
|
|
56
|
+
|
|
57
|
+
model.fields[field.name] = model_field
|
|
48
58
|
|
|
49
59
|
data_contract_specification.models[table_name] = model
|
|
50
60
|
return data_contract_specification
|
datacontract/imports/importer.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import importlib.util
|
|
2
2
|
import sys
|
|
3
|
-
|
|
3
|
+
|
|
4
|
+
from datacontract.imports.importer import Importer, ImportFormat
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
class ImporterFactory:
|
|
@@ -98,3 +99,8 @@ importer_factory.register_lazy_importer(
|
|
|
98
99
|
module_path="datacontract.imports.iceberg_importer",
|
|
99
100
|
class_name="IcebergImporter",
|
|
100
101
|
)
|
|
102
|
+
importer_factory.register_lazy_importer(
|
|
103
|
+
name=ImportFormat.parquet,
|
|
104
|
+
module_path="datacontract.imports.parquet_importer",
|
|
105
|
+
class_name="ParquetImporter",
|
|
106
|
+
)
|
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
import fastjsonschema
|
|
4
4
|
|
|
5
5
|
from datacontract.imports.importer import Importer
|
|
6
|
-
from datacontract.model.data_contract_specification import DataContractSpecification,
|
|
6
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Definition, Field, Model
|
|
7
7
|
from datacontract.model.exceptions import DataContractException
|
|
8
8
|
|
|
9
9
|
|
|
@@ -111,7 +111,8 @@ def schema_to_args(property_schema, is_required: bool = None) -> dict:
|
|
|
111
111
|
nested_properties = property_schema.get("properties")
|
|
112
112
|
if nested_properties is not None:
|
|
113
113
|
# recursive call for complex nested properties
|
|
114
|
-
|
|
114
|
+
required = property_schema.get("required", [])
|
|
115
|
+
field_kwargs["fields"] = jsonschema_to_args(nested_properties, required)
|
|
115
116
|
|
|
116
117
|
return field_kwargs
|
|
117
118
|
|
|
@@ -6,16 +6,16 @@ import yaml
|
|
|
6
6
|
|
|
7
7
|
from datacontract.imports.importer import Importer
|
|
8
8
|
from datacontract.model.data_contract_specification import (
|
|
9
|
+
DATACONTRACT_TYPES,
|
|
9
10
|
Availability,
|
|
10
11
|
Contact,
|
|
11
12
|
DataContractSpecification,
|
|
13
|
+
Field,
|
|
12
14
|
Info,
|
|
13
15
|
Model,
|
|
14
|
-
Field,
|
|
15
16
|
Retention,
|
|
16
17
|
ServiceLevel,
|
|
17
18
|
Terms,
|
|
18
|
-
DATACONTRACT_TYPES,
|
|
19
19
|
)
|
|
20
20
|
from datacontract.model.exceptions import DataContractException
|
|
21
21
|
|
|
@@ -8,16 +8,17 @@ import yaml
|
|
|
8
8
|
from datacontract.imports.importer import Importer
|
|
9
9
|
from datacontract.lint.resources import read_resource
|
|
10
10
|
from datacontract.model.data_contract_specification import (
|
|
11
|
+
DATACONTRACT_TYPES,
|
|
11
12
|
Availability,
|
|
12
13
|
DataContractSpecification,
|
|
14
|
+
Field,
|
|
13
15
|
Info,
|
|
14
16
|
Model,
|
|
15
|
-
|
|
17
|
+
Quality,
|
|
16
18
|
Retention,
|
|
17
19
|
Server,
|
|
18
20
|
ServiceLevel,
|
|
19
21
|
Terms,
|
|
20
|
-
DATACONTRACT_TYPES,
|
|
21
22
|
)
|
|
22
23
|
from datacontract.model.exceptions import DataContractException
|
|
23
24
|
|
|
@@ -193,6 +194,10 @@ def import_models(odcs_contract: Dict[str, Any]) -> Dict[str, Model]:
|
|
|
193
194
|
model.fields = import_fields(
|
|
194
195
|
odcs_schema.get("properties"), custom_type_mappings, server_type=get_server_type(odcs_contract)
|
|
195
196
|
)
|
|
197
|
+
if odcs_schema.get("quality") is not None:
|
|
198
|
+
# convert dict to pydantic model
|
|
199
|
+
|
|
200
|
+
model.quality = [Quality.model_validate(q) for q in odcs_schema.get("quality")]
|
|
196
201
|
model.title = schema_name
|
|
197
202
|
if odcs_schema.get("dataGranularityDescription") is not None:
|
|
198
203
|
model.config = {"dataGranularityDescription": odcs_schema.get("dataGranularityDescription")}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
|
|
3
|
+
import pyarrow
|
|
4
|
+
from pyarrow import parquet
|
|
5
|
+
|
|
6
|
+
from datacontract.imports.importer import Importer
|
|
7
|
+
from datacontract.model.data_contract_specification import (
|
|
8
|
+
DataContractSpecification,
|
|
9
|
+
Field,
|
|
10
|
+
Model,
|
|
11
|
+
)
|
|
12
|
+
from datacontract.model.exceptions import DataContractException
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ParquetImporter(Importer):
|
|
16
|
+
def import_source(
|
|
17
|
+
self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
|
|
18
|
+
) -> DataContractSpecification:
|
|
19
|
+
return import_parquet(data_contract_specification, source)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def import_parquet(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
|
|
23
|
+
# use filename as schema name, remove .parquet suffix, avoid breaking the yaml output by replacing dots
|
|
24
|
+
schema_name = os.path.basename(source).removesuffix(".parquet").replace(".", "_")
|
|
25
|
+
|
|
26
|
+
fields: dict[str, Field] = {}
|
|
27
|
+
|
|
28
|
+
arrow_schema = parquet.read_schema(source)
|
|
29
|
+
for field_name in arrow_schema.names:
|
|
30
|
+
parquet_field = arrow_schema.field(field_name)
|
|
31
|
+
|
|
32
|
+
field = map_pyarrow_field_to_specification_field(parquet_field, "parquet")
|
|
33
|
+
|
|
34
|
+
if not parquet_field.nullable:
|
|
35
|
+
field.required = True
|
|
36
|
+
|
|
37
|
+
fields[field_name] = field
|
|
38
|
+
|
|
39
|
+
data_contract_specification.models[schema_name] = Model(fields=fields)
|
|
40
|
+
|
|
41
|
+
return data_contract_specification
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def map_pyarrow_field_to_specification_field(pyarrow_field: pyarrow.Field, file_format: str) -> Field:
|
|
45
|
+
if pyarrow.types.is_boolean(pyarrow_field.type):
|
|
46
|
+
return Field(type="boolean")
|
|
47
|
+
if pyarrow.types.is_int32(pyarrow_field.type):
|
|
48
|
+
return Field(type="int")
|
|
49
|
+
if pyarrow.types.is_int64(pyarrow_field.type):
|
|
50
|
+
return Field(type="long")
|
|
51
|
+
if pyarrow.types.is_integer(pyarrow_field.type):
|
|
52
|
+
return Field(type="number")
|
|
53
|
+
if pyarrow.types.is_float32(pyarrow_field.type):
|
|
54
|
+
return Field(type="float")
|
|
55
|
+
if pyarrow.types.is_float64(pyarrow_field.type):
|
|
56
|
+
return Field(type="double")
|
|
57
|
+
if pyarrow.types.is_decimal(pyarrow_field.type):
|
|
58
|
+
return Field(type="decimal", precision=pyarrow_field.type.precision, scale=pyarrow_field.type.scale)
|
|
59
|
+
if pyarrow.types.is_timestamp(pyarrow_field.type):
|
|
60
|
+
return Field(type="timestamp")
|
|
61
|
+
if pyarrow.types.is_date(pyarrow_field.type):
|
|
62
|
+
return Field(type="date")
|
|
63
|
+
if pyarrow.types.is_null(pyarrow_field.type):
|
|
64
|
+
return Field(type="null")
|
|
65
|
+
if pyarrow.types.is_binary(pyarrow_field.type):
|
|
66
|
+
return Field(type="bytes")
|
|
67
|
+
if pyarrow.types.is_string(pyarrow_field.type):
|
|
68
|
+
return Field(type="string")
|
|
69
|
+
if pyarrow.types.is_map(pyarrow_field.type) or pyarrow.types.is_dictionary(pyarrow_field.type):
|
|
70
|
+
return Field(type="map")
|
|
71
|
+
if pyarrow.types.is_struct(pyarrow_field.type):
|
|
72
|
+
return Field(type="struct")
|
|
73
|
+
if pyarrow.types.is_list(pyarrow_field.type):
|
|
74
|
+
return Field(type="array")
|
|
75
|
+
|
|
76
|
+
raise DataContractException(
|
|
77
|
+
type="schema",
|
|
78
|
+
name=f"Parse {file_format} schema",
|
|
79
|
+
reason=f"{pyarrow_field.type} currently not supported.",
|
|
80
|
+
engine="datacontract",
|
|
81
|
+
)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from simple_ddl_parser import parse_from_file
|
|
2
2
|
|
|
3
3
|
from datacontract.imports.importer import Importer
|
|
4
|
-
from datacontract.model.data_contract_specification import DataContractSpecification,
|
|
4
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class SqlImporter(Importer):
|
|
@@ -2,13 +2,13 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List, Optional
|
|
4
4
|
|
|
5
|
-
from pyspark.sql import types
|
|
6
5
|
from databricks.sdk import WorkspaceClient
|
|
7
|
-
from databricks.sdk.service.catalog import
|
|
6
|
+
from databricks.sdk.service.catalog import ColumnInfo, TableInfo
|
|
7
|
+
from pyspark.sql import types
|
|
8
8
|
|
|
9
9
|
from datacontract.imports.importer import Importer
|
|
10
10
|
from datacontract.imports.spark_importer import _field_from_struct_type
|
|
11
|
-
from datacontract.model.data_contract_specification import DataContractSpecification,
|
|
11
|
+
from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
|
|
12
12
|
from datacontract.model.exceptions import DataContractException
|
|
13
13
|
|
|
14
14
|
|
datacontract/lint/lint.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import abc
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Any, Sequence, cast
|
|
5
5
|
|
|
6
6
|
from datacontract.model.run import Check
|
|
7
|
+
|
|
7
8
|
from ..model.data_contract_specification import DataContractSpecification
|
|
8
9
|
|
|
9
10
|
"""This module contains linter definitions for linting a data contract.
|