datacontract-cli 0.10.14__py3-none-any.whl → 0.10.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (69) hide show
  1. datacontract/breaking/breaking.py +229 -11
  2. datacontract/breaking/breaking_rules.py +24 -0
  3. datacontract/catalog/catalog.py +1 -1
  4. datacontract/cli.py +100 -33
  5. datacontract/data_contract.py +26 -4
  6. datacontract/engines/datacontract/check_that_datacontract_file_exists.py +1 -1
  7. datacontract/engines/fastjsonschema/check_jsonschema.py +114 -22
  8. datacontract/engines/soda/check_soda_execute.py +7 -5
  9. datacontract/engines/soda/connections/duckdb.py +1 -0
  10. datacontract/engines/soda/connections/kafka.py +12 -12
  11. datacontract/export/avro_idl_converter.py +1 -2
  12. datacontract/export/bigquery_converter.py +4 -3
  13. datacontract/export/data_caterer_converter.py +1 -1
  14. datacontract/export/dbml_converter.py +2 -4
  15. datacontract/export/dbt_converter.py +45 -39
  16. datacontract/export/exporter.py +2 -1
  17. datacontract/export/exporter_factory.py +7 -2
  18. datacontract/export/go_converter.py +3 -2
  19. datacontract/export/great_expectations_converter.py +202 -40
  20. datacontract/export/html_export.py +1 -1
  21. datacontract/export/iceberg_converter.py +188 -0
  22. datacontract/export/jsonschema_converter.py +3 -2
  23. datacontract/export/odcs_v2_exporter.py +1 -1
  24. datacontract/export/odcs_v3_exporter.py +44 -30
  25. datacontract/export/pandas_type_converter.py +40 -0
  26. datacontract/export/protobuf_converter.py +1 -1
  27. datacontract/export/rdf_converter.py +4 -5
  28. datacontract/export/sodacl_converter.py +9 -4
  29. datacontract/export/spark_converter.py +7 -6
  30. datacontract/export/sql_converter.py +1 -2
  31. datacontract/export/sqlalchemy_converter.py +1 -2
  32. datacontract/export/terraform_converter.py +1 -1
  33. datacontract/imports/avro_importer.py +1 -1
  34. datacontract/imports/bigquery_importer.py +1 -1
  35. datacontract/imports/dbml_importer.py +2 -2
  36. datacontract/imports/dbt_importer.py +80 -15
  37. datacontract/imports/glue_importer.py +5 -3
  38. datacontract/imports/iceberg_importer.py +17 -7
  39. datacontract/imports/importer.py +1 -0
  40. datacontract/imports/importer_factory.py +7 -1
  41. datacontract/imports/jsonschema_importer.py +3 -2
  42. datacontract/imports/odcs_v2_importer.py +2 -2
  43. datacontract/imports/odcs_v3_importer.py +7 -2
  44. datacontract/imports/parquet_importer.py +81 -0
  45. datacontract/imports/spark_importer.py +2 -1
  46. datacontract/imports/sql_importer.py +1 -1
  47. datacontract/imports/unity_importer.py +3 -3
  48. datacontract/integration/opentelemetry.py +0 -1
  49. datacontract/lint/lint.py +2 -1
  50. datacontract/lint/linters/description_linter.py +1 -0
  51. datacontract/lint/linters/example_model_linter.py +1 -0
  52. datacontract/lint/linters/field_pattern_linter.py +1 -0
  53. datacontract/lint/linters/field_reference_linter.py +1 -0
  54. datacontract/lint/linters/notice_period_linter.py +1 -0
  55. datacontract/lint/linters/quality_schema_linter.py +1 -0
  56. datacontract/lint/linters/valid_constraints_linter.py +1 -0
  57. datacontract/lint/resolve.py +7 -3
  58. datacontract/lint/schema.py +1 -1
  59. datacontract/model/data_contract_specification.py +13 -6
  60. datacontract/model/run.py +21 -12
  61. datacontract/templates/index.html +6 -6
  62. datacontract/web.py +2 -3
  63. {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.16.dist-info}/METADATA +163 -60
  64. datacontract_cli-0.10.16.dist-info/RECORD +106 -0
  65. {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.16.dist-info}/WHEEL +1 -1
  66. datacontract_cli-0.10.14.dist-info/RECORD +0 -103
  67. {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.16.dist-info}/LICENSE +0 -0
  68. {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.16.dist-info}/entry_points.txt +0 -0
  69. {datacontract_cli-0.10.14.dist-info → datacontract_cli-0.10.16.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,40 @@
1
+ """
2
+ Module for converting data contract field types to corresponding pandas data types.
3
+ """
4
+
5
+ from datacontract.model.data_contract_specification import Field
6
+
7
+
8
+ def convert_to_pandas_type(field: Field) -> str:
9
+ """
10
+ Convert a data contract field type to the equivalent pandas data type.
11
+
12
+ Parameters:
13
+ ----------
14
+ field : Field
15
+ A Field object containing metadata about the data type of the field.
16
+
17
+ Returns:
18
+ -------
19
+ str
20
+ The corresponding pandas data type as a string.
21
+ """
22
+ field_type = field.type
23
+
24
+ if field_type in ["string", "varchar", "text"]:
25
+ return "str"
26
+ if field_type in ["integer", "int"]:
27
+ return "int32"
28
+ if field_type == "long":
29
+ return "int64"
30
+ if field_type == "float":
31
+ return "float32"
32
+ if field_type in ["number", "decimal", "numeric", "double"]:
33
+ return "float64"
34
+ if field_type == "boolean":
35
+ return "bool"
36
+ if field_type in ["timestamp", "timestamp_tz", "timestamp_ntz", "date"]:
37
+ return "datetime64[ns]"
38
+ if field_type == "bytes":
39
+ return "object"
40
+ return "object"
@@ -1,5 +1,5 @@
1
- from datacontract.model.data_contract_specification import DataContractSpecification
2
1
  from datacontract.export.exporter import Exporter
2
+ from datacontract.model.data_contract_specification import DataContractSpecification
3
3
 
4
4
 
5
5
  class ProtoBufExporter(Exporter):
@@ -1,9 +1,8 @@
1
1
  from pydantic import BaseModel
2
- from rdflib import Graph, Literal, BNode, RDF, URIRef, Namespace
3
-
4
- from datacontract.model.data_contract_specification import DataContractSpecification
2
+ from rdflib import RDF, BNode, Graph, Literal, Namespace, URIRef
5
3
 
6
4
  from datacontract.export.exporter import Exporter
5
+ from datacontract.model.data_contract_specification import DataContractSpecification
7
6
 
8
7
 
9
8
  class RdfExporter(Exporter):
@@ -58,8 +57,8 @@ def to_rdf(data_contract_spec: DataContractSpecification, base) -> Graph:
58
57
  else:
59
58
  g = Graph(base=Namespace(""))
60
59
 
61
- dc = Namespace("https://datacontract.com/DataContractSpecification/0.9.2/")
62
- dcx = Namespace("https://datacontract.com/DataContractSpecification/0.9.2/Extension/")
60
+ dc = Namespace("https://datacontract.com/DataContractSpecification/1.1.0/")
61
+ dcx = Namespace("https://datacontract.com/DataContractSpecification/1.1.0/Extension/")
63
62
 
64
63
  g.bind("dc", dc)
65
64
  g.bind("dcx", dcx)
@@ -62,12 +62,16 @@ def to_checks(model_key, model_value, server_type: str, check_types: bool):
62
62
  if field.enum is not None and len(field.enum) > 0:
63
63
  checks.append(check_field_enum(field_name, field.enum, quote_field_name))
64
64
  if field.quality is not None and len(field.quality) > 0:
65
- checks.append(check_quality_list(model_key, field_name, field.quality))
65
+ quality_list = check_quality_list(model_key, field_name, field.quality)
66
+ if (quality_list is not None) and len(quality_list) > 0:
67
+ checks.append(quality_list)
66
68
  # TODO references: str = None
67
69
  # TODO format
68
70
 
69
71
  if model_value.quality is not None and len(model_value.quality) > 0:
70
- checks.append(check_quality_list(model_key, None, model_value.quality))
72
+ quality_list = check_quality_list(model_key, None, model_value.quality)
73
+ if (quality_list is not None) and len(quality_list) > 0:
74
+ checks.append(quality_list)
71
75
 
72
76
  checks_for_model_key = f"checks for {model_key}"
73
77
 
@@ -196,9 +200,9 @@ def check_quality_list(model_name, field_name, quality_list: List[Quality]):
196
200
  for quality in quality_list:
197
201
  if quality.type == "sql":
198
202
  if field_name is None:
199
- metric_name = f"{model_name}_{field_name}_quality_sql_{count}"
200
- else:
201
203
  metric_name = f"{model_name}_quality_sql_{count}"
204
+ else:
205
+ metric_name = f"{model_name}_{field_name}_quality_sql_{count}"
202
206
  threshold = to_sodacl_threshold(quality)
203
207
  query = prepare_query(quality, model_name, field_name)
204
208
  if query is None:
@@ -261,6 +265,7 @@ def to_sodacl_threshold(quality: Quality) -> str | None:
261
265
  return None
262
266
 
263
267
 
268
+ # These are deprecated root-level quality specifications, use the model-level and field-level quality fields instead
264
269
  def add_quality_checks(sodacl, data_contract_spec):
265
270
  if data_contract_spec.quality is None:
266
271
  return
@@ -1,10 +1,11 @@
1
1
  from pyspark.sql import types
2
+
3
+ from datacontract.export.exporter import Exporter
2
4
  from datacontract.model.data_contract_specification import (
3
5
  DataContractSpecification,
4
- Model,
5
6
  Field,
7
+ Model,
6
8
  )
7
- from datacontract.export.exporter import Exporter
8
9
 
9
10
 
10
11
  class SparkExporter(Exporter):
@@ -102,11 +103,11 @@ def to_struct_field(field: Field, field_name: str) -> types.StructField:
102
103
  Returns:
103
104
  types.StructField: The corresponding Spark StructField.
104
105
  """
105
- data_type = to_data_type(field)
106
+ data_type = to_spark_data_type(field)
106
107
  return types.StructField(name=field_name, dataType=data_type, nullable=not field.required)
107
108
 
108
109
 
109
- def to_data_type(field: Field) -> types.DataType:
110
+ def to_spark_data_type(field: Field) -> types.DataType:
110
111
  """
111
112
  Convert a field to a Spark DataType.
112
113
 
@@ -120,11 +121,11 @@ def to_data_type(field: Field) -> types.DataType:
120
121
  if field_type is None or field_type in ["null"]:
121
122
  return types.NullType()
122
123
  if field_type == "array":
123
- return types.ArrayType(to_data_type(field.items))
124
+ return types.ArrayType(to_spark_data_type(field.items))
124
125
  if field_type in ["object", "record", "struct"]:
125
126
  return types.StructType(to_struct_type(field.fields))
126
127
  if field_type == "map":
127
- return types.MapType(to_data_type(field.keys), to_data_type(field.values))
128
+ return types.MapType(to_spark_data_type(field.keys), to_spark_data_type(field.values))
128
129
  if field_type in ["string", "varchar", "text"]:
129
130
  return types.StringType()
130
131
  if field_type in ["number", "decimal", "numeric"]:
@@ -1,8 +1,7 @@
1
+ from datacontract.export.exporter import Exporter, _check_models_for_export, _determine_sql_server_type
1
2
  from datacontract.export.sql_type_converter import convert_to_sql_type
2
3
  from datacontract.model.data_contract_specification import DataContractSpecification, Model
3
4
 
4
- from datacontract.export.exporter import Exporter, _check_models_for_export, _determine_sql_server_type
5
-
6
5
 
7
6
  class SqlExporter(Exporter):
8
7
  def export(self, data_contract, model, server, sql_server_type, export_args) -> dict:
@@ -2,8 +2,7 @@ import ast
2
2
  import typing
3
3
 
4
4
  import datacontract.model.data_contract_specification as spec
5
- from datacontract.export.exporter import Exporter
6
- from datacontract.export.exporter import _determine_sql_server_type
5
+ from datacontract.export.exporter import Exporter, _determine_sql_server_type
7
6
 
8
7
 
9
8
  class SQLAlchemyExporter(Exporter):
@@ -1,7 +1,7 @@
1
1
  import re
2
2
 
3
- from datacontract.model.data_contract_specification import DataContractSpecification, Server
4
3
  from datacontract.export.exporter import Exporter
4
+ from datacontract.model.data_contract_specification import DataContractSpecification, Server
5
5
 
6
6
 
7
7
  class TerraformExporter(Exporter):
@@ -3,7 +3,7 @@ from typing import Dict, List
3
3
  import avro.schema
4
4
 
5
5
  from datacontract.imports.importer import Importer
6
- from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
6
+ from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
7
7
  from datacontract.model.exceptions import DataContractException
8
8
 
9
9
 
@@ -3,7 +3,7 @@ import logging
3
3
  from typing import List
4
4
 
5
5
  from datacontract.imports.importer import Importer
6
- from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
6
+ from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
7
7
  from datacontract.model.exceptions import DataContractException
8
8
 
9
9
 
@@ -1,11 +1,11 @@
1
- from pydbml import PyDBML, Database
2
1
  from typing import List
3
2
 
3
+ from pydbml import Database, PyDBML
4
4
  from pyparsing import ParseException
5
5
 
6
6
  from datacontract.imports.importer import Importer
7
7
  from datacontract.imports.sql_importer import map_type_from_sql
8
- from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
8
+ from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
9
9
  from datacontract.model.exceptions import DataContractException
10
10
 
11
11
 
@@ -1,10 +1,14 @@
1
1
  import json
2
2
  from typing import TypedDict
3
3
 
4
- from datacontract.imports.importer import Importer
5
- from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
6
4
  from dbt.artifacts.resources.v1.components import ColumnInfo
7
5
  from dbt.contracts.graph.manifest import Manifest
6
+ from dbt.contracts.graph.nodes import GenericTestNode
7
+ from dbt_common.contracts.constraints import ConstraintType
8
+
9
+ from datacontract.imports.bigquery_importer import map_type_from_bigquery
10
+ from datacontract.imports.importer import Importer
11
+ from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
8
12
 
9
13
 
10
14
  class DBTImportArgs(TypedDict, total=False):
@@ -33,7 +37,7 @@ class DbtManifestImporter(Importer):
33
37
  return import_dbt_manifest(
34
38
  data_contract_specification=data_contract_specification,
35
39
  manifest=manifest,
36
- dbt_nodes=import_args.get("dbt_nodes", []),
40
+ dbt_nodes=import_args.get("dbt_model", []),
37
41
  resource_types=import_args.get("resource_types", ["model"]),
38
42
  )
39
43
 
@@ -42,7 +46,9 @@ def read_dbt_manifest(manifest_path: str) -> Manifest:
42
46
  """Read a manifest from file."""
43
47
  with open(file=manifest_path, mode="r", encoding="utf-8") as f:
44
48
  manifest_dict: dict = json.load(f)
45
- return Manifest.from_dict(manifest_dict)
49
+ manifest = Manifest.from_dict(manifest_dict)
50
+ manifest.build_parent_and_child_maps()
51
+ return manifest
46
52
 
47
53
 
48
54
  def import_dbt_manifest(
@@ -57,7 +63,7 @@ def import_dbt_manifest(
57
63
  """
58
64
  data_contract_specification.info.title = manifest.metadata.project_name
59
65
  data_contract_specification.info.dbt_version = manifest.metadata.dbt_version
60
-
66
+ adapter_type = manifest.metadata.adapter_type
61
67
  data_contract_specification.models = data_contract_specification.models or {}
62
68
  for model_contents in manifest.nodes.values():
63
69
  # Only intressted in processing models.
@@ -72,7 +78,12 @@ def import_dbt_manifest(
72
78
  dc_model = Model(
73
79
  description=model_contents.description,
74
80
  tags=model_contents.tags,
75
- fields=create_fields(columns=model_contents.columns),
81
+ fields=create_fields(
82
+ manifest,
83
+ model_unique_id=model_contents.unique_id,
84
+ columns=model_contents.columns,
85
+ adapter_type=adapter_type,
86
+ ),
76
87
  )
77
88
 
78
89
  data_contract_specification.models[model_contents.name] = dc_model
@@ -80,14 +91,68 @@ def import_dbt_manifest(
80
91
  return data_contract_specification
81
92
 
82
93
 
83
- def create_fields(columns: dict[str, ColumnInfo]) -> dict[str, Field]:
84
- fields = {
85
- column.name: Field(
86
- description=column.description,
87
- type=column.data_type if column.data_type else "",
88
- tags=column.tags,
89
- )
90
- for column in columns.values()
91
- }
94
+ def convert_data_type_by_adapter_type(data_type: str, adapter_type: str) -> str:
95
+ if adapter_type == "bigquery":
96
+ return map_type_from_bigquery(data_type)
97
+ return data_type
92
98
 
99
+
100
+ def create_fields(
101
+ manifest: Manifest, model_unique_id: str, columns: dict[str, ColumnInfo], adapter_type: str
102
+ ) -> dict[str, Field]:
103
+ fields = {column.name: create_field(manifest, model_unique_id, column, adapter_type) for column in columns.values()}
93
104
  return fields
105
+
106
+
107
+ def get_column_tests(manifest: Manifest, model_name: str, column_name: str) -> list[dict[str, str]]:
108
+ column_tests = []
109
+ model_node = manifest.nodes.get(model_name)
110
+ if not model_node:
111
+ raise ValueError(f"Model {model_name} not found in manifest.")
112
+
113
+ model_unique_id = model_node.unique_id
114
+ test_ids = manifest.child_map.get(model_unique_id, [])
115
+
116
+ for test_id in test_ids:
117
+ test_node = manifest.nodes.get(test_id)
118
+ if not test_node or test_node.resource_type != "test":
119
+ continue
120
+
121
+ if not isinstance(test_node, GenericTestNode):
122
+ continue
123
+
124
+ if test_node.column_name != column_name:
125
+ continue
126
+
127
+ if test_node.config.where is not None:
128
+ continue
129
+
130
+ column_tests.append(
131
+ {
132
+ "test_name": test_node.name,
133
+ "test_type": test_node.test_metadata.name,
134
+ "column": test_node.column_name,
135
+ }
136
+ )
137
+ return column_tests
138
+
139
+
140
+ def create_field(manifest: Manifest, model_unique_id: str, column: ColumnInfo, adapter_type: str) -> Field:
141
+ column_type = convert_data_type_by_adapter_type(column.data_type, adapter_type) if column.data_type else ""
142
+ field = Field(
143
+ description=column.description,
144
+ type=column_type,
145
+ tags=column.tags,
146
+ )
147
+
148
+ all_tests = get_column_tests(manifest, model_unique_id, column.name)
149
+
150
+ required = False
151
+ if any(constraint.type == ConstraintType.not_null for constraint in column.constraints):
152
+ required = True
153
+ if [test for test in all_tests if test["test_type"] == "not_null"]:
154
+ required = True
155
+ if required:
156
+ field.required = required
157
+
158
+ return field
@@ -1,11 +1,13 @@
1
- import boto3
2
- from typing import List, Dict, Generator
3
1
  import re
2
+ from typing import Dict, Generator, List
3
+
4
+ import boto3
5
+
4
6
  from datacontract.imports.importer import Importer
5
7
  from datacontract.model.data_contract_specification import (
6
8
  DataContractSpecification,
7
- Model,
8
9
  Field,
10
+ Model,
9
11
  Server,
10
12
  )
11
13
 
@@ -1,12 +1,11 @@
1
- from typing import Dict, Any
1
+ from typing import Any, Dict
2
2
 
3
- from datacontract.imports.importer import Importer
4
- from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
5
-
6
- from pyiceberg.schema import Schema
7
- from pyiceberg import types as iceberg_types
8
3
  from pydantic import ValidationError
4
+ from pyiceberg import types as iceberg_types
5
+ from pyiceberg.schema import Schema
9
6
 
7
+ from datacontract.imports.importer import Importer
8
+ from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
10
9
  from datacontract.model.exceptions import DataContractException
11
10
 
12
11
 
@@ -43,8 +42,19 @@ def import_iceberg(
43
42
 
44
43
  model = Model(type="table", title=table_name)
45
44
 
45
+ # Iceberg identifier_fields aren't technically primary keys since Iceberg doesn't support primary keys,
46
+ # but they are close enough that we can probably treat them as primary keys on the conversion.
47
+ # ref: https://iceberg.apache.org/spec/#identifier-field-ids
48
+ # this code WILL NOT support finding nested primary key fields.
49
+ identifier_fields_ids = schema.identifier_field_ids
50
+
46
51
  for field in schema.fields:
47
- model.fields[field.name] = _field_from_nested_field(field)
52
+ model_field = _field_from_nested_field(field)
53
+
54
+ if field.field_id in identifier_fields_ids:
55
+ model_field.primaryKey = True
56
+
57
+ model.fields[field.name] = model_field
48
58
 
49
59
  data_contract_specification.models[table_name] = model
50
60
  return data_contract_specification
@@ -30,6 +30,7 @@ class ImportFormat(str, Enum):
30
30
  unity = "unity"
31
31
  spark = "spark"
32
32
  iceberg = "iceberg"
33
+ parquet = "parquet"
33
34
 
34
35
  @classmethod
35
36
  def get_supported_formats(cls):
@@ -1,6 +1,7 @@
1
1
  import importlib.util
2
2
  import sys
3
- from datacontract.imports.importer import ImportFormat, Importer
3
+
4
+ from datacontract.imports.importer import Importer, ImportFormat
4
5
 
5
6
 
6
7
  class ImporterFactory:
@@ -98,3 +99,8 @@ importer_factory.register_lazy_importer(
98
99
  module_path="datacontract.imports.iceberg_importer",
99
100
  class_name="IcebergImporter",
100
101
  )
102
+ importer_factory.register_lazy_importer(
103
+ name=ImportFormat.parquet,
104
+ module_path="datacontract.imports.parquet_importer",
105
+ class_name="ParquetImporter",
106
+ )
@@ -3,7 +3,7 @@ import json
3
3
  import fastjsonschema
4
4
 
5
5
  from datacontract.imports.importer import Importer
6
- from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field, Definition
6
+ from datacontract.model.data_contract_specification import DataContractSpecification, Definition, Field, Model
7
7
  from datacontract.model.exceptions import DataContractException
8
8
 
9
9
 
@@ -111,7 +111,8 @@ def schema_to_args(property_schema, is_required: bool = None) -> dict:
111
111
  nested_properties = property_schema.get("properties")
112
112
  if nested_properties is not None:
113
113
  # recursive call for complex nested properties
114
- field_kwargs["fields"] = jsonschema_to_args(nested_properties, property_schema["required"])
114
+ required = property_schema.get("required", [])
115
+ field_kwargs["fields"] = jsonschema_to_args(nested_properties, required)
115
116
 
116
117
  return field_kwargs
117
118
 
@@ -6,16 +6,16 @@ import yaml
6
6
 
7
7
  from datacontract.imports.importer import Importer
8
8
  from datacontract.model.data_contract_specification import (
9
+ DATACONTRACT_TYPES,
9
10
  Availability,
10
11
  Contact,
11
12
  DataContractSpecification,
13
+ Field,
12
14
  Info,
13
15
  Model,
14
- Field,
15
16
  Retention,
16
17
  ServiceLevel,
17
18
  Terms,
18
- DATACONTRACT_TYPES,
19
19
  )
20
20
  from datacontract.model.exceptions import DataContractException
21
21
 
@@ -8,16 +8,17 @@ import yaml
8
8
  from datacontract.imports.importer import Importer
9
9
  from datacontract.lint.resources import read_resource
10
10
  from datacontract.model.data_contract_specification import (
11
+ DATACONTRACT_TYPES,
11
12
  Availability,
12
13
  DataContractSpecification,
14
+ Field,
13
15
  Info,
14
16
  Model,
15
- Field,
17
+ Quality,
16
18
  Retention,
17
19
  Server,
18
20
  ServiceLevel,
19
21
  Terms,
20
- DATACONTRACT_TYPES,
21
22
  )
22
23
  from datacontract.model.exceptions import DataContractException
23
24
 
@@ -193,6 +194,10 @@ def import_models(odcs_contract: Dict[str, Any]) -> Dict[str, Model]:
193
194
  model.fields = import_fields(
194
195
  odcs_schema.get("properties"), custom_type_mappings, server_type=get_server_type(odcs_contract)
195
196
  )
197
+ if odcs_schema.get("quality") is not None:
198
+ # convert dict to pydantic model
199
+
200
+ model.quality = [Quality.model_validate(q) for q in odcs_schema.get("quality")]
196
201
  model.title = schema_name
197
202
  if odcs_schema.get("dataGranularityDescription") is not None:
198
203
  model.config = {"dataGranularityDescription": odcs_schema.get("dataGranularityDescription")}
@@ -0,0 +1,81 @@
1
+ import os.path
2
+
3
+ import pyarrow
4
+ from pyarrow import parquet
5
+
6
+ from datacontract.imports.importer import Importer
7
+ from datacontract.model.data_contract_specification import (
8
+ DataContractSpecification,
9
+ Field,
10
+ Model,
11
+ )
12
+ from datacontract.model.exceptions import DataContractException
13
+
14
+
15
+ class ParquetImporter(Importer):
16
+ def import_source(
17
+ self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
18
+ ) -> DataContractSpecification:
19
+ return import_parquet(data_contract_specification, source)
20
+
21
+
22
+ def import_parquet(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
23
+ # use filename as schema name, remove .parquet suffix, avoid breaking the yaml output by replacing dots
24
+ schema_name = os.path.basename(source).removesuffix(".parquet").replace(".", "_")
25
+
26
+ fields: dict[str, Field] = {}
27
+
28
+ arrow_schema = parquet.read_schema(source)
29
+ for field_name in arrow_schema.names:
30
+ parquet_field = arrow_schema.field(field_name)
31
+
32
+ field = map_pyarrow_field_to_specification_field(parquet_field, "parquet")
33
+
34
+ if not parquet_field.nullable:
35
+ field.required = True
36
+
37
+ fields[field_name] = field
38
+
39
+ data_contract_specification.models[schema_name] = Model(fields=fields)
40
+
41
+ return data_contract_specification
42
+
43
+
44
+ def map_pyarrow_field_to_specification_field(pyarrow_field: pyarrow.Field, file_format: str) -> Field:
45
+ if pyarrow.types.is_boolean(pyarrow_field.type):
46
+ return Field(type="boolean")
47
+ if pyarrow.types.is_int32(pyarrow_field.type):
48
+ return Field(type="int")
49
+ if pyarrow.types.is_int64(pyarrow_field.type):
50
+ return Field(type="long")
51
+ if pyarrow.types.is_integer(pyarrow_field.type):
52
+ return Field(type="number")
53
+ if pyarrow.types.is_float32(pyarrow_field.type):
54
+ return Field(type="float")
55
+ if pyarrow.types.is_float64(pyarrow_field.type):
56
+ return Field(type="double")
57
+ if pyarrow.types.is_decimal(pyarrow_field.type):
58
+ return Field(type="decimal", precision=pyarrow_field.type.precision, scale=pyarrow_field.type.scale)
59
+ if pyarrow.types.is_timestamp(pyarrow_field.type):
60
+ return Field(type="timestamp")
61
+ if pyarrow.types.is_date(pyarrow_field.type):
62
+ return Field(type="date")
63
+ if pyarrow.types.is_null(pyarrow_field.type):
64
+ return Field(type="null")
65
+ if pyarrow.types.is_binary(pyarrow_field.type):
66
+ return Field(type="bytes")
67
+ if pyarrow.types.is_string(pyarrow_field.type):
68
+ return Field(type="string")
69
+ if pyarrow.types.is_map(pyarrow_field.type) or pyarrow.types.is_dictionary(pyarrow_field.type):
70
+ return Field(type="map")
71
+ if pyarrow.types.is_struct(pyarrow_field.type):
72
+ return Field(type="struct")
73
+ if pyarrow.types.is_list(pyarrow_field.type):
74
+ return Field(type="array")
75
+
76
+ raise DataContractException(
77
+ type="schema",
78
+ name=f"Parse {file_format} schema",
79
+ reason=f"{pyarrow_field.type} currently not supported.",
80
+ engine="datacontract",
81
+ )
@@ -1,9 +1,10 @@
1
1
  from pyspark.sql import DataFrame, SparkSession, types
2
+
2
3
  from datacontract.imports.importer import Importer
3
4
  from datacontract.model.data_contract_specification import (
4
5
  DataContractSpecification,
5
- Model,
6
6
  Field,
7
+ Model,
7
8
  Server,
8
9
  )
9
10
 
@@ -1,7 +1,7 @@
1
1
  from simple_ddl_parser import parse_from_file
2
2
 
3
3
  from datacontract.imports.importer import Importer
4
- from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
4
+ from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
5
5
 
6
6
 
7
7
  class SqlImporter(Importer):
@@ -2,13 +2,13 @@ import json
2
2
  import os
3
3
  from typing import List, Optional
4
4
 
5
- from pyspark.sql import types
6
5
  from databricks.sdk import WorkspaceClient
7
- from databricks.sdk.service.catalog import TableInfo, ColumnInfo
6
+ from databricks.sdk.service.catalog import ColumnInfo, TableInfo
7
+ from pyspark.sql import types
8
8
 
9
9
  from datacontract.imports.importer import Importer
10
10
  from datacontract.imports.spark_importer import _field_from_struct_type
11
- from datacontract.model.data_contract_specification import DataContractSpecification, Model, Field
11
+ from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
12
12
  from datacontract.model.exceptions import DataContractException
13
13
 
14
14
 
@@ -12,7 +12,6 @@ from opentelemetry.sdk.metrics.export import ConsoleMetricExporter, PeriodicExpo
12
12
 
13
13
  from datacontract.model.run import Run
14
14
 
15
-
16
15
  # Publishes metrics of a test run.
17
16
  # Metric contains the values:
18
17
  # 0 == test run passed,
datacontract/lint/lint.py CHANGED
@@ -1,9 +1,10 @@
1
1
  import abc
2
2
  from dataclasses import dataclass, field
3
3
  from enum import Enum
4
- from typing import Sequence, Any, cast
4
+ from typing import Any, Sequence, cast
5
5
 
6
6
  from datacontract.model.run import Check
7
+
7
8
  from ..model.data_contract_specification import DataContractSpecification
8
9
 
9
10
  """This module contains linter definitions for linting a data contract.
@@ -1,4 +1,5 @@
1
1
  from datacontract.model.data_contract_specification import DataContractSpecification
2
+
2
3
  from ..lint import Linter, LinterResult
3
4
 
4
5