datacontract-cli 0.10.9__py3-none-any.whl → 0.10.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

datacontract/cli.py CHANGED
@@ -226,6 +226,12 @@ def import_(
226
226
  unity_table_full_name: Annotated[
227
227
  Optional[str], typer.Option(help="Full name of a table in the unity catalog")
228
228
  ] = None,
229
+ dbt_model: Annotated[
230
+ Optional[List[str]],
231
+ typer.Option(
232
+ help="List of models names to import from the dbt manifest file (repeat for multiple models names, leave empty for all models in the dataset)."
233
+ ),
234
+ ] = None,
229
235
  ):
230
236
  """
231
237
  Create a data contract from the given source location. Prints to stdout.
@@ -238,6 +244,7 @@ def import_(
238
244
  bigquery_project=bigquery_project,
239
245
  bigquery_dataset=bigquery_dataset,
240
246
  unity_table_full_name=unity_table_full_name,
247
+ dbt_model=dbt_model,
241
248
  )
242
249
  console.print(result.to_yaml())
243
250
 
@@ -4,7 +4,9 @@ import tempfile
4
4
  import typing
5
5
 
6
6
  import yaml
7
- from pyspark.sql import SparkSession
7
+
8
+ if typing.TYPE_CHECKING:
9
+ from pyspark.sql import SparkSession
8
10
 
9
11
  from datacontract.breaking.breaking import models_breaking_changes, quality_breaking_changes
10
12
  from datacontract.engines.datacontract.check_that_datacontract_contains_valid_servers_configuration import (
@@ -43,7 +45,7 @@ class DataContract:
43
45
  examples: bool = False,
44
46
  publish_url: str = None,
45
47
  publish_to_opentelemetry: bool = False,
46
- spark: SparkSession = None,
48
+ spark: "SparkSession" = None,
47
49
  inline_definitions: bool = False,
48
50
  inline_quality: bool = False,
49
51
  ):
@@ -1,6 +1,9 @@
1
1
  import logging
2
+ import typing
3
+
4
+ if typing.TYPE_CHECKING:
5
+ from pyspark.sql import SparkSession
2
6
 
3
- from pyspark.sql import SparkSession
4
7
  from soda.scan import Scan
5
8
 
6
9
  from datacontract.engines.soda.connections.bigquery import to_bigquery_soda_configuration
@@ -17,7 +20,7 @@ from datacontract.model.run import Run, Check, Log
17
20
 
18
21
 
19
22
  def check_soda_execute(
20
- run: Run, data_contract: DataContractSpecification, server: Server, spark: SparkSession, tmp_dir
23
+ run: Run, data_contract: DataContractSpecification, server: Server, spark: "SparkSession", tmp_dir
21
24
  ):
22
25
  if data_contract is None:
23
26
  run.log_warn("Cannot run engine soda-core, as data contract is invalid")
@@ -50,6 +50,10 @@ def get_duckdb_connection(data_contract, server, run: Run):
50
50
  )
51
51
  elif server.format == "delta":
52
52
  if server.type == "azure":
53
+ # After switching to native delta table support
54
+ # in https://github.com/datacontract/datacontract-cli/issues/258,
55
+ # azure storage should also work
56
+ # https://github.com/duckdb/duckdb_delta/issues/21
53
57
  raise NotImplementedError("Support for Delta Tables on Azure Storage is not implemented yet")
54
58
 
55
59
  storage_options = {
@@ -65,7 +65,7 @@ def to_avro_type(field: Field, field_name: str) -> str | dict:
65
65
  if field.config["avroLogicalType"] in ["time-millis", "date"]:
66
66
  return {"type": "int", "logicalType": field.config["avroLogicalType"]}
67
67
  if "avroType" in field.config:
68
- return field.config["avroLogicalType"]
68
+ return field.config["avroType"]
69
69
 
70
70
  if field.type is None:
71
71
  return "null"
@@ -131,7 +131,7 @@ def check_field_minimum(field_name, minimum, quote_field_name: bool = False):
131
131
  field_name = f'"{field_name}"'
132
132
  return {
133
133
  f"invalid_count({field_name}) = 0": {
134
- "name": f"Check that field {field_name} has a minimum of {min}",
134
+ "name": f"Check that field {field_name} has a minimum of {minimum}",
135
135
  "valid min": minimum,
136
136
  }
137
137
  }
@@ -1,3 +1,5 @@
1
+ from typing import Dict, List
2
+
1
3
  import avro.schema
2
4
 
3
5
  from datacontract.imports.importer import Importer
@@ -6,13 +8,39 @@ from datacontract.model.exceptions import DataContractException
6
8
 
7
9
 
8
10
  class AvroImporter(Importer):
11
+ """Class to import Avro Schema file"""
12
+
9
13
  def import_source(
10
14
  self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
11
- ) -> dict:
15
+ ) -> DataContractSpecification:
16
+ """
17
+ Import Avro schema from a source file.
18
+
19
+ Args:
20
+ data_contract_specification: The data contract specification to update.
21
+ source: The path to the Avro schema file.
22
+ import_args: Additional import arguments.
23
+
24
+ Returns:
25
+ The updated data contract specification.
26
+ """
12
27
  return import_avro(data_contract_specification, source)
13
28
 
14
29
 
15
30
  def import_avro(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
31
+ """
32
+ Import an Avro schema from a file and update the data contract specification.
33
+
34
+ Args:
35
+ data_contract_specification: The data contract specification to update.
36
+ source: The path to the Avro schema file.
37
+
38
+ Returns:
39
+ DataContractSpecification: The updated data contract specification.
40
+
41
+ Raises:
42
+ DataContractException: If there's an error parsing the Avro schema.
43
+ """
16
44
  if data_contract_specification.models is None:
17
45
  data_contract_specification.models = {}
18
46
 
@@ -45,7 +73,14 @@ def import_avro(data_contract_specification: DataContractSpecification, source:
45
73
  return data_contract_specification
46
74
 
47
75
 
48
- def handle_config_avro_custom_properties(field, imported_field):
76
+ def handle_config_avro_custom_properties(field: avro.schema.Field, imported_field: Field) -> None:
77
+ """
78
+ Handle custom Avro properties and add them to the imported field's config.
79
+
80
+ Args:
81
+ field: The Avro field.
82
+ imported_field: The imported field to update.
83
+ """
49
84
  if field.get_prop("logicalType") is not None:
50
85
  if imported_field.config is None:
51
86
  imported_field.config = {}
@@ -57,7 +92,16 @@ def handle_config_avro_custom_properties(field, imported_field):
57
92
  imported_field.config["avroDefault"] = field.default
58
93
 
59
94
 
60
- def import_record_fields(record_fields):
95
+ def import_record_fields(record_fields: List[avro.schema.Field]) -> Dict[str, Field]:
96
+ """
97
+ Import Avro record fields and convert them to data contract fields.
98
+
99
+ Args:
100
+ record_fields: List of Avro record fields.
101
+
102
+ Returns:
103
+ A dictionary of imported fields.
104
+ """
61
105
  imported_fields = {}
62
106
  for field in record_fields:
63
107
  imported_field = Field()
@@ -83,6 +127,15 @@ def import_record_fields(record_fields):
83
127
  elif field.type.type == "array":
84
128
  imported_field.type = "array"
85
129
  imported_field.items = import_avro_array_items(field.type)
130
+ elif field.type.type == "map":
131
+ imported_field.type = "map"
132
+ imported_field.values = import_avro_map_values(field.type)
133
+ elif field.type.type == "enum":
134
+ imported_field.type = "string"
135
+ imported_field.enum = field.type.symbols
136
+ if not imported_field.config:
137
+ imported_field.config = {}
138
+ imported_field.config["avroType"] = "enum"
86
139
  else: # primitive type
87
140
  imported_field.type = map_type_from_avro(field.type.type)
88
141
 
@@ -91,7 +144,16 @@ def import_record_fields(record_fields):
91
144
  return imported_fields
92
145
 
93
146
 
94
- def import_avro_array_items(array_schema):
147
+ def import_avro_array_items(array_schema: avro.schema.ArraySchema) -> Field:
148
+ """
149
+ Import Avro array items and convert them to a data contract field.
150
+
151
+ Args:
152
+ array_schema: The Avro array schema.
153
+
154
+ Returns:
155
+ Field: The imported field representing the array items.
156
+ """
95
157
  items = Field()
96
158
  for prop in array_schema.other_props:
97
159
  items.__setattr__(prop, array_schema.other_props[prop])
@@ -108,7 +170,45 @@ def import_avro_array_items(array_schema):
108
170
  return items
109
171
 
110
172
 
111
- def import_type_of_optional_field(field):
173
+ def import_avro_map_values(map_schema: avro.schema.MapSchema) -> Field:
174
+ """
175
+ Import Avro map values and convert them to a data contract field.
176
+
177
+ Args:
178
+ map_schema: The Avro map schema.
179
+
180
+ Returns:
181
+ Field: The imported field representing the map values.
182
+ """
183
+ values = Field()
184
+ for prop in map_schema.other_props:
185
+ values.__setattr__(prop, map_schema.other_props[prop])
186
+
187
+ if map_schema.values.type == "record":
188
+ values.type = "object"
189
+ values.fields = import_record_fields(map_schema.values.fields)
190
+ elif map_schema.values.type == "array":
191
+ values.type = "array"
192
+ values.items = import_avro_array_items(map_schema.values)
193
+ else: # primitive type
194
+ values.type = map_type_from_avro(map_schema.values.type)
195
+
196
+ return values
197
+
198
+
199
+ def import_type_of_optional_field(field: avro.schema.Field) -> str:
200
+ """
201
+ Determine the type of optional field in an Avro union.
202
+
203
+ Args:
204
+ field: The Avro field with a union type.
205
+
206
+ Returns:
207
+ str: The mapped type of the non-null field in the union.
208
+
209
+ Raises:
210
+ DataContractException: If no non-null type is found in the union.
211
+ """
112
212
  for field_type in field.type.schemas:
113
213
  if field_type.type != "null":
114
214
  return map_type_from_avro(field_type.type)
@@ -121,21 +221,51 @@ def import_type_of_optional_field(field):
121
221
  )
122
222
 
123
223
 
124
- def get_record_from_union_field(field):
224
+ def get_record_from_union_field(field: avro.schema.Field) -> avro.schema.RecordSchema | None:
225
+ """
226
+ Get the record schema from a union field.
227
+
228
+ Args:
229
+ field: The Avro field with a union type.
230
+
231
+ Returns:
232
+ The record schema if found, None otherwise.
233
+ """
125
234
  for field_type in field.type.schemas:
126
235
  if field_type.type == "record":
127
236
  return field_type
128
237
  return None
129
238
 
130
239
 
131
- def get_array_from_union_field(field):
240
+ def get_array_from_union_field(field: avro.schema.Field) -> avro.schema.ArraySchema | None:
241
+ """
242
+ Get the array schema from a union field.
243
+
244
+ Args:
245
+ field: The Avro field with a union type.
246
+
247
+ Returns:
248
+ The array schema if found, None otherwise.
249
+ """
132
250
  for field_type in field.type.schemas:
133
251
  if field_type.type == "array":
134
252
  return field_type
135
253
  return None
136
254
 
137
255
 
138
- def map_type_from_avro(avro_type_str: str):
256
+ def map_type_from_avro(avro_type_str: str) -> str:
257
+ """
258
+ Map Avro type strings to data contract type strings.
259
+
260
+ Args:
261
+ avro_type_str (str): The Avro type string.
262
+
263
+ Returns:
264
+ str: The corresponding data contract type string.
265
+
266
+ Raises:
267
+ DataContractException: If the Avro type is unsupported.
268
+ """
139
269
  # TODO: ambiguous mapping in the export
140
270
  if avro_type_str == "null":
141
271
  return "null"
@@ -155,6 +285,10 @@ def map_type_from_avro(avro_type_str: str):
155
285
  return "record"
156
286
  elif avro_type_str == "array":
157
287
  return "array"
288
+ elif avro_type_str == "map":
289
+ return "map"
290
+ elif avro_type_str == "enum":
291
+ return "string"
158
292
  else:
159
293
  raise DataContractException(
160
294
  type="schema",
@@ -0,0 +1,117 @@
1
+ import json
2
+
3
+ from typing import (
4
+ List,
5
+ )
6
+
7
+ from datacontract.imports.importer import Importer
8
+ from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
9
+
10
+
11
+ class DbtManifestImporter(Importer):
12
+ def import_source(
13
+ self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
14
+ ) -> dict:
15
+ data = read_dbt_manifest(manifest_path=source)
16
+ return import_dbt_manifest(
17
+ data_contract_specification, manifest_dict=data, dbt_models=import_args.get("dbt_model")
18
+ )
19
+
20
+
21
+ def import_dbt_manifest(
22
+ data_contract_specification: DataContractSpecification, manifest_dict: dict, dbt_models: List[str]
23
+ ):
24
+ data_contract_specification.info.title = manifest_dict.get("info").get("project_name")
25
+ data_contract_specification.info.dbt_version = manifest_dict.get("info").get("dbt_version")
26
+
27
+ if data_contract_specification.models is None:
28
+ data_contract_specification.models = {}
29
+
30
+ for model in manifest_dict.get("models", []):
31
+ if dbt_models and model.name not in dbt_models:
32
+ continue
33
+
34
+ dc_model = Model(
35
+ description=model.description,
36
+ tags=model.tags,
37
+ fields=create_fields(model.columns),
38
+ )
39
+
40
+ data_contract_specification.models[model.name] = dc_model
41
+
42
+ return data_contract_specification
43
+
44
+
45
+ def create_fields(columns: List):
46
+ fields = {}
47
+ for column in columns:
48
+ field = Field(
49
+ description=column.description, type=column.data_type if column.data_type else "", tags=column.tags
50
+ )
51
+ fields[column.name] = field
52
+
53
+ return fields
54
+
55
+
56
+ def read_dbt_manifest(manifest_path: str):
57
+ with open(manifest_path, "r", encoding="utf-8") as f:
58
+ manifest = json.load(f)
59
+ return {"info": manifest.get("metadata"), "models": create_manifest_models(manifest)}
60
+
61
+
62
+ def create_manifest_models(manifest: dict) -> List:
63
+ models = []
64
+ nodes = manifest.get("nodes")
65
+
66
+ for node in nodes.values():
67
+ if node["resource_type"] != "model":
68
+ continue
69
+
70
+ models.append(DbtModel(node))
71
+ return models
72
+
73
+
74
+ class DbtColumn:
75
+ name: str
76
+ description: str
77
+ data_type: str
78
+ meta: dict
79
+ tags: List
80
+
81
+ def __init__(self, node_column: dict):
82
+ self.name = node_column.get("name")
83
+ self.description = node_column.get("description")
84
+ self.data_type = node_column.get("data_type")
85
+ self.meta = node_column.get("meta", {})
86
+ self.tags = node_column.get("tags", [])
87
+
88
+ def __repr__(self) -> str:
89
+ return self.name
90
+
91
+
92
+ class DbtModel:
93
+ name: str
94
+ database: str
95
+ schema: str
96
+ description: str
97
+ unique_id: str
98
+ tags: List
99
+
100
+ def __init__(self, node: dict):
101
+ self.name = node.get("name")
102
+ self.database = node.get("database")
103
+ self.schema = node.get("schema")
104
+ self.description = node.get("description")
105
+ self.display_name = node.get("display_name")
106
+ self.unique_id = node.get("unique_id")
107
+ self.columns = []
108
+ self.tags = node.get("tags")
109
+ if node.get("columns"):
110
+ self.add_columns(node.get("columns").values())
111
+
112
+ def add_columns(self, model_columns: List):
113
+ for column in model_columns:
114
+ self.columns.append(DbtColumn(column))
115
+
116
+ def __repr__(self) -> str:
117
+ return self.name
@@ -14,7 +14,7 @@ class GlueImporter(Importer):
14
14
  def import_source(
15
15
  self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
16
16
  ) -> dict:
17
- return import_glue(data_contract_specification, source, import_args.get("glue_tables"))
17
+ return import_glue(data_contract_specification, source, import_args.get("glue_table"))
18
18
 
19
19
 
20
20
  def get_glue_database(database_name: str):
@@ -154,7 +154,7 @@ def import_glue(
154
154
  for column in table_schema:
155
155
  field = create_typed_field(column["Type"])
156
156
 
157
- # hive partitons are required, but are not primary keys
157
+ # hive partitions are required, but are not primary keys
158
158
  if column.get("Hive"):
159
159
  field.required = True
160
160
 
@@ -10,7 +10,10 @@ class Importer(ABC):
10
10
 
11
11
  @abstractmethod
12
12
  def import_source(
13
- self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
13
+ self,
14
+ data_contract_specification: DataContractSpecification,
15
+ source: str,
16
+ import_args: dict,
14
17
  ) -> dict:
15
18
  pass
16
19
 
@@ -18,11 +21,13 @@ class Importer(ABC):
18
21
  class ImportFormat(str, Enum):
19
22
  sql = "sql"
20
23
  avro = "avro"
24
+ dbt = "dbt"
21
25
  glue = "glue"
22
26
  jsonschema = "jsonschema"
23
27
  bigquery = "bigquery"
24
28
  odcs = "odcs"
25
29
  unity = "unity"
30
+ spark = "spark"
26
31
 
27
32
  @classmethod
28
33
  def get_suported_formats(cls):
@@ -18,7 +18,7 @@ class ImporterFactory:
18
18
  importers = self.dict_importer.copy()
19
19
  importers.update(self.dict_lazy_importer.copy())
20
20
  if name not in importers.keys():
21
- raise ValueError(f"The '{name}' format is not suportted.")
21
+ raise ValueError(f"The '{name}' format is not supported.")
22
22
  importer_class = importers[name]
23
23
  if type(importers[name]) is tuple:
24
24
  importer_class = load_module_class(module_path=importers[name][0], class_name=importers[name][1])
@@ -46,7 +46,9 @@ def load_module_class(module_path, class_name):
46
46
 
47
47
  importer_factory = ImporterFactory()
48
48
  importer_factory.register_lazy_importer(
49
- name=ImportFormat.avro, module_path="datacontract.imports.avro_importer", class_name="AvroImporter"
49
+ name=ImportFormat.avro,
50
+ module_path="datacontract.imports.avro_importer",
51
+ class_name="AvroImporter",
50
52
  )
51
53
  importer_factory.register_lazy_importer(
52
54
  name=ImportFormat.bigquery,
@@ -54,7 +56,9 @@ importer_factory.register_lazy_importer(
54
56
  class_name="BigQueryImporter",
55
57
  )
56
58
  importer_factory.register_lazy_importer(
57
- name=ImportFormat.glue, module_path="datacontract.imports.glue_importer", class_name="GlueImporter"
59
+ name=ImportFormat.glue,
60
+ module_path="datacontract.imports.glue_importer",
61
+ class_name="GlueImporter",
58
62
  )
59
63
  importer_factory.register_lazy_importer(
60
64
  name=ImportFormat.jsonschema,
@@ -62,11 +66,25 @@ importer_factory.register_lazy_importer(
62
66
  class_name="JsonSchemaImporter",
63
67
  )
64
68
  importer_factory.register_lazy_importer(
65
- name=ImportFormat.odcs, module_path="datacontract.imports.odcs_importer", class_name="OdcsImporter"
69
+ name=ImportFormat.odcs,
70
+ module_path="datacontract.imports.odcs_importer",
71
+ class_name="OdcsImporter",
66
72
  )
67
73
  importer_factory.register_lazy_importer(
68
- name=ImportFormat.sql, module_path="datacontract.imports.sql_importer", class_name="SqlImporter"
74
+ name=ImportFormat.sql,
75
+ module_path="datacontract.imports.sql_importer",
76
+ class_name="SqlImporter",
69
77
  )
70
78
  importer_factory.register_lazy_importer(
71
- name=ImportFormat.unity, module_path="datacontract.imports.unity_importer", class_name="UnityImporter"
79
+ name=ImportFormat.unity,
80
+ module_path="datacontract.imports.unity_importer",
81
+ class_name="UnityImporter",
82
+ )
83
+ importer_factory.register_lazy_importer(
84
+ name=ImportFormat.spark,
85
+ module_path="datacontract.imports.spark_importer",
86
+ class_name="SparkImporter",
87
+ )
88
+ importer_factory.register_lazy_importer(
89
+ name=ImportFormat.dbt, module_path="datacontract.imports.dbt_importer", class_name="DbtManifestImporter"
72
90
  )
@@ -64,11 +64,14 @@ def convert_json_schema_properties(properties, is_definition=False):
64
64
  case "tags":
65
65
  field_kwargs["tags"] = value
66
66
  case "properties":
67
- field_kwargs["fields"] = convert_json_schema_properties(value)
67
+ field_kwargs["fields"] = convert_json_schema_properties(value, is_definition=is_definition)
68
68
  case "items":
69
- field_kwargs["items"] = convert_json_schema_properties(value)
69
+ field_kwargs["items"] = convert_json_schema_properties(value, is_definition=is_definition)
70
70
 
71
- field = Field(**field_kwargs)
71
+ if is_definition:
72
+ field = Definition(**field_kwargs)
73
+ else:
74
+ field = Field(**field_kwargs)
72
75
  fields[field_name] = field
73
76
 
74
77
  return fields