datacontract-cli 0.10.11__py3-none-any.whl → 0.10.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (40) hide show
  1. datacontract/cli.py +19 -3
  2. datacontract/data_contract.py +5 -10
  3. datacontract/engines/fastjsonschema/check_jsonschema.py +11 -0
  4. datacontract/engines/fastjsonschema/s3/s3_read_files.py +2 -0
  5. datacontract/engines/soda/check_soda_execute.py +2 -8
  6. datacontract/engines/soda/connections/duckdb.py +23 -24
  7. datacontract/engines/soda/connections/kafka.py +84 -25
  8. datacontract/export/avro_converter.py +12 -2
  9. datacontract/export/bigquery_converter.py +30 -23
  10. datacontract/export/data_caterer_converter.py +148 -0
  11. datacontract/export/dbml_converter.py +3 -2
  12. datacontract/export/exporter.py +2 -0
  13. datacontract/export/exporter_factory.py +12 -0
  14. datacontract/export/jsonschema_converter.py +13 -2
  15. datacontract/export/spark_converter.py +5 -1
  16. datacontract/export/sql_type_converter.py +65 -39
  17. datacontract/export/sqlalchemy_converter.py +169 -0
  18. datacontract/imports/avro_importer.py +1 -0
  19. datacontract/imports/bigquery_importer.py +2 -2
  20. datacontract/imports/dbml_importer.py +112 -0
  21. datacontract/imports/dbt_importer.py +67 -91
  22. datacontract/imports/glue_importer.py +62 -58
  23. datacontract/imports/importer.py +2 -1
  24. datacontract/imports/importer_factory.py +5 -0
  25. datacontract/imports/odcs_importer.py +1 -1
  26. datacontract/imports/spark_importer.py +34 -11
  27. datacontract/imports/sql_importer.py +1 -1
  28. datacontract/imports/unity_importer.py +106 -85
  29. datacontract/integration/{publish_datamesh_manager.py → datamesh_manager.py} +33 -5
  30. datacontract/integration/{publish_opentelemetry.py → opentelemetry.py} +1 -1
  31. datacontract/lint/resolve.py +10 -1
  32. datacontract/lint/urls.py +27 -13
  33. datacontract/model/data_contract_specification.py +6 -2
  34. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/METADATA +123 -32
  35. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/RECORD +39 -37
  36. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/WHEEL +1 -1
  37. datacontract/publish/publish.py +0 -32
  38. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/LICENSE +0 -0
  39. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/entry_points.txt +0 -0
  40. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.13.dist-info}/top_level.txt +0 -0
@@ -1,117 +1,93 @@
1
1
  import json
2
-
3
- from typing import (
4
- List,
5
- )
2
+ from typing import TypedDict
6
3
 
7
4
  from datacontract.imports.importer import Importer
8
5
  from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model
6
+ from dbt.artifacts.resources.v1.components import ColumnInfo
7
+ from dbt.contracts.graph.manifest import Manifest
8
+
9
+
10
+ class DBTImportArgs(TypedDict, total=False):
11
+ """
12
+ A dictionary representing arguments for importing DBT models.
13
+ Makes the DBT Importer more customizable by allowing for flexible filtering
14
+ of models and their properties, through wrapping or extending.
15
+
16
+ Attributes:
17
+ dbt_models: The keys of models to be used in contract. All as default.
18
+ resource_types: Nodes listed in resource_types are kept while importing. model as default.
19
+ """
20
+
21
+ dbt_nodes: list[str]
22
+ resource_types: list[str]
9
23
 
10
24
 
11
25
  class DbtManifestImporter(Importer):
12
26
  def import_source(
13
- self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
14
- ) -> dict:
15
- data = read_dbt_manifest(manifest_path=source)
27
+ self,
28
+ data_contract_specification: DataContractSpecification,
29
+ source: str,
30
+ import_args: DBTImportArgs,
31
+ ) -> DataContractSpecification:
32
+ manifest = read_dbt_manifest(manifest_path=source)
16
33
  return import_dbt_manifest(
17
- data_contract_specification, manifest_dict=data, dbt_models=import_args.get("dbt_model")
34
+ data_contract_specification=data_contract_specification,
35
+ manifest=manifest,
36
+ dbt_nodes=import_args.get("dbt_nodes", []),
37
+ resource_types=import_args.get("resource_types", ["model"]),
18
38
  )
19
39
 
20
40
 
21
- def import_dbt_manifest(
22
- data_contract_specification: DataContractSpecification, manifest_dict: dict, dbt_models: List[str]
23
- ):
24
- data_contract_specification.info.title = manifest_dict.get("info").get("project_name")
25
- data_contract_specification.info.dbt_version = manifest_dict.get("info").get("dbt_version")
41
+ def read_dbt_manifest(manifest_path: str) -> Manifest:
42
+ """Read a manifest from file."""
43
+ with open(file=manifest_path, mode="r", encoding="utf-8") as f:
44
+ manifest_dict: dict = json.load(f)
45
+ return Manifest.from_dict(manifest_dict)
26
46
 
27
- if data_contract_specification.models is None:
28
- data_contract_specification.models = {}
29
47
 
30
- for model in manifest_dict.get("models", []):
31
- if dbt_models and model.name not in dbt_models:
48
+ def import_dbt_manifest(
49
+ data_contract_specification: DataContractSpecification,
50
+ manifest: Manifest,
51
+ dbt_nodes: list[str],
52
+ resource_types: list[str],
53
+ ) -> DataContractSpecification:
54
+ """
55
+ Extracts all relevant information from the manifest,
56
+ and puts it in a data contract specification.
57
+ """
58
+ data_contract_specification.info.title = manifest.metadata.project_name
59
+ data_contract_specification.info.dbt_version = manifest.metadata.dbt_version
60
+
61
+ data_contract_specification.models = data_contract_specification.models or {}
62
+ for model_contents in manifest.nodes.values():
63
+ # Only intressted in processing models.
64
+ if model_contents.resource_type not in resource_types:
65
+ continue
66
+
67
+ # To allow args stored in dbt_models to filter relevant models.
68
+ # If dbt_models is empty, use all models.
69
+ if dbt_nodes and model_contents.name not in dbt_nodes:
32
70
  continue
33
71
 
34
72
  dc_model = Model(
35
- description=model.description,
36
- tags=model.tags,
37
- fields=create_fields(model.columns),
73
+ description=model_contents.description,
74
+ tags=model_contents.tags,
75
+ fields=create_fields(columns=model_contents.columns),
38
76
  )
39
77
 
40
- data_contract_specification.models[model.name] = dc_model
78
+ data_contract_specification.models[model_contents.name] = dc_model
41
79
 
42
80
  return data_contract_specification
43
81
 
44
82
 
45
- def create_fields(columns: List):
46
- fields = {}
47
- for column in columns:
48
- field = Field(
49
- description=column.description, type=column.data_type if column.data_type else "", tags=column.tags
83
+ def create_fields(columns: dict[str, ColumnInfo]) -> dict[str, Field]:
84
+ fields = {
85
+ column.name: Field(
86
+ description=column.description,
87
+ type=column.data_type if column.data_type else "",
88
+ tags=column.tags,
50
89
  )
51
- fields[column.name] = field
90
+ for column in columns.values()
91
+ }
52
92
 
53
93
  return fields
54
-
55
-
56
- def read_dbt_manifest(manifest_path: str):
57
- with open(manifest_path, "r", encoding="utf-8") as f:
58
- manifest = json.load(f)
59
- return {"info": manifest.get("metadata"), "models": create_manifest_models(manifest)}
60
-
61
-
62
- def create_manifest_models(manifest: dict) -> List:
63
- models = []
64
- nodes = manifest.get("nodes")
65
-
66
- for node in nodes.values():
67
- if node["resource_type"] != "model":
68
- continue
69
-
70
- models.append(DbtModel(node))
71
- return models
72
-
73
-
74
- class DbtColumn:
75
- name: str
76
- description: str
77
- data_type: str
78
- meta: dict
79
- tags: List
80
-
81
- def __init__(self, node_column: dict):
82
- self.name = node_column.get("name")
83
- self.description = node_column.get("description")
84
- self.data_type = node_column.get("data_type")
85
- self.meta = node_column.get("meta", {})
86
- self.tags = node_column.get("tags", [])
87
-
88
- def __repr__(self) -> str:
89
- return self.name
90
-
91
-
92
- class DbtModel:
93
- name: str
94
- database: str
95
- schema: str
96
- description: str
97
- unique_id: str
98
- tags: List
99
-
100
- def __init__(self, node: dict):
101
- self.name = node.get("name")
102
- self.database = node.get("database")
103
- self.schema = node.get("schema")
104
- self.description = node.get("description")
105
- self.display_name = node.get("display_name")
106
- self.unique_id = node.get("unique_id")
107
- self.columns = []
108
- self.tags = node.get("tags")
109
- if node.get("columns"):
110
- self.add_columns(node.get("columns").values())
111
-
112
- def add_columns(self, model_columns: List):
113
- for column in model_columns:
114
- self.columns.append(DbtColumn(column))
115
-
116
- def __repr__(self) -> str:
117
- return self.name
@@ -1,6 +1,6 @@
1
1
  import boto3
2
- from typing import List
3
-
2
+ from typing import List, Dict, Generator
3
+ import re
4
4
  from datacontract.imports.importer import Importer
5
5
  from datacontract.model.data_contract_specification import (
6
6
  DataContractSpecification,
@@ -13,7 +13,7 @@ from datacontract.model.data_contract_specification import (
13
13
  class GlueImporter(Importer):
14
14
  def import_source(
15
15
  self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
16
- ) -> dict:
16
+ ) -> DataContractSpecification:
17
17
  return import_glue(data_contract_specification, source, import_args.get("glue_table"))
18
18
 
19
19
 
@@ -39,7 +39,7 @@ def get_glue_database(database_name: str):
39
39
 
40
40
  return (
41
41
  response["Database"]["CatalogId"],
42
- response["Database"].get("LocationUri", "None"),
42
+ response["Database"].get("LocationUri"),
43
43
  )
44
44
 
45
45
 
@@ -75,7 +75,7 @@ def get_glue_tables(database_name: str) -> List[str]:
75
75
  return table_names
76
76
 
77
77
 
78
- def get_glue_table_schema(database_name: str, table_name: str):
78
+ def get_glue_table_schema(database_name: str, table_name: str) -> List[Dict]:
79
79
  """Get the schema of a Glue table.
80
80
 
81
81
  Args:
@@ -93,11 +93,11 @@ def get_glue_table_schema(database_name: str, table_name: str):
93
93
  response = glue.get_table(DatabaseName=database_name, Name=table_name)
94
94
  except glue.exceptions.EntityNotFoundException:
95
95
  print(f"Table {table_name} not found in database {database_name}.")
96
- return {}
96
+ return []
97
97
  except Exception as e:
98
98
  # todo catch all
99
99
  print(f"Error: {e}")
100
- return {}
100
+ return []
101
101
 
102
102
  table_schema = response["Table"]["StorageDescriptor"]["Columns"]
103
103
 
@@ -109,10 +109,9 @@ def get_glue_table_schema(database_name: str, table_name: str):
109
109
  "Name": pk["Name"],
110
110
  "Type": pk["Type"],
111
111
  "Hive": True,
112
- "Comment": "Partition Key",
112
+ "Comment": pk.get("Comment"),
113
113
  }
114
114
  )
115
-
116
115
  return table_schema
117
116
 
118
117
 
@@ -120,7 +119,7 @@ def import_glue(
120
119
  data_contract_specification: DataContractSpecification,
121
120
  source: str,
122
121
  table_names: List[str],
123
- ):
122
+ ) -> DataContractSpecification:
124
123
  """Import the schema of a Glue database.
125
124
 
126
125
  Args:
@@ -140,8 +139,13 @@ def import_glue(
140
139
  if table_names is None:
141
140
  table_names = get_glue_tables(source)
142
141
 
142
+ server_kwargs = {"type": "glue", "account": catalogid, "database": source}
143
+
144
+ if location_uri:
145
+ server_kwargs["location"] = location_uri
146
+
143
147
  data_contract_specification.servers = {
144
- "production": Server(type="glue", account=catalogid, database=source, location=location_uri),
148
+ "production": Server(**server_kwargs),
145
149
  }
146
150
 
147
151
  for table_name in table_names:
@@ -161,12 +165,6 @@ def import_glue(
161
165
  field.description = column.get("Comment")
162
166
  fields[column["Name"]] = field
163
167
 
164
- if "decimal" in column["Type"]:
165
- # Extract precision and scale from the string
166
- perc_scale = column["Type"][8:-1].split(",")
167
- field.precision = int(perc_scale[0])
168
- field.scale = int(perc_scale[1])
169
-
170
168
  data_contract_specification.models[table_name] = Model(
171
169
  type="table",
172
170
  fields=fields,
@@ -186,27 +184,43 @@ def create_typed_field(dtype: str) -> Field:
186
184
  """
187
185
  field = Field()
188
186
  dtype = dtype.strip().lower().replace(" ", "")
189
- if dtype.startswith(("array", "struct", "map")):
190
- orig_dtype: str = dtype
191
- if dtype.startswith("array"):
192
- field.type = "array"
193
- field.items = create_typed_field(orig_dtype[6:-1])
194
- elif dtype.startswith("struct"):
195
- field.type = "struct"
196
- for f in split_struct(orig_dtype[7:-1]):
197
- field.fields[f.split(":", 1)[0].strip()] = create_typed_field(f.split(":", 1)[1])
198
- elif dtype.startswith("map"):
199
- field.type = "map"
200
- key_type = orig_dtype[4:-1].split(",", 1)[0]
201
- value_type = orig_dtype[4:-1].split(",", 1)[1]
187
+ # Example: array<string>
188
+ if dtype.startswith("array"):
189
+ field.type = "array"
190
+ field.items = create_typed_field(dtype[6:-1])
191
+ # Example: struct<field1:float,field2:string>
192
+ elif dtype.startswith("struct"):
193
+ field.type = "struct"
194
+ for f in split_struct(dtype[7:-1]):
195
+ field_name, field_key = f.split(":", 1)
196
+ field.fields[field_name] = create_typed_field(field_key)
197
+ # Example: map<string,int>
198
+ elif dtype.startswith("map"):
199
+ field.type = "map"
200
+ map_match = re.match(r"map<(.+?),\s*(.+)>", dtype)
201
+ if map_match:
202
+ key_type = map_match.group(1)
203
+ value_type = map_match.group(2)
202
204
  field.keys = create_typed_field(key_type)
203
205
  field.values = create_typed_field(value_type)
206
+ # Example: decimal(38, 6) or decimal
207
+ elif dtype.startswith("decimal"):
208
+ field.type = "decimal"
209
+ decimal_match = re.match(r"decimal\((\d+),\s*(\d+)\)", dtype)
210
+ if decimal_match: # if precision specified
211
+ field.precision = int(decimal_match.group(1))
212
+ field.scale = int(decimal_match.group(2))
213
+ # Example: varchar(255) or varchar
214
+ elif dtype.startswith("varchar"):
215
+ field.type = "varchar"
216
+ if len(dtype) > 7:
217
+ field.maxLength = int(dtype[8:-1])
204
218
  else:
205
219
  field.type = map_type_from_sql(dtype)
206
220
  return field
207
221
 
208
222
 
209
- def split_fields(s: str):
223
+ def split_fields(s: str) -> Generator[str, None, None]:
210
224
  """Split a string of fields considering nested structures.
211
225
 
212
226
  Args:
@@ -253,30 +267,20 @@ def map_type_from_sql(sql_type: str) -> str:
253
267
  return None
254
268
 
255
269
  sql_type = sql_type.lower()
256
- if sql_type.startswith("varchar"):
257
- return "varchar"
258
- if sql_type.startswith("string"):
259
- return "string"
260
- if sql_type.startswith("text"):
261
- return "text"
262
- if sql_type.startswith("byte"):
263
- return "byte"
264
- if sql_type.startswith("short"):
265
- return "short"
266
- if sql_type.startswith("integer") or sql_type.startswith("int"):
267
- return "integer"
268
- if sql_type.startswith("long") or sql_type.startswith("bigint"):
269
- return "long"
270
- if sql_type.startswith("float"):
271
- return "float"
272
- if sql_type.startswith("double"):
273
- return "double"
274
- if sql_type.startswith("boolean"):
275
- return "boolean"
276
- if sql_type.startswith("timestamp"):
277
- return "timestamp"
278
- if sql_type.startswith("date"):
279
- return "date"
280
- if sql_type.startswith("decimal"):
281
- return "decimal"
282
- return "variant"
270
+
271
+ type_mapping = {
272
+ "string": "string",
273
+ "int": "int",
274
+ "bigint": "bigint",
275
+ "float": "float",
276
+ "double": "double",
277
+ "boolean": "boolean",
278
+ "timestamp": "timestamp",
279
+ "date": "date",
280
+ }
281
+
282
+ for prefix, mapped_type in type_mapping.items():
283
+ if sql_type.startswith(prefix):
284
+ return mapped_type
285
+
286
+ return "unknown"
@@ -14,7 +14,7 @@ class Importer(ABC):
14
14
  data_contract_specification: DataContractSpecification,
15
15
  source: str,
16
16
  import_args: dict,
17
- ) -> dict:
17
+ ) -> DataContractSpecification:
18
18
  pass
19
19
 
20
20
 
@@ -22,6 +22,7 @@ class ImportFormat(str, Enum):
22
22
  sql = "sql"
23
23
  avro = "avro"
24
24
  dbt = "dbt"
25
+ dbml = "dbml"
25
26
  glue = "glue"
26
27
  jsonschema = "jsonschema"
27
28
  bigquery = "bigquery"
@@ -88,3 +88,8 @@ importer_factory.register_lazy_importer(
88
88
  importer_factory.register_lazy_importer(
89
89
  name=ImportFormat.dbt, module_path="datacontract.imports.dbt_importer", class_name="DbtManifestImporter"
90
90
  )
91
+ importer_factory.register_lazy_importer(
92
+ name=ImportFormat.dbml,
93
+ module_path="datacontract.imports.dbml_importer",
94
+ class_name="DBMLImporter",
95
+ )
@@ -46,7 +46,7 @@ DATACONTRACT_TYPES = [
46
46
  class OdcsImporter(Importer):
47
47
  def import_source(
48
48
  self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
49
- ) -> dict:
49
+ ) -> DataContractSpecification:
50
50
  return import_odcs(data_contract_specification, source)
51
51
 
52
52
 
@@ -14,7 +14,7 @@ class SparkImporter(Importer):
14
14
  data_contract_specification: DataContractSpecification,
15
15
  source: str,
16
16
  import_args: dict,
17
- ) -> dict:
17
+ ) -> DataContractSpecification:
18
18
  """
19
19
  Imports data from a Spark source into the data contract specification.
20
20
 
@@ -63,12 +63,12 @@ def import_from_spark_df(df: DataFrame) -> Model:
63
63
  schema = df.schema
64
64
 
65
65
  for field in schema:
66
- model.fields[field.name] = _field_from_spark(field)
66
+ model.fields[field.name] = _field_from_struct_type(field)
67
67
 
68
68
  return model
69
69
 
70
70
 
71
- def _field_from_spark(spark_field: types.StructField) -> Field:
71
+ def _field_from_struct_type(spark_field: types.StructField) -> Field:
72
72
  """
73
73
  Converts a Spark StructField into a Field object for the data contract.
74
74
 
@@ -76,18 +76,37 @@ def _field_from_spark(spark_field: types.StructField) -> Field:
76
76
  spark_field: The Spark StructField to convert.
77
77
 
78
78
  Returns:
79
- Field: The corresponding Field object.
79
+ Field: The generated Field object.
80
80
  """
81
- field_type = _data_type_from_spark(spark_field.dataType)
82
81
  field = Field()
83
- field.type = field_type
84
82
  field.required = not spark_field.nullable
83
+ field.description = spark_field.metadata.get("comment")
85
84
 
86
- if field_type == "array":
87
- field.items = _field_from_spark(spark_field.dataType.elementType)
85
+ return _type_from_data_type(field, spark_field.dataType)
88
86
 
89
- if field_type == "struct":
90
- field.fields = {sf.name: _field_from_spark(sf) for sf in spark_field.dataType.fields}
87
+
88
+ def _type_from_data_type(field: Field, spark_type: types.DataType) -> Field:
89
+ """
90
+ Maps Spark data types to the Data Contract type system and updates the field.
91
+
92
+ Args:
93
+ field: The Field object to update.
94
+ spark_type: The Spark data type to map.
95
+
96
+ Returns:
97
+ Field: The updated Field object.
98
+ """
99
+ field.type = _data_type_from_spark(spark_type)
100
+
101
+ if field.type == "array":
102
+ field.items = _type_from_data_type(Field(required=not spark_type.containsNull), spark_type.elementType)
103
+
104
+ elif field.type == "map":
105
+ field.keys = _type_from_data_type(Field(required=True), spark_type.keyType)
106
+ field.values = _type_from_data_type(Field(required=not spark_type.valueContainsNull), spark_type.valueType)
107
+
108
+ elif field.type == "struct":
109
+ field.fields = {sf.name: _field_from_struct_type(sf) for sf in spark_type.fields}
91
110
 
92
111
  return field
93
112
 
@@ -104,7 +123,7 @@ def _data_type_from_spark(spark_type: types.DataType) -> str:
104
123
  """
105
124
  if isinstance(spark_type, types.StringType):
106
125
  return "string"
107
- elif isinstance(spark_type, types.IntegerType):
126
+ elif isinstance(spark_type, (types.IntegerType, types.ShortType)):
108
127
  return "integer"
109
128
  elif isinstance(spark_type, types.LongType):
110
129
  return "long"
@@ -116,6 +135,8 @@ def _data_type_from_spark(spark_type: types.DataType) -> str:
116
135
  return "struct"
117
136
  elif isinstance(spark_type, types.ArrayType):
118
137
  return "array"
138
+ elif isinstance(spark_type, types.MapType):
139
+ return "map"
119
140
  elif isinstance(spark_type, types.TimestampType):
120
141
  return "timestamp"
121
142
  elif isinstance(spark_type, types.TimestampNTZType):
@@ -130,5 +151,7 @@ def _data_type_from_spark(spark_type: types.DataType) -> str:
130
151
  return "decimal"
131
152
  elif isinstance(spark_type, types.NullType):
132
153
  return "null"
154
+ elif isinstance(spark_type, types.VarcharType):
155
+ return "varchar"
133
156
  else:
134
157
  raise ValueError(f"Unsupported Spark type: {spark_type}")
@@ -7,7 +7,7 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
7
7
  class SqlImporter(Importer):
8
8
  def import_source(
9
9
  self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
10
- ) -> dict:
10
+ ) -> DataContractSpecification:
11
11
  return import_sql(data_contract_specification, self.import_format, source)
12
12
 
13
13