datacontract-cli 0.10.11__py3-none-any.whl → 0.10.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (35) hide show
  1. datacontract/cli.py +19 -3
  2. datacontract/data_contract.py +5 -10
  3. datacontract/engines/fastjsonschema/check_jsonschema.py +11 -0
  4. datacontract/engines/fastjsonschema/s3/s3_read_files.py +2 -0
  5. datacontract/engines/soda/check_soda_execute.py +2 -8
  6. datacontract/engines/soda/connections/duckdb.py +23 -24
  7. datacontract/engines/soda/connections/kafka.py +81 -23
  8. datacontract/export/avro_converter.py +12 -2
  9. datacontract/export/dbml_converter.py +3 -2
  10. datacontract/export/exporter.py +1 -0
  11. datacontract/export/exporter_factory.py +6 -0
  12. datacontract/export/spark_converter.py +4 -0
  13. datacontract/export/sql_type_converter.py +64 -29
  14. datacontract/export/sqlalchemy_converter.py +169 -0
  15. datacontract/imports/avro_importer.py +1 -0
  16. datacontract/imports/bigquery_importer.py +2 -2
  17. datacontract/imports/dbml_importer.py +112 -0
  18. datacontract/imports/dbt_importer.py +67 -91
  19. datacontract/imports/glue_importer.py +62 -58
  20. datacontract/imports/importer.py +2 -1
  21. datacontract/imports/importer_factory.py +5 -0
  22. datacontract/imports/odcs_importer.py +1 -1
  23. datacontract/imports/spark_importer.py +29 -10
  24. datacontract/imports/sql_importer.py +1 -1
  25. datacontract/imports/unity_importer.py +1 -1
  26. datacontract/integration/{publish_datamesh_manager.py → datamesh_manager.py} +33 -5
  27. datacontract/integration/{publish_opentelemetry.py → opentelemetry.py} +1 -1
  28. datacontract/model/data_contract_specification.py +6 -2
  29. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.12.dist-info}/METADATA +103 -28
  30. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.12.dist-info}/RECORD +34 -33
  31. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.12.dist-info}/WHEEL +1 -1
  32. datacontract/publish/publish.py +0 -32
  33. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.12.dist-info}/LICENSE +0 -0
  34. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.12.dist-info}/entry_points.txt +0 -0
  35. {datacontract_cli-0.10.11.dist-info → datacontract_cli-0.10.12.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  import boto3
2
- from typing import List
3
-
2
+ from typing import List, Dict, Generator
3
+ import re
4
4
  from datacontract.imports.importer import Importer
5
5
  from datacontract.model.data_contract_specification import (
6
6
  DataContractSpecification,
@@ -13,7 +13,7 @@ from datacontract.model.data_contract_specification import (
13
13
  class GlueImporter(Importer):
14
14
  def import_source(
15
15
  self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
16
- ) -> dict:
16
+ ) -> DataContractSpecification:
17
17
  return import_glue(data_contract_specification, source, import_args.get("glue_table"))
18
18
 
19
19
 
@@ -39,7 +39,7 @@ def get_glue_database(database_name: str):
39
39
 
40
40
  return (
41
41
  response["Database"]["CatalogId"],
42
- response["Database"].get("LocationUri", "None"),
42
+ response["Database"].get("LocationUri"),
43
43
  )
44
44
 
45
45
 
@@ -75,7 +75,7 @@ def get_glue_tables(database_name: str) -> List[str]:
75
75
  return table_names
76
76
 
77
77
 
78
- def get_glue_table_schema(database_name: str, table_name: str):
78
+ def get_glue_table_schema(database_name: str, table_name: str) -> List[Dict]:
79
79
  """Get the schema of a Glue table.
80
80
 
81
81
  Args:
@@ -93,11 +93,11 @@ def get_glue_table_schema(database_name: str, table_name: str):
93
93
  response = glue.get_table(DatabaseName=database_name, Name=table_name)
94
94
  except glue.exceptions.EntityNotFoundException:
95
95
  print(f"Table {table_name} not found in database {database_name}.")
96
- return {}
96
+ return []
97
97
  except Exception as e:
98
98
  # todo catch all
99
99
  print(f"Error: {e}")
100
- return {}
100
+ return []
101
101
 
102
102
  table_schema = response["Table"]["StorageDescriptor"]["Columns"]
103
103
 
@@ -109,10 +109,9 @@ def get_glue_table_schema(database_name: str, table_name: str):
109
109
  "Name": pk["Name"],
110
110
  "Type": pk["Type"],
111
111
  "Hive": True,
112
- "Comment": "Partition Key",
112
+ "Comment": pk.get("Comment"),
113
113
  }
114
114
  )
115
-
116
115
  return table_schema
117
116
 
118
117
 
@@ -120,7 +119,7 @@ def import_glue(
120
119
  data_contract_specification: DataContractSpecification,
121
120
  source: str,
122
121
  table_names: List[str],
123
- ):
122
+ ) -> DataContractSpecification:
124
123
  """Import the schema of a Glue database.
125
124
 
126
125
  Args:
@@ -140,8 +139,13 @@ def import_glue(
140
139
  if table_names is None:
141
140
  table_names = get_glue_tables(source)
142
141
 
142
+ server_kwargs = {"type": "glue", "account": catalogid, "database": source}
143
+
144
+ if location_uri:
145
+ server_kwargs["location"] = location_uri
146
+
143
147
  data_contract_specification.servers = {
144
- "production": Server(type="glue", account=catalogid, database=source, location=location_uri),
148
+ "production": Server(**server_kwargs),
145
149
  }
146
150
 
147
151
  for table_name in table_names:
@@ -161,12 +165,6 @@ def import_glue(
161
165
  field.description = column.get("Comment")
162
166
  fields[column["Name"]] = field
163
167
 
164
- if "decimal" in column["Type"]:
165
- # Extract precision and scale from the string
166
- perc_scale = column["Type"][8:-1].split(",")
167
- field.precision = int(perc_scale[0])
168
- field.scale = int(perc_scale[1])
169
-
170
168
  data_contract_specification.models[table_name] = Model(
171
169
  type="table",
172
170
  fields=fields,
@@ -186,27 +184,43 @@ def create_typed_field(dtype: str) -> Field:
186
184
  """
187
185
  field = Field()
188
186
  dtype = dtype.strip().lower().replace(" ", "")
189
- if dtype.startswith(("array", "struct", "map")):
190
- orig_dtype: str = dtype
191
- if dtype.startswith("array"):
192
- field.type = "array"
193
- field.items = create_typed_field(orig_dtype[6:-1])
194
- elif dtype.startswith("struct"):
195
- field.type = "struct"
196
- for f in split_struct(orig_dtype[7:-1]):
197
- field.fields[f.split(":", 1)[0].strip()] = create_typed_field(f.split(":", 1)[1])
198
- elif dtype.startswith("map"):
199
- field.type = "map"
200
- key_type = orig_dtype[4:-1].split(",", 1)[0]
201
- value_type = orig_dtype[4:-1].split(",", 1)[1]
187
+ # Example: array<string>
188
+ if dtype.startswith("array"):
189
+ field.type = "array"
190
+ field.items = create_typed_field(dtype[6:-1])
191
+ # Example: struct<field1:float,field2:string>
192
+ elif dtype.startswith("struct"):
193
+ field.type = "struct"
194
+ for f in split_struct(dtype[7:-1]):
195
+ field_name, field_key = f.split(":", 1)
196
+ field.fields[field_name] = create_typed_field(field_key)
197
+ # Example: map<string,int>
198
+ elif dtype.startswith("map"):
199
+ field.type = "map"
200
+ map_match = re.match(r"map<(.+?),\s*(.+)>", dtype)
201
+ if map_match:
202
+ key_type = map_match.group(1)
203
+ value_type = map_match.group(2)
202
204
  field.keys = create_typed_field(key_type)
203
205
  field.values = create_typed_field(value_type)
206
+ # Example: decimal(38, 6) or decimal
207
+ elif dtype.startswith("decimal"):
208
+ field.type = "decimal"
209
+ decimal_match = re.match(r"decimal\((\d+),\s*(\d+)\)", dtype)
210
+ if decimal_match: # if precision specified
211
+ field.precision = int(decimal_match.group(1))
212
+ field.scale = int(decimal_match.group(2))
213
+ # Example: varchar(255) or varchar
214
+ elif dtype.startswith("varchar"):
215
+ field.type = "varchar"
216
+ if len(dtype) > 7:
217
+ field.maxLength = int(dtype[8:-1])
204
218
  else:
205
219
  field.type = map_type_from_sql(dtype)
206
220
  return field
207
221
 
208
222
 
209
- def split_fields(s: str):
223
+ def split_fields(s: str) -> Generator[str, None, None]:
210
224
  """Split a string of fields considering nested structures.
211
225
 
212
226
  Args:
@@ -253,30 +267,20 @@ def map_type_from_sql(sql_type: str) -> str:
253
267
  return None
254
268
 
255
269
  sql_type = sql_type.lower()
256
- if sql_type.startswith("varchar"):
257
- return "varchar"
258
- if sql_type.startswith("string"):
259
- return "string"
260
- if sql_type.startswith("text"):
261
- return "text"
262
- if sql_type.startswith("byte"):
263
- return "byte"
264
- if sql_type.startswith("short"):
265
- return "short"
266
- if sql_type.startswith("integer") or sql_type.startswith("int"):
267
- return "integer"
268
- if sql_type.startswith("long") or sql_type.startswith("bigint"):
269
- return "long"
270
- if sql_type.startswith("float"):
271
- return "float"
272
- if sql_type.startswith("double"):
273
- return "double"
274
- if sql_type.startswith("boolean"):
275
- return "boolean"
276
- if sql_type.startswith("timestamp"):
277
- return "timestamp"
278
- if sql_type.startswith("date"):
279
- return "date"
280
- if sql_type.startswith("decimal"):
281
- return "decimal"
282
- return "variant"
270
+
271
+ type_mapping = {
272
+ "string": "string",
273
+ "int": "int",
274
+ "bigint": "bigint",
275
+ "float": "float",
276
+ "double": "double",
277
+ "boolean": "boolean",
278
+ "timestamp": "timestamp",
279
+ "date": "date",
280
+ }
281
+
282
+ for prefix, mapped_type in type_mapping.items():
283
+ if sql_type.startswith(prefix):
284
+ return mapped_type
285
+
286
+ return "unknown"
@@ -14,7 +14,7 @@ class Importer(ABC):
14
14
  data_contract_specification: DataContractSpecification,
15
15
  source: str,
16
16
  import_args: dict,
17
- ) -> dict:
17
+ ) -> DataContractSpecification:
18
18
  pass
19
19
 
20
20
 
@@ -22,6 +22,7 @@ class ImportFormat(str, Enum):
22
22
  sql = "sql"
23
23
  avro = "avro"
24
24
  dbt = "dbt"
25
+ dbml = "dbml"
25
26
  glue = "glue"
26
27
  jsonschema = "jsonschema"
27
28
  bigquery = "bigquery"
@@ -88,3 +88,8 @@ importer_factory.register_lazy_importer(
88
88
  importer_factory.register_lazy_importer(
89
89
  name=ImportFormat.dbt, module_path="datacontract.imports.dbt_importer", class_name="DbtManifestImporter"
90
90
  )
91
+ importer_factory.register_lazy_importer(
92
+ name=ImportFormat.dbml,
93
+ module_path="datacontract.imports.dbml_importer",
94
+ class_name="DBMLImporter",
95
+ )
@@ -46,7 +46,7 @@ DATACONTRACT_TYPES = [
46
46
  class OdcsImporter(Importer):
47
47
  def import_source(
48
48
  self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
49
- ) -> dict:
49
+ ) -> DataContractSpecification:
50
50
  return import_odcs(data_contract_specification, source)
51
51
 
52
52
 
@@ -14,7 +14,7 @@ class SparkImporter(Importer):
14
14
  data_contract_specification: DataContractSpecification,
15
15
  source: str,
16
16
  import_args: dict,
17
- ) -> dict:
17
+ ) -> DataContractSpecification:
18
18
  """
19
19
  Imports data from a Spark source into the data contract specification.
20
20
 
@@ -63,12 +63,12 @@ def import_from_spark_df(df: DataFrame) -> Model:
63
63
  schema = df.schema
64
64
 
65
65
  for field in schema:
66
- model.fields[field.name] = _field_from_spark(field)
66
+ model.fields[field.name] = _field_from_struct_type(field)
67
67
 
68
68
  return model
69
69
 
70
70
 
71
- def _field_from_spark(spark_field: types.StructField) -> Field:
71
+ def _field_from_struct_type(spark_field: types.StructField) -> Field:
72
72
  """
73
73
  Converts a Spark StructField into a Field object for the data contract.
74
74
 
@@ -76,18 +76,35 @@ def _field_from_spark(spark_field: types.StructField) -> Field:
76
76
  spark_field: The Spark StructField to convert.
77
77
 
78
78
  Returns:
79
- Field: The corresponding Field object.
79
+ Field: The generated Field object.
80
80
  """
81
- field_type = _data_type_from_spark(spark_field.dataType)
82
81
  field = Field()
83
- field.type = field_type
84
82
  field.required = not spark_field.nullable
83
+ return _type_from_data_type(field, spark_field.dataType)
85
84
 
86
- if field_type == "array":
87
- field.items = _field_from_spark(spark_field.dataType.elementType)
88
85
 
89
- if field_type == "struct":
90
- field.fields = {sf.name: _field_from_spark(sf) for sf in spark_field.dataType.fields}
86
+ def _type_from_data_type(field: Field, spark_type: types.DataType) -> Field:
87
+ """
88
+ Maps Spark data types to the Data Contract type system and updates the field.
89
+
90
+ Args:
91
+ field: The Field object to update.
92
+ spark_type: The Spark data type to map.
93
+
94
+ Returns:
95
+ Field: The updated Field object.
96
+ """
97
+ field.type = _data_type_from_spark(spark_type)
98
+
99
+ if field.type == "array":
100
+ field.items = _type_from_data_type(Field(required=not spark_type.containsNull), spark_type.elementType)
101
+
102
+ elif field.type == "map":
103
+ field.keys = _type_from_data_type(Field(required=True), spark_type.keyType)
104
+ field.values = _type_from_data_type(Field(required=not spark_type.valueContainsNull), spark_type.valueType)
105
+
106
+ elif field.type == "struct":
107
+ field.fields = {sf.name: _field_from_struct_type(sf) for sf in spark_type.fields}
91
108
 
92
109
  return field
93
110
 
@@ -116,6 +133,8 @@ def _data_type_from_spark(spark_type: types.DataType) -> str:
116
133
  return "struct"
117
134
  elif isinstance(spark_type, types.ArrayType):
118
135
  return "array"
136
+ elif isinstance(spark_type, types.MapType):
137
+ return "map"
119
138
  elif isinstance(spark_type, types.TimestampType):
120
139
  return "timestamp"
121
140
  elif isinstance(spark_type, types.TimestampNTZType):
@@ -7,7 +7,7 @@ from datacontract.model.data_contract_specification import DataContractSpecifica
7
7
  class SqlImporter(Importer):
8
8
  def import_source(
9
9
  self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
10
- ) -> dict:
10
+ ) -> DataContractSpecification:
11
11
  return import_sql(data_contract_specification, self.import_format, source)
12
12
 
13
13
 
@@ -11,7 +11,7 @@ from datacontract.model.exceptions import DataContractException
11
11
  class UnityImporter(Importer):
12
12
  def import_source(
13
13
  self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
14
- ) -> dict:
14
+ ) -> DataContractSpecification:
15
15
  if source is not None:
16
16
  data_contract_specification = import_unity_from_json(data_contract_specification, source)
17
17
  else:
@@ -2,28 +2,29 @@ import os
2
2
 
3
3
  import requests
4
4
 
5
+ from datacontract.model.data_contract_specification import DataContractSpecification
5
6
  from datacontract.model.run import Run
6
7
 
7
8
 
8
- def publish_datamesh_manager(run: Run, publish_url: str):
9
+ def publish_test_results_to_datamesh_manager(run: Run, publish_url: str):
9
10
  try:
10
11
  if publish_url is None:
11
12
  # this url supports Data Mesh Manager and Data Contract Manager
12
13
  url = "https://api.datamesh-manager.com/api/test-results"
13
14
  else:
14
15
  url = publish_url
16
+
15
17
  api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
16
18
  if api_key is None:
17
19
  api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY")
18
-
19
- if run.dataContractId is None:
20
- raise Exception("Cannot publish run results, as data contract ID is unknown")
21
-
22
20
  if api_key is None:
23
21
  raise Exception(
24
22
  "Cannot publish run results, as DATAMESH_MANAGER_API_KEY nor DATACONTRACT_MANAGER_API_KEY are not set"
25
23
  )
26
24
 
25
+ if run.dataContractId is None:
26
+ raise Exception("Cannot publish run results, as data contract ID is unknown")
27
+
27
28
  headers = {"Content-Type": "application/json", "x-api-key": api_key}
28
29
  request_body = run.model_dump_json()
29
30
  # print("Request Body:", request_body)
@@ -36,3 +37,30 @@ def publish_datamesh_manager(run: Run, publish_url: str):
36
37
  run.log_info(f"Published test results to {url}")
37
38
  except Exception as e:
38
39
  run.log_error(f"Failed publishing test results. Error: {str(e)}")
40
+
41
+
42
+ def publish_data_contract_to_datamesh_manager(data_contract_specification: DataContractSpecification):
43
+ try:
44
+ api_key = os.getenv("DATAMESH_MANAGER_API_KEY")
45
+ if api_key is None:
46
+ api_key = os.getenv("DATACONTRACT_MANAGER_API_KEY")
47
+ if api_key is None:
48
+ raise Exception(
49
+ "Cannot publish data contract, as neither DATAMESH_MANAGER_API_KEY nor DATACONTRACT_MANAGER_API_KEY is set"
50
+ )
51
+ headers = {"Content-Type": "application/json", "x-api-key": api_key}
52
+ spec = data_contract_specification
53
+ id = spec.id
54
+ url = "https://api.datamesh-manager.com/api/datacontracts/{0}".format(id)
55
+ request_body = spec.model_dump_json().encode("utf-8")
56
+ response = requests.put(
57
+ url=url,
58
+ data=request_body,
59
+ headers=headers,
60
+ )
61
+ if response.status_code != 200:
62
+ print(f"Error publishing data contract to Data Mesh Manager: {response.text}")
63
+ exit(1)
64
+ print(f"Published data contract to {url}")
65
+ except Exception as e:
66
+ print(f"Failed publishing data contract. Error: {str(e)}")
@@ -34,7 +34,7 @@ from datacontract.model.run import Run
34
34
  # - Metrics only, no logs yet (but loosely planned)
35
35
 
36
36
 
37
- def publish_opentelemetry(run: Run):
37
+ def publish_test_results_to_opentelemetry(run: Run):
38
38
  try:
39
39
  if run.dataContractId is None:
40
40
  raise Exception("Cannot publish run results, as data contract ID is unknown")
@@ -73,7 +73,7 @@ class Definition(pyd.BaseModel):
73
73
  exclusiveMaximum: int = None
74
74
  pii: bool = None
75
75
  classification: str = None
76
- fields: Dict[str, "Definition"] = {}
76
+ fields: Dict[str, "Field"] = {}
77
77
  tags: List[str] = []
78
78
  links: Dict[str, str] = {}
79
79
  example: str = None
@@ -239,4 +239,8 @@ class DataContractSpecification(pyd.BaseModel):
239
239
  return DataContractSpecification(**data)
240
240
 
241
241
  def to_yaml(self):
242
- return yaml.dump(self.model_dump(exclude_defaults=True, exclude_none=True), sort_keys=False, allow_unicode=True)
242
+ return yaml.dump(
243
+ self.model_dump(exclude_defaults=True, exclude_none=True, by_alias=True),
244
+ sort_keys=False,
245
+ allow_unicode=True,
246
+ )