datacontract-cli 0.10.23__py3-none-any.whl → 0.10.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. datacontract/__init__.py +13 -0
  2. datacontract/api.py +12 -5
  3. datacontract/catalog/catalog.py +5 -3
  4. datacontract/cli.py +116 -10
  5. datacontract/data_contract.py +143 -65
  6. datacontract/engines/data_contract_checks.py +366 -60
  7. datacontract/engines/data_contract_test.py +50 -4
  8. datacontract/engines/fastjsonschema/check_jsonschema.py +37 -19
  9. datacontract/engines/fastjsonschema/s3/s3_read_files.py +3 -2
  10. datacontract/engines/soda/check_soda_execute.py +22 -3
  11. datacontract/engines/soda/connections/athena.py +79 -0
  12. datacontract/engines/soda/connections/duckdb_connection.py +65 -6
  13. datacontract/engines/soda/connections/kafka.py +4 -2
  14. datacontract/export/avro_converter.py +20 -3
  15. datacontract/export/bigquery_converter.py +1 -1
  16. datacontract/export/dbt_converter.py +36 -7
  17. datacontract/export/dqx_converter.py +126 -0
  18. datacontract/export/duckdb_type_converter.py +57 -0
  19. datacontract/export/excel_exporter.py +923 -0
  20. datacontract/export/exporter.py +3 -0
  21. datacontract/export/exporter_factory.py +17 -1
  22. datacontract/export/great_expectations_converter.py +55 -5
  23. datacontract/export/{html_export.py → html_exporter.py} +31 -20
  24. datacontract/export/markdown_converter.py +134 -5
  25. datacontract/export/mermaid_exporter.py +110 -0
  26. datacontract/export/odcs_v3_exporter.py +187 -145
  27. datacontract/export/protobuf_converter.py +163 -69
  28. datacontract/export/rdf_converter.py +2 -2
  29. datacontract/export/sodacl_converter.py +9 -1
  30. datacontract/export/spark_converter.py +31 -4
  31. datacontract/export/sql_converter.py +6 -2
  32. datacontract/export/sql_type_converter.py +20 -8
  33. datacontract/imports/avro_importer.py +63 -12
  34. datacontract/imports/csv_importer.py +111 -57
  35. datacontract/imports/excel_importer.py +1111 -0
  36. datacontract/imports/importer.py +16 -3
  37. datacontract/imports/importer_factory.py +17 -0
  38. datacontract/imports/json_importer.py +325 -0
  39. datacontract/imports/odcs_importer.py +2 -2
  40. datacontract/imports/odcs_v3_importer.py +351 -151
  41. datacontract/imports/protobuf_importer.py +264 -0
  42. datacontract/imports/spark_importer.py +117 -13
  43. datacontract/imports/sql_importer.py +32 -16
  44. datacontract/imports/unity_importer.py +84 -38
  45. datacontract/init/init_template.py +1 -1
  46. datacontract/integration/datamesh_manager.py +16 -2
  47. datacontract/lint/resolve.py +112 -23
  48. datacontract/lint/schema.py +24 -15
  49. datacontract/model/data_contract_specification/__init__.py +1 -0
  50. datacontract/model/odcs.py +13 -0
  51. datacontract/model/run.py +3 -0
  52. datacontract/output/junit_test_results.py +3 -3
  53. datacontract/schemas/datacontract-1.1.0.init.yaml +1 -1
  54. datacontract/schemas/datacontract-1.2.0.init.yaml +91 -0
  55. datacontract/schemas/datacontract-1.2.0.schema.json +2029 -0
  56. datacontract/schemas/datacontract-1.2.1.init.yaml +91 -0
  57. datacontract/schemas/datacontract-1.2.1.schema.json +2058 -0
  58. datacontract/schemas/odcs-3.0.2.schema.json +2382 -0
  59. datacontract/templates/datacontract.html +54 -3
  60. datacontract/templates/datacontract_odcs.html +685 -0
  61. datacontract/templates/index.html +5 -2
  62. datacontract/templates/partials/server.html +2 -0
  63. datacontract/templates/style/output.css +319 -145
  64. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.37.dist-info}/METADATA +656 -431
  65. datacontract_cli-0.10.37.dist-info/RECORD +119 -0
  66. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.37.dist-info}/WHEEL +1 -1
  67. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.37.dist-info/licenses}/LICENSE +1 -1
  68. datacontract/export/csv_type_converter.py +0 -36
  69. datacontract/lint/lint.py +0 -142
  70. datacontract/lint/linters/description_linter.py +0 -35
  71. datacontract/lint/linters/field_pattern_linter.py +0 -34
  72. datacontract/lint/linters/field_reference_linter.py +0 -48
  73. datacontract/lint/linters/notice_period_linter.py +0 -55
  74. datacontract/lint/linters/quality_schema_linter.py +0 -52
  75. datacontract/lint/linters/valid_constraints_linter.py +0 -100
  76. datacontract/model/data_contract_specification.py +0 -327
  77. datacontract_cli-0.10.23.dist-info/RECORD +0 -113
  78. /datacontract/{lint/linters → output}/__init__.py +0 -0
  79. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.37.dist-info}/entry_points.txt +0 -0
  80. {datacontract_cli-0.10.23.dist-info → datacontract_cli-0.10.37.dist-info}/top_level.txt +0 -0
@@ -27,7 +27,7 @@ class DbtStageExporter(Exporter):
27
27
  )
28
28
 
29
29
 
30
- def to_dbt_models_yaml(data_contract_spec: DataContractSpecification, server: str = None):
30
+ def to_dbt_models_yaml(data_contract_spec: DataContractSpecification, server: str = None) -> str:
31
31
  dbt = {
32
32
  "version": 2,
33
33
  "models": [],
@@ -102,8 +102,11 @@ def _to_dbt_model(
102
102
  "name": model_key,
103
103
  }
104
104
  model_type = _to_dbt_model_type(model_value.type)
105
+
105
106
  dbt_model["config"] = {"meta": {"data_contract": data_contract_spec.id}}
106
- dbt_model["config"]["materialized"] = model_type
107
+
108
+ if model_type:
109
+ dbt_model["config"]["materialized"] = model_type
107
110
 
108
111
  if data_contract_spec.info.owner is not None:
109
112
  dbt_model["config"]["meta"]["owner"] = data_contract_spec.info.owner
@@ -112,9 +115,28 @@ def _to_dbt_model(
112
115
  dbt_model["config"]["contract"] = {"enforced": True}
113
116
  if model_value.description is not None:
114
117
  dbt_model["description"] = model_value.description.strip().replace("\n", " ")
115
- columns = _to_columns(data_contract_spec, model_value.fields, _supports_constraints(model_type), adapter_type)
118
+
119
+ # Handle model-level primaryKey (before columns for better YAML ordering)
120
+ primary_key_columns = []
121
+ if hasattr(model_value, "primaryKey") and model_value.primaryKey:
122
+ if isinstance(model_value.primaryKey, list) and len(model_value.primaryKey) > 1:
123
+ # Multiple columns: use dbt_utils.unique_combination_of_columns
124
+ dbt_model["data_tests"] = [
125
+ {"dbt_utils.unique_combination_of_columns": {"combination_of_columns": model_value.primaryKey}}
126
+ ]
127
+ elif isinstance(model_value.primaryKey, list) and len(model_value.primaryKey) == 1:
128
+ # Single column: handle at column level (pass to _to_columns)
129
+ primary_key_columns = model_value.primaryKey
130
+ elif isinstance(model_value.primaryKey, str):
131
+ # Single column as string: handle at column level
132
+ primary_key_columns = [model_value.primaryKey]
133
+
134
+ columns = _to_columns(
135
+ data_contract_spec, model_value.fields, _supports_constraints(model_type), adapter_type, primary_key_columns
136
+ )
116
137
  if columns:
117
138
  dbt_model["columns"] = columns
139
+
118
140
  return dbt_model
119
141
 
120
142
 
@@ -123,7 +145,7 @@ def _to_dbt_model_type(model_type):
123
145
  # Allowed values: table, view, incremental, ephemeral, materialized view
124
146
  # Custom values also possible
125
147
  if model_type is None:
126
- return "table"
148
+ return None
127
149
  if model_type.lower() == "table":
128
150
  return "table"
129
151
  if model_type.lower() == "view":
@@ -140,10 +162,13 @@ def _to_columns(
140
162
  fields: Dict[str, Field],
141
163
  supports_constraints: bool,
142
164
  adapter_type: Optional[str],
165
+ primary_key_columns: Optional[list] = None,
143
166
  ) -> list:
144
167
  columns = []
168
+ primary_key_columns = primary_key_columns or []
145
169
  for field_name, field in fields.items():
146
- column = _to_column(data_contract_spec, field_name, field, supports_constraints, adapter_type)
170
+ is_primary_key = field_name in primary_key_columns
171
+ column = _to_column(data_contract_spec, field_name, field, supports_constraints, adapter_type, is_primary_key)
147
172
  columns.append(column)
148
173
  return columns
149
174
 
@@ -161,6 +186,7 @@ def _to_column(
161
186
  field: Field,
162
187
  supports_constraints: bool,
163
188
  adapter_type: Optional[str],
189
+ is_primary_key: bool = False,
164
190
  ) -> dict:
165
191
  column = {"name": field_name}
166
192
  adapter_type = adapter_type or "snowflake"
@@ -175,12 +201,15 @@ def _to_column(
175
201
  )
176
202
  if field.description is not None:
177
203
  column["description"] = field.description.strip().replace("\n", " ")
178
- if field.required:
204
+ # Handle required/not_null constraint
205
+ if field.required or is_primary_key:
179
206
  if supports_constraints:
180
207
  column.setdefault("constraints", []).append({"type": "not_null"})
181
208
  else:
182
209
  column["data_tests"].append("not_null")
183
- if field.unique:
210
+
211
+ # Handle unique constraint
212
+ if field.unique or is_primary_key:
184
213
  if supports_constraints:
185
214
  column.setdefault("constraints", []).append({"type": "unique"})
186
215
  else:
@@ -0,0 +1,126 @@
1
+ from typing import Any, Dict, List, Union
2
+
3
+ import yaml
4
+
5
+ from datacontract.export.exporter import Exporter, _check_models_for_export
6
+ from datacontract.model.data_contract_specification import DataContractSpecification, Field, Model, Quality
7
+
8
+
9
+ class DqxKeys:
10
+ CHECK = "check"
11
+ ARGUMENTS = "arguments"
12
+ SPECIFICATION = "specification"
13
+ COL_NAME = "column"
14
+ COL_NAMES = "for_each_column"
15
+ COLUMNS = "columns"
16
+ FUNCTION = "function"
17
+
18
+
19
+ class DqxExporter(Exporter):
20
+ """Exporter implementation for converting data contracts to DQX YAML file."""
21
+
22
+ def export(
23
+ self,
24
+ data_contract: DataContractSpecification,
25
+ model: Model,
26
+ server: str,
27
+ sql_server_type: str,
28
+ export_args: Dict[str, Any],
29
+ ) -> str:
30
+ """Exports a data contract to DQX format."""
31
+ model_name, model_value = _check_models_for_export(data_contract, model, self.export_format)
32
+ return to_dqx_yaml(model_value)
33
+
34
+
35
+ def to_dqx_yaml(model_value: Model) -> str:
36
+ """
37
+ Converts the data contract's quality checks to DQX YAML format.
38
+
39
+ Args:
40
+ model_value (Model): The data contract to convert.
41
+
42
+ Returns:
43
+ str: YAML representation of the data contract's quality checks.
44
+ """
45
+ extracted_rules = extract_quality_rules(model_value)
46
+ return yaml.dump(extracted_rules, sort_keys=False, allow_unicode=True, default_flow_style=False)
47
+
48
+
49
+ def process_quality_rule(rule: Quality, column_name: str) -> Dict[str, Any]:
50
+ """
51
+ Processes a single quality rule by injecting the column path into its arguments if absent.
52
+
53
+ Args:
54
+ rule (Quality): The quality rule to process.
55
+ column_name (str): The full path to the current column.
56
+
57
+ Returns:
58
+ dict: The processed quality rule specification.
59
+ """
60
+ rule_data = rule.model_extra
61
+ specification = rule_data[DqxKeys.SPECIFICATION]
62
+ check = specification[DqxKeys.CHECK]
63
+
64
+ if column_name:
65
+ arguments = check.setdefault(DqxKeys.ARGUMENTS, {})
66
+
67
+ if (
68
+ DqxKeys.COL_NAME not in arguments
69
+ and DqxKeys.COL_NAMES not in arguments
70
+ and DqxKeys.COLUMNS not in arguments
71
+ ):
72
+ if check[DqxKeys.FUNCTION] not in ("is_unique", "foreign_key"):
73
+ arguments[DqxKeys.COL_NAME] = column_name
74
+ else:
75
+ arguments[DqxKeys.COLUMNS] = [column_name]
76
+
77
+ return specification
78
+
79
+
80
+ def extract_quality_rules(data: Union[Model, Field, Quality], column_path: str = "") -> List[Dict[str, Any]]:
81
+ """
82
+ Recursively extracts all quality rules from a data contract structure.
83
+
84
+ Args:
85
+ data (Union[Model, Field, Quality]): The data contract model, field, or quality rule.
86
+ column_path (str, optional): The current path in the schema hierarchy. Defaults to "".
87
+
88
+ Returns:
89
+ List[Dict[str, Any]]: A list of quality rule specifications.
90
+ """
91
+ quality_rules = []
92
+
93
+ if isinstance(data, Quality):
94
+ return [process_quality_rule(data, column_path)]
95
+
96
+ if isinstance(data, (Model, Field)):
97
+ for key, field in data.fields.items():
98
+ current_path = build_column_path(column_path, key)
99
+
100
+ if field.fields:
101
+ # Field is a struct-like object, recurse deeper
102
+ quality_rules.extend(extract_quality_rules(field, current_path))
103
+ else:
104
+ # Process quality rules at leaf fields
105
+ for rule in field.quality:
106
+ quality_rules.append(process_quality_rule(rule, current_path))
107
+
108
+ # Process any quality rules attached directly to this level
109
+ for rule in data.quality:
110
+ quality_rules.append(process_quality_rule(rule, column_path))
111
+
112
+ return quality_rules
113
+
114
+
115
+ def build_column_path(current_path: str, key: str) -> str:
116
+ """
117
+ Builds the full column path by concatenating parent path with current key.
118
+
119
+ Args:
120
+ current_path (str): The current path prefix.
121
+ key (str): The current field's key.
122
+
123
+ Returns:
124
+ str: The full path.
125
+ """
126
+ return f"{current_path}.{key}" if current_path else key
@@ -0,0 +1,57 @@
1
+ from typing import Dict
2
+
3
+ from datacontract.model.data_contract_specification import Field
4
+
5
+
6
+ # https://duckdb.org/docs/data/csv/overview.html
7
+ # ['SQLNULL', 'BOOLEAN', 'BIGINT', 'DOUBLE', 'TIME', 'DATE', 'TIMESTAMP', 'VARCHAR']
8
+ def convert_to_duckdb_csv_type(field) -> None | str:
9
+ datacontract_type = field.type
10
+ if datacontract_type is None:
11
+ return "VARCHAR"
12
+ if datacontract_type.lower() in ["string", "varchar", "text"]:
13
+ return "VARCHAR"
14
+ if datacontract_type.lower() in ["timestamp", "timestamp_tz"]:
15
+ return "TIMESTAMP"
16
+ if datacontract_type.lower() in ["timestamp_ntz"]:
17
+ return "TIMESTAMP"
18
+ if datacontract_type.lower() in ["date"]:
19
+ return "DATE"
20
+ if datacontract_type.lower() in ["time"]:
21
+ return "TIME"
22
+ if datacontract_type.lower() in ["number", "decimal", "numeric"]:
23
+ # precision and scale not supported by data contract
24
+ return "VARCHAR"
25
+ if datacontract_type.lower() in ["float", "double"]:
26
+ return "DOUBLE"
27
+ if datacontract_type.lower() in ["integer", "int", "long", "bigint"]:
28
+ return "BIGINT"
29
+ if datacontract_type.lower() in ["boolean"]:
30
+ return "BOOLEAN"
31
+ if datacontract_type.lower() in ["object", "record", "struct"]:
32
+ # not supported in CSV
33
+ return "VARCHAR"
34
+ if datacontract_type.lower() in ["bytes"]:
35
+ # not supported in CSV
36
+ return "VARCHAR"
37
+ if datacontract_type.lower() in ["array"]:
38
+ return "VARCHAR"
39
+ if datacontract_type.lower() in ["null"]:
40
+ return "SQLNULL"
41
+ return "VARCHAR"
42
+
43
+
44
+ def convert_to_duckdb_json_type(field: Field) -> None | str:
45
+ datacontract_type = field.type
46
+ if datacontract_type is None:
47
+ return "VARCHAR"
48
+ if datacontract_type.lower() in ["array"]:
49
+ return convert_to_duckdb_json_type(field.items) + "[]" # type: ignore
50
+ if datacontract_type.lower() in ["object", "record", "struct"]:
51
+ return convert_to_duckdb_object(field.fields)
52
+ return convert_to_duckdb_csv_type(field)
53
+
54
+
55
+ def convert_to_duckdb_object(fields: Dict[str, Field]):
56
+ columns = [f'"{x[0]}" {convert_to_duckdb_json_type(x[1])}' for x in fields.items()]
57
+ return f"STRUCT({', '.join(columns)})"