odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/validation/gate.py
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Quality Gate support for batch-level validation.
|
|
3
|
+
|
|
4
|
+
Gates evaluate the entire batch before writing, ensuring
|
|
5
|
+
data quality thresholds are met at the aggregate level.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
from odibi.config import GateConfig, GateOnFail
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class GateResult:
|
|
19
|
+
"""Result of gate evaluation."""
|
|
20
|
+
|
|
21
|
+
passed: bool
|
|
22
|
+
pass_rate: float
|
|
23
|
+
total_rows: int
|
|
24
|
+
passed_rows: int
|
|
25
|
+
failed_rows: int
|
|
26
|
+
details: Dict[str, Any] = field(default_factory=dict)
|
|
27
|
+
action: GateOnFail = GateOnFail.ABORT
|
|
28
|
+
failure_reasons: List[str] = field(default_factory=list)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def evaluate_gate(
|
|
32
|
+
df: Any,
|
|
33
|
+
validation_results: Dict[str, List[bool]],
|
|
34
|
+
gate_config: GateConfig,
|
|
35
|
+
engine: Any,
|
|
36
|
+
catalog: Optional[Any] = None,
|
|
37
|
+
node_name: Optional[str] = None,
|
|
38
|
+
) -> GateResult:
|
|
39
|
+
"""
|
|
40
|
+
Evaluate quality gate on validation results.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
df: DataFrame being validated
|
|
44
|
+
validation_results: Dict of test_name -> per-row boolean results (True=passed)
|
|
45
|
+
gate_config: Gate configuration
|
|
46
|
+
engine: Engine instance
|
|
47
|
+
catalog: Optional CatalogManager for historical row count checks
|
|
48
|
+
node_name: Optional node name for historical lookups
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
GateResult with pass/fail status and action to take
|
|
52
|
+
"""
|
|
53
|
+
is_spark = False
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
import pyspark
|
|
57
|
+
|
|
58
|
+
if hasattr(engine, "spark") or isinstance(df, pyspark.sql.DataFrame):
|
|
59
|
+
is_spark = True
|
|
60
|
+
except ImportError:
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
if is_spark:
|
|
64
|
+
total_rows = df.count()
|
|
65
|
+
elif hasattr(engine, "count_rows"):
|
|
66
|
+
total_rows = engine.count_rows(df)
|
|
67
|
+
else:
|
|
68
|
+
total_rows = len(df)
|
|
69
|
+
|
|
70
|
+
if total_rows == 0:
|
|
71
|
+
return GateResult(
|
|
72
|
+
passed=True,
|
|
73
|
+
pass_rate=1.0,
|
|
74
|
+
total_rows=0,
|
|
75
|
+
passed_rows=0,
|
|
76
|
+
failed_rows=0,
|
|
77
|
+
action=gate_config.on_fail,
|
|
78
|
+
details={"message": "Empty dataset - gate passed by default"},
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
passed_rows = total_rows
|
|
82
|
+
if validation_results:
|
|
83
|
+
all_pass_mask = None
|
|
84
|
+
for test_name, results in validation_results.items():
|
|
85
|
+
if len(results) == total_rows:
|
|
86
|
+
if all_pass_mask is None:
|
|
87
|
+
all_pass_mask = results.copy()
|
|
88
|
+
else:
|
|
89
|
+
all_pass_mask = [a and b for a, b in zip(all_pass_mask, results)]
|
|
90
|
+
|
|
91
|
+
if all_pass_mask:
|
|
92
|
+
passed_rows = sum(all_pass_mask)
|
|
93
|
+
|
|
94
|
+
pass_rate = passed_rows / total_rows if total_rows > 0 else 1.0
|
|
95
|
+
failed_rows = total_rows - passed_rows
|
|
96
|
+
|
|
97
|
+
details: Dict[str, Any] = {
|
|
98
|
+
"overall_pass_rate": pass_rate,
|
|
99
|
+
"per_test_rates": {},
|
|
100
|
+
"row_count_check": None,
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
gate_passed = True
|
|
104
|
+
failure_reasons: List[str] = []
|
|
105
|
+
|
|
106
|
+
if pass_rate < gate_config.require_pass_rate:
|
|
107
|
+
gate_passed = False
|
|
108
|
+
failure_reasons.append(
|
|
109
|
+
f"Overall pass rate {pass_rate:.1%} < required {gate_config.require_pass_rate:.1%}"
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
for threshold in gate_config.thresholds:
|
|
113
|
+
test_results = validation_results.get(threshold.test)
|
|
114
|
+
if test_results:
|
|
115
|
+
test_total = len(test_results)
|
|
116
|
+
test_passed = sum(test_results)
|
|
117
|
+
test_pass_rate = test_passed / test_total if test_total > 0 else 1.0
|
|
118
|
+
details["per_test_rates"][threshold.test] = test_pass_rate
|
|
119
|
+
|
|
120
|
+
if test_pass_rate < threshold.min_pass_rate:
|
|
121
|
+
gate_passed = False
|
|
122
|
+
failure_reasons.append(
|
|
123
|
+
f"Test '{threshold.test}' pass rate {test_pass_rate:.1%} "
|
|
124
|
+
f"< required {threshold.min_pass_rate:.1%}"
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
if gate_config.row_count:
|
|
128
|
+
row_check = _check_row_count(
|
|
129
|
+
total_rows,
|
|
130
|
+
gate_config.row_count,
|
|
131
|
+
catalog,
|
|
132
|
+
node_name,
|
|
133
|
+
)
|
|
134
|
+
details["row_count_check"] = row_check
|
|
135
|
+
|
|
136
|
+
if not row_check["passed"]:
|
|
137
|
+
gate_passed = False
|
|
138
|
+
failure_reasons.append(row_check["reason"])
|
|
139
|
+
|
|
140
|
+
details["failure_reasons"] = failure_reasons
|
|
141
|
+
|
|
142
|
+
if gate_passed:
|
|
143
|
+
logger.info(f"Gate passed: {pass_rate:.1%} pass rate ({passed_rows}/{total_rows} rows)")
|
|
144
|
+
else:
|
|
145
|
+
logger.warning(f"Gate failed: {', '.join(failure_reasons)}")
|
|
146
|
+
|
|
147
|
+
return GateResult(
|
|
148
|
+
passed=gate_passed,
|
|
149
|
+
pass_rate=pass_rate,
|
|
150
|
+
total_rows=total_rows,
|
|
151
|
+
passed_rows=passed_rows,
|
|
152
|
+
failed_rows=failed_rows,
|
|
153
|
+
details=details,
|
|
154
|
+
action=gate_config.on_fail,
|
|
155
|
+
failure_reasons=failure_reasons,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _check_row_count(
|
|
160
|
+
current_count: int,
|
|
161
|
+
row_count_config: Any,
|
|
162
|
+
catalog: Optional[Any],
|
|
163
|
+
node_name: Optional[str],
|
|
164
|
+
) -> Dict[str, Any]:
|
|
165
|
+
"""
|
|
166
|
+
Check row count against thresholds and historical data.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
current_count: Current row count
|
|
170
|
+
row_count_config: RowCountGate configuration
|
|
171
|
+
catalog: CatalogManager for historical lookups
|
|
172
|
+
node_name: Node name for historical lookups
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
Dict with passed status, reason, and details
|
|
176
|
+
"""
|
|
177
|
+
result: Dict[str, Any] = {
|
|
178
|
+
"passed": True,
|
|
179
|
+
"reason": "",
|
|
180
|
+
"current_count": current_count,
|
|
181
|
+
"min": row_count_config.min,
|
|
182
|
+
"max": row_count_config.max,
|
|
183
|
+
"change_threshold": row_count_config.change_threshold,
|
|
184
|
+
"previous_count": None,
|
|
185
|
+
"change_percent": None,
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
if row_count_config.min is not None and current_count < row_count_config.min:
|
|
189
|
+
result["passed"] = False
|
|
190
|
+
result["reason"] = f"Row count {current_count} < minimum {row_count_config.min}"
|
|
191
|
+
return result
|
|
192
|
+
|
|
193
|
+
if row_count_config.max is not None and current_count > row_count_config.max:
|
|
194
|
+
result["passed"] = False
|
|
195
|
+
result["reason"] = f"Row count {current_count} > maximum {row_count_config.max}"
|
|
196
|
+
return result
|
|
197
|
+
|
|
198
|
+
if row_count_config.change_threshold is not None and catalog and node_name:
|
|
199
|
+
try:
|
|
200
|
+
previous_count = _get_previous_row_count(catalog, node_name)
|
|
201
|
+
if previous_count is not None and previous_count > 0:
|
|
202
|
+
result["previous_count"] = previous_count
|
|
203
|
+
change_percent = abs(current_count - previous_count) / previous_count
|
|
204
|
+
result["change_percent"] = change_percent
|
|
205
|
+
|
|
206
|
+
if change_percent > row_count_config.change_threshold:
|
|
207
|
+
result["passed"] = False
|
|
208
|
+
result["reason"] = (
|
|
209
|
+
f"Row count changed {change_percent:.1%} vs previous ({previous_count}), "
|
|
210
|
+
f"exceeds threshold {row_count_config.change_threshold:.1%}"
|
|
211
|
+
)
|
|
212
|
+
return result
|
|
213
|
+
except Exception as e:
|
|
214
|
+
logger.warning(f"Failed to check historical row count: {e}")
|
|
215
|
+
|
|
216
|
+
return result
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _get_previous_row_count(
|
|
220
|
+
catalog: Any,
|
|
221
|
+
node_name: str,
|
|
222
|
+
) -> Optional[int]:
|
|
223
|
+
"""
|
|
224
|
+
Get the previous row count for a node from the catalog.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
catalog: CatalogManager instance
|
|
228
|
+
node_name: Name of the node
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
Previous row count or None if not available
|
|
232
|
+
"""
|
|
233
|
+
try:
|
|
234
|
+
if hasattr(catalog, "get_last_run_metrics"):
|
|
235
|
+
metrics = catalog.get_last_run_metrics(node_name)
|
|
236
|
+
if metrics and "rows_processed" in metrics:
|
|
237
|
+
return metrics["rows_processed"]
|
|
238
|
+
|
|
239
|
+
if hasattr(catalog, "query"):
|
|
240
|
+
results = catalog.query(
|
|
241
|
+
"meta_runs",
|
|
242
|
+
filter=f"node_name = '{node_name}' AND status = 'SUCCESS'",
|
|
243
|
+
order_by="started_at DESC",
|
|
244
|
+
limit=1,
|
|
245
|
+
)
|
|
246
|
+
if results and len(results) > 0:
|
|
247
|
+
return results[0].get("rows_processed")
|
|
248
|
+
|
|
249
|
+
except Exception as e:
|
|
250
|
+
logger.debug(f"Could not fetch previous row count: {e}")
|
|
251
|
+
|
|
252
|
+
return None
|