odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,252 @@
1
+ """
2
+ Quality Gate support for batch-level validation.
3
+
4
+ Gates evaluate the entire batch before writing, ensuring
5
+ data quality thresholds are met at the aggregate level.
6
+ """
7
+
8
+ import logging
9
+ from dataclasses import dataclass, field
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from odibi.config import GateConfig, GateOnFail
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @dataclass
18
+ class GateResult:
19
+ """Result of gate evaluation."""
20
+
21
+ passed: bool
22
+ pass_rate: float
23
+ total_rows: int
24
+ passed_rows: int
25
+ failed_rows: int
26
+ details: Dict[str, Any] = field(default_factory=dict)
27
+ action: GateOnFail = GateOnFail.ABORT
28
+ failure_reasons: List[str] = field(default_factory=list)
29
+
30
+
31
+ def evaluate_gate(
32
+ df: Any,
33
+ validation_results: Dict[str, List[bool]],
34
+ gate_config: GateConfig,
35
+ engine: Any,
36
+ catalog: Optional[Any] = None,
37
+ node_name: Optional[str] = None,
38
+ ) -> GateResult:
39
+ """
40
+ Evaluate quality gate on validation results.
41
+
42
+ Args:
43
+ df: DataFrame being validated
44
+ validation_results: Dict of test_name -> per-row boolean results (True=passed)
45
+ gate_config: Gate configuration
46
+ engine: Engine instance
47
+ catalog: Optional CatalogManager for historical row count checks
48
+ node_name: Optional node name for historical lookups
49
+
50
+ Returns:
51
+ GateResult with pass/fail status and action to take
52
+ """
53
+ is_spark = False
54
+
55
+ try:
56
+ import pyspark
57
+
58
+ if hasattr(engine, "spark") or isinstance(df, pyspark.sql.DataFrame):
59
+ is_spark = True
60
+ except ImportError:
61
+ pass
62
+
63
+ if is_spark:
64
+ total_rows = df.count()
65
+ elif hasattr(engine, "count_rows"):
66
+ total_rows = engine.count_rows(df)
67
+ else:
68
+ total_rows = len(df)
69
+
70
+ if total_rows == 0:
71
+ return GateResult(
72
+ passed=True,
73
+ pass_rate=1.0,
74
+ total_rows=0,
75
+ passed_rows=0,
76
+ failed_rows=0,
77
+ action=gate_config.on_fail,
78
+ details={"message": "Empty dataset - gate passed by default"},
79
+ )
80
+
81
+ passed_rows = total_rows
82
+ if validation_results:
83
+ all_pass_mask = None
84
+ for test_name, results in validation_results.items():
85
+ if len(results) == total_rows:
86
+ if all_pass_mask is None:
87
+ all_pass_mask = results.copy()
88
+ else:
89
+ all_pass_mask = [a and b for a, b in zip(all_pass_mask, results)]
90
+
91
+ if all_pass_mask:
92
+ passed_rows = sum(all_pass_mask)
93
+
94
+ pass_rate = passed_rows / total_rows if total_rows > 0 else 1.0
95
+ failed_rows = total_rows - passed_rows
96
+
97
+ details: Dict[str, Any] = {
98
+ "overall_pass_rate": pass_rate,
99
+ "per_test_rates": {},
100
+ "row_count_check": None,
101
+ }
102
+
103
+ gate_passed = True
104
+ failure_reasons: List[str] = []
105
+
106
+ if pass_rate < gate_config.require_pass_rate:
107
+ gate_passed = False
108
+ failure_reasons.append(
109
+ f"Overall pass rate {pass_rate:.1%} < required {gate_config.require_pass_rate:.1%}"
110
+ )
111
+
112
+ for threshold in gate_config.thresholds:
113
+ test_results = validation_results.get(threshold.test)
114
+ if test_results:
115
+ test_total = len(test_results)
116
+ test_passed = sum(test_results)
117
+ test_pass_rate = test_passed / test_total if test_total > 0 else 1.0
118
+ details["per_test_rates"][threshold.test] = test_pass_rate
119
+
120
+ if test_pass_rate < threshold.min_pass_rate:
121
+ gate_passed = False
122
+ failure_reasons.append(
123
+ f"Test '{threshold.test}' pass rate {test_pass_rate:.1%} "
124
+ f"< required {threshold.min_pass_rate:.1%}"
125
+ )
126
+
127
+ if gate_config.row_count:
128
+ row_check = _check_row_count(
129
+ total_rows,
130
+ gate_config.row_count,
131
+ catalog,
132
+ node_name,
133
+ )
134
+ details["row_count_check"] = row_check
135
+
136
+ if not row_check["passed"]:
137
+ gate_passed = False
138
+ failure_reasons.append(row_check["reason"])
139
+
140
+ details["failure_reasons"] = failure_reasons
141
+
142
+ if gate_passed:
143
+ logger.info(f"Gate passed: {pass_rate:.1%} pass rate ({passed_rows}/{total_rows} rows)")
144
+ else:
145
+ logger.warning(f"Gate failed: {', '.join(failure_reasons)}")
146
+
147
+ return GateResult(
148
+ passed=gate_passed,
149
+ pass_rate=pass_rate,
150
+ total_rows=total_rows,
151
+ passed_rows=passed_rows,
152
+ failed_rows=failed_rows,
153
+ details=details,
154
+ action=gate_config.on_fail,
155
+ failure_reasons=failure_reasons,
156
+ )
157
+
158
+
159
+ def _check_row_count(
160
+ current_count: int,
161
+ row_count_config: Any,
162
+ catalog: Optional[Any],
163
+ node_name: Optional[str],
164
+ ) -> Dict[str, Any]:
165
+ """
166
+ Check row count against thresholds and historical data.
167
+
168
+ Args:
169
+ current_count: Current row count
170
+ row_count_config: RowCountGate configuration
171
+ catalog: CatalogManager for historical lookups
172
+ node_name: Node name for historical lookups
173
+
174
+ Returns:
175
+ Dict with passed status, reason, and details
176
+ """
177
+ result: Dict[str, Any] = {
178
+ "passed": True,
179
+ "reason": "",
180
+ "current_count": current_count,
181
+ "min": row_count_config.min,
182
+ "max": row_count_config.max,
183
+ "change_threshold": row_count_config.change_threshold,
184
+ "previous_count": None,
185
+ "change_percent": None,
186
+ }
187
+
188
+ if row_count_config.min is not None and current_count < row_count_config.min:
189
+ result["passed"] = False
190
+ result["reason"] = f"Row count {current_count} < minimum {row_count_config.min}"
191
+ return result
192
+
193
+ if row_count_config.max is not None and current_count > row_count_config.max:
194
+ result["passed"] = False
195
+ result["reason"] = f"Row count {current_count} > maximum {row_count_config.max}"
196
+ return result
197
+
198
+ if row_count_config.change_threshold is not None and catalog and node_name:
199
+ try:
200
+ previous_count = _get_previous_row_count(catalog, node_name)
201
+ if previous_count is not None and previous_count > 0:
202
+ result["previous_count"] = previous_count
203
+ change_percent = abs(current_count - previous_count) / previous_count
204
+ result["change_percent"] = change_percent
205
+
206
+ if change_percent > row_count_config.change_threshold:
207
+ result["passed"] = False
208
+ result["reason"] = (
209
+ f"Row count changed {change_percent:.1%} vs previous ({previous_count}), "
210
+ f"exceeds threshold {row_count_config.change_threshold:.1%}"
211
+ )
212
+ return result
213
+ except Exception as e:
214
+ logger.warning(f"Failed to check historical row count: {e}")
215
+
216
+ return result
217
+
218
+
219
+ def _get_previous_row_count(
220
+ catalog: Any,
221
+ node_name: str,
222
+ ) -> Optional[int]:
223
+ """
224
+ Get the previous row count for a node from the catalog.
225
+
226
+ Args:
227
+ catalog: CatalogManager instance
228
+ node_name: Name of the node
229
+
230
+ Returns:
231
+ Previous row count or None if not available
232
+ """
233
+ try:
234
+ if hasattr(catalog, "get_last_run_metrics"):
235
+ metrics = catalog.get_last_run_metrics(node_name)
236
+ if metrics and "rows_processed" in metrics:
237
+ return metrics["rows_processed"]
238
+
239
+ if hasattr(catalog, "query"):
240
+ results = catalog.query(
241
+ "meta_runs",
242
+ filter=f"node_name = '{node_name}' AND status = 'SUCCESS'",
243
+ order_by="started_at DESC",
244
+ limit=1,
245
+ )
246
+ if results and len(results) > 0:
247
+ return results[0].get("rows_processed")
248
+
249
+ except Exception as e:
250
+ logger.debug(f"Could not fetch previous row count: {e}")
251
+
252
+ return None