odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/story/metadata.py
ADDED
|
@@ -0,0 +1,608 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Story Metadata Tracking
|
|
3
|
+
========================
|
|
4
|
+
|
|
5
|
+
Tracks detailed metadata for pipeline execution stories.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
from odibi.utils.logging_context import get_logging_context
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class DeltaWriteInfo:
|
|
17
|
+
"""
|
|
18
|
+
Metadata specific to Delta Lake writes.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
version: int
|
|
22
|
+
timestamp: Optional[datetime] = None
|
|
23
|
+
operation: Optional[str] = None
|
|
24
|
+
operation_metrics: Dict[str, Any] = field(default_factory=dict)
|
|
25
|
+
# For linking back to specific commit info if needed
|
|
26
|
+
read_version: Optional[int] = None # The version we read FROM (if applicable)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class NodeExecutionMetadata:
|
|
31
|
+
"""
|
|
32
|
+
Metadata for a single node execution.
|
|
33
|
+
|
|
34
|
+
Captures all relevant information about a node's execution including
|
|
35
|
+
performance metrics, data transformations, and error details.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
node_name: str
|
|
39
|
+
operation: str
|
|
40
|
+
status: str # "success", "failed", "skipped"
|
|
41
|
+
duration: float
|
|
42
|
+
|
|
43
|
+
# Data metrics
|
|
44
|
+
rows_in: Optional[int] = None
|
|
45
|
+
rows_out: Optional[int] = None
|
|
46
|
+
rows_written: Optional[int] = None
|
|
47
|
+
rows_change: Optional[int] = None
|
|
48
|
+
rows_change_pct: Optional[float] = None
|
|
49
|
+
sample_in: Optional[List[Dict[str, Any]]] = None
|
|
50
|
+
sample_data: Optional[List[Dict[str, Any]]] = None
|
|
51
|
+
|
|
52
|
+
# Schema tracking
|
|
53
|
+
schema_in: Optional[List[str]] = None
|
|
54
|
+
schema_out: Optional[List[str]] = None
|
|
55
|
+
columns_added: List[str] = field(default_factory=list)
|
|
56
|
+
columns_removed: List[str] = field(default_factory=list)
|
|
57
|
+
columns_renamed: List[str] = field(default_factory=list)
|
|
58
|
+
|
|
59
|
+
# Execution Logic & Lineage
|
|
60
|
+
executed_sql: List[str] = field(default_factory=list)
|
|
61
|
+
sql_hash: Optional[str] = None
|
|
62
|
+
transformation_stack: List[str] = field(default_factory=list)
|
|
63
|
+
config_snapshot: Optional[Dict[str, Any]] = None
|
|
64
|
+
|
|
65
|
+
# Delta & Data Info
|
|
66
|
+
delta_info: Optional[DeltaWriteInfo] = None
|
|
67
|
+
data_diff: Optional[Dict[str, Any]] = None # Stores diff summary (added/removed samples)
|
|
68
|
+
environment: Optional[Dict[str, Any]] = None # Captured execution environment
|
|
69
|
+
|
|
70
|
+
# Source & Quality
|
|
71
|
+
source_files: List[str] = field(default_factory=list)
|
|
72
|
+
null_profile: Optional[Dict[str, float]] = None
|
|
73
|
+
|
|
74
|
+
# Error info
|
|
75
|
+
error_message: Optional[str] = None
|
|
76
|
+
error_type: Optional[str] = None
|
|
77
|
+
error_traceback: Optional[str] = None
|
|
78
|
+
error_traceback_cleaned: Optional[str] = None
|
|
79
|
+
validation_warnings: List[str] = field(default_factory=list)
|
|
80
|
+
|
|
81
|
+
# Execution steps (troubleshooting)
|
|
82
|
+
execution_steps: List[str] = field(default_factory=list)
|
|
83
|
+
|
|
84
|
+
# Failed rows samples (per validation name -> sample rows)
|
|
85
|
+
failed_rows_samples: Dict[str, List[Dict[str, Any]]] = field(default_factory=dict)
|
|
86
|
+
failed_rows_counts: Dict[str, int] = field(default_factory=dict)
|
|
87
|
+
failed_rows_truncated: bool = False
|
|
88
|
+
truncated_validations: List[str] = field(default_factory=list)
|
|
89
|
+
|
|
90
|
+
# Retry history
|
|
91
|
+
retry_history: List[Dict[str, Any]] = field(default_factory=list)
|
|
92
|
+
|
|
93
|
+
# Historical Context (Catalog)
|
|
94
|
+
historical_avg_rows: Optional[float] = None
|
|
95
|
+
historical_avg_duration: Optional[float] = None
|
|
96
|
+
|
|
97
|
+
# Anomaly Flags (Phase 1 - Triage)
|
|
98
|
+
is_anomaly: bool = False
|
|
99
|
+
anomaly_reasons: List[str] = field(default_factory=list)
|
|
100
|
+
is_slow: bool = False # 3x slower than historical avg
|
|
101
|
+
has_row_anomaly: bool = False # ±50% rows vs historical avg
|
|
102
|
+
|
|
103
|
+
# Cross-run changes (Phase 3)
|
|
104
|
+
changed_from_last_success: bool = False
|
|
105
|
+
changes_detected: List[str] = field(default_factory=list) # e.g. ["sql", "schema", "rows"]
|
|
106
|
+
previous_sql_hash: Optional[str] = None
|
|
107
|
+
previous_rows_out: Optional[int] = None
|
|
108
|
+
previous_duration: Optional[float] = None
|
|
109
|
+
previous_config_snapshot: Optional[Dict[str, Any]] = None # For config diff viewer
|
|
110
|
+
|
|
111
|
+
# Duration history for sparkline (last N runs)
|
|
112
|
+
# Format: [{"run_id": "...", "duration": 1.5}, ...]
|
|
113
|
+
duration_history: Optional[List[Dict[str, Any]]] = None
|
|
114
|
+
|
|
115
|
+
# Timestamps
|
|
116
|
+
started_at: Optional[str] = None
|
|
117
|
+
completed_at: Optional[str] = None
|
|
118
|
+
|
|
119
|
+
# Phase 5: Quality & Documentation
|
|
120
|
+
description: Optional[str] = None # From NodeConfig.description
|
|
121
|
+
runbook_url: Optional[str] = None # From NodeConfig.runbook_url
|
|
122
|
+
column_statistics: Optional[Dict[str, Dict[str, Any]]] = None # min/max/mean/stddev per column
|
|
123
|
+
|
|
124
|
+
def calculate_row_change(self):
|
|
125
|
+
"""Calculate row count change metrics."""
|
|
126
|
+
ctx = get_logging_context()
|
|
127
|
+
if self.rows_in is not None and self.rows_out is not None:
|
|
128
|
+
self.rows_change = self.rows_out - self.rows_in
|
|
129
|
+
if self.rows_in > 0:
|
|
130
|
+
self.rows_change_pct = (self.rows_change / self.rows_in) * 100
|
|
131
|
+
else:
|
|
132
|
+
self.rows_change_pct = 0.0 if self.rows_out == 0 else 100.0
|
|
133
|
+
ctx.debug(
|
|
134
|
+
"Row change calculated",
|
|
135
|
+
node=self.node_name,
|
|
136
|
+
rows_in=self.rows_in,
|
|
137
|
+
rows_out=self.rows_out,
|
|
138
|
+
change=self.rows_change,
|
|
139
|
+
change_pct=self.rows_change_pct,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
def calculate_schema_changes(self):
|
|
143
|
+
"""Calculate schema changes between input and output."""
|
|
144
|
+
ctx = get_logging_context()
|
|
145
|
+
if self.schema_in and self.schema_out:
|
|
146
|
+
set_in = set(self.schema_in)
|
|
147
|
+
set_out = set(self.schema_out)
|
|
148
|
+
|
|
149
|
+
self.columns_added = list(set_out - set_in)
|
|
150
|
+
self.columns_removed = list(set_in - set_out)
|
|
151
|
+
|
|
152
|
+
if self.columns_added or self.columns_removed:
|
|
153
|
+
ctx.debug(
|
|
154
|
+
"Schema changes detected",
|
|
155
|
+
node=self.node_name,
|
|
156
|
+
columns_added=self.columns_added,
|
|
157
|
+
columns_removed=self.columns_removed,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
161
|
+
"""Convert to dictionary."""
|
|
162
|
+
base_dict = {
|
|
163
|
+
"node_name": self.node_name,
|
|
164
|
+
"operation": self.operation,
|
|
165
|
+
"status": self.status,
|
|
166
|
+
"duration": self.duration,
|
|
167
|
+
"rows_in": self.rows_in,
|
|
168
|
+
"rows_out": self.rows_out,
|
|
169
|
+
"rows_written": self.rows_written,
|
|
170
|
+
"rows_change": self.rows_change,
|
|
171
|
+
"rows_change_pct": self.rows_change_pct,
|
|
172
|
+
"sample_in": self.sample_in,
|
|
173
|
+
"sample_data": self.sample_data,
|
|
174
|
+
"schema_in": self.schema_in,
|
|
175
|
+
"schema_out": self.schema_out,
|
|
176
|
+
"columns_added": self.columns_added,
|
|
177
|
+
"columns_removed": self.columns_removed,
|
|
178
|
+
"error_message": self.error_message,
|
|
179
|
+
"error_type": self.error_type,
|
|
180
|
+
"error_traceback": self.error_traceback,
|
|
181
|
+
"error_traceback_cleaned": self.error_traceback_cleaned,
|
|
182
|
+
"validation_warnings": self.validation_warnings,
|
|
183
|
+
"execution_steps": self.execution_steps,
|
|
184
|
+
"failed_rows_samples": self.failed_rows_samples,
|
|
185
|
+
"failed_rows_counts": self.failed_rows_counts,
|
|
186
|
+
"failed_rows_truncated": self.failed_rows_truncated,
|
|
187
|
+
"truncated_validations": self.truncated_validations,
|
|
188
|
+
"retry_history": self.retry_history,
|
|
189
|
+
"historical_avg_rows": self.historical_avg_rows,
|
|
190
|
+
"historical_avg_duration": self.historical_avg_duration,
|
|
191
|
+
"started_at": self.started_at,
|
|
192
|
+
"completed_at": self.completed_at,
|
|
193
|
+
"executed_sql": self.executed_sql,
|
|
194
|
+
"sql_hash": self.sql_hash,
|
|
195
|
+
"transformation_stack": self.transformation_stack,
|
|
196
|
+
"config_snapshot": self.config_snapshot,
|
|
197
|
+
"data_diff": self.data_diff,
|
|
198
|
+
"environment": self.environment,
|
|
199
|
+
"source_files": self.source_files,
|
|
200
|
+
"null_profile": self.null_profile,
|
|
201
|
+
"is_anomaly": self.is_anomaly,
|
|
202
|
+
"anomaly_reasons": self.anomaly_reasons,
|
|
203
|
+
"is_slow": self.is_slow,
|
|
204
|
+
"has_row_anomaly": self.has_row_anomaly,
|
|
205
|
+
"changed_from_last_success": self.changed_from_last_success,
|
|
206
|
+
"changes_detected": self.changes_detected,
|
|
207
|
+
"previous_sql_hash": self.previous_sql_hash,
|
|
208
|
+
"previous_rows_out": self.previous_rows_out,
|
|
209
|
+
"previous_duration": self.previous_duration,
|
|
210
|
+
"previous_config_snapshot": self.previous_config_snapshot,
|
|
211
|
+
"duration_history": self.duration_history,
|
|
212
|
+
"description": self.description,
|
|
213
|
+
"runbook_url": self.runbook_url,
|
|
214
|
+
"column_statistics": self.column_statistics,
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
if self.delta_info:
|
|
218
|
+
base_dict["delta_info"] = {
|
|
219
|
+
"version": self.delta_info.version,
|
|
220
|
+
"timestamp": (
|
|
221
|
+
self.delta_info.timestamp.isoformat() if self.delta_info.timestamp else None
|
|
222
|
+
),
|
|
223
|
+
"operation": self.delta_info.operation,
|
|
224
|
+
"operation_metrics": self.delta_info.operation_metrics,
|
|
225
|
+
"read_version": self.delta_info.read_version,
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
return base_dict
|
|
229
|
+
|
|
230
|
+
@classmethod
|
|
231
|
+
def from_dict(cls, data: Dict[str, Any]) -> "NodeExecutionMetadata":
|
|
232
|
+
"""Create instance from dictionary."""
|
|
233
|
+
ctx = get_logging_context()
|
|
234
|
+
ctx.debug(
|
|
235
|
+
"Collecting node metadata from dict",
|
|
236
|
+
node_name=data.get("node_name"),
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
delta_info = None
|
|
240
|
+
if "delta_info" in data and data["delta_info"]:
|
|
241
|
+
d_info = data["delta_info"]
|
|
242
|
+
# Parse timestamp if present
|
|
243
|
+
ts = None
|
|
244
|
+
if d_info.get("timestamp"):
|
|
245
|
+
try:
|
|
246
|
+
ts = datetime.fromisoformat(d_info["timestamp"])
|
|
247
|
+
except ValueError:
|
|
248
|
+
pass
|
|
249
|
+
|
|
250
|
+
delta_info = DeltaWriteInfo(
|
|
251
|
+
version=d_info.get("version"),
|
|
252
|
+
timestamp=ts,
|
|
253
|
+
operation=d_info.get("operation"),
|
|
254
|
+
operation_metrics=d_info.get("operation_metrics", {}),
|
|
255
|
+
read_version=d_info.get("read_version"),
|
|
256
|
+
)
|
|
257
|
+
ctx.debug(
|
|
258
|
+
"Delta version info extracted",
|
|
259
|
+
node_name=data.get("node_name"),
|
|
260
|
+
version=d_info.get("version"),
|
|
261
|
+
operation=d_info.get("operation"),
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
# Filter out unknown keys to be safe
|
|
265
|
+
valid_keys = cls.__annotations__.keys()
|
|
266
|
+
clean_data = {k: v for k, v in data.items() if k in valid_keys}
|
|
267
|
+
|
|
268
|
+
# Remove nested objects handled separately
|
|
269
|
+
if "delta_info" in clean_data:
|
|
270
|
+
del clean_data["delta_info"]
|
|
271
|
+
|
|
272
|
+
# Log data diff collection if present
|
|
273
|
+
if "data_diff" in data and data["data_diff"]:
|
|
274
|
+
ctx.debug(
|
|
275
|
+
"Data diff collected",
|
|
276
|
+
node_name=data.get("node_name"),
|
|
277
|
+
has_added=bool(data["data_diff"].get("added")),
|
|
278
|
+
has_removed=bool(data["data_diff"].get("removed")),
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
return cls(delta_info=delta_info, **clean_data)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
@dataclass
|
|
285
|
+
class PipelineStoryMetadata:
|
|
286
|
+
"""
|
|
287
|
+
Complete metadata for a pipeline run story.
|
|
288
|
+
|
|
289
|
+
Aggregates information about the entire pipeline execution including
|
|
290
|
+
all node executions, overall status, and project context.
|
|
291
|
+
"""
|
|
292
|
+
|
|
293
|
+
pipeline_name: str
|
|
294
|
+
pipeline_layer: Optional[str] = None
|
|
295
|
+
|
|
296
|
+
# Execution info
|
|
297
|
+
run_id: str = field(default_factory=lambda: datetime.now().strftime("%Y%m%d_%H%M%S"))
|
|
298
|
+
started_at: str = field(default_factory=lambda: datetime.now().isoformat())
|
|
299
|
+
completed_at: Optional[str] = None
|
|
300
|
+
duration: float = 0.0
|
|
301
|
+
|
|
302
|
+
# Status
|
|
303
|
+
total_nodes: int = 0
|
|
304
|
+
completed_nodes: int = 0
|
|
305
|
+
failed_nodes: int = 0
|
|
306
|
+
skipped_nodes: int = 0
|
|
307
|
+
|
|
308
|
+
# Node details
|
|
309
|
+
nodes: List[NodeExecutionMetadata] = field(default_factory=list)
|
|
310
|
+
|
|
311
|
+
# Project context
|
|
312
|
+
project: Optional[str] = None
|
|
313
|
+
plant: Optional[str] = None
|
|
314
|
+
asset: Optional[str] = None
|
|
315
|
+
business_unit: Optional[str] = None
|
|
316
|
+
|
|
317
|
+
# Story settings
|
|
318
|
+
theme: str = "default"
|
|
319
|
+
include_samples: bool = True
|
|
320
|
+
max_sample_rows: int = 10
|
|
321
|
+
|
|
322
|
+
# Graph data for interactive DAG (Phase 2)
|
|
323
|
+
graph_data: Optional[Dict[str, Any]] = None
|
|
324
|
+
|
|
325
|
+
# Cross-run comparison (Phase 3)
|
|
326
|
+
change_summary: Optional[Dict[str, Any]] = None
|
|
327
|
+
compared_to_run_id: Optional[str] = None
|
|
328
|
+
git_info: Optional[Dict[str, str]] = None
|
|
329
|
+
|
|
330
|
+
def add_node(self, node_metadata: NodeExecutionMetadata):
|
|
331
|
+
"""
|
|
332
|
+
Add node execution metadata.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
node_metadata: Metadata for the node execution
|
|
336
|
+
"""
|
|
337
|
+
ctx = get_logging_context()
|
|
338
|
+
self.nodes.append(node_metadata)
|
|
339
|
+
self.total_nodes += 1
|
|
340
|
+
|
|
341
|
+
if node_metadata.status == "success":
|
|
342
|
+
self.completed_nodes += 1
|
|
343
|
+
elif node_metadata.status == "failed":
|
|
344
|
+
self.failed_nodes += 1
|
|
345
|
+
elif node_metadata.status == "skipped":
|
|
346
|
+
self.skipped_nodes += 1
|
|
347
|
+
|
|
348
|
+
ctx.debug(
|
|
349
|
+
"Node metadata added to story",
|
|
350
|
+
pipeline=self.pipeline_name,
|
|
351
|
+
node=node_metadata.node_name,
|
|
352
|
+
status=node_metadata.status,
|
|
353
|
+
total_nodes=self.total_nodes,
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
def get_success_rate(self) -> float:
|
|
357
|
+
"""Calculate success rate as percentage."""
|
|
358
|
+
if self.total_nodes == 0:
|
|
359
|
+
return 0.0
|
|
360
|
+
return (self.completed_nodes / self.total_nodes) * 100
|
|
361
|
+
|
|
362
|
+
def get_total_rows_processed(self) -> int:
|
|
363
|
+
"""Calculate total rows processed across all nodes."""
|
|
364
|
+
total = 0
|
|
365
|
+
for node in self.nodes:
|
|
366
|
+
if node.rows_out is not None:
|
|
367
|
+
total += node.rows_out
|
|
368
|
+
return total
|
|
369
|
+
|
|
370
|
+
def get_total_rows_in(self) -> int:
|
|
371
|
+
"""Calculate total input rows across all nodes."""
|
|
372
|
+
total = 0
|
|
373
|
+
for node in self.nodes:
|
|
374
|
+
if node.rows_in is not None:
|
|
375
|
+
total += node.rows_in
|
|
376
|
+
return total
|
|
377
|
+
|
|
378
|
+
def get_rows_dropped(self) -> int:
|
|
379
|
+
"""Calculate total rows dropped (filtered) across all nodes."""
|
|
380
|
+
dropped = 0
|
|
381
|
+
for node in self.nodes:
|
|
382
|
+
if node.rows_in is not None and node.rows_out is not None:
|
|
383
|
+
diff = node.rows_in - node.rows_out
|
|
384
|
+
if diff > 0:
|
|
385
|
+
dropped += diff
|
|
386
|
+
return dropped
|
|
387
|
+
|
|
388
|
+
def get_final_output_rows(self) -> Optional[int]:
|
|
389
|
+
"""Get the row count from the last successful node (final output)."""
|
|
390
|
+
for node in reversed(self.nodes):
|
|
391
|
+
if node.status == "success" and node.rows_out is not None:
|
|
392
|
+
return node.rows_out
|
|
393
|
+
return None
|
|
394
|
+
|
|
395
|
+
def get_alert_summary(self) -> Dict[str, Any]:
|
|
396
|
+
"""Get a summary suitable for alert payloads.
|
|
397
|
+
|
|
398
|
+
Returns:
|
|
399
|
+
Dictionary with key metrics for alerts
|
|
400
|
+
"""
|
|
401
|
+
return {
|
|
402
|
+
"total_rows_processed": self.get_total_rows_processed(),
|
|
403
|
+
"total_rows_in": self.get_total_rows_in(),
|
|
404
|
+
"rows_dropped": self.get_rows_dropped(),
|
|
405
|
+
"final_output_rows": self.get_final_output_rows(),
|
|
406
|
+
"success_rate": self.get_success_rate(),
|
|
407
|
+
"completed_nodes": self.completed_nodes,
|
|
408
|
+
"failed_nodes": self.failed_nodes,
|
|
409
|
+
"skipped_nodes": self.skipped_nodes,
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
def get_failed_node_names(self) -> List[str]:
|
|
413
|
+
"""Get names of all failed nodes."""
|
|
414
|
+
return [n.node_name for n in self.nodes if n.status == "failed"]
|
|
415
|
+
|
|
416
|
+
def get_first_failure(self) -> Optional["NodeExecutionMetadata"]:
|
|
417
|
+
"""Get the first failed node (by execution order)."""
|
|
418
|
+
for node in self.nodes:
|
|
419
|
+
if node.status == "failed":
|
|
420
|
+
return node
|
|
421
|
+
return None
|
|
422
|
+
|
|
423
|
+
def get_anomalous_nodes(self) -> List["NodeExecutionMetadata"]:
|
|
424
|
+
"""Get all nodes with anomalies (slow or row count deviation)."""
|
|
425
|
+
return [n for n in self.nodes if n.is_anomaly]
|
|
426
|
+
|
|
427
|
+
def get_run_health_summary(self) -> Dict[str, Any]:
|
|
428
|
+
"""Get run health summary for triage header.
|
|
429
|
+
|
|
430
|
+
Returns:
|
|
431
|
+
Dictionary with health info for quick triage
|
|
432
|
+
"""
|
|
433
|
+
failed_names = self.get_failed_node_names()
|
|
434
|
+
first_failure = self.get_first_failure()
|
|
435
|
+
anomalous = self.get_anomalous_nodes()
|
|
436
|
+
|
|
437
|
+
return {
|
|
438
|
+
"has_failures": len(failed_names) > 0,
|
|
439
|
+
"failed_count": len(failed_names),
|
|
440
|
+
"failed_nodes": failed_names,
|
|
441
|
+
"first_failure_node": first_failure.node_name if first_failure else None,
|
|
442
|
+
"first_failure_error": first_failure.error_message if first_failure else None,
|
|
443
|
+
"first_failure_type": first_failure.error_type if first_failure else None,
|
|
444
|
+
"anomaly_count": len(anomalous),
|
|
445
|
+
"anomalous_nodes": [n.node_name for n in anomalous],
|
|
446
|
+
"overall_status": "failed" if failed_names else "success",
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
def get_data_quality_summary(self) -> Dict[str, Any]:
|
|
450
|
+
"""Get data quality summary across all nodes.
|
|
451
|
+
|
|
452
|
+
Returns:
|
|
453
|
+
Dictionary with quality metrics for Phase 5 Data Quality Summary card
|
|
454
|
+
"""
|
|
455
|
+
total_validations_failed = 0
|
|
456
|
+
total_failed_rows = 0
|
|
457
|
+
top_null_columns: List[Dict[str, Any]] = []
|
|
458
|
+
nodes_with_warnings = []
|
|
459
|
+
|
|
460
|
+
for node in self.nodes:
|
|
461
|
+
# Count validation warnings
|
|
462
|
+
if node.validation_warnings:
|
|
463
|
+
total_validations_failed += len(node.validation_warnings)
|
|
464
|
+
nodes_with_warnings.append(node.node_name)
|
|
465
|
+
|
|
466
|
+
# Sum failed rows
|
|
467
|
+
if node.failed_rows_counts:
|
|
468
|
+
for count in node.failed_rows_counts.values():
|
|
469
|
+
total_failed_rows += count
|
|
470
|
+
|
|
471
|
+
# Collect null profile data
|
|
472
|
+
if node.null_profile:
|
|
473
|
+
for col, null_pct in node.null_profile.items():
|
|
474
|
+
if null_pct and null_pct > 0:
|
|
475
|
+
top_null_columns.append(
|
|
476
|
+
{
|
|
477
|
+
"node": node.node_name,
|
|
478
|
+
"column": col,
|
|
479
|
+
"null_pct": null_pct,
|
|
480
|
+
}
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
# Sort by null percentage descending and take top 10
|
|
484
|
+
top_null_columns.sort(key=lambda x: x["null_pct"], reverse=True)
|
|
485
|
+
top_null_columns = top_null_columns[:10]
|
|
486
|
+
|
|
487
|
+
return {
|
|
488
|
+
"total_validations_failed": total_validations_failed,
|
|
489
|
+
"total_failed_rows": total_failed_rows,
|
|
490
|
+
"top_null_columns": top_null_columns,
|
|
491
|
+
"nodes_with_warnings": nodes_with_warnings,
|
|
492
|
+
"has_quality_issues": total_validations_failed > 0 or total_failed_rows > 0,
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
def get_freshness_info(self) -> Optional[Dict[str, Any]]:
|
|
496
|
+
"""Get data freshness indicator from date columns.
|
|
497
|
+
|
|
498
|
+
Looks for max timestamp in date/timestamp columns from sample data.
|
|
499
|
+
|
|
500
|
+
Returns:
|
|
501
|
+
Dictionary with freshness info or None if not available
|
|
502
|
+
"""
|
|
503
|
+
latest_timestamp = None
|
|
504
|
+
latest_column = None
|
|
505
|
+
latest_node = None
|
|
506
|
+
|
|
507
|
+
date_patterns = ["date", "time", "timestamp", "created", "updated", "modified", "_at"]
|
|
508
|
+
|
|
509
|
+
for node in reversed(self.nodes): # Start from last node (output)
|
|
510
|
+
if node.status != "success" or not node.sample_data:
|
|
511
|
+
continue
|
|
512
|
+
|
|
513
|
+
if not node.sample_data:
|
|
514
|
+
continue
|
|
515
|
+
|
|
516
|
+
for sample_row in node.sample_data:
|
|
517
|
+
for col, val in sample_row.items():
|
|
518
|
+
# Check if column name suggests date/time
|
|
519
|
+
col_lower = col.lower()
|
|
520
|
+
if not any(p in col_lower for p in date_patterns):
|
|
521
|
+
continue
|
|
522
|
+
|
|
523
|
+
if val is None:
|
|
524
|
+
continue
|
|
525
|
+
|
|
526
|
+
# Try to parse as datetime
|
|
527
|
+
try:
|
|
528
|
+
from datetime import datetime as dt
|
|
529
|
+
|
|
530
|
+
if isinstance(val, str):
|
|
531
|
+
# Try common formats
|
|
532
|
+
for fmt in [
|
|
533
|
+
"%Y-%m-%d %H:%M:%S",
|
|
534
|
+
"%Y-%m-%dT%H:%M:%S",
|
|
535
|
+
"%Y-%m-%d",
|
|
536
|
+
]:
|
|
537
|
+
try:
|
|
538
|
+
parsed = dt.strptime(val[:19], fmt)
|
|
539
|
+
if latest_timestamp is None or parsed > latest_timestamp:
|
|
540
|
+
latest_timestamp = parsed
|
|
541
|
+
latest_column = col
|
|
542
|
+
latest_node = node.node_name
|
|
543
|
+
break
|
|
544
|
+
except (ValueError, TypeError):
|
|
545
|
+
continue
|
|
546
|
+
except Exception:
|
|
547
|
+
pass
|
|
548
|
+
|
|
549
|
+
if latest_timestamp:
|
|
550
|
+
return {
|
|
551
|
+
"timestamp": latest_timestamp.isoformat(),
|
|
552
|
+
"column": latest_column,
|
|
553
|
+
"node": latest_node,
|
|
554
|
+
"formatted": latest_timestamp.strftime("%Y-%m-%d %H:%M"),
|
|
555
|
+
}
|
|
556
|
+
return None
|
|
557
|
+
|
|
558
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
559
|
+
"""Convert to dictionary."""
|
|
560
|
+
return {
|
|
561
|
+
"pipeline_name": self.pipeline_name,
|
|
562
|
+
"pipeline_layer": self.pipeline_layer,
|
|
563
|
+
"run_id": self.run_id,
|
|
564
|
+
"started_at": self.started_at,
|
|
565
|
+
"completed_at": self.completed_at,
|
|
566
|
+
"duration": self.duration,
|
|
567
|
+
"total_nodes": self.total_nodes,
|
|
568
|
+
"completed_nodes": self.completed_nodes,
|
|
569
|
+
"failed_nodes": self.failed_nodes,
|
|
570
|
+
"skipped_nodes": self.skipped_nodes,
|
|
571
|
+
"success_rate": self.get_success_rate(),
|
|
572
|
+
"total_rows_processed": self.get_total_rows_processed(),
|
|
573
|
+
"nodes": [node.to_dict() for node in self.nodes],
|
|
574
|
+
"project": self.project,
|
|
575
|
+
"plant": self.plant,
|
|
576
|
+
"asset": self.asset,
|
|
577
|
+
"business_unit": self.business_unit,
|
|
578
|
+
"theme": self.theme,
|
|
579
|
+
"graph_data": self.graph_data,
|
|
580
|
+
"change_summary": self.change_summary,
|
|
581
|
+
"compared_to_run_id": self.compared_to_run_id,
|
|
582
|
+
"git_info": self.git_info,
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
@classmethod
|
|
586
|
+
def from_dict(cls, data: Dict[str, Any]) -> "PipelineStoryMetadata":
|
|
587
|
+
"""Create instance from dictionary."""
|
|
588
|
+
nodes_data = data.get("nodes", [])
|
|
589
|
+
nodes = [NodeExecutionMetadata.from_dict(n) for n in nodes_data]
|
|
590
|
+
|
|
591
|
+
# Filter valid keys
|
|
592
|
+
valid_keys = cls.__annotations__.keys()
|
|
593
|
+
clean_data = {k: v for k, v in data.items() if k in valid_keys}
|
|
594
|
+
|
|
595
|
+
# Handle nested
|
|
596
|
+
if "nodes" in clean_data:
|
|
597
|
+
del clean_data["nodes"]
|
|
598
|
+
|
|
599
|
+
return cls(nodes=nodes, **clean_data)
|
|
600
|
+
|
|
601
|
+
@classmethod
|
|
602
|
+
def from_json(cls, path: str) -> "PipelineStoryMetadata":
|
|
603
|
+
"""Load from a JSON file."""
|
|
604
|
+
import json
|
|
605
|
+
|
|
606
|
+
with open(path, "r") as f:
|
|
607
|
+
data = json.load(f)
|
|
608
|
+
return cls.from_dict(data)
|