odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,608 @@
1
+ """
2
+ Story Metadata Tracking
3
+ ========================
4
+
5
+ Tracks detailed metadata for pipeline execution stories.
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from datetime import datetime
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from odibi.utils.logging_context import get_logging_context
13
+
14
+
15
+ @dataclass
16
+ class DeltaWriteInfo:
17
+ """
18
+ Metadata specific to Delta Lake writes.
19
+ """
20
+
21
+ version: int
22
+ timestamp: Optional[datetime] = None
23
+ operation: Optional[str] = None
24
+ operation_metrics: Dict[str, Any] = field(default_factory=dict)
25
+ # For linking back to specific commit info if needed
26
+ read_version: Optional[int] = None # The version we read FROM (if applicable)
27
+
28
+
29
+ @dataclass
30
+ class NodeExecutionMetadata:
31
+ """
32
+ Metadata for a single node execution.
33
+
34
+ Captures all relevant information about a node's execution including
35
+ performance metrics, data transformations, and error details.
36
+ """
37
+
38
+ node_name: str
39
+ operation: str
40
+ status: str # "success", "failed", "skipped"
41
+ duration: float
42
+
43
+ # Data metrics
44
+ rows_in: Optional[int] = None
45
+ rows_out: Optional[int] = None
46
+ rows_written: Optional[int] = None
47
+ rows_change: Optional[int] = None
48
+ rows_change_pct: Optional[float] = None
49
+ sample_in: Optional[List[Dict[str, Any]]] = None
50
+ sample_data: Optional[List[Dict[str, Any]]] = None
51
+
52
+ # Schema tracking
53
+ schema_in: Optional[List[str]] = None
54
+ schema_out: Optional[List[str]] = None
55
+ columns_added: List[str] = field(default_factory=list)
56
+ columns_removed: List[str] = field(default_factory=list)
57
+ columns_renamed: List[str] = field(default_factory=list)
58
+
59
+ # Execution Logic & Lineage
60
+ executed_sql: List[str] = field(default_factory=list)
61
+ sql_hash: Optional[str] = None
62
+ transformation_stack: List[str] = field(default_factory=list)
63
+ config_snapshot: Optional[Dict[str, Any]] = None
64
+
65
+ # Delta & Data Info
66
+ delta_info: Optional[DeltaWriteInfo] = None
67
+ data_diff: Optional[Dict[str, Any]] = None # Stores diff summary (added/removed samples)
68
+ environment: Optional[Dict[str, Any]] = None # Captured execution environment
69
+
70
+ # Source & Quality
71
+ source_files: List[str] = field(default_factory=list)
72
+ null_profile: Optional[Dict[str, float]] = None
73
+
74
+ # Error info
75
+ error_message: Optional[str] = None
76
+ error_type: Optional[str] = None
77
+ error_traceback: Optional[str] = None
78
+ error_traceback_cleaned: Optional[str] = None
79
+ validation_warnings: List[str] = field(default_factory=list)
80
+
81
+ # Execution steps (troubleshooting)
82
+ execution_steps: List[str] = field(default_factory=list)
83
+
84
+ # Failed rows samples (per validation name -> sample rows)
85
+ failed_rows_samples: Dict[str, List[Dict[str, Any]]] = field(default_factory=dict)
86
+ failed_rows_counts: Dict[str, int] = field(default_factory=dict)
87
+ failed_rows_truncated: bool = False
88
+ truncated_validations: List[str] = field(default_factory=list)
89
+
90
+ # Retry history
91
+ retry_history: List[Dict[str, Any]] = field(default_factory=list)
92
+
93
+ # Historical Context (Catalog)
94
+ historical_avg_rows: Optional[float] = None
95
+ historical_avg_duration: Optional[float] = None
96
+
97
+ # Anomaly Flags (Phase 1 - Triage)
98
+ is_anomaly: bool = False
99
+ anomaly_reasons: List[str] = field(default_factory=list)
100
+ is_slow: bool = False # 3x slower than historical avg
101
+ has_row_anomaly: bool = False # ±50% rows vs historical avg
102
+
103
+ # Cross-run changes (Phase 3)
104
+ changed_from_last_success: bool = False
105
+ changes_detected: List[str] = field(default_factory=list) # e.g. ["sql", "schema", "rows"]
106
+ previous_sql_hash: Optional[str] = None
107
+ previous_rows_out: Optional[int] = None
108
+ previous_duration: Optional[float] = None
109
+ previous_config_snapshot: Optional[Dict[str, Any]] = None # For config diff viewer
110
+
111
+ # Duration history for sparkline (last N runs)
112
+ # Format: [{"run_id": "...", "duration": 1.5}, ...]
113
+ duration_history: Optional[List[Dict[str, Any]]] = None
114
+
115
+ # Timestamps
116
+ started_at: Optional[str] = None
117
+ completed_at: Optional[str] = None
118
+
119
+ # Phase 5: Quality & Documentation
120
+ description: Optional[str] = None # From NodeConfig.description
121
+ runbook_url: Optional[str] = None # From NodeConfig.runbook_url
122
+ column_statistics: Optional[Dict[str, Dict[str, Any]]] = None # min/max/mean/stddev per column
123
+
124
+ def calculate_row_change(self):
125
+ """Calculate row count change metrics."""
126
+ ctx = get_logging_context()
127
+ if self.rows_in is not None and self.rows_out is not None:
128
+ self.rows_change = self.rows_out - self.rows_in
129
+ if self.rows_in > 0:
130
+ self.rows_change_pct = (self.rows_change / self.rows_in) * 100
131
+ else:
132
+ self.rows_change_pct = 0.0 if self.rows_out == 0 else 100.0
133
+ ctx.debug(
134
+ "Row change calculated",
135
+ node=self.node_name,
136
+ rows_in=self.rows_in,
137
+ rows_out=self.rows_out,
138
+ change=self.rows_change,
139
+ change_pct=self.rows_change_pct,
140
+ )
141
+
142
+ def calculate_schema_changes(self):
143
+ """Calculate schema changes between input and output."""
144
+ ctx = get_logging_context()
145
+ if self.schema_in and self.schema_out:
146
+ set_in = set(self.schema_in)
147
+ set_out = set(self.schema_out)
148
+
149
+ self.columns_added = list(set_out - set_in)
150
+ self.columns_removed = list(set_in - set_out)
151
+
152
+ if self.columns_added or self.columns_removed:
153
+ ctx.debug(
154
+ "Schema changes detected",
155
+ node=self.node_name,
156
+ columns_added=self.columns_added,
157
+ columns_removed=self.columns_removed,
158
+ )
159
+
160
+ def to_dict(self) -> Dict[str, Any]:
161
+ """Convert to dictionary."""
162
+ base_dict = {
163
+ "node_name": self.node_name,
164
+ "operation": self.operation,
165
+ "status": self.status,
166
+ "duration": self.duration,
167
+ "rows_in": self.rows_in,
168
+ "rows_out": self.rows_out,
169
+ "rows_written": self.rows_written,
170
+ "rows_change": self.rows_change,
171
+ "rows_change_pct": self.rows_change_pct,
172
+ "sample_in": self.sample_in,
173
+ "sample_data": self.sample_data,
174
+ "schema_in": self.schema_in,
175
+ "schema_out": self.schema_out,
176
+ "columns_added": self.columns_added,
177
+ "columns_removed": self.columns_removed,
178
+ "error_message": self.error_message,
179
+ "error_type": self.error_type,
180
+ "error_traceback": self.error_traceback,
181
+ "error_traceback_cleaned": self.error_traceback_cleaned,
182
+ "validation_warnings": self.validation_warnings,
183
+ "execution_steps": self.execution_steps,
184
+ "failed_rows_samples": self.failed_rows_samples,
185
+ "failed_rows_counts": self.failed_rows_counts,
186
+ "failed_rows_truncated": self.failed_rows_truncated,
187
+ "truncated_validations": self.truncated_validations,
188
+ "retry_history": self.retry_history,
189
+ "historical_avg_rows": self.historical_avg_rows,
190
+ "historical_avg_duration": self.historical_avg_duration,
191
+ "started_at": self.started_at,
192
+ "completed_at": self.completed_at,
193
+ "executed_sql": self.executed_sql,
194
+ "sql_hash": self.sql_hash,
195
+ "transformation_stack": self.transformation_stack,
196
+ "config_snapshot": self.config_snapshot,
197
+ "data_diff": self.data_diff,
198
+ "environment": self.environment,
199
+ "source_files": self.source_files,
200
+ "null_profile": self.null_profile,
201
+ "is_anomaly": self.is_anomaly,
202
+ "anomaly_reasons": self.anomaly_reasons,
203
+ "is_slow": self.is_slow,
204
+ "has_row_anomaly": self.has_row_anomaly,
205
+ "changed_from_last_success": self.changed_from_last_success,
206
+ "changes_detected": self.changes_detected,
207
+ "previous_sql_hash": self.previous_sql_hash,
208
+ "previous_rows_out": self.previous_rows_out,
209
+ "previous_duration": self.previous_duration,
210
+ "previous_config_snapshot": self.previous_config_snapshot,
211
+ "duration_history": self.duration_history,
212
+ "description": self.description,
213
+ "runbook_url": self.runbook_url,
214
+ "column_statistics": self.column_statistics,
215
+ }
216
+
217
+ if self.delta_info:
218
+ base_dict["delta_info"] = {
219
+ "version": self.delta_info.version,
220
+ "timestamp": (
221
+ self.delta_info.timestamp.isoformat() if self.delta_info.timestamp else None
222
+ ),
223
+ "operation": self.delta_info.operation,
224
+ "operation_metrics": self.delta_info.operation_metrics,
225
+ "read_version": self.delta_info.read_version,
226
+ }
227
+
228
+ return base_dict
229
+
230
+ @classmethod
231
+ def from_dict(cls, data: Dict[str, Any]) -> "NodeExecutionMetadata":
232
+ """Create instance from dictionary."""
233
+ ctx = get_logging_context()
234
+ ctx.debug(
235
+ "Collecting node metadata from dict",
236
+ node_name=data.get("node_name"),
237
+ )
238
+
239
+ delta_info = None
240
+ if "delta_info" in data and data["delta_info"]:
241
+ d_info = data["delta_info"]
242
+ # Parse timestamp if present
243
+ ts = None
244
+ if d_info.get("timestamp"):
245
+ try:
246
+ ts = datetime.fromisoformat(d_info["timestamp"])
247
+ except ValueError:
248
+ pass
249
+
250
+ delta_info = DeltaWriteInfo(
251
+ version=d_info.get("version"),
252
+ timestamp=ts,
253
+ operation=d_info.get("operation"),
254
+ operation_metrics=d_info.get("operation_metrics", {}),
255
+ read_version=d_info.get("read_version"),
256
+ )
257
+ ctx.debug(
258
+ "Delta version info extracted",
259
+ node_name=data.get("node_name"),
260
+ version=d_info.get("version"),
261
+ operation=d_info.get("operation"),
262
+ )
263
+
264
+ # Filter out unknown keys to be safe
265
+ valid_keys = cls.__annotations__.keys()
266
+ clean_data = {k: v for k, v in data.items() if k in valid_keys}
267
+
268
+ # Remove nested objects handled separately
269
+ if "delta_info" in clean_data:
270
+ del clean_data["delta_info"]
271
+
272
+ # Log data diff collection if present
273
+ if "data_diff" in data and data["data_diff"]:
274
+ ctx.debug(
275
+ "Data diff collected",
276
+ node_name=data.get("node_name"),
277
+ has_added=bool(data["data_diff"].get("added")),
278
+ has_removed=bool(data["data_diff"].get("removed")),
279
+ )
280
+
281
+ return cls(delta_info=delta_info, **clean_data)
282
+
283
+
284
+ @dataclass
285
+ class PipelineStoryMetadata:
286
+ """
287
+ Complete metadata for a pipeline run story.
288
+
289
+ Aggregates information about the entire pipeline execution including
290
+ all node executions, overall status, and project context.
291
+ """
292
+
293
+ pipeline_name: str
294
+ pipeline_layer: Optional[str] = None
295
+
296
+ # Execution info
297
+ run_id: str = field(default_factory=lambda: datetime.now().strftime("%Y%m%d_%H%M%S"))
298
+ started_at: str = field(default_factory=lambda: datetime.now().isoformat())
299
+ completed_at: Optional[str] = None
300
+ duration: float = 0.0
301
+
302
+ # Status
303
+ total_nodes: int = 0
304
+ completed_nodes: int = 0
305
+ failed_nodes: int = 0
306
+ skipped_nodes: int = 0
307
+
308
+ # Node details
309
+ nodes: List[NodeExecutionMetadata] = field(default_factory=list)
310
+
311
+ # Project context
312
+ project: Optional[str] = None
313
+ plant: Optional[str] = None
314
+ asset: Optional[str] = None
315
+ business_unit: Optional[str] = None
316
+
317
+ # Story settings
318
+ theme: str = "default"
319
+ include_samples: bool = True
320
+ max_sample_rows: int = 10
321
+
322
+ # Graph data for interactive DAG (Phase 2)
323
+ graph_data: Optional[Dict[str, Any]] = None
324
+
325
+ # Cross-run comparison (Phase 3)
326
+ change_summary: Optional[Dict[str, Any]] = None
327
+ compared_to_run_id: Optional[str] = None
328
+ git_info: Optional[Dict[str, str]] = None
329
+
330
+ def add_node(self, node_metadata: NodeExecutionMetadata):
331
+ """
332
+ Add node execution metadata.
333
+
334
+ Args:
335
+ node_metadata: Metadata for the node execution
336
+ """
337
+ ctx = get_logging_context()
338
+ self.nodes.append(node_metadata)
339
+ self.total_nodes += 1
340
+
341
+ if node_metadata.status == "success":
342
+ self.completed_nodes += 1
343
+ elif node_metadata.status == "failed":
344
+ self.failed_nodes += 1
345
+ elif node_metadata.status == "skipped":
346
+ self.skipped_nodes += 1
347
+
348
+ ctx.debug(
349
+ "Node metadata added to story",
350
+ pipeline=self.pipeline_name,
351
+ node=node_metadata.node_name,
352
+ status=node_metadata.status,
353
+ total_nodes=self.total_nodes,
354
+ )
355
+
356
+ def get_success_rate(self) -> float:
357
+ """Calculate success rate as percentage."""
358
+ if self.total_nodes == 0:
359
+ return 0.0
360
+ return (self.completed_nodes / self.total_nodes) * 100
361
+
362
+ def get_total_rows_processed(self) -> int:
363
+ """Calculate total rows processed across all nodes."""
364
+ total = 0
365
+ for node in self.nodes:
366
+ if node.rows_out is not None:
367
+ total += node.rows_out
368
+ return total
369
+
370
+ def get_total_rows_in(self) -> int:
371
+ """Calculate total input rows across all nodes."""
372
+ total = 0
373
+ for node in self.nodes:
374
+ if node.rows_in is not None:
375
+ total += node.rows_in
376
+ return total
377
+
378
+ def get_rows_dropped(self) -> int:
379
+ """Calculate total rows dropped (filtered) across all nodes."""
380
+ dropped = 0
381
+ for node in self.nodes:
382
+ if node.rows_in is not None and node.rows_out is not None:
383
+ diff = node.rows_in - node.rows_out
384
+ if diff > 0:
385
+ dropped += diff
386
+ return dropped
387
+
388
+ def get_final_output_rows(self) -> Optional[int]:
389
+ """Get the row count from the last successful node (final output)."""
390
+ for node in reversed(self.nodes):
391
+ if node.status == "success" and node.rows_out is not None:
392
+ return node.rows_out
393
+ return None
394
+
395
+ def get_alert_summary(self) -> Dict[str, Any]:
396
+ """Get a summary suitable for alert payloads.
397
+
398
+ Returns:
399
+ Dictionary with key metrics for alerts
400
+ """
401
+ return {
402
+ "total_rows_processed": self.get_total_rows_processed(),
403
+ "total_rows_in": self.get_total_rows_in(),
404
+ "rows_dropped": self.get_rows_dropped(),
405
+ "final_output_rows": self.get_final_output_rows(),
406
+ "success_rate": self.get_success_rate(),
407
+ "completed_nodes": self.completed_nodes,
408
+ "failed_nodes": self.failed_nodes,
409
+ "skipped_nodes": self.skipped_nodes,
410
+ }
411
+
412
+ def get_failed_node_names(self) -> List[str]:
413
+ """Get names of all failed nodes."""
414
+ return [n.node_name for n in self.nodes if n.status == "failed"]
415
+
416
+ def get_first_failure(self) -> Optional["NodeExecutionMetadata"]:
417
+ """Get the first failed node (by execution order)."""
418
+ for node in self.nodes:
419
+ if node.status == "failed":
420
+ return node
421
+ return None
422
+
423
+ def get_anomalous_nodes(self) -> List["NodeExecutionMetadata"]:
424
+ """Get all nodes with anomalies (slow or row count deviation)."""
425
+ return [n for n in self.nodes if n.is_anomaly]
426
+
427
+ def get_run_health_summary(self) -> Dict[str, Any]:
428
+ """Get run health summary for triage header.
429
+
430
+ Returns:
431
+ Dictionary with health info for quick triage
432
+ """
433
+ failed_names = self.get_failed_node_names()
434
+ first_failure = self.get_first_failure()
435
+ anomalous = self.get_anomalous_nodes()
436
+
437
+ return {
438
+ "has_failures": len(failed_names) > 0,
439
+ "failed_count": len(failed_names),
440
+ "failed_nodes": failed_names,
441
+ "first_failure_node": first_failure.node_name if first_failure else None,
442
+ "first_failure_error": first_failure.error_message if first_failure else None,
443
+ "first_failure_type": first_failure.error_type if first_failure else None,
444
+ "anomaly_count": len(anomalous),
445
+ "anomalous_nodes": [n.node_name for n in anomalous],
446
+ "overall_status": "failed" if failed_names else "success",
447
+ }
448
+
449
+ def get_data_quality_summary(self) -> Dict[str, Any]:
450
+ """Get data quality summary across all nodes.
451
+
452
+ Returns:
453
+ Dictionary with quality metrics for Phase 5 Data Quality Summary card
454
+ """
455
+ total_validations_failed = 0
456
+ total_failed_rows = 0
457
+ top_null_columns: List[Dict[str, Any]] = []
458
+ nodes_with_warnings = []
459
+
460
+ for node in self.nodes:
461
+ # Count validation warnings
462
+ if node.validation_warnings:
463
+ total_validations_failed += len(node.validation_warnings)
464
+ nodes_with_warnings.append(node.node_name)
465
+
466
+ # Sum failed rows
467
+ if node.failed_rows_counts:
468
+ for count in node.failed_rows_counts.values():
469
+ total_failed_rows += count
470
+
471
+ # Collect null profile data
472
+ if node.null_profile:
473
+ for col, null_pct in node.null_profile.items():
474
+ if null_pct and null_pct > 0:
475
+ top_null_columns.append(
476
+ {
477
+ "node": node.node_name,
478
+ "column": col,
479
+ "null_pct": null_pct,
480
+ }
481
+ )
482
+
483
+ # Sort by null percentage descending and take top 10
484
+ top_null_columns.sort(key=lambda x: x["null_pct"], reverse=True)
485
+ top_null_columns = top_null_columns[:10]
486
+
487
+ return {
488
+ "total_validations_failed": total_validations_failed,
489
+ "total_failed_rows": total_failed_rows,
490
+ "top_null_columns": top_null_columns,
491
+ "nodes_with_warnings": nodes_with_warnings,
492
+ "has_quality_issues": total_validations_failed > 0 or total_failed_rows > 0,
493
+ }
494
+
495
+ def get_freshness_info(self) -> Optional[Dict[str, Any]]:
496
+ """Get data freshness indicator from date columns.
497
+
498
+ Looks for max timestamp in date/timestamp columns from sample data.
499
+
500
+ Returns:
501
+ Dictionary with freshness info or None if not available
502
+ """
503
+ latest_timestamp = None
504
+ latest_column = None
505
+ latest_node = None
506
+
507
+ date_patterns = ["date", "time", "timestamp", "created", "updated", "modified", "_at"]
508
+
509
+ for node in reversed(self.nodes): # Start from last node (output)
510
+ if node.status != "success" or not node.sample_data:
511
+ continue
512
+
513
+ if not node.sample_data:
514
+ continue
515
+
516
+ for sample_row in node.sample_data:
517
+ for col, val in sample_row.items():
518
+ # Check if column name suggests date/time
519
+ col_lower = col.lower()
520
+ if not any(p in col_lower for p in date_patterns):
521
+ continue
522
+
523
+ if val is None:
524
+ continue
525
+
526
+ # Try to parse as datetime
527
+ try:
528
+ from datetime import datetime as dt
529
+
530
+ if isinstance(val, str):
531
+ # Try common formats
532
+ for fmt in [
533
+ "%Y-%m-%d %H:%M:%S",
534
+ "%Y-%m-%dT%H:%M:%S",
535
+ "%Y-%m-%d",
536
+ ]:
537
+ try:
538
+ parsed = dt.strptime(val[:19], fmt)
539
+ if latest_timestamp is None or parsed > latest_timestamp:
540
+ latest_timestamp = parsed
541
+ latest_column = col
542
+ latest_node = node.node_name
543
+ break
544
+ except (ValueError, TypeError):
545
+ continue
546
+ except Exception:
547
+ pass
548
+
549
+ if latest_timestamp:
550
+ return {
551
+ "timestamp": latest_timestamp.isoformat(),
552
+ "column": latest_column,
553
+ "node": latest_node,
554
+ "formatted": latest_timestamp.strftime("%Y-%m-%d %H:%M"),
555
+ }
556
+ return None
557
+
558
+ def to_dict(self) -> Dict[str, Any]:
559
+ """Convert to dictionary."""
560
+ return {
561
+ "pipeline_name": self.pipeline_name,
562
+ "pipeline_layer": self.pipeline_layer,
563
+ "run_id": self.run_id,
564
+ "started_at": self.started_at,
565
+ "completed_at": self.completed_at,
566
+ "duration": self.duration,
567
+ "total_nodes": self.total_nodes,
568
+ "completed_nodes": self.completed_nodes,
569
+ "failed_nodes": self.failed_nodes,
570
+ "skipped_nodes": self.skipped_nodes,
571
+ "success_rate": self.get_success_rate(),
572
+ "total_rows_processed": self.get_total_rows_processed(),
573
+ "nodes": [node.to_dict() for node in self.nodes],
574
+ "project": self.project,
575
+ "plant": self.plant,
576
+ "asset": self.asset,
577
+ "business_unit": self.business_unit,
578
+ "theme": self.theme,
579
+ "graph_data": self.graph_data,
580
+ "change_summary": self.change_summary,
581
+ "compared_to_run_id": self.compared_to_run_id,
582
+ "git_info": self.git_info,
583
+ }
584
+
585
+ @classmethod
586
+ def from_dict(cls, data: Dict[str, Any]) -> "PipelineStoryMetadata":
587
+ """Create instance from dictionary."""
588
+ nodes_data = data.get("nodes", [])
589
+ nodes = [NodeExecutionMetadata.from_dict(n) for n in nodes_data]
590
+
591
+ # Filter valid keys
592
+ valid_keys = cls.__annotations__.keys()
593
+ clean_data = {k: v for k, v in data.items() if k in valid_keys}
594
+
595
+ # Handle nested
596
+ if "nodes" in clean_data:
597
+ del clean_data["nodes"]
598
+
599
+ return cls(nodes=nodes, **clean_data)
600
+
601
+ @classmethod
602
+ def from_json(cls, path: str) -> "PipelineStoryMetadata":
603
+ """Load from a JSON file."""
604
+ import json
605
+
606
+ with open(path, "r") as f:
607
+ data = json.load(f)
608
+ return cls.from_dict(data)