odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1431 @@
1
+ """Story generator for pipeline execution documentation."""
2
+
3
+ import subprocess
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ import yaml
9
+
10
+ from odibi.node import NodeResult
11
+ from odibi.story.metadata import DeltaWriteInfo, NodeExecutionMetadata, PipelineStoryMetadata
12
+ from odibi.story.renderers import HTMLStoryRenderer, JSONStoryRenderer
13
+ from odibi.utils.logging_context import get_logging_context
14
+
15
+
16
+ # Custom class to force block style for multiline strings
17
+ class MultilineString(str):
18
+ """String subclass to force YAML block scalar style."""
19
+
20
+ pass
21
+
22
+
23
+ def multiline_presenter(dumper, data):
24
+ """YAML representer for MultilineString."""
25
+ return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
26
+
27
+
28
+ yaml.add_representer(MultilineString, multiline_presenter)
29
+
30
+
31
+ class StoryGenerator:
32
+ """Generates markdown documentation of pipeline execution."""
33
+
34
+ def __init__(
35
+ self,
36
+ pipeline_name: str,
37
+ max_sample_rows: int = 10,
38
+ output_path: str = "stories/",
39
+ retention_days: int = 30,
40
+ retention_count: int = 100,
41
+ storage_options: Optional[Dict[str, Any]] = None,
42
+ catalog_manager: Optional[Any] = None,
43
+ ):
44
+ """Initialize story generator.
45
+
46
+ Args:
47
+ pipeline_name: Name of the pipeline
48
+ max_sample_rows: Maximum rows to show in samples
49
+ output_path: Directory for story output
50
+ retention_days: Days to keep stories
51
+ retention_count: Max number of stories to keep
52
+ storage_options: Credentials for remote storage (e.g. ADLS)
53
+ catalog_manager: System Catalog Manager for historical context
54
+ """
55
+ self.pipeline_name = pipeline_name
56
+ self.max_sample_rows = max_sample_rows
57
+ self.output_path_str = output_path # Store original string
58
+ self.is_remote = "://" in output_path
59
+ self.storage_options = storage_options or {}
60
+ self.catalog_manager = catalog_manager
61
+
62
+ # Track last generated story for alert enrichment
63
+ self._last_story_path: Optional[str] = None
64
+ self._last_metadata: Optional[PipelineStoryMetadata] = None
65
+
66
+ if not self.is_remote:
67
+ self.output_path = Path(output_path)
68
+ self.output_path.mkdir(parents=True, exist_ok=True)
69
+ else:
70
+ self.output_path = None # Handle remote paths differently
71
+
72
+ self.retention_days = retention_days
73
+ self.retention_count = retention_count
74
+
75
+ ctx = get_logging_context()
76
+ ctx.debug(
77
+ "StoryGenerator initialized",
78
+ pipeline=pipeline_name,
79
+ output_path=output_path,
80
+ is_remote=self.is_remote,
81
+ retention_days=retention_days,
82
+ retention_count=retention_count,
83
+ )
84
+
85
+ def generate(
86
+ self,
87
+ node_results: Dict[str, NodeResult],
88
+ completed: List[str],
89
+ failed: List[str],
90
+ skipped: List[str],
91
+ duration: float,
92
+ start_time: str,
93
+ end_time: str,
94
+ context: Any = None,
95
+ config: Optional[Dict[str, Any]] = None,
96
+ graph_data: Optional[Dict[str, Any]] = None,
97
+ ) -> str:
98
+ """Generate story HTML and JSON.
99
+
100
+ Args:
101
+ node_results: Dictionary of node name -> NodeResult
102
+ completed: List of completed node names
103
+ failed: List of failed node names
104
+ skipped: List of skipped node names
105
+ duration: Total pipeline duration
106
+ start_time: ISO timestamp of start
107
+ end_time: ISO timestamp of end
108
+ context: Optional context to access intermediate DataFrames
109
+ config: Optional pipeline configuration snapshot
110
+ graph_data: Optional graph data dict with nodes/edges for DAG visualization
111
+
112
+ Returns:
113
+ Path to generated HTML story file
114
+ """
115
+ ctx = get_logging_context()
116
+ ctx.debug(
117
+ "Generating story",
118
+ pipeline=self.pipeline_name,
119
+ node_count=len(node_results),
120
+ completed=len(completed),
121
+ failed=len(failed),
122
+ skipped=len(skipped),
123
+ )
124
+
125
+ # 1. Build metadata object
126
+ metadata = PipelineStoryMetadata(
127
+ pipeline_name=self.pipeline_name,
128
+ pipeline_layer=config.get("layer") if config else None,
129
+ started_at=start_time,
130
+ completed_at=end_time,
131
+ duration=duration,
132
+ total_nodes=len(completed) + len(failed) + len(skipped),
133
+ completed_nodes=len(completed),
134
+ failed_nodes=len(failed),
135
+ skipped_nodes=len(skipped),
136
+ project=config.get("project") if config else None,
137
+ plant=config.get("plant") if config else None,
138
+ asset=config.get("asset") if config else None,
139
+ business_unit=config.get("business_unit") if config else None,
140
+ )
141
+
142
+ # Add Git Info
143
+ # git_info = self._get_git_info()
144
+ # We can't easily add arbitrary fields to dataclass without changing it,
145
+ # but we can rely on the fact that it's just metadata.
146
+ # For now, let's skip adding git info to the core model or extend it later.
147
+
148
+ # Process all nodes in order
149
+ all_nodes = completed + failed + skipped
150
+
151
+ # If we have config, try to follow config order instead of list order
152
+ if config and "nodes" in config:
153
+ config_order = [n["name"] for n in config["nodes"]]
154
+ # Sort all_nodes based on index in config_order
155
+ all_nodes.sort(key=lambda x: config_order.index(x) if x in config_order else 999)
156
+
157
+ for node_name in all_nodes:
158
+ if node_name in node_results:
159
+ result = node_results[node_name]
160
+ node_meta = self._convert_result_to_metadata(result, node_name)
161
+
162
+ # Status overrides (result object has success bool, but we have lists)
163
+ if node_name in failed:
164
+ node_meta.status = "failed"
165
+ elif node_name in skipped:
166
+ node_meta.status = "skipped"
167
+ else:
168
+ node_meta.status = "success"
169
+
170
+ metadata.nodes.append(node_meta)
171
+ else:
172
+ # Skipped node without result
173
+ metadata.nodes.append(
174
+ NodeExecutionMetadata(
175
+ node_name=node_name, operation="skipped", status="skipped", duration=0.0
176
+ )
177
+ )
178
+
179
+ # Enrich with Historical Context (if available)
180
+ current_node = metadata.nodes[-1]
181
+ if self.catalog_manager:
182
+ try:
183
+ avg_rows = self.catalog_manager.get_average_volume(node_name)
184
+ avg_duration = self.catalog_manager.get_average_duration(node_name)
185
+
186
+ current_node.historical_avg_rows = avg_rows
187
+ current_node.historical_avg_duration = avg_duration
188
+
189
+ # Compute anomalies (Phase 1 - Triage)
190
+ self._compute_anomalies(current_node)
191
+ except Exception as e:
192
+ ctx = get_logging_context()
193
+ ctx.debug(
194
+ "Failed to fetch historical metrics for node",
195
+ node_name=node_name,
196
+ error=str(e),
197
+ )
198
+
199
+ # 2. Build graph data for interactive DAG (Phase 2)
200
+ metadata.graph_data = self._build_graph_data(metadata, graph_data, config)
201
+
202
+ # 3. Compare with last successful run (Phase 3)
203
+ self._compare_with_last_success(metadata)
204
+
205
+ # 4. Add git info (Phase 3)
206
+ metadata.git_info = self._get_git_info()
207
+
208
+ # 5. Render outputs
209
+ timestamp_obj = datetime.now()
210
+ date_str = timestamp_obj.strftime("%Y-%m-%d")
211
+ time_str = timestamp_obj.strftime("%H-%M-%S")
212
+
213
+ # Create structured path: {pipeline_name}/{date}/
214
+ relative_folder = f"{self.pipeline_name}/{date_str}"
215
+
216
+ if self.is_remote:
217
+ base_path = f"{self.output_path_str.rstrip('/')}/{relative_folder}"
218
+ else:
219
+ base_path = self.output_path / relative_folder
220
+ base_path.mkdir(parents=True, exist_ok=True)
221
+
222
+ base_filename = f"run_{time_str}"
223
+
224
+ # Prepare renderers
225
+ html_renderer = HTMLStoryRenderer()
226
+ json_renderer = JSONStoryRenderer()
227
+
228
+ # Paths
229
+ if self.is_remote:
230
+ html_path = f"{base_path}/{base_filename}.html"
231
+ json_path = f"{base_path}/{base_filename}.json"
232
+ else:
233
+ html_path = str(base_path / f"{base_filename}.html")
234
+ json_path = str(base_path / f"{base_filename}.json")
235
+
236
+ # Render HTML
237
+ html_content = html_renderer.render(metadata)
238
+
239
+ # Render JSON
240
+ json_content = json_renderer.render(metadata)
241
+
242
+ # Write files
243
+ try:
244
+ if self.is_remote:
245
+ self._write_remote(html_path, html_content)
246
+ self._write_remote(json_path, json_content)
247
+ else:
248
+ with open(html_path, "w", encoding="utf-8") as f:
249
+ f.write(html_content)
250
+ with open(json_path, "w", encoding="utf-8") as f:
251
+ f.write(json_content)
252
+
253
+ ctx.debug(
254
+ "Story files written",
255
+ html_path=html_path,
256
+ html_size=len(html_content),
257
+ json_path=json_path,
258
+ json_size=len(json_content),
259
+ )
260
+ except Exception as e:
261
+ ctx.error(
262
+ "Failed to write story files",
263
+ error=str(e),
264
+ html_path=html_path,
265
+ json_path=json_path,
266
+ )
267
+ raise
268
+
269
+ # Store for alert enrichment
270
+ self._last_story_path = html_path
271
+ self._last_metadata = metadata
272
+
273
+ # Cleanup and generate index
274
+ self.cleanup()
275
+ self._generate_pipeline_index()
276
+
277
+ ctx.info(
278
+ "Story generated",
279
+ path=html_path,
280
+ nodes=len(metadata.nodes),
281
+ success_rate=metadata.get_success_rate(),
282
+ )
283
+
284
+ return html_path
285
+
286
+ def get_alert_summary(self) -> Dict[str, Any]:
287
+ """Get a summary of the last generated story for alerts.
288
+
289
+ Returns:
290
+ Dictionary with metrics suitable for alert payloads
291
+ """
292
+ if not self._last_metadata:
293
+ return {}
294
+
295
+ summary = self._last_metadata.get_alert_summary()
296
+ summary["story_path"] = self._last_story_path
297
+ return summary
298
+
299
+ def _get_duration_history(self, node_name: str, limit: int = 10) -> List[Dict[str, Any]]:
300
+ """Get duration history for a node across recent runs.
301
+
302
+ Args:
303
+ node_name: The node name to get history for
304
+ limit: Maximum number of runs to include
305
+
306
+ Returns:
307
+ List of {"run_id": "...", "duration": 1.5, "started_at": "..."} dicts
308
+ """
309
+ import json
310
+
311
+ ctx = get_logging_context()
312
+
313
+ if self.is_remote:
314
+ ctx.debug("Duration history not yet supported for remote storage")
315
+ return []
316
+
317
+ if self.output_path is None:
318
+ return []
319
+
320
+ pipeline_dir = self.output_path / self.pipeline_name
321
+ if not pipeline_dir.exists():
322
+ return []
323
+
324
+ json_files = sorted(
325
+ pipeline_dir.glob("**/*.json"),
326
+ key=lambda p: str(p),
327
+ reverse=True,
328
+ )
329
+
330
+ history = []
331
+ for json_path in json_files[: limit + 1]: # +1 to skip current run if it exists
332
+ try:
333
+ with open(json_path, "r", encoding="utf-8") as f:
334
+ data = json.load(f)
335
+
336
+ for node_data in data.get("nodes", []):
337
+ if node_data.get("node_name") == node_name:
338
+ history.append(
339
+ {
340
+ "run_id": data.get("run_id", "unknown"),
341
+ "duration": node_data.get("duration", 0),
342
+ "started_at": data.get("started_at", ""),
343
+ }
344
+ )
345
+ break
346
+ except Exception as e:
347
+ ctx.debug(f"Failed to load run for duration history: {json_path}, error: {e}")
348
+ continue
349
+
350
+ return history[:limit]
351
+
352
+ def _find_last_successful_run(self) -> Optional[Dict[str, Any]]:
353
+ """Find the most recent successful run's JSON data.
354
+
355
+ Returns:
356
+ Dictionary of the last successful run metadata, or None
357
+ """
358
+ import json
359
+
360
+ ctx = get_logging_context()
361
+
362
+ if self.is_remote:
363
+ return self._find_last_successful_run_remote()
364
+
365
+ if self.output_path is None:
366
+ return None
367
+
368
+ pipeline_dir = self.output_path / self.pipeline_name
369
+ if not pipeline_dir.exists():
370
+ return None
371
+
372
+ # Find all JSON files, sorted by path (date/time order) descending
373
+ json_files = sorted(
374
+ pipeline_dir.glob("**/*.json"),
375
+ key=lambda p: str(p),
376
+ reverse=True,
377
+ )
378
+
379
+ # Find the most recent successful run
380
+ for json_path in json_files:
381
+ try:
382
+ with open(json_path, "r", encoding="utf-8") as f:
383
+ data = json.load(f)
384
+
385
+ # Check if this run was successful (no failed nodes)
386
+ if data.get("failed_nodes", 0) == 0:
387
+ ctx.debug(
388
+ "Found last successful run",
389
+ path=str(json_path),
390
+ run_id=data.get("run_id"),
391
+ )
392
+ return data
393
+ except Exception as e:
394
+ ctx.debug(f"Failed to load story JSON: {json_path}, error: {e}")
395
+ continue
396
+
397
+ return None
398
+
399
+ def _find_last_successful_run_remote(self) -> Optional[Dict[str, Any]]:
400
+ """Find the most recent successful run's JSON data from remote storage.
401
+
402
+ Uses fsspec to list and read JSON files from Azure Blob, ADLS, S3, etc.
403
+
404
+ Returns:
405
+ Dictionary of the last successful run metadata, or None
406
+ """
407
+ import json
408
+
409
+ ctx = get_logging_context()
410
+
411
+ try:
412
+ import fsspec
413
+ except ImportError:
414
+ ctx.debug("fsspec not available, skipping remote comparison")
415
+ return None
416
+
417
+ pipeline_path = f"{self.output_path_str.rstrip('/')}/{self.pipeline_name}"
418
+
419
+ try:
420
+ fs = fsspec.filesystem(pipeline_path.split("://")[0], **self.storage_options)
421
+
422
+ # List all JSON files recursively under pipeline directory
423
+ # fsspec glob pattern for recursive search
424
+ glob_pattern = f"{pipeline_path.split('://', 1)[1]}/**/*.json"
425
+ json_files = fs.glob(glob_pattern)
426
+
427
+ if not json_files:
428
+ ctx.debug("No previous story JSON files found", path=pipeline_path)
429
+ return None
430
+
431
+ # Sort by path descending (date/time order due to folder structure)
432
+ json_files = sorted(json_files, reverse=True)
433
+
434
+ ctx.debug(
435
+ "Found story JSON files for comparison",
436
+ count=len(json_files),
437
+ path=pipeline_path,
438
+ )
439
+
440
+ # Find the most recent successful run
441
+ protocol = pipeline_path.split("://")[0]
442
+ for json_path in json_files:
443
+ full_path = f"{protocol}://{json_path}"
444
+ try:
445
+ with fsspec.open(full_path, "r", encoding="utf-8", **self.storage_options) as f:
446
+ data = json.load(f)
447
+
448
+ # Check if this run was successful (no failed nodes)
449
+ if data.get("failed_nodes", 0) == 0:
450
+ ctx.debug(
451
+ "Found last successful run (remote)",
452
+ path=full_path,
453
+ run_id=data.get("run_id"),
454
+ )
455
+ return data
456
+ except Exception as e:
457
+ ctx.debug(f"Failed to load remote story JSON: {full_path}, error: {e}")
458
+ continue
459
+
460
+ except Exception as e:
461
+ ctx.warning(
462
+ "Failed to search remote storage for previous runs",
463
+ error=str(e),
464
+ path=pipeline_path,
465
+ )
466
+
467
+ return None
468
+
469
+ def _compare_with_last_success(self, metadata: PipelineStoryMetadata) -> None:
470
+ """Compare current run with last successful run and populate change_summary."""
471
+ ctx = get_logging_context()
472
+
473
+ # Collect duration history for all nodes (before comparison)
474
+ for node in metadata.nodes:
475
+ history = self._get_duration_history(node.node_name, limit=10)
476
+ if history:
477
+ node.duration_history = history
478
+
479
+ last_success = self._find_last_successful_run()
480
+ if not last_success:
481
+ ctx.debug("No previous successful run found for comparison")
482
+ return
483
+
484
+ metadata.compared_to_run_id = last_success.get("run_id")
485
+
486
+ # Build lookup for previous run's nodes
487
+ prev_nodes = {n["node_name"]: n for n in last_success.get("nodes", [])}
488
+
489
+ # Track changes
490
+ sql_changed = []
491
+ schema_changed = []
492
+ rows_changed = []
493
+ newly_failing = []
494
+ duration_changed = []
495
+
496
+ for node in metadata.nodes:
497
+ prev = prev_nodes.get(node.node_name)
498
+ if not prev:
499
+ # New node, not in previous run
500
+ continue
501
+
502
+ changes = []
503
+
504
+ # Compare SQL hash
505
+ if node.sql_hash and prev.get("sql_hash"):
506
+ if node.sql_hash != prev["sql_hash"]:
507
+ changes.append("sql")
508
+ sql_changed.append(node.node_name)
509
+ node.previous_sql_hash = prev["sql_hash"]
510
+
511
+ # Compare schema (output)
512
+ curr_schema = set(node.schema_out or [])
513
+ prev_schema = set(prev.get("schema_out") or [])
514
+ if curr_schema != prev_schema:
515
+ changes.append("schema")
516
+ schema_changed.append(node.node_name)
517
+
518
+ # Compare row counts (significant change = >20%)
519
+ if node.rows_out is not None and prev.get("rows_out") is not None:
520
+ prev_rows = prev["rows_out"]
521
+ if prev_rows > 0:
522
+ pct_change = abs(node.rows_out - prev_rows) / prev_rows
523
+ if pct_change > 0.2:
524
+ changes.append("rows")
525
+ rows_changed.append(node.node_name)
526
+ node.previous_rows_out = prev_rows
527
+
528
+ # Compare duration (significant change = 2x slower)
529
+ if node.duration and prev.get("duration"):
530
+ prev_dur = prev["duration"]
531
+ if prev_dur > 0 and node.duration >= prev_dur * 2:
532
+ changes.append("duration")
533
+ duration_changed.append(node.node_name)
534
+ node.previous_duration = prev_dur
535
+
536
+ # Check if newly failing
537
+ if node.status == "failed" and prev.get("status") == "success":
538
+ newly_failing.append(node.node_name)
539
+
540
+ # Capture previous config snapshot for diff viewer
541
+ if prev.get("config_snapshot"):
542
+ node.previous_config_snapshot = prev["config_snapshot"]
543
+
544
+ if changes:
545
+ node.changed_from_last_success = True
546
+ node.changes_detected = changes
547
+
548
+ # Build summary
549
+ metadata.change_summary = {
550
+ "has_changes": bool(sql_changed or schema_changed or rows_changed or newly_failing),
551
+ "sql_changed_count": len(sql_changed),
552
+ "sql_changed_nodes": sql_changed,
553
+ "schema_changed_count": len(schema_changed),
554
+ "schema_changed_nodes": schema_changed,
555
+ "rows_changed_count": len(rows_changed),
556
+ "rows_changed_nodes": rows_changed,
557
+ "duration_changed_count": len(duration_changed),
558
+ "duration_changed_nodes": duration_changed,
559
+ "newly_failing_count": len(newly_failing),
560
+ "newly_failing_nodes": newly_failing,
561
+ "compared_to_run_id": metadata.compared_to_run_id,
562
+ }
563
+
564
+ ctx.debug(
565
+ "Cross-run comparison complete",
566
+ compared_to=metadata.compared_to_run_id,
567
+ sql_changed=len(sql_changed),
568
+ schema_changed=len(schema_changed),
569
+ newly_failing=len(newly_failing),
570
+ )
571
+
572
+ def _infer_layer_from_path(self, path: str) -> str:
573
+ """Infer the data layer from a path string.
574
+
575
+ Uses common naming patterns to identify bronze/silver/gold/raw layers.
576
+ """
577
+ path_lower = path.lower()
578
+ if "bronze" in path_lower:
579
+ return "bronze"
580
+ elif "silver" in path_lower:
581
+ return "silver"
582
+ elif "gold" in path_lower:
583
+ return "gold"
584
+ elif "raw" in path_lower:
585
+ return "raw"
586
+ elif "staging" in path_lower:
587
+ return "staging"
588
+ elif "semantic" in path_lower:
589
+ return "semantic"
590
+ return "source"
591
+
592
+ def _build_graph_data(
593
+ self,
594
+ metadata: PipelineStoryMetadata,
595
+ graph_data: Optional[Dict[str, Any]],
596
+ config: Optional[Dict[str, Any]],
597
+ ) -> Dict[str, Any]:
598
+ """Build enriched graph data for interactive DAG visualization.
599
+
600
+ Combines static graph structure with runtime execution metadata.
601
+ """
602
+ ctx = get_logging_context()
603
+
604
+ # Build node lookup for runtime data
605
+ node_lookup = {n.node_name: n for n in metadata.nodes}
606
+
607
+ # Debug: Log which path we're taking
608
+ path_taken = (
609
+ "graph_data"
610
+ if graph_data
611
+ else ("config" if config and "nodes" in config else "fallback")
612
+ )
613
+ ctx.debug(
614
+ "Building graph data",
615
+ path=path_taken,
616
+ has_graph_data=bool(graph_data),
617
+ has_config=bool(config),
618
+ config_has_nodes=bool(config and "nodes" in config),
619
+ metadata_node_count=len(metadata.nodes),
620
+ )
621
+
622
+ # Start with provided graph_data or build from config
623
+ if graph_data:
624
+ nodes = graph_data.get("nodes", [])
625
+ edges = graph_data.get("edges", [])
626
+ elif config and "nodes" in config:
627
+ nodes = []
628
+ edges = []
629
+ source_nodes = set() # Track source tables for lineage
630
+ target_nodes = set() # Track target tables for lineage
631
+
632
+ for node_cfg in config["nodes"]:
633
+ node_name = node_cfg["name"]
634
+ nodes.append(
635
+ {
636
+ "id": node_name,
637
+ "label": node_name,
638
+ "type": node_cfg.get("type", "transform"),
639
+ "layer": metadata.pipeline_layer or "unknown",
640
+ }
641
+ )
642
+ # Check depends_on for intra-pipeline dependencies
643
+ for dep in node_cfg.get("depends_on", []):
644
+ edges.append({"source": dep, "target": node_name})
645
+
646
+ # Check inputs block for cross-pipeline dependencies
647
+ inputs = node_cfg.get("inputs", {})
648
+ if inputs:
649
+ for input_name, input_val in inputs.items():
650
+ if isinstance(input_val, str) and input_val.startswith("$"):
651
+ ref = input_val[1:]
652
+ if "." in ref:
653
+ pipeline_name, node_ref = ref.split(".", 1)
654
+ edges.append(
655
+ {
656
+ "source": node_ref,
657
+ "target": node_name,
658
+ "source_pipeline": pipeline_name,
659
+ }
660
+ )
661
+ else:
662
+ edges.append({"source": ref, "target": node_name})
663
+
664
+ # Add read path as source for lineage
665
+ read_cfg = node_cfg.get("read", {})
666
+ if read_cfg:
667
+ read_path = read_cfg.get("path") or read_cfg.get("table")
668
+ if read_path:
669
+ source_nodes.add(read_path)
670
+ edges.append({"from": read_path, "to": node_name})
671
+
672
+ # Add write path as target for lineage
673
+ write_cfg = node_cfg.get("write", {})
674
+ if write_cfg:
675
+ write_path = write_cfg.get("path") or write_cfg.get("table")
676
+ if write_path:
677
+ target_nodes.add(write_path)
678
+ edges.append({"from": node_name, "to": write_path})
679
+
680
+ # Add source table nodes (inputs)
681
+ for source in source_nodes:
682
+ if not any(n["id"] == source for n in nodes):
683
+ nodes.append(
684
+ {
685
+ "id": source,
686
+ "label": source,
687
+ "type": "source",
688
+ "layer": self._infer_layer_from_path(source),
689
+ }
690
+ )
691
+
692
+ # Add target table nodes (outputs)
693
+ for target in target_nodes:
694
+ if not any(n["id"] == target for n in nodes):
695
+ nodes.append(
696
+ {
697
+ "id": target,
698
+ "label": target,
699
+ "type": "table",
700
+ "layer": metadata.pipeline_layer or "unknown",
701
+ }
702
+ )
703
+ else:
704
+ # Fallback: build from metadata nodes
705
+ nodes = [
706
+ {
707
+ "id": n.node_name,
708
+ "label": n.node_name,
709
+ "layer": metadata.pipeline_layer or "unknown",
710
+ }
711
+ for n in metadata.nodes
712
+ ]
713
+ edges = []
714
+ source_nodes = set()
715
+ target_nodes = set()
716
+
717
+ for n in metadata.nodes:
718
+ # Debug: Log config_snapshot contents for each node
719
+ ctx.debug(
720
+ "Fallback path: checking node config_snapshot",
721
+ node_name=n.node_name,
722
+ has_config_snapshot=bool(n.config_snapshot),
723
+ config_snapshot_keys=(
724
+ list(n.config_snapshot.keys()) if n.config_snapshot else []
725
+ ),
726
+ has_inputs=bool(n.config_snapshot and n.config_snapshot.get("inputs")),
727
+ inputs_value=n.config_snapshot.get("inputs") if n.config_snapshot else None,
728
+ has_depends_on=bool(n.config_snapshot and n.config_snapshot.get("depends_on")),
729
+ )
730
+
731
+ # Check depends_on for intra-pipeline dependencies
732
+ if n.config_snapshot and n.config_snapshot.get("depends_on"):
733
+ for dep in n.config_snapshot["depends_on"]:
734
+ edges.append({"source": dep, "target": n.node_name})
735
+
736
+ # Check inputs block for cross-pipeline dependencies
737
+ if n.config_snapshot and n.config_snapshot.get("inputs"):
738
+ for input_name, input_val in n.config_snapshot["inputs"].items():
739
+ ctx.debug(
740
+ "Processing input reference",
741
+ node_name=n.node_name,
742
+ input_name=input_name,
743
+ input_val=input_val,
744
+ is_string=isinstance(input_val, str),
745
+ starts_with_dollar=isinstance(input_val, str)
746
+ and input_val.startswith("$"),
747
+ )
748
+ # Handle $pipeline.node reference format
749
+ if isinstance(input_val, str) and input_val.startswith("$"):
750
+ # Format: $pipeline_name.node_name
751
+ ref = input_val[1:] # Remove $
752
+ if "." in ref:
753
+ pipeline_name, node_ref = ref.split(".", 1)
754
+ edges.append(
755
+ {
756
+ "source": node_ref,
757
+ "target": n.node_name,
758
+ "source_pipeline": pipeline_name,
759
+ }
760
+ )
761
+ ctx.debug(
762
+ "Added cross-pipeline edge",
763
+ source=node_ref,
764
+ target=n.node_name,
765
+ source_pipeline=pipeline_name,
766
+ )
767
+ else:
768
+ edges.append({"source": ref, "target": n.node_name})
769
+ ctx.debug(
770
+ "Added same-pipeline edge from inputs",
771
+ source=ref,
772
+ target=n.node_name,
773
+ )
774
+
775
+ # Add read/write paths for lineage from config_snapshot
776
+ if n.config_snapshot:
777
+ read_cfg = n.config_snapshot.get("read", {})
778
+ if read_cfg:
779
+ read_path = read_cfg.get("path") or read_cfg.get("table")
780
+ if read_path:
781
+ source_nodes.add(read_path)
782
+ edges.append({"from": read_path, "to": n.node_name})
783
+
784
+ write_cfg = n.config_snapshot.get("write", {})
785
+ if write_cfg:
786
+ write_path = write_cfg.get("path") or write_cfg.get("table")
787
+ if write_path:
788
+ target_nodes.add(write_path)
789
+ edges.append({"from": n.node_name, "to": write_path})
790
+
791
+ # Add source table nodes
792
+ for source in source_nodes:
793
+ if not any(n["id"] == source for n in nodes):
794
+ nodes.append(
795
+ {
796
+ "id": source,
797
+ "label": source,
798
+ "type": "source",
799
+ "layer": self._infer_layer_from_path(source),
800
+ }
801
+ )
802
+
803
+ # Add target table nodes
804
+ for target in target_nodes:
805
+ if not any(n["id"] == target for n in nodes):
806
+ nodes.append(
807
+ {
808
+ "id": target,
809
+ "label": target,
810
+ "type": "table",
811
+ "layer": metadata.pipeline_layer or "unknown",
812
+ }
813
+ )
814
+
815
+ # Collect all node IDs that exist in the current pipeline
816
+ existing_node_ids = {node["id"] for node in nodes}
817
+
818
+ # Find cross-pipeline dependencies (edge sources that don't exist as nodes)
819
+ # Build a map of node_ref -> pipeline_name for labeling
820
+ external_node_pipelines = {}
821
+ cross_pipeline_deps = set()
822
+ for edge in edges:
823
+ # Support both "source"/"target" and "from"/"to" formats
824
+ edge_source = edge.get("source") or edge.get("from", "")
825
+ if edge_source and edge_source not in existing_node_ids:
826
+ cross_pipeline_deps.add(edge_source)
827
+ # Track the pipeline name if available
828
+ if "source_pipeline" in edge:
829
+ external_node_pipelines[edge_source] = edge["source_pipeline"]
830
+
831
+ # Debug: Log summary before adding external nodes
832
+ ctx.debug(
833
+ "Graph data summary",
834
+ total_nodes=len(nodes),
835
+ total_edges=len(edges),
836
+ existing_node_ids=list(existing_node_ids),
837
+ edge_sources=[e.get("source") or e.get("from", "") for e in edges],
838
+ cross_pipeline_deps=list(cross_pipeline_deps),
839
+ )
840
+
841
+ # Add placeholder nodes for cross-pipeline dependencies
842
+ for dep_id in cross_pipeline_deps:
843
+ pipeline_name = external_node_pipelines.get(dep_id)
844
+ label = f"{pipeline_name}.{dep_id}" if pipeline_name else dep_id
845
+ ctx.debug(
846
+ "Adding external node for cross-pipeline dependency",
847
+ dep_id=dep_id,
848
+ pipeline_name=pipeline_name,
849
+ label=label,
850
+ )
851
+ nodes.append(
852
+ {
853
+ "id": dep_id,
854
+ "label": label,
855
+ "type": "external",
856
+ "source_pipeline": pipeline_name,
857
+ }
858
+ )
859
+
860
+ # Build dependency lookup: node_id -> list of source nodes (with pipeline info)
861
+ node_dependencies = {}
862
+ for edge in edges:
863
+ # Support both "source"/"target" and "from"/"to" formats
864
+ target = edge.get("target") or edge.get("to", "")
865
+ source = edge.get("source") or edge.get("from", "")
866
+ if not target or not source:
867
+ continue
868
+ source_pipeline = edge.get("source_pipeline")
869
+ dep_label = f"{source_pipeline}.{source}" if source_pipeline else source
870
+
871
+ if target not in node_dependencies:
872
+ node_dependencies[target] = []
873
+ node_dependencies[target].append(dep_label)
874
+
875
+ # Enrich nodes with runtime execution data
876
+ enriched_nodes = []
877
+ for node in nodes:
878
+ node_id = node["id"]
879
+ runtime = node_lookup.get(node_id)
880
+ is_external = node.get("type") == "external"
881
+
882
+ enriched = {
883
+ "id": node_id,
884
+ "label": node.get("label", node_id),
885
+ "type": node.get("type", "transform"),
886
+ "status": runtime.status if runtime else ("external" if is_external else "unknown"),
887
+ "duration": runtime.duration if runtime else 0,
888
+ "rows_out": runtime.rows_out if runtime else None,
889
+ "is_anomaly": runtime.is_anomaly if runtime else False,
890
+ "is_slow": runtime.is_slow if runtime else False,
891
+ "has_row_anomaly": runtime.has_row_anomaly if runtime else False,
892
+ "error_message": runtime.error_message if runtime else None,
893
+ "validation_count": len(runtime.validation_warnings) if runtime else 0,
894
+ "is_external": is_external,
895
+ "source_pipeline": node.get("source_pipeline"),
896
+ "dependencies": node_dependencies.get(node_id, []),
897
+ }
898
+ enriched_nodes.append(enriched)
899
+
900
+ return {
901
+ "nodes": enriched_nodes,
902
+ "edges": edges,
903
+ }
904
+
905
+ def _compute_anomalies(self, node: NodeExecutionMetadata) -> None:
906
+ """Compute anomaly flags for a node based on historical data.
907
+
908
+ Anomaly rules:
909
+ - is_slow: node duration is 3x or more than historical avg
910
+ - has_row_anomaly: rows_out deviates ±50% from historical avg
911
+ """
912
+ anomaly_reasons = []
913
+
914
+ # Check for slow execution (3x threshold)
915
+ if node.historical_avg_duration and node.historical_avg_duration > 0:
916
+ if node.duration >= node.historical_avg_duration * 3:
917
+ node.is_slow = True
918
+ ratio = node.duration / node.historical_avg_duration
919
+ avg_dur = node.historical_avg_duration
920
+ anomaly_reasons.append(
921
+ f"Slow: {node.duration:.2f}s vs avg {avg_dur:.2f}s ({ratio:.1f}x)"
922
+ )
923
+
924
+ # Check for row count anomaly (±50% threshold)
925
+ if node.historical_avg_rows and node.historical_avg_rows > 0 and node.rows_out is not None:
926
+ pct_change = abs(node.rows_out - node.historical_avg_rows) / node.historical_avg_rows
927
+ if pct_change >= 0.5:
928
+ node.has_row_anomaly = True
929
+ direction = "+" if node.rows_out > node.historical_avg_rows else "-"
930
+ avg_rows = node.historical_avg_rows
931
+ pct_str = f"{pct_change * 100:.0f}"
932
+ anomaly_reasons.append(
933
+ f"Rows: {node.rows_out:,} vs avg {avg_rows:,.0f} ({direction}{pct_str}%)"
934
+ )
935
+
936
+ if anomaly_reasons:
937
+ node.is_anomaly = True
938
+ node.anomaly_reasons = anomaly_reasons
939
+
940
+ def _convert_result_to_metadata(
941
+ self, result: NodeResult, node_name: str
942
+ ) -> NodeExecutionMetadata:
943
+ """Convert NodeResult to NodeExecutionMetadata."""
944
+ meta = result.metadata or {}
945
+
946
+ # Extract Delta Info
947
+ delta_info = None
948
+ if "delta_info" in meta:
949
+ d = meta["delta_info"]
950
+ # Check if it's already an object or dict
951
+ if isinstance(d, DeltaWriteInfo):
952
+ delta_info = d
953
+ else:
954
+ # It might be a dict if coming from loose dict
955
+ pass
956
+
957
+ node_meta = NodeExecutionMetadata(
958
+ node_name=node_name,
959
+ operation="transform", # Generic default
960
+ status="success" if result.success else "failed",
961
+ duration=result.duration,
962
+ rows_out=result.rows_processed,
963
+ rows_written=result.rows_written,
964
+ schema_out=result.result_schema,
965
+ # From metadata dict
966
+ rows_in=result.rows_read, # Use rows_read from NodeResult
967
+ sample_in=meta.get("sample_data_in"),
968
+ executed_sql=meta.get("executed_sql", []),
969
+ sql_hash=meta.get("sql_hash"),
970
+ transformation_stack=meta.get("transformation_stack", []),
971
+ config_snapshot=meta.get("config_snapshot"),
972
+ delta_info=delta_info,
973
+ data_diff=meta.get("data_diff"),
974
+ environment=meta.get("environment"),
975
+ source_files=meta.get("source_files", []),
976
+ null_profile=meta.get("null_profile"),
977
+ schema_in=meta.get("schema_in"),
978
+ sample_data=meta.get("sample_data"),
979
+ columns_added=meta.get("columns_added", []),
980
+ columns_removed=meta.get("columns_removed", []),
981
+ error_message=str(result.error) if result.error else None,
982
+ error_type=type(result.error).__name__ if result.error else None,
983
+ error_traceback=meta.get("error_traceback"),
984
+ error_traceback_cleaned=meta.get("error_traceback_cleaned"),
985
+ validation_warnings=meta.get("validation_warnings", []),
986
+ execution_steps=meta.get("steps", []),
987
+ failed_rows_samples=meta.get("failed_rows_samples", {}),
988
+ failed_rows_counts=meta.get("failed_rows_counts", {}),
989
+ failed_rows_truncated=meta.get("failed_rows_truncated", False),
990
+ truncated_validations=meta.get("truncated_validations", []),
991
+ retry_history=meta.get("retry_history", []),
992
+ )
993
+
994
+ # Calculate derived metrics
995
+ node_meta.calculate_row_change() # Needs rows_in
996
+ # schema changes are already in metadata from Node logic
997
+
998
+ return node_meta
999
+
1000
+ def _write_remote(self, path: str, content: str) -> None:
1001
+ """Write content to remote path using fsspec."""
1002
+ ctx = get_logging_context()
1003
+ try:
1004
+ import fsspec
1005
+
1006
+ # Use provided storage options (credentials)
1007
+ with fsspec.open(path, "w", encoding="utf-8", **self.storage_options) as f:
1008
+ f.write(content)
1009
+ ctx.debug("Remote file written", path=path, size=len(content))
1010
+ except ImportError:
1011
+ # Fallback for environments without fsspec (e.g., minimal Spark)
1012
+ # Try dbutils if on Databricks
1013
+ try:
1014
+ from pyspark.dbutils import DBUtils
1015
+ from pyspark.sql import SparkSession
1016
+
1017
+ spark = SparkSession.builder.getOrCreate()
1018
+ dbutils = DBUtils(spark)
1019
+ # dbutils.fs.put expects string
1020
+ dbutils.fs.put(path, content, True)
1021
+ ctx.debug("Remote file written via dbutils", path=path, size=len(content))
1022
+ except Exception as e:
1023
+ ctx.error(
1024
+ "Failed to write remote story",
1025
+ path=path,
1026
+ error=str(e),
1027
+ )
1028
+ raise RuntimeError(
1029
+ f"Could not write story to {path}. Install 'fsspec' or 'adlfs'."
1030
+ ) from e
1031
+
1032
+ def _clean_config_for_dump(self, config: Any) -> Any:
1033
+ """Clean configuration for YAML dumping.
1034
+
1035
+ Handles multiline strings to force block style.
1036
+ """
1037
+ if isinstance(config, dict):
1038
+ return {k: self._clean_config_for_dump(v) for k, v in config.items()}
1039
+ elif isinstance(config, list):
1040
+ return [self._clean_config_for_dump(v) for v in config]
1041
+ elif isinstance(config, str) and "\n" in config:
1042
+ # Use custom class to force block style
1043
+ # Strip trailing spaces from lines to allow block style
1044
+ cleaned = config.replace(" \n", "\n").strip()
1045
+ return MultilineString(cleaned)
1046
+ return config
1047
+
1048
+ def _get_git_info(self) -> Dict[str, str]:
1049
+ """Get current git commit and branch."""
1050
+ try:
1051
+ # Run git commands silently
1052
+ commit = (
1053
+ subprocess.check_output(
1054
+ ["git", "rev-parse", "--short", "HEAD"], stderr=subprocess.DEVNULL
1055
+ )
1056
+ .decode("utf-8")
1057
+ .strip()
1058
+ )
1059
+
1060
+ branch = (
1061
+ subprocess.check_output(
1062
+ ["git", "rev-parse", "--abbrev-ref", "HEAD"], stderr=subprocess.DEVNULL
1063
+ )
1064
+ .decode("utf-8")
1065
+ .strip()
1066
+ )
1067
+
1068
+ return {"commit": commit, "branch": branch}
1069
+ except Exception:
1070
+ return {"commit": "unknown", "branch": "unknown"}
1071
+
1072
+ def cleanup(self) -> None:
1073
+ """Remove old stories based on retention policy."""
1074
+ ctx = get_logging_context()
1075
+
1076
+ if self.is_remote:
1077
+ self._cleanup_remote()
1078
+ return
1079
+
1080
+ if self.output_path is None:
1081
+ return
1082
+
1083
+ try:
1084
+ # 1. Clean new nested structure: {pipeline}/{date}/run_*.html
1085
+ pipeline_dir = self.output_path / self.pipeline_name
1086
+ if pipeline_dir.exists():
1087
+ # Find all files recursively
1088
+ stories = sorted(
1089
+ pipeline_dir.glob("**/*.html"),
1090
+ key=lambda p: str(p), # Sort by path (date/time)
1091
+ reverse=True,
1092
+ )
1093
+ json_stories = sorted(
1094
+ pipeline_dir.glob("**/*.json"),
1095
+ key=lambda p: str(p),
1096
+ reverse=True,
1097
+ )
1098
+
1099
+ self._apply_retention(stories, json_stories)
1100
+
1101
+ # Clean empty date directories
1102
+ for date_dir in pipeline_dir.iterdir():
1103
+ if date_dir.is_dir() and not any(date_dir.iterdir()):
1104
+ try:
1105
+ date_dir.rmdir()
1106
+ except Exception:
1107
+ pass
1108
+
1109
+ # 2. Clean legacy flat structure: {pipeline}_*.html in root
1110
+ legacy_stories = sorted(
1111
+ self.output_path.glob(f"{self.pipeline_name}_*.html"),
1112
+ key=lambda p: p.stat().st_mtime,
1113
+ reverse=True,
1114
+ )
1115
+ # Only clean legacy if we have them
1116
+ if legacy_stories:
1117
+ # We don't want to count legacy + new against the same limit technically,
1118
+ # but for simplicity let's just clean legacy based on their own existence
1119
+ self._apply_retention(legacy_stories, [])
1120
+
1121
+ ctx.debug(
1122
+ "Retention policy applied",
1123
+ pipeline=self.pipeline_name,
1124
+ retention_days=self.retention_days,
1125
+ retention_count=self.retention_count,
1126
+ )
1127
+
1128
+ except Exception as e:
1129
+ ctx.warning("Story cleanup failed", error=str(e))
1130
+
1131
+ def _apply_retention(self, stories: List[Path], json_stories: List[Path]) -> None:
1132
+ """Apply count and time retention policies."""
1133
+ from datetime import timedelta
1134
+
1135
+ # 1. Count retention
1136
+ if self.retention_count is not None and len(stories) > self.retention_count:
1137
+ to_delete = stories[self.retention_count :]
1138
+ for path in to_delete:
1139
+ path.unlink(missing_ok=True)
1140
+
1141
+ if self.retention_count is not None and len(json_stories) > self.retention_count:
1142
+ to_delete = json_stories[self.retention_count :]
1143
+ for path in to_delete:
1144
+ path.unlink(missing_ok=True)
1145
+
1146
+ # 2. Time retention
1147
+ now = datetime.now()
1148
+ if self.retention_days is None:
1149
+ return
1150
+ cutoff = now - timedelta(days=self.retention_days)
1151
+
1152
+ # Check remaining files
1153
+ # For nested files, we could parse date from folder name, but mtime is safer fallback
1154
+ retention_count = self.retention_count or 100
1155
+ remaining = stories[:retention_count] + json_stories[:retention_count]
1156
+
1157
+ for path in remaining:
1158
+ if path.exists():
1159
+ # Try to infer date from path first (faster/more accurate than mtime)
1160
+ # Path format: .../{date}/run_{time}.html
1161
+ try:
1162
+ # Try to parse parent folder as date
1163
+ file_date = datetime.strptime(path.parent.name, "%Y-%m-%d")
1164
+ if file_date < cutoff.replace(hour=0, minute=0, second=0, microsecond=0):
1165
+ path.unlink(missing_ok=True)
1166
+ continue
1167
+ except ValueError:
1168
+ pass
1169
+
1170
+ # Fallback to mtime
1171
+ mtime = datetime.fromtimestamp(path.stat().st_mtime)
1172
+ if mtime < cutoff:
1173
+ path.unlink(missing_ok=True)
1174
+
1175
+ def _cleanup_remote(self) -> None:
1176
+ """Clean up old stories from remote storage using fsspec."""
1177
+ ctx = get_logging_context()
1178
+
1179
+ try:
1180
+ import fsspec
1181
+ from datetime import timedelta
1182
+
1183
+ # Build the pipeline stories path
1184
+ pipeline_path = f"{self.output_path_str.rstrip('/')}/{self.pipeline_name}"
1185
+
1186
+ # Get filesystem from the path
1187
+ fs, path_prefix = fsspec.core.url_to_fs(pipeline_path, **self.storage_options)
1188
+
1189
+ # Check if path exists
1190
+ if not fs.exists(path_prefix):
1191
+ ctx.debug("Remote story path does not exist yet", path=pipeline_path)
1192
+ return
1193
+
1194
+ # List all files recursively
1195
+ all_files = []
1196
+ try:
1197
+ for root, dirs, files in fs.walk(path_prefix):
1198
+ for f in files:
1199
+ if f.endswith((".html", ".json")):
1200
+ full_path = f"{root}/{f}" if root else f
1201
+ all_files.append(full_path)
1202
+ except Exception as e:
1203
+ ctx.debug(f"Could not walk remote path: {e}")
1204
+ return
1205
+
1206
+ if not all_files:
1207
+ return
1208
+
1209
+ # Sort by path (which includes date folders) - newest first
1210
+ all_files.sort(reverse=True)
1211
+
1212
+ # Separate html and json
1213
+ html_files = [f for f in all_files if f.endswith(".html")]
1214
+ json_files = [f for f in all_files if f.endswith(".json")]
1215
+
1216
+ deleted_count = 0
1217
+
1218
+ # Apply count retention
1219
+ if self.retention_count is not None:
1220
+ if len(html_files) > self.retention_count:
1221
+ for f in html_files[self.retention_count :]:
1222
+ try:
1223
+ fs.rm(f)
1224
+ deleted_count += 1
1225
+ except Exception:
1226
+ pass
1227
+
1228
+ if len(json_files) > self.retention_count:
1229
+ for f in json_files[self.retention_count :]:
1230
+ try:
1231
+ fs.rm(f)
1232
+ deleted_count += 1
1233
+ except Exception:
1234
+ pass
1235
+
1236
+ # Apply time retention
1237
+ if self.retention_days is not None:
1238
+ cutoff = datetime.now() - timedelta(days=self.retention_days)
1239
+ cutoff_str = cutoff.strftime("%Y-%m-%d")
1240
+
1241
+ # Check remaining files
1242
+ retention_count = self.retention_count or 100
1243
+ remaining = html_files[:retention_count] + json_files[:retention_count]
1244
+
1245
+ for f in remaining:
1246
+ # Try to parse date from path (format: .../YYYY-MM-DD/run_*.html)
1247
+ try:
1248
+ parts = f.replace("\\", "/").split("/")
1249
+ for part in parts:
1250
+ if len(part) == 10 and part[4] == "-" and part[7] == "-":
1251
+ if part < cutoff_str:
1252
+ try:
1253
+ fs.rm(f)
1254
+ deleted_count += 1
1255
+ except Exception:
1256
+ pass
1257
+ break
1258
+ except Exception:
1259
+ pass
1260
+
1261
+ # Clean empty date directories
1262
+ try:
1263
+ for item in fs.ls(path_prefix, detail=False):
1264
+ if fs.isdir(item):
1265
+ contents = fs.ls(item, detail=False)
1266
+ if not contents:
1267
+ fs.rmdir(item)
1268
+ except Exception:
1269
+ pass
1270
+
1271
+ if deleted_count > 0:
1272
+ ctx.debug(
1273
+ "Remote story cleanup completed",
1274
+ deleted=deleted_count,
1275
+ pipeline=self.pipeline_name,
1276
+ )
1277
+
1278
+ except ImportError:
1279
+ ctx.debug("fsspec not available for remote cleanup")
1280
+ except Exception as e:
1281
+ ctx.warning(f"Remote story cleanup failed: {e}")
1282
+
1283
+ def _generate_pipeline_index(self) -> None:
1284
+ """Generate an index.html with a table of recent runs (Phase 3)."""
1285
+ import json
1286
+
1287
+ ctx = get_logging_context()
1288
+
1289
+ if self.is_remote:
1290
+ ctx.debug("Pipeline index not yet supported for remote storage")
1291
+ return
1292
+
1293
+ if self.output_path is None:
1294
+ return
1295
+
1296
+ pipeline_dir = self.output_path / self.pipeline_name
1297
+ if not pipeline_dir.exists():
1298
+ return
1299
+
1300
+ # Find all JSON files
1301
+ json_files = sorted(
1302
+ pipeline_dir.glob("**/*.json"),
1303
+ key=lambda p: str(p),
1304
+ reverse=True,
1305
+ )
1306
+
1307
+ if not json_files:
1308
+ return
1309
+
1310
+ # Load metadata from each run
1311
+ runs = []
1312
+ for json_path in json_files[:50]: # Limit to 50 most recent
1313
+ try:
1314
+ with open(json_path, "r", encoding="utf-8") as f:
1315
+ data = json.load(f)
1316
+
1317
+ html_path = json_path.with_suffix(".html")
1318
+ relative_html = html_path.relative_to(pipeline_dir)
1319
+
1320
+ runs.append(
1321
+ {
1322
+ "run_id": data.get("run_id", "unknown"),
1323
+ "started_at": data.get("started_at", ""),
1324
+ "duration": data.get("duration", 0),
1325
+ "total_nodes": data.get("total_nodes", 0),
1326
+ "completed_nodes": data.get("completed_nodes", 0),
1327
+ "failed_nodes": data.get("failed_nodes", 0),
1328
+ "success_rate": data.get("success_rate", 0),
1329
+ "html_path": str(relative_html).replace("\\", "/"),
1330
+ "status": "failed" if data.get("failed_nodes", 0) > 0 else "success",
1331
+ }
1332
+ )
1333
+ except Exception as e:
1334
+ ctx.debug(f"Failed to load run metadata: {json_path}, error: {e}")
1335
+ continue
1336
+
1337
+ if not runs:
1338
+ return
1339
+
1340
+ # Generate index HTML
1341
+ index_html = self._render_index_html(runs)
1342
+ index_path = pipeline_dir / "index.html"
1343
+
1344
+ try:
1345
+ with open(index_path, "w", encoding="utf-8") as f:
1346
+ f.write(index_html)
1347
+ ctx.debug("Pipeline index generated", path=str(index_path), runs=len(runs))
1348
+ except Exception as e:
1349
+ ctx.warning(f"Failed to write pipeline index: {e}")
1350
+
1351
+ def _render_index_html(self, runs: List[Dict[str, Any]]) -> str:
1352
+ """Render the pipeline history index HTML."""
1353
+ rows_html = ""
1354
+ for run in runs:
1355
+ status_class = "success" if run["status"] == "success" else "failed"
1356
+ status_icon = "✓" if run["status"] == "success" else "✗"
1357
+ rows_html += f"""
1358
+ <tr class="{status_class}">
1359
+ <td><a href="{run["html_path"]}">{run["run_id"]}</a></td>
1360
+ <td>{run["started_at"]}</td>
1361
+ <td>{run["duration"]:.2f}s</td>
1362
+ <td>{run["total_nodes"]}</td>
1363
+ <td class="status-cell {status_class}">{status_icon} {run["completed_nodes"]}/{run["total_nodes"]}</td>
1364
+ <td>{run["success_rate"]:.1f}%</td>
1365
+ </tr>
1366
+ """
1367
+
1368
+ return f"""<!DOCTYPE html>
1369
+ <html lang="en">
1370
+ <head>
1371
+ <meta charset="UTF-8">
1372
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
1373
+ <title>Pipeline History: {self.pipeline_name}</title>
1374
+ <style>
1375
+ :root {{
1376
+ --primary-color: #0066cc;
1377
+ --success-color: #28a745;
1378
+ --error-color: #dc3545;
1379
+ }}
1380
+ body {{
1381
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
1382
+ background: #f4f7f9;
1383
+ margin: 0;
1384
+ padding: 20px;
1385
+ }}
1386
+ .container {{ max-width: 1200px; margin: 0 auto; }}
1387
+ h1 {{ color: var(--primary-color); margin-bottom: 20px; }}
1388
+ table {{
1389
+ width: 100%;
1390
+ background: #fff;
1391
+ border-collapse: collapse;
1392
+ border-radius: 8px;
1393
+ overflow: hidden;
1394
+ box-shadow: 0 2px 8px rgba(0,0,0,0.1);
1395
+ }}
1396
+ th, td {{ padding: 12px 16px; text-align: left; border-bottom: 1px solid #e1e4e8; }}
1397
+ th {{ background: #f8f9fa; font-weight: 600; }}
1398
+ tr:hover {{ background: #f8f9fa; }}
1399
+ a {{ color: var(--primary-color); text-decoration: none; }}
1400
+ a:hover {{ text-decoration: underline; }}
1401
+ .status-cell.success {{ color: var(--success-color); font-weight: 600; }}
1402
+ .status-cell.failed {{ color: var(--error-color); font-weight: 600; }}
1403
+ tr.failed {{ background: #fff5f5; }}
1404
+ </style>
1405
+ </head>
1406
+ <body>
1407
+ <div class="container">
1408
+ <h1>📊 Pipeline History: {self.pipeline_name}</h1>
1409
+ <p style="color: #666; margin-bottom: 20px;">Showing {len(runs)} most recent runs</p>
1410
+ <table>
1411
+ <thead>
1412
+ <tr>
1413
+ <th>Run ID</th>
1414
+ <th>Started</th>
1415
+ <th>Duration</th>
1416
+ <th>Nodes</th>
1417
+ <th>Status</th>
1418
+ <th>Success Rate</th>
1419
+ </tr>
1420
+ </thead>
1421
+ <tbody>
1422
+ {rows_html}
1423
+ </tbody>
1424
+ </table>
1425
+ </div>
1426
+ </body>
1427
+ </html>
1428
+ """
1429
+
1430
+ # Legacy methods removed as they are now handled by renderers
1431
+ # _generate_node_section, _sample_to_markdown, _dataframe_to_markdown