odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/story/generator.py
ADDED
|
@@ -0,0 +1,1431 @@
|
|
|
1
|
+
"""Story generator for pipeline execution documentation."""
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
import yaml
|
|
9
|
+
|
|
10
|
+
from odibi.node import NodeResult
|
|
11
|
+
from odibi.story.metadata import DeltaWriteInfo, NodeExecutionMetadata, PipelineStoryMetadata
|
|
12
|
+
from odibi.story.renderers import HTMLStoryRenderer, JSONStoryRenderer
|
|
13
|
+
from odibi.utils.logging_context import get_logging_context
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Custom class to force block style for multiline strings
|
|
17
|
+
class MultilineString(str):
|
|
18
|
+
"""String subclass to force YAML block scalar style."""
|
|
19
|
+
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def multiline_presenter(dumper, data):
|
|
24
|
+
"""YAML representer for MultilineString."""
|
|
25
|
+
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
yaml.add_representer(MultilineString, multiline_presenter)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class StoryGenerator:
|
|
32
|
+
"""Generates markdown documentation of pipeline execution."""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
pipeline_name: str,
|
|
37
|
+
max_sample_rows: int = 10,
|
|
38
|
+
output_path: str = "stories/",
|
|
39
|
+
retention_days: int = 30,
|
|
40
|
+
retention_count: int = 100,
|
|
41
|
+
storage_options: Optional[Dict[str, Any]] = None,
|
|
42
|
+
catalog_manager: Optional[Any] = None,
|
|
43
|
+
):
|
|
44
|
+
"""Initialize story generator.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
pipeline_name: Name of the pipeline
|
|
48
|
+
max_sample_rows: Maximum rows to show in samples
|
|
49
|
+
output_path: Directory for story output
|
|
50
|
+
retention_days: Days to keep stories
|
|
51
|
+
retention_count: Max number of stories to keep
|
|
52
|
+
storage_options: Credentials for remote storage (e.g. ADLS)
|
|
53
|
+
catalog_manager: System Catalog Manager for historical context
|
|
54
|
+
"""
|
|
55
|
+
self.pipeline_name = pipeline_name
|
|
56
|
+
self.max_sample_rows = max_sample_rows
|
|
57
|
+
self.output_path_str = output_path # Store original string
|
|
58
|
+
self.is_remote = "://" in output_path
|
|
59
|
+
self.storage_options = storage_options or {}
|
|
60
|
+
self.catalog_manager = catalog_manager
|
|
61
|
+
|
|
62
|
+
# Track last generated story for alert enrichment
|
|
63
|
+
self._last_story_path: Optional[str] = None
|
|
64
|
+
self._last_metadata: Optional[PipelineStoryMetadata] = None
|
|
65
|
+
|
|
66
|
+
if not self.is_remote:
|
|
67
|
+
self.output_path = Path(output_path)
|
|
68
|
+
self.output_path.mkdir(parents=True, exist_ok=True)
|
|
69
|
+
else:
|
|
70
|
+
self.output_path = None # Handle remote paths differently
|
|
71
|
+
|
|
72
|
+
self.retention_days = retention_days
|
|
73
|
+
self.retention_count = retention_count
|
|
74
|
+
|
|
75
|
+
ctx = get_logging_context()
|
|
76
|
+
ctx.debug(
|
|
77
|
+
"StoryGenerator initialized",
|
|
78
|
+
pipeline=pipeline_name,
|
|
79
|
+
output_path=output_path,
|
|
80
|
+
is_remote=self.is_remote,
|
|
81
|
+
retention_days=retention_days,
|
|
82
|
+
retention_count=retention_count,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
def generate(
|
|
86
|
+
self,
|
|
87
|
+
node_results: Dict[str, NodeResult],
|
|
88
|
+
completed: List[str],
|
|
89
|
+
failed: List[str],
|
|
90
|
+
skipped: List[str],
|
|
91
|
+
duration: float,
|
|
92
|
+
start_time: str,
|
|
93
|
+
end_time: str,
|
|
94
|
+
context: Any = None,
|
|
95
|
+
config: Optional[Dict[str, Any]] = None,
|
|
96
|
+
graph_data: Optional[Dict[str, Any]] = None,
|
|
97
|
+
) -> str:
|
|
98
|
+
"""Generate story HTML and JSON.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
node_results: Dictionary of node name -> NodeResult
|
|
102
|
+
completed: List of completed node names
|
|
103
|
+
failed: List of failed node names
|
|
104
|
+
skipped: List of skipped node names
|
|
105
|
+
duration: Total pipeline duration
|
|
106
|
+
start_time: ISO timestamp of start
|
|
107
|
+
end_time: ISO timestamp of end
|
|
108
|
+
context: Optional context to access intermediate DataFrames
|
|
109
|
+
config: Optional pipeline configuration snapshot
|
|
110
|
+
graph_data: Optional graph data dict with nodes/edges for DAG visualization
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Path to generated HTML story file
|
|
114
|
+
"""
|
|
115
|
+
ctx = get_logging_context()
|
|
116
|
+
ctx.debug(
|
|
117
|
+
"Generating story",
|
|
118
|
+
pipeline=self.pipeline_name,
|
|
119
|
+
node_count=len(node_results),
|
|
120
|
+
completed=len(completed),
|
|
121
|
+
failed=len(failed),
|
|
122
|
+
skipped=len(skipped),
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# 1. Build metadata object
|
|
126
|
+
metadata = PipelineStoryMetadata(
|
|
127
|
+
pipeline_name=self.pipeline_name,
|
|
128
|
+
pipeline_layer=config.get("layer") if config else None,
|
|
129
|
+
started_at=start_time,
|
|
130
|
+
completed_at=end_time,
|
|
131
|
+
duration=duration,
|
|
132
|
+
total_nodes=len(completed) + len(failed) + len(skipped),
|
|
133
|
+
completed_nodes=len(completed),
|
|
134
|
+
failed_nodes=len(failed),
|
|
135
|
+
skipped_nodes=len(skipped),
|
|
136
|
+
project=config.get("project") if config else None,
|
|
137
|
+
plant=config.get("plant") if config else None,
|
|
138
|
+
asset=config.get("asset") if config else None,
|
|
139
|
+
business_unit=config.get("business_unit") if config else None,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# Add Git Info
|
|
143
|
+
# git_info = self._get_git_info()
|
|
144
|
+
# We can't easily add arbitrary fields to dataclass without changing it,
|
|
145
|
+
# but we can rely on the fact that it's just metadata.
|
|
146
|
+
# For now, let's skip adding git info to the core model or extend it later.
|
|
147
|
+
|
|
148
|
+
# Process all nodes in order
|
|
149
|
+
all_nodes = completed + failed + skipped
|
|
150
|
+
|
|
151
|
+
# If we have config, try to follow config order instead of list order
|
|
152
|
+
if config and "nodes" in config:
|
|
153
|
+
config_order = [n["name"] for n in config["nodes"]]
|
|
154
|
+
# Sort all_nodes based on index in config_order
|
|
155
|
+
all_nodes.sort(key=lambda x: config_order.index(x) if x in config_order else 999)
|
|
156
|
+
|
|
157
|
+
for node_name in all_nodes:
|
|
158
|
+
if node_name in node_results:
|
|
159
|
+
result = node_results[node_name]
|
|
160
|
+
node_meta = self._convert_result_to_metadata(result, node_name)
|
|
161
|
+
|
|
162
|
+
# Status overrides (result object has success bool, but we have lists)
|
|
163
|
+
if node_name in failed:
|
|
164
|
+
node_meta.status = "failed"
|
|
165
|
+
elif node_name in skipped:
|
|
166
|
+
node_meta.status = "skipped"
|
|
167
|
+
else:
|
|
168
|
+
node_meta.status = "success"
|
|
169
|
+
|
|
170
|
+
metadata.nodes.append(node_meta)
|
|
171
|
+
else:
|
|
172
|
+
# Skipped node without result
|
|
173
|
+
metadata.nodes.append(
|
|
174
|
+
NodeExecutionMetadata(
|
|
175
|
+
node_name=node_name, operation="skipped", status="skipped", duration=0.0
|
|
176
|
+
)
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# Enrich with Historical Context (if available)
|
|
180
|
+
current_node = metadata.nodes[-1]
|
|
181
|
+
if self.catalog_manager:
|
|
182
|
+
try:
|
|
183
|
+
avg_rows = self.catalog_manager.get_average_volume(node_name)
|
|
184
|
+
avg_duration = self.catalog_manager.get_average_duration(node_name)
|
|
185
|
+
|
|
186
|
+
current_node.historical_avg_rows = avg_rows
|
|
187
|
+
current_node.historical_avg_duration = avg_duration
|
|
188
|
+
|
|
189
|
+
# Compute anomalies (Phase 1 - Triage)
|
|
190
|
+
self._compute_anomalies(current_node)
|
|
191
|
+
except Exception as e:
|
|
192
|
+
ctx = get_logging_context()
|
|
193
|
+
ctx.debug(
|
|
194
|
+
"Failed to fetch historical metrics for node",
|
|
195
|
+
node_name=node_name,
|
|
196
|
+
error=str(e),
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# 2. Build graph data for interactive DAG (Phase 2)
|
|
200
|
+
metadata.graph_data = self._build_graph_data(metadata, graph_data, config)
|
|
201
|
+
|
|
202
|
+
# 3. Compare with last successful run (Phase 3)
|
|
203
|
+
self._compare_with_last_success(metadata)
|
|
204
|
+
|
|
205
|
+
# 4. Add git info (Phase 3)
|
|
206
|
+
metadata.git_info = self._get_git_info()
|
|
207
|
+
|
|
208
|
+
# 5. Render outputs
|
|
209
|
+
timestamp_obj = datetime.now()
|
|
210
|
+
date_str = timestamp_obj.strftime("%Y-%m-%d")
|
|
211
|
+
time_str = timestamp_obj.strftime("%H-%M-%S")
|
|
212
|
+
|
|
213
|
+
# Create structured path: {pipeline_name}/{date}/
|
|
214
|
+
relative_folder = f"{self.pipeline_name}/{date_str}"
|
|
215
|
+
|
|
216
|
+
if self.is_remote:
|
|
217
|
+
base_path = f"{self.output_path_str.rstrip('/')}/{relative_folder}"
|
|
218
|
+
else:
|
|
219
|
+
base_path = self.output_path / relative_folder
|
|
220
|
+
base_path.mkdir(parents=True, exist_ok=True)
|
|
221
|
+
|
|
222
|
+
base_filename = f"run_{time_str}"
|
|
223
|
+
|
|
224
|
+
# Prepare renderers
|
|
225
|
+
html_renderer = HTMLStoryRenderer()
|
|
226
|
+
json_renderer = JSONStoryRenderer()
|
|
227
|
+
|
|
228
|
+
# Paths
|
|
229
|
+
if self.is_remote:
|
|
230
|
+
html_path = f"{base_path}/{base_filename}.html"
|
|
231
|
+
json_path = f"{base_path}/{base_filename}.json"
|
|
232
|
+
else:
|
|
233
|
+
html_path = str(base_path / f"{base_filename}.html")
|
|
234
|
+
json_path = str(base_path / f"{base_filename}.json")
|
|
235
|
+
|
|
236
|
+
# Render HTML
|
|
237
|
+
html_content = html_renderer.render(metadata)
|
|
238
|
+
|
|
239
|
+
# Render JSON
|
|
240
|
+
json_content = json_renderer.render(metadata)
|
|
241
|
+
|
|
242
|
+
# Write files
|
|
243
|
+
try:
|
|
244
|
+
if self.is_remote:
|
|
245
|
+
self._write_remote(html_path, html_content)
|
|
246
|
+
self._write_remote(json_path, json_content)
|
|
247
|
+
else:
|
|
248
|
+
with open(html_path, "w", encoding="utf-8") as f:
|
|
249
|
+
f.write(html_content)
|
|
250
|
+
with open(json_path, "w", encoding="utf-8") as f:
|
|
251
|
+
f.write(json_content)
|
|
252
|
+
|
|
253
|
+
ctx.debug(
|
|
254
|
+
"Story files written",
|
|
255
|
+
html_path=html_path,
|
|
256
|
+
html_size=len(html_content),
|
|
257
|
+
json_path=json_path,
|
|
258
|
+
json_size=len(json_content),
|
|
259
|
+
)
|
|
260
|
+
except Exception as e:
|
|
261
|
+
ctx.error(
|
|
262
|
+
"Failed to write story files",
|
|
263
|
+
error=str(e),
|
|
264
|
+
html_path=html_path,
|
|
265
|
+
json_path=json_path,
|
|
266
|
+
)
|
|
267
|
+
raise
|
|
268
|
+
|
|
269
|
+
# Store for alert enrichment
|
|
270
|
+
self._last_story_path = html_path
|
|
271
|
+
self._last_metadata = metadata
|
|
272
|
+
|
|
273
|
+
# Cleanup and generate index
|
|
274
|
+
self.cleanup()
|
|
275
|
+
self._generate_pipeline_index()
|
|
276
|
+
|
|
277
|
+
ctx.info(
|
|
278
|
+
"Story generated",
|
|
279
|
+
path=html_path,
|
|
280
|
+
nodes=len(metadata.nodes),
|
|
281
|
+
success_rate=metadata.get_success_rate(),
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
return html_path
|
|
285
|
+
|
|
286
|
+
def get_alert_summary(self) -> Dict[str, Any]:
|
|
287
|
+
"""Get a summary of the last generated story for alerts.
|
|
288
|
+
|
|
289
|
+
Returns:
|
|
290
|
+
Dictionary with metrics suitable for alert payloads
|
|
291
|
+
"""
|
|
292
|
+
if not self._last_metadata:
|
|
293
|
+
return {}
|
|
294
|
+
|
|
295
|
+
summary = self._last_metadata.get_alert_summary()
|
|
296
|
+
summary["story_path"] = self._last_story_path
|
|
297
|
+
return summary
|
|
298
|
+
|
|
299
|
+
def _get_duration_history(self, node_name: str, limit: int = 10) -> List[Dict[str, Any]]:
|
|
300
|
+
"""Get duration history for a node across recent runs.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
node_name: The node name to get history for
|
|
304
|
+
limit: Maximum number of runs to include
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
List of {"run_id": "...", "duration": 1.5, "started_at": "..."} dicts
|
|
308
|
+
"""
|
|
309
|
+
import json
|
|
310
|
+
|
|
311
|
+
ctx = get_logging_context()
|
|
312
|
+
|
|
313
|
+
if self.is_remote:
|
|
314
|
+
ctx.debug("Duration history not yet supported for remote storage")
|
|
315
|
+
return []
|
|
316
|
+
|
|
317
|
+
if self.output_path is None:
|
|
318
|
+
return []
|
|
319
|
+
|
|
320
|
+
pipeline_dir = self.output_path / self.pipeline_name
|
|
321
|
+
if not pipeline_dir.exists():
|
|
322
|
+
return []
|
|
323
|
+
|
|
324
|
+
json_files = sorted(
|
|
325
|
+
pipeline_dir.glob("**/*.json"),
|
|
326
|
+
key=lambda p: str(p),
|
|
327
|
+
reverse=True,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
history = []
|
|
331
|
+
for json_path in json_files[: limit + 1]: # +1 to skip current run if it exists
|
|
332
|
+
try:
|
|
333
|
+
with open(json_path, "r", encoding="utf-8") as f:
|
|
334
|
+
data = json.load(f)
|
|
335
|
+
|
|
336
|
+
for node_data in data.get("nodes", []):
|
|
337
|
+
if node_data.get("node_name") == node_name:
|
|
338
|
+
history.append(
|
|
339
|
+
{
|
|
340
|
+
"run_id": data.get("run_id", "unknown"),
|
|
341
|
+
"duration": node_data.get("duration", 0),
|
|
342
|
+
"started_at": data.get("started_at", ""),
|
|
343
|
+
}
|
|
344
|
+
)
|
|
345
|
+
break
|
|
346
|
+
except Exception as e:
|
|
347
|
+
ctx.debug(f"Failed to load run for duration history: {json_path}, error: {e}")
|
|
348
|
+
continue
|
|
349
|
+
|
|
350
|
+
return history[:limit]
|
|
351
|
+
|
|
352
|
+
def _find_last_successful_run(self) -> Optional[Dict[str, Any]]:
|
|
353
|
+
"""Find the most recent successful run's JSON data.
|
|
354
|
+
|
|
355
|
+
Returns:
|
|
356
|
+
Dictionary of the last successful run metadata, or None
|
|
357
|
+
"""
|
|
358
|
+
import json
|
|
359
|
+
|
|
360
|
+
ctx = get_logging_context()
|
|
361
|
+
|
|
362
|
+
if self.is_remote:
|
|
363
|
+
return self._find_last_successful_run_remote()
|
|
364
|
+
|
|
365
|
+
if self.output_path is None:
|
|
366
|
+
return None
|
|
367
|
+
|
|
368
|
+
pipeline_dir = self.output_path / self.pipeline_name
|
|
369
|
+
if not pipeline_dir.exists():
|
|
370
|
+
return None
|
|
371
|
+
|
|
372
|
+
# Find all JSON files, sorted by path (date/time order) descending
|
|
373
|
+
json_files = sorted(
|
|
374
|
+
pipeline_dir.glob("**/*.json"),
|
|
375
|
+
key=lambda p: str(p),
|
|
376
|
+
reverse=True,
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
# Find the most recent successful run
|
|
380
|
+
for json_path in json_files:
|
|
381
|
+
try:
|
|
382
|
+
with open(json_path, "r", encoding="utf-8") as f:
|
|
383
|
+
data = json.load(f)
|
|
384
|
+
|
|
385
|
+
# Check if this run was successful (no failed nodes)
|
|
386
|
+
if data.get("failed_nodes", 0) == 0:
|
|
387
|
+
ctx.debug(
|
|
388
|
+
"Found last successful run",
|
|
389
|
+
path=str(json_path),
|
|
390
|
+
run_id=data.get("run_id"),
|
|
391
|
+
)
|
|
392
|
+
return data
|
|
393
|
+
except Exception as e:
|
|
394
|
+
ctx.debug(f"Failed to load story JSON: {json_path}, error: {e}")
|
|
395
|
+
continue
|
|
396
|
+
|
|
397
|
+
return None
|
|
398
|
+
|
|
399
|
+
def _find_last_successful_run_remote(self) -> Optional[Dict[str, Any]]:
|
|
400
|
+
"""Find the most recent successful run's JSON data from remote storage.
|
|
401
|
+
|
|
402
|
+
Uses fsspec to list and read JSON files from Azure Blob, ADLS, S3, etc.
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
Dictionary of the last successful run metadata, or None
|
|
406
|
+
"""
|
|
407
|
+
import json
|
|
408
|
+
|
|
409
|
+
ctx = get_logging_context()
|
|
410
|
+
|
|
411
|
+
try:
|
|
412
|
+
import fsspec
|
|
413
|
+
except ImportError:
|
|
414
|
+
ctx.debug("fsspec not available, skipping remote comparison")
|
|
415
|
+
return None
|
|
416
|
+
|
|
417
|
+
pipeline_path = f"{self.output_path_str.rstrip('/')}/{self.pipeline_name}"
|
|
418
|
+
|
|
419
|
+
try:
|
|
420
|
+
fs = fsspec.filesystem(pipeline_path.split("://")[0], **self.storage_options)
|
|
421
|
+
|
|
422
|
+
# List all JSON files recursively under pipeline directory
|
|
423
|
+
# fsspec glob pattern for recursive search
|
|
424
|
+
glob_pattern = f"{pipeline_path.split('://', 1)[1]}/**/*.json"
|
|
425
|
+
json_files = fs.glob(glob_pattern)
|
|
426
|
+
|
|
427
|
+
if not json_files:
|
|
428
|
+
ctx.debug("No previous story JSON files found", path=pipeline_path)
|
|
429
|
+
return None
|
|
430
|
+
|
|
431
|
+
# Sort by path descending (date/time order due to folder structure)
|
|
432
|
+
json_files = sorted(json_files, reverse=True)
|
|
433
|
+
|
|
434
|
+
ctx.debug(
|
|
435
|
+
"Found story JSON files for comparison",
|
|
436
|
+
count=len(json_files),
|
|
437
|
+
path=pipeline_path,
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
# Find the most recent successful run
|
|
441
|
+
protocol = pipeline_path.split("://")[0]
|
|
442
|
+
for json_path in json_files:
|
|
443
|
+
full_path = f"{protocol}://{json_path}"
|
|
444
|
+
try:
|
|
445
|
+
with fsspec.open(full_path, "r", encoding="utf-8", **self.storage_options) as f:
|
|
446
|
+
data = json.load(f)
|
|
447
|
+
|
|
448
|
+
# Check if this run was successful (no failed nodes)
|
|
449
|
+
if data.get("failed_nodes", 0) == 0:
|
|
450
|
+
ctx.debug(
|
|
451
|
+
"Found last successful run (remote)",
|
|
452
|
+
path=full_path,
|
|
453
|
+
run_id=data.get("run_id"),
|
|
454
|
+
)
|
|
455
|
+
return data
|
|
456
|
+
except Exception as e:
|
|
457
|
+
ctx.debug(f"Failed to load remote story JSON: {full_path}, error: {e}")
|
|
458
|
+
continue
|
|
459
|
+
|
|
460
|
+
except Exception as e:
|
|
461
|
+
ctx.warning(
|
|
462
|
+
"Failed to search remote storage for previous runs",
|
|
463
|
+
error=str(e),
|
|
464
|
+
path=pipeline_path,
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
return None
|
|
468
|
+
|
|
469
|
+
def _compare_with_last_success(self, metadata: PipelineStoryMetadata) -> None:
|
|
470
|
+
"""Compare current run with last successful run and populate change_summary."""
|
|
471
|
+
ctx = get_logging_context()
|
|
472
|
+
|
|
473
|
+
# Collect duration history for all nodes (before comparison)
|
|
474
|
+
for node in metadata.nodes:
|
|
475
|
+
history = self._get_duration_history(node.node_name, limit=10)
|
|
476
|
+
if history:
|
|
477
|
+
node.duration_history = history
|
|
478
|
+
|
|
479
|
+
last_success = self._find_last_successful_run()
|
|
480
|
+
if not last_success:
|
|
481
|
+
ctx.debug("No previous successful run found for comparison")
|
|
482
|
+
return
|
|
483
|
+
|
|
484
|
+
metadata.compared_to_run_id = last_success.get("run_id")
|
|
485
|
+
|
|
486
|
+
# Build lookup for previous run's nodes
|
|
487
|
+
prev_nodes = {n["node_name"]: n for n in last_success.get("nodes", [])}
|
|
488
|
+
|
|
489
|
+
# Track changes
|
|
490
|
+
sql_changed = []
|
|
491
|
+
schema_changed = []
|
|
492
|
+
rows_changed = []
|
|
493
|
+
newly_failing = []
|
|
494
|
+
duration_changed = []
|
|
495
|
+
|
|
496
|
+
for node in metadata.nodes:
|
|
497
|
+
prev = prev_nodes.get(node.node_name)
|
|
498
|
+
if not prev:
|
|
499
|
+
# New node, not in previous run
|
|
500
|
+
continue
|
|
501
|
+
|
|
502
|
+
changes = []
|
|
503
|
+
|
|
504
|
+
# Compare SQL hash
|
|
505
|
+
if node.sql_hash and prev.get("sql_hash"):
|
|
506
|
+
if node.sql_hash != prev["sql_hash"]:
|
|
507
|
+
changes.append("sql")
|
|
508
|
+
sql_changed.append(node.node_name)
|
|
509
|
+
node.previous_sql_hash = prev["sql_hash"]
|
|
510
|
+
|
|
511
|
+
# Compare schema (output)
|
|
512
|
+
curr_schema = set(node.schema_out or [])
|
|
513
|
+
prev_schema = set(prev.get("schema_out") or [])
|
|
514
|
+
if curr_schema != prev_schema:
|
|
515
|
+
changes.append("schema")
|
|
516
|
+
schema_changed.append(node.node_name)
|
|
517
|
+
|
|
518
|
+
# Compare row counts (significant change = >20%)
|
|
519
|
+
if node.rows_out is not None and prev.get("rows_out") is not None:
|
|
520
|
+
prev_rows = prev["rows_out"]
|
|
521
|
+
if prev_rows > 0:
|
|
522
|
+
pct_change = abs(node.rows_out - prev_rows) / prev_rows
|
|
523
|
+
if pct_change > 0.2:
|
|
524
|
+
changes.append("rows")
|
|
525
|
+
rows_changed.append(node.node_name)
|
|
526
|
+
node.previous_rows_out = prev_rows
|
|
527
|
+
|
|
528
|
+
# Compare duration (significant change = 2x slower)
|
|
529
|
+
if node.duration and prev.get("duration"):
|
|
530
|
+
prev_dur = prev["duration"]
|
|
531
|
+
if prev_dur > 0 and node.duration >= prev_dur * 2:
|
|
532
|
+
changes.append("duration")
|
|
533
|
+
duration_changed.append(node.node_name)
|
|
534
|
+
node.previous_duration = prev_dur
|
|
535
|
+
|
|
536
|
+
# Check if newly failing
|
|
537
|
+
if node.status == "failed" and prev.get("status") == "success":
|
|
538
|
+
newly_failing.append(node.node_name)
|
|
539
|
+
|
|
540
|
+
# Capture previous config snapshot for diff viewer
|
|
541
|
+
if prev.get("config_snapshot"):
|
|
542
|
+
node.previous_config_snapshot = prev["config_snapshot"]
|
|
543
|
+
|
|
544
|
+
if changes:
|
|
545
|
+
node.changed_from_last_success = True
|
|
546
|
+
node.changes_detected = changes
|
|
547
|
+
|
|
548
|
+
# Build summary
|
|
549
|
+
metadata.change_summary = {
|
|
550
|
+
"has_changes": bool(sql_changed or schema_changed or rows_changed or newly_failing),
|
|
551
|
+
"sql_changed_count": len(sql_changed),
|
|
552
|
+
"sql_changed_nodes": sql_changed,
|
|
553
|
+
"schema_changed_count": len(schema_changed),
|
|
554
|
+
"schema_changed_nodes": schema_changed,
|
|
555
|
+
"rows_changed_count": len(rows_changed),
|
|
556
|
+
"rows_changed_nodes": rows_changed,
|
|
557
|
+
"duration_changed_count": len(duration_changed),
|
|
558
|
+
"duration_changed_nodes": duration_changed,
|
|
559
|
+
"newly_failing_count": len(newly_failing),
|
|
560
|
+
"newly_failing_nodes": newly_failing,
|
|
561
|
+
"compared_to_run_id": metadata.compared_to_run_id,
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
ctx.debug(
|
|
565
|
+
"Cross-run comparison complete",
|
|
566
|
+
compared_to=metadata.compared_to_run_id,
|
|
567
|
+
sql_changed=len(sql_changed),
|
|
568
|
+
schema_changed=len(schema_changed),
|
|
569
|
+
newly_failing=len(newly_failing),
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
def _infer_layer_from_path(self, path: str) -> str:
|
|
573
|
+
"""Infer the data layer from a path string.
|
|
574
|
+
|
|
575
|
+
Uses common naming patterns to identify bronze/silver/gold/raw layers.
|
|
576
|
+
"""
|
|
577
|
+
path_lower = path.lower()
|
|
578
|
+
if "bronze" in path_lower:
|
|
579
|
+
return "bronze"
|
|
580
|
+
elif "silver" in path_lower:
|
|
581
|
+
return "silver"
|
|
582
|
+
elif "gold" in path_lower:
|
|
583
|
+
return "gold"
|
|
584
|
+
elif "raw" in path_lower:
|
|
585
|
+
return "raw"
|
|
586
|
+
elif "staging" in path_lower:
|
|
587
|
+
return "staging"
|
|
588
|
+
elif "semantic" in path_lower:
|
|
589
|
+
return "semantic"
|
|
590
|
+
return "source"
|
|
591
|
+
|
|
592
|
+
def _build_graph_data(
|
|
593
|
+
self,
|
|
594
|
+
metadata: PipelineStoryMetadata,
|
|
595
|
+
graph_data: Optional[Dict[str, Any]],
|
|
596
|
+
config: Optional[Dict[str, Any]],
|
|
597
|
+
) -> Dict[str, Any]:
|
|
598
|
+
"""Build enriched graph data for interactive DAG visualization.
|
|
599
|
+
|
|
600
|
+
Combines static graph structure with runtime execution metadata.
|
|
601
|
+
"""
|
|
602
|
+
ctx = get_logging_context()
|
|
603
|
+
|
|
604
|
+
# Build node lookup for runtime data
|
|
605
|
+
node_lookup = {n.node_name: n for n in metadata.nodes}
|
|
606
|
+
|
|
607
|
+
# Debug: Log which path we're taking
|
|
608
|
+
path_taken = (
|
|
609
|
+
"graph_data"
|
|
610
|
+
if graph_data
|
|
611
|
+
else ("config" if config and "nodes" in config else "fallback")
|
|
612
|
+
)
|
|
613
|
+
ctx.debug(
|
|
614
|
+
"Building graph data",
|
|
615
|
+
path=path_taken,
|
|
616
|
+
has_graph_data=bool(graph_data),
|
|
617
|
+
has_config=bool(config),
|
|
618
|
+
config_has_nodes=bool(config and "nodes" in config),
|
|
619
|
+
metadata_node_count=len(metadata.nodes),
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
# Start with provided graph_data or build from config
|
|
623
|
+
if graph_data:
|
|
624
|
+
nodes = graph_data.get("nodes", [])
|
|
625
|
+
edges = graph_data.get("edges", [])
|
|
626
|
+
elif config and "nodes" in config:
|
|
627
|
+
nodes = []
|
|
628
|
+
edges = []
|
|
629
|
+
source_nodes = set() # Track source tables for lineage
|
|
630
|
+
target_nodes = set() # Track target tables for lineage
|
|
631
|
+
|
|
632
|
+
for node_cfg in config["nodes"]:
|
|
633
|
+
node_name = node_cfg["name"]
|
|
634
|
+
nodes.append(
|
|
635
|
+
{
|
|
636
|
+
"id": node_name,
|
|
637
|
+
"label": node_name,
|
|
638
|
+
"type": node_cfg.get("type", "transform"),
|
|
639
|
+
"layer": metadata.pipeline_layer or "unknown",
|
|
640
|
+
}
|
|
641
|
+
)
|
|
642
|
+
# Check depends_on for intra-pipeline dependencies
|
|
643
|
+
for dep in node_cfg.get("depends_on", []):
|
|
644
|
+
edges.append({"source": dep, "target": node_name})
|
|
645
|
+
|
|
646
|
+
# Check inputs block for cross-pipeline dependencies
|
|
647
|
+
inputs = node_cfg.get("inputs", {})
|
|
648
|
+
if inputs:
|
|
649
|
+
for input_name, input_val in inputs.items():
|
|
650
|
+
if isinstance(input_val, str) and input_val.startswith("$"):
|
|
651
|
+
ref = input_val[1:]
|
|
652
|
+
if "." in ref:
|
|
653
|
+
pipeline_name, node_ref = ref.split(".", 1)
|
|
654
|
+
edges.append(
|
|
655
|
+
{
|
|
656
|
+
"source": node_ref,
|
|
657
|
+
"target": node_name,
|
|
658
|
+
"source_pipeline": pipeline_name,
|
|
659
|
+
}
|
|
660
|
+
)
|
|
661
|
+
else:
|
|
662
|
+
edges.append({"source": ref, "target": node_name})
|
|
663
|
+
|
|
664
|
+
# Add read path as source for lineage
|
|
665
|
+
read_cfg = node_cfg.get("read", {})
|
|
666
|
+
if read_cfg:
|
|
667
|
+
read_path = read_cfg.get("path") or read_cfg.get("table")
|
|
668
|
+
if read_path:
|
|
669
|
+
source_nodes.add(read_path)
|
|
670
|
+
edges.append({"from": read_path, "to": node_name})
|
|
671
|
+
|
|
672
|
+
# Add write path as target for lineage
|
|
673
|
+
write_cfg = node_cfg.get("write", {})
|
|
674
|
+
if write_cfg:
|
|
675
|
+
write_path = write_cfg.get("path") or write_cfg.get("table")
|
|
676
|
+
if write_path:
|
|
677
|
+
target_nodes.add(write_path)
|
|
678
|
+
edges.append({"from": node_name, "to": write_path})
|
|
679
|
+
|
|
680
|
+
# Add source table nodes (inputs)
|
|
681
|
+
for source in source_nodes:
|
|
682
|
+
if not any(n["id"] == source for n in nodes):
|
|
683
|
+
nodes.append(
|
|
684
|
+
{
|
|
685
|
+
"id": source,
|
|
686
|
+
"label": source,
|
|
687
|
+
"type": "source",
|
|
688
|
+
"layer": self._infer_layer_from_path(source),
|
|
689
|
+
}
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
# Add target table nodes (outputs)
|
|
693
|
+
for target in target_nodes:
|
|
694
|
+
if not any(n["id"] == target for n in nodes):
|
|
695
|
+
nodes.append(
|
|
696
|
+
{
|
|
697
|
+
"id": target,
|
|
698
|
+
"label": target,
|
|
699
|
+
"type": "table",
|
|
700
|
+
"layer": metadata.pipeline_layer or "unknown",
|
|
701
|
+
}
|
|
702
|
+
)
|
|
703
|
+
else:
|
|
704
|
+
# Fallback: build from metadata nodes
|
|
705
|
+
nodes = [
|
|
706
|
+
{
|
|
707
|
+
"id": n.node_name,
|
|
708
|
+
"label": n.node_name,
|
|
709
|
+
"layer": metadata.pipeline_layer or "unknown",
|
|
710
|
+
}
|
|
711
|
+
for n in metadata.nodes
|
|
712
|
+
]
|
|
713
|
+
edges = []
|
|
714
|
+
source_nodes = set()
|
|
715
|
+
target_nodes = set()
|
|
716
|
+
|
|
717
|
+
for n in metadata.nodes:
|
|
718
|
+
# Debug: Log config_snapshot contents for each node
|
|
719
|
+
ctx.debug(
|
|
720
|
+
"Fallback path: checking node config_snapshot",
|
|
721
|
+
node_name=n.node_name,
|
|
722
|
+
has_config_snapshot=bool(n.config_snapshot),
|
|
723
|
+
config_snapshot_keys=(
|
|
724
|
+
list(n.config_snapshot.keys()) if n.config_snapshot else []
|
|
725
|
+
),
|
|
726
|
+
has_inputs=bool(n.config_snapshot and n.config_snapshot.get("inputs")),
|
|
727
|
+
inputs_value=n.config_snapshot.get("inputs") if n.config_snapshot else None,
|
|
728
|
+
has_depends_on=bool(n.config_snapshot and n.config_snapshot.get("depends_on")),
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
# Check depends_on for intra-pipeline dependencies
|
|
732
|
+
if n.config_snapshot and n.config_snapshot.get("depends_on"):
|
|
733
|
+
for dep in n.config_snapshot["depends_on"]:
|
|
734
|
+
edges.append({"source": dep, "target": n.node_name})
|
|
735
|
+
|
|
736
|
+
# Check inputs block for cross-pipeline dependencies
|
|
737
|
+
if n.config_snapshot and n.config_snapshot.get("inputs"):
|
|
738
|
+
for input_name, input_val in n.config_snapshot["inputs"].items():
|
|
739
|
+
ctx.debug(
|
|
740
|
+
"Processing input reference",
|
|
741
|
+
node_name=n.node_name,
|
|
742
|
+
input_name=input_name,
|
|
743
|
+
input_val=input_val,
|
|
744
|
+
is_string=isinstance(input_val, str),
|
|
745
|
+
starts_with_dollar=isinstance(input_val, str)
|
|
746
|
+
and input_val.startswith("$"),
|
|
747
|
+
)
|
|
748
|
+
# Handle $pipeline.node reference format
|
|
749
|
+
if isinstance(input_val, str) and input_val.startswith("$"):
|
|
750
|
+
# Format: $pipeline_name.node_name
|
|
751
|
+
ref = input_val[1:] # Remove $
|
|
752
|
+
if "." in ref:
|
|
753
|
+
pipeline_name, node_ref = ref.split(".", 1)
|
|
754
|
+
edges.append(
|
|
755
|
+
{
|
|
756
|
+
"source": node_ref,
|
|
757
|
+
"target": n.node_name,
|
|
758
|
+
"source_pipeline": pipeline_name,
|
|
759
|
+
}
|
|
760
|
+
)
|
|
761
|
+
ctx.debug(
|
|
762
|
+
"Added cross-pipeline edge",
|
|
763
|
+
source=node_ref,
|
|
764
|
+
target=n.node_name,
|
|
765
|
+
source_pipeline=pipeline_name,
|
|
766
|
+
)
|
|
767
|
+
else:
|
|
768
|
+
edges.append({"source": ref, "target": n.node_name})
|
|
769
|
+
ctx.debug(
|
|
770
|
+
"Added same-pipeline edge from inputs",
|
|
771
|
+
source=ref,
|
|
772
|
+
target=n.node_name,
|
|
773
|
+
)
|
|
774
|
+
|
|
775
|
+
# Add read/write paths for lineage from config_snapshot
|
|
776
|
+
if n.config_snapshot:
|
|
777
|
+
read_cfg = n.config_snapshot.get("read", {})
|
|
778
|
+
if read_cfg:
|
|
779
|
+
read_path = read_cfg.get("path") or read_cfg.get("table")
|
|
780
|
+
if read_path:
|
|
781
|
+
source_nodes.add(read_path)
|
|
782
|
+
edges.append({"from": read_path, "to": n.node_name})
|
|
783
|
+
|
|
784
|
+
write_cfg = n.config_snapshot.get("write", {})
|
|
785
|
+
if write_cfg:
|
|
786
|
+
write_path = write_cfg.get("path") or write_cfg.get("table")
|
|
787
|
+
if write_path:
|
|
788
|
+
target_nodes.add(write_path)
|
|
789
|
+
edges.append({"from": n.node_name, "to": write_path})
|
|
790
|
+
|
|
791
|
+
# Add source table nodes
|
|
792
|
+
for source in source_nodes:
|
|
793
|
+
if not any(n["id"] == source for n in nodes):
|
|
794
|
+
nodes.append(
|
|
795
|
+
{
|
|
796
|
+
"id": source,
|
|
797
|
+
"label": source,
|
|
798
|
+
"type": "source",
|
|
799
|
+
"layer": self._infer_layer_from_path(source),
|
|
800
|
+
}
|
|
801
|
+
)
|
|
802
|
+
|
|
803
|
+
# Add target table nodes
|
|
804
|
+
for target in target_nodes:
|
|
805
|
+
if not any(n["id"] == target for n in nodes):
|
|
806
|
+
nodes.append(
|
|
807
|
+
{
|
|
808
|
+
"id": target,
|
|
809
|
+
"label": target,
|
|
810
|
+
"type": "table",
|
|
811
|
+
"layer": metadata.pipeline_layer or "unknown",
|
|
812
|
+
}
|
|
813
|
+
)
|
|
814
|
+
|
|
815
|
+
# Collect all node IDs that exist in the current pipeline
|
|
816
|
+
existing_node_ids = {node["id"] for node in nodes}
|
|
817
|
+
|
|
818
|
+
# Find cross-pipeline dependencies (edge sources that don't exist as nodes)
|
|
819
|
+
# Build a map of node_ref -> pipeline_name for labeling
|
|
820
|
+
external_node_pipelines = {}
|
|
821
|
+
cross_pipeline_deps = set()
|
|
822
|
+
for edge in edges:
|
|
823
|
+
# Support both "source"/"target" and "from"/"to" formats
|
|
824
|
+
edge_source = edge.get("source") or edge.get("from", "")
|
|
825
|
+
if edge_source and edge_source not in existing_node_ids:
|
|
826
|
+
cross_pipeline_deps.add(edge_source)
|
|
827
|
+
# Track the pipeline name if available
|
|
828
|
+
if "source_pipeline" in edge:
|
|
829
|
+
external_node_pipelines[edge_source] = edge["source_pipeline"]
|
|
830
|
+
|
|
831
|
+
# Debug: Log summary before adding external nodes
|
|
832
|
+
ctx.debug(
|
|
833
|
+
"Graph data summary",
|
|
834
|
+
total_nodes=len(nodes),
|
|
835
|
+
total_edges=len(edges),
|
|
836
|
+
existing_node_ids=list(existing_node_ids),
|
|
837
|
+
edge_sources=[e.get("source") or e.get("from", "") for e in edges],
|
|
838
|
+
cross_pipeline_deps=list(cross_pipeline_deps),
|
|
839
|
+
)
|
|
840
|
+
|
|
841
|
+
# Add placeholder nodes for cross-pipeline dependencies
|
|
842
|
+
for dep_id in cross_pipeline_deps:
|
|
843
|
+
pipeline_name = external_node_pipelines.get(dep_id)
|
|
844
|
+
label = f"{pipeline_name}.{dep_id}" if pipeline_name else dep_id
|
|
845
|
+
ctx.debug(
|
|
846
|
+
"Adding external node for cross-pipeline dependency",
|
|
847
|
+
dep_id=dep_id,
|
|
848
|
+
pipeline_name=pipeline_name,
|
|
849
|
+
label=label,
|
|
850
|
+
)
|
|
851
|
+
nodes.append(
|
|
852
|
+
{
|
|
853
|
+
"id": dep_id,
|
|
854
|
+
"label": label,
|
|
855
|
+
"type": "external",
|
|
856
|
+
"source_pipeline": pipeline_name,
|
|
857
|
+
}
|
|
858
|
+
)
|
|
859
|
+
|
|
860
|
+
# Build dependency lookup: node_id -> list of source nodes (with pipeline info)
|
|
861
|
+
node_dependencies = {}
|
|
862
|
+
for edge in edges:
|
|
863
|
+
# Support both "source"/"target" and "from"/"to" formats
|
|
864
|
+
target = edge.get("target") or edge.get("to", "")
|
|
865
|
+
source = edge.get("source") or edge.get("from", "")
|
|
866
|
+
if not target or not source:
|
|
867
|
+
continue
|
|
868
|
+
source_pipeline = edge.get("source_pipeline")
|
|
869
|
+
dep_label = f"{source_pipeline}.{source}" if source_pipeline else source
|
|
870
|
+
|
|
871
|
+
if target not in node_dependencies:
|
|
872
|
+
node_dependencies[target] = []
|
|
873
|
+
node_dependencies[target].append(dep_label)
|
|
874
|
+
|
|
875
|
+
# Enrich nodes with runtime execution data
|
|
876
|
+
enriched_nodes = []
|
|
877
|
+
for node in nodes:
|
|
878
|
+
node_id = node["id"]
|
|
879
|
+
runtime = node_lookup.get(node_id)
|
|
880
|
+
is_external = node.get("type") == "external"
|
|
881
|
+
|
|
882
|
+
enriched = {
|
|
883
|
+
"id": node_id,
|
|
884
|
+
"label": node.get("label", node_id),
|
|
885
|
+
"type": node.get("type", "transform"),
|
|
886
|
+
"status": runtime.status if runtime else ("external" if is_external else "unknown"),
|
|
887
|
+
"duration": runtime.duration if runtime else 0,
|
|
888
|
+
"rows_out": runtime.rows_out if runtime else None,
|
|
889
|
+
"is_anomaly": runtime.is_anomaly if runtime else False,
|
|
890
|
+
"is_slow": runtime.is_slow if runtime else False,
|
|
891
|
+
"has_row_anomaly": runtime.has_row_anomaly if runtime else False,
|
|
892
|
+
"error_message": runtime.error_message if runtime else None,
|
|
893
|
+
"validation_count": len(runtime.validation_warnings) if runtime else 0,
|
|
894
|
+
"is_external": is_external,
|
|
895
|
+
"source_pipeline": node.get("source_pipeline"),
|
|
896
|
+
"dependencies": node_dependencies.get(node_id, []),
|
|
897
|
+
}
|
|
898
|
+
enriched_nodes.append(enriched)
|
|
899
|
+
|
|
900
|
+
return {
|
|
901
|
+
"nodes": enriched_nodes,
|
|
902
|
+
"edges": edges,
|
|
903
|
+
}
|
|
904
|
+
|
|
905
|
+
def _compute_anomalies(self, node: NodeExecutionMetadata) -> None:
|
|
906
|
+
"""Compute anomaly flags for a node based on historical data.
|
|
907
|
+
|
|
908
|
+
Anomaly rules:
|
|
909
|
+
- is_slow: node duration is 3x or more than historical avg
|
|
910
|
+
- has_row_anomaly: rows_out deviates ±50% from historical avg
|
|
911
|
+
"""
|
|
912
|
+
anomaly_reasons = []
|
|
913
|
+
|
|
914
|
+
# Check for slow execution (3x threshold)
|
|
915
|
+
if node.historical_avg_duration and node.historical_avg_duration > 0:
|
|
916
|
+
if node.duration >= node.historical_avg_duration * 3:
|
|
917
|
+
node.is_slow = True
|
|
918
|
+
ratio = node.duration / node.historical_avg_duration
|
|
919
|
+
avg_dur = node.historical_avg_duration
|
|
920
|
+
anomaly_reasons.append(
|
|
921
|
+
f"Slow: {node.duration:.2f}s vs avg {avg_dur:.2f}s ({ratio:.1f}x)"
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
# Check for row count anomaly (±50% threshold)
|
|
925
|
+
if node.historical_avg_rows and node.historical_avg_rows > 0 and node.rows_out is not None:
|
|
926
|
+
pct_change = abs(node.rows_out - node.historical_avg_rows) / node.historical_avg_rows
|
|
927
|
+
if pct_change >= 0.5:
|
|
928
|
+
node.has_row_anomaly = True
|
|
929
|
+
direction = "+" if node.rows_out > node.historical_avg_rows else "-"
|
|
930
|
+
avg_rows = node.historical_avg_rows
|
|
931
|
+
pct_str = f"{pct_change * 100:.0f}"
|
|
932
|
+
anomaly_reasons.append(
|
|
933
|
+
f"Rows: {node.rows_out:,} vs avg {avg_rows:,.0f} ({direction}{pct_str}%)"
|
|
934
|
+
)
|
|
935
|
+
|
|
936
|
+
if anomaly_reasons:
|
|
937
|
+
node.is_anomaly = True
|
|
938
|
+
node.anomaly_reasons = anomaly_reasons
|
|
939
|
+
|
|
940
|
+
def _convert_result_to_metadata(
|
|
941
|
+
self, result: NodeResult, node_name: str
|
|
942
|
+
) -> NodeExecutionMetadata:
|
|
943
|
+
"""Convert NodeResult to NodeExecutionMetadata."""
|
|
944
|
+
meta = result.metadata or {}
|
|
945
|
+
|
|
946
|
+
# Extract Delta Info
|
|
947
|
+
delta_info = None
|
|
948
|
+
if "delta_info" in meta:
|
|
949
|
+
d = meta["delta_info"]
|
|
950
|
+
# Check if it's already an object or dict
|
|
951
|
+
if isinstance(d, DeltaWriteInfo):
|
|
952
|
+
delta_info = d
|
|
953
|
+
else:
|
|
954
|
+
# It might be a dict if coming from loose dict
|
|
955
|
+
pass
|
|
956
|
+
|
|
957
|
+
node_meta = NodeExecutionMetadata(
|
|
958
|
+
node_name=node_name,
|
|
959
|
+
operation="transform", # Generic default
|
|
960
|
+
status="success" if result.success else "failed",
|
|
961
|
+
duration=result.duration,
|
|
962
|
+
rows_out=result.rows_processed,
|
|
963
|
+
rows_written=result.rows_written,
|
|
964
|
+
schema_out=result.result_schema,
|
|
965
|
+
# From metadata dict
|
|
966
|
+
rows_in=result.rows_read, # Use rows_read from NodeResult
|
|
967
|
+
sample_in=meta.get("sample_data_in"),
|
|
968
|
+
executed_sql=meta.get("executed_sql", []),
|
|
969
|
+
sql_hash=meta.get("sql_hash"),
|
|
970
|
+
transformation_stack=meta.get("transformation_stack", []),
|
|
971
|
+
config_snapshot=meta.get("config_snapshot"),
|
|
972
|
+
delta_info=delta_info,
|
|
973
|
+
data_diff=meta.get("data_diff"),
|
|
974
|
+
environment=meta.get("environment"),
|
|
975
|
+
source_files=meta.get("source_files", []),
|
|
976
|
+
null_profile=meta.get("null_profile"),
|
|
977
|
+
schema_in=meta.get("schema_in"),
|
|
978
|
+
sample_data=meta.get("sample_data"),
|
|
979
|
+
columns_added=meta.get("columns_added", []),
|
|
980
|
+
columns_removed=meta.get("columns_removed", []),
|
|
981
|
+
error_message=str(result.error) if result.error else None,
|
|
982
|
+
error_type=type(result.error).__name__ if result.error else None,
|
|
983
|
+
error_traceback=meta.get("error_traceback"),
|
|
984
|
+
error_traceback_cleaned=meta.get("error_traceback_cleaned"),
|
|
985
|
+
validation_warnings=meta.get("validation_warnings", []),
|
|
986
|
+
execution_steps=meta.get("steps", []),
|
|
987
|
+
failed_rows_samples=meta.get("failed_rows_samples", {}),
|
|
988
|
+
failed_rows_counts=meta.get("failed_rows_counts", {}),
|
|
989
|
+
failed_rows_truncated=meta.get("failed_rows_truncated", False),
|
|
990
|
+
truncated_validations=meta.get("truncated_validations", []),
|
|
991
|
+
retry_history=meta.get("retry_history", []),
|
|
992
|
+
)
|
|
993
|
+
|
|
994
|
+
# Calculate derived metrics
|
|
995
|
+
node_meta.calculate_row_change() # Needs rows_in
|
|
996
|
+
# schema changes are already in metadata from Node logic
|
|
997
|
+
|
|
998
|
+
return node_meta
|
|
999
|
+
|
|
1000
|
+
def _write_remote(self, path: str, content: str) -> None:
|
|
1001
|
+
"""Write content to remote path using fsspec."""
|
|
1002
|
+
ctx = get_logging_context()
|
|
1003
|
+
try:
|
|
1004
|
+
import fsspec
|
|
1005
|
+
|
|
1006
|
+
# Use provided storage options (credentials)
|
|
1007
|
+
with fsspec.open(path, "w", encoding="utf-8", **self.storage_options) as f:
|
|
1008
|
+
f.write(content)
|
|
1009
|
+
ctx.debug("Remote file written", path=path, size=len(content))
|
|
1010
|
+
except ImportError:
|
|
1011
|
+
# Fallback for environments without fsspec (e.g., minimal Spark)
|
|
1012
|
+
# Try dbutils if on Databricks
|
|
1013
|
+
try:
|
|
1014
|
+
from pyspark.dbutils import DBUtils
|
|
1015
|
+
from pyspark.sql import SparkSession
|
|
1016
|
+
|
|
1017
|
+
spark = SparkSession.builder.getOrCreate()
|
|
1018
|
+
dbutils = DBUtils(spark)
|
|
1019
|
+
# dbutils.fs.put expects string
|
|
1020
|
+
dbutils.fs.put(path, content, True)
|
|
1021
|
+
ctx.debug("Remote file written via dbutils", path=path, size=len(content))
|
|
1022
|
+
except Exception as e:
|
|
1023
|
+
ctx.error(
|
|
1024
|
+
"Failed to write remote story",
|
|
1025
|
+
path=path,
|
|
1026
|
+
error=str(e),
|
|
1027
|
+
)
|
|
1028
|
+
raise RuntimeError(
|
|
1029
|
+
f"Could not write story to {path}. Install 'fsspec' or 'adlfs'."
|
|
1030
|
+
) from e
|
|
1031
|
+
|
|
1032
|
+
def _clean_config_for_dump(self, config: Any) -> Any:
|
|
1033
|
+
"""Clean configuration for YAML dumping.
|
|
1034
|
+
|
|
1035
|
+
Handles multiline strings to force block style.
|
|
1036
|
+
"""
|
|
1037
|
+
if isinstance(config, dict):
|
|
1038
|
+
return {k: self._clean_config_for_dump(v) for k, v in config.items()}
|
|
1039
|
+
elif isinstance(config, list):
|
|
1040
|
+
return [self._clean_config_for_dump(v) for v in config]
|
|
1041
|
+
elif isinstance(config, str) and "\n" in config:
|
|
1042
|
+
# Use custom class to force block style
|
|
1043
|
+
# Strip trailing spaces from lines to allow block style
|
|
1044
|
+
cleaned = config.replace(" \n", "\n").strip()
|
|
1045
|
+
return MultilineString(cleaned)
|
|
1046
|
+
return config
|
|
1047
|
+
|
|
1048
|
+
def _get_git_info(self) -> Dict[str, str]:
|
|
1049
|
+
"""Get current git commit and branch."""
|
|
1050
|
+
try:
|
|
1051
|
+
# Run git commands silently
|
|
1052
|
+
commit = (
|
|
1053
|
+
subprocess.check_output(
|
|
1054
|
+
["git", "rev-parse", "--short", "HEAD"], stderr=subprocess.DEVNULL
|
|
1055
|
+
)
|
|
1056
|
+
.decode("utf-8")
|
|
1057
|
+
.strip()
|
|
1058
|
+
)
|
|
1059
|
+
|
|
1060
|
+
branch = (
|
|
1061
|
+
subprocess.check_output(
|
|
1062
|
+
["git", "rev-parse", "--abbrev-ref", "HEAD"], stderr=subprocess.DEVNULL
|
|
1063
|
+
)
|
|
1064
|
+
.decode("utf-8")
|
|
1065
|
+
.strip()
|
|
1066
|
+
)
|
|
1067
|
+
|
|
1068
|
+
return {"commit": commit, "branch": branch}
|
|
1069
|
+
except Exception:
|
|
1070
|
+
return {"commit": "unknown", "branch": "unknown"}
|
|
1071
|
+
|
|
1072
|
+
def cleanup(self) -> None:
|
|
1073
|
+
"""Remove old stories based on retention policy."""
|
|
1074
|
+
ctx = get_logging_context()
|
|
1075
|
+
|
|
1076
|
+
if self.is_remote:
|
|
1077
|
+
self._cleanup_remote()
|
|
1078
|
+
return
|
|
1079
|
+
|
|
1080
|
+
if self.output_path is None:
|
|
1081
|
+
return
|
|
1082
|
+
|
|
1083
|
+
try:
|
|
1084
|
+
# 1. Clean new nested structure: {pipeline}/{date}/run_*.html
|
|
1085
|
+
pipeline_dir = self.output_path / self.pipeline_name
|
|
1086
|
+
if pipeline_dir.exists():
|
|
1087
|
+
# Find all files recursively
|
|
1088
|
+
stories = sorted(
|
|
1089
|
+
pipeline_dir.glob("**/*.html"),
|
|
1090
|
+
key=lambda p: str(p), # Sort by path (date/time)
|
|
1091
|
+
reverse=True,
|
|
1092
|
+
)
|
|
1093
|
+
json_stories = sorted(
|
|
1094
|
+
pipeline_dir.glob("**/*.json"),
|
|
1095
|
+
key=lambda p: str(p),
|
|
1096
|
+
reverse=True,
|
|
1097
|
+
)
|
|
1098
|
+
|
|
1099
|
+
self._apply_retention(stories, json_stories)
|
|
1100
|
+
|
|
1101
|
+
# Clean empty date directories
|
|
1102
|
+
for date_dir in pipeline_dir.iterdir():
|
|
1103
|
+
if date_dir.is_dir() and not any(date_dir.iterdir()):
|
|
1104
|
+
try:
|
|
1105
|
+
date_dir.rmdir()
|
|
1106
|
+
except Exception:
|
|
1107
|
+
pass
|
|
1108
|
+
|
|
1109
|
+
# 2. Clean legacy flat structure: {pipeline}_*.html in root
|
|
1110
|
+
legacy_stories = sorted(
|
|
1111
|
+
self.output_path.glob(f"{self.pipeline_name}_*.html"),
|
|
1112
|
+
key=lambda p: p.stat().st_mtime,
|
|
1113
|
+
reverse=True,
|
|
1114
|
+
)
|
|
1115
|
+
# Only clean legacy if we have them
|
|
1116
|
+
if legacy_stories:
|
|
1117
|
+
# We don't want to count legacy + new against the same limit technically,
|
|
1118
|
+
# but for simplicity let's just clean legacy based on their own existence
|
|
1119
|
+
self._apply_retention(legacy_stories, [])
|
|
1120
|
+
|
|
1121
|
+
ctx.debug(
|
|
1122
|
+
"Retention policy applied",
|
|
1123
|
+
pipeline=self.pipeline_name,
|
|
1124
|
+
retention_days=self.retention_days,
|
|
1125
|
+
retention_count=self.retention_count,
|
|
1126
|
+
)
|
|
1127
|
+
|
|
1128
|
+
except Exception as e:
|
|
1129
|
+
ctx.warning("Story cleanup failed", error=str(e))
|
|
1130
|
+
|
|
1131
|
+
def _apply_retention(self, stories: List[Path], json_stories: List[Path]) -> None:
|
|
1132
|
+
"""Apply count and time retention policies."""
|
|
1133
|
+
from datetime import timedelta
|
|
1134
|
+
|
|
1135
|
+
# 1. Count retention
|
|
1136
|
+
if self.retention_count is not None and len(stories) > self.retention_count:
|
|
1137
|
+
to_delete = stories[self.retention_count :]
|
|
1138
|
+
for path in to_delete:
|
|
1139
|
+
path.unlink(missing_ok=True)
|
|
1140
|
+
|
|
1141
|
+
if self.retention_count is not None and len(json_stories) > self.retention_count:
|
|
1142
|
+
to_delete = json_stories[self.retention_count :]
|
|
1143
|
+
for path in to_delete:
|
|
1144
|
+
path.unlink(missing_ok=True)
|
|
1145
|
+
|
|
1146
|
+
# 2. Time retention
|
|
1147
|
+
now = datetime.now()
|
|
1148
|
+
if self.retention_days is None:
|
|
1149
|
+
return
|
|
1150
|
+
cutoff = now - timedelta(days=self.retention_days)
|
|
1151
|
+
|
|
1152
|
+
# Check remaining files
|
|
1153
|
+
# For nested files, we could parse date from folder name, but mtime is safer fallback
|
|
1154
|
+
retention_count = self.retention_count or 100
|
|
1155
|
+
remaining = stories[:retention_count] + json_stories[:retention_count]
|
|
1156
|
+
|
|
1157
|
+
for path in remaining:
|
|
1158
|
+
if path.exists():
|
|
1159
|
+
# Try to infer date from path first (faster/more accurate than mtime)
|
|
1160
|
+
# Path format: .../{date}/run_{time}.html
|
|
1161
|
+
try:
|
|
1162
|
+
# Try to parse parent folder as date
|
|
1163
|
+
file_date = datetime.strptime(path.parent.name, "%Y-%m-%d")
|
|
1164
|
+
if file_date < cutoff.replace(hour=0, minute=0, second=0, microsecond=0):
|
|
1165
|
+
path.unlink(missing_ok=True)
|
|
1166
|
+
continue
|
|
1167
|
+
except ValueError:
|
|
1168
|
+
pass
|
|
1169
|
+
|
|
1170
|
+
# Fallback to mtime
|
|
1171
|
+
mtime = datetime.fromtimestamp(path.stat().st_mtime)
|
|
1172
|
+
if mtime < cutoff:
|
|
1173
|
+
path.unlink(missing_ok=True)
|
|
1174
|
+
|
|
1175
|
+
def _cleanup_remote(self) -> None:
|
|
1176
|
+
"""Clean up old stories from remote storage using fsspec."""
|
|
1177
|
+
ctx = get_logging_context()
|
|
1178
|
+
|
|
1179
|
+
try:
|
|
1180
|
+
import fsspec
|
|
1181
|
+
from datetime import timedelta
|
|
1182
|
+
|
|
1183
|
+
# Build the pipeline stories path
|
|
1184
|
+
pipeline_path = f"{self.output_path_str.rstrip('/')}/{self.pipeline_name}"
|
|
1185
|
+
|
|
1186
|
+
# Get filesystem from the path
|
|
1187
|
+
fs, path_prefix = fsspec.core.url_to_fs(pipeline_path, **self.storage_options)
|
|
1188
|
+
|
|
1189
|
+
# Check if path exists
|
|
1190
|
+
if not fs.exists(path_prefix):
|
|
1191
|
+
ctx.debug("Remote story path does not exist yet", path=pipeline_path)
|
|
1192
|
+
return
|
|
1193
|
+
|
|
1194
|
+
# List all files recursively
|
|
1195
|
+
all_files = []
|
|
1196
|
+
try:
|
|
1197
|
+
for root, dirs, files in fs.walk(path_prefix):
|
|
1198
|
+
for f in files:
|
|
1199
|
+
if f.endswith((".html", ".json")):
|
|
1200
|
+
full_path = f"{root}/{f}" if root else f
|
|
1201
|
+
all_files.append(full_path)
|
|
1202
|
+
except Exception as e:
|
|
1203
|
+
ctx.debug(f"Could not walk remote path: {e}")
|
|
1204
|
+
return
|
|
1205
|
+
|
|
1206
|
+
if not all_files:
|
|
1207
|
+
return
|
|
1208
|
+
|
|
1209
|
+
# Sort by path (which includes date folders) - newest first
|
|
1210
|
+
all_files.sort(reverse=True)
|
|
1211
|
+
|
|
1212
|
+
# Separate html and json
|
|
1213
|
+
html_files = [f for f in all_files if f.endswith(".html")]
|
|
1214
|
+
json_files = [f for f in all_files if f.endswith(".json")]
|
|
1215
|
+
|
|
1216
|
+
deleted_count = 0
|
|
1217
|
+
|
|
1218
|
+
# Apply count retention
|
|
1219
|
+
if self.retention_count is not None:
|
|
1220
|
+
if len(html_files) > self.retention_count:
|
|
1221
|
+
for f in html_files[self.retention_count :]:
|
|
1222
|
+
try:
|
|
1223
|
+
fs.rm(f)
|
|
1224
|
+
deleted_count += 1
|
|
1225
|
+
except Exception:
|
|
1226
|
+
pass
|
|
1227
|
+
|
|
1228
|
+
if len(json_files) > self.retention_count:
|
|
1229
|
+
for f in json_files[self.retention_count :]:
|
|
1230
|
+
try:
|
|
1231
|
+
fs.rm(f)
|
|
1232
|
+
deleted_count += 1
|
|
1233
|
+
except Exception:
|
|
1234
|
+
pass
|
|
1235
|
+
|
|
1236
|
+
# Apply time retention
|
|
1237
|
+
if self.retention_days is not None:
|
|
1238
|
+
cutoff = datetime.now() - timedelta(days=self.retention_days)
|
|
1239
|
+
cutoff_str = cutoff.strftime("%Y-%m-%d")
|
|
1240
|
+
|
|
1241
|
+
# Check remaining files
|
|
1242
|
+
retention_count = self.retention_count or 100
|
|
1243
|
+
remaining = html_files[:retention_count] + json_files[:retention_count]
|
|
1244
|
+
|
|
1245
|
+
for f in remaining:
|
|
1246
|
+
# Try to parse date from path (format: .../YYYY-MM-DD/run_*.html)
|
|
1247
|
+
try:
|
|
1248
|
+
parts = f.replace("\\", "/").split("/")
|
|
1249
|
+
for part in parts:
|
|
1250
|
+
if len(part) == 10 and part[4] == "-" and part[7] == "-":
|
|
1251
|
+
if part < cutoff_str:
|
|
1252
|
+
try:
|
|
1253
|
+
fs.rm(f)
|
|
1254
|
+
deleted_count += 1
|
|
1255
|
+
except Exception:
|
|
1256
|
+
pass
|
|
1257
|
+
break
|
|
1258
|
+
except Exception:
|
|
1259
|
+
pass
|
|
1260
|
+
|
|
1261
|
+
# Clean empty date directories
|
|
1262
|
+
try:
|
|
1263
|
+
for item in fs.ls(path_prefix, detail=False):
|
|
1264
|
+
if fs.isdir(item):
|
|
1265
|
+
contents = fs.ls(item, detail=False)
|
|
1266
|
+
if not contents:
|
|
1267
|
+
fs.rmdir(item)
|
|
1268
|
+
except Exception:
|
|
1269
|
+
pass
|
|
1270
|
+
|
|
1271
|
+
if deleted_count > 0:
|
|
1272
|
+
ctx.debug(
|
|
1273
|
+
"Remote story cleanup completed",
|
|
1274
|
+
deleted=deleted_count,
|
|
1275
|
+
pipeline=self.pipeline_name,
|
|
1276
|
+
)
|
|
1277
|
+
|
|
1278
|
+
except ImportError:
|
|
1279
|
+
ctx.debug("fsspec not available for remote cleanup")
|
|
1280
|
+
except Exception as e:
|
|
1281
|
+
ctx.warning(f"Remote story cleanup failed: {e}")
|
|
1282
|
+
|
|
1283
|
+
def _generate_pipeline_index(self) -> None:
|
|
1284
|
+
"""Generate an index.html with a table of recent runs (Phase 3)."""
|
|
1285
|
+
import json
|
|
1286
|
+
|
|
1287
|
+
ctx = get_logging_context()
|
|
1288
|
+
|
|
1289
|
+
if self.is_remote:
|
|
1290
|
+
ctx.debug("Pipeline index not yet supported for remote storage")
|
|
1291
|
+
return
|
|
1292
|
+
|
|
1293
|
+
if self.output_path is None:
|
|
1294
|
+
return
|
|
1295
|
+
|
|
1296
|
+
pipeline_dir = self.output_path / self.pipeline_name
|
|
1297
|
+
if not pipeline_dir.exists():
|
|
1298
|
+
return
|
|
1299
|
+
|
|
1300
|
+
# Find all JSON files
|
|
1301
|
+
json_files = sorted(
|
|
1302
|
+
pipeline_dir.glob("**/*.json"),
|
|
1303
|
+
key=lambda p: str(p),
|
|
1304
|
+
reverse=True,
|
|
1305
|
+
)
|
|
1306
|
+
|
|
1307
|
+
if not json_files:
|
|
1308
|
+
return
|
|
1309
|
+
|
|
1310
|
+
# Load metadata from each run
|
|
1311
|
+
runs = []
|
|
1312
|
+
for json_path in json_files[:50]: # Limit to 50 most recent
|
|
1313
|
+
try:
|
|
1314
|
+
with open(json_path, "r", encoding="utf-8") as f:
|
|
1315
|
+
data = json.load(f)
|
|
1316
|
+
|
|
1317
|
+
html_path = json_path.with_suffix(".html")
|
|
1318
|
+
relative_html = html_path.relative_to(pipeline_dir)
|
|
1319
|
+
|
|
1320
|
+
runs.append(
|
|
1321
|
+
{
|
|
1322
|
+
"run_id": data.get("run_id", "unknown"),
|
|
1323
|
+
"started_at": data.get("started_at", ""),
|
|
1324
|
+
"duration": data.get("duration", 0),
|
|
1325
|
+
"total_nodes": data.get("total_nodes", 0),
|
|
1326
|
+
"completed_nodes": data.get("completed_nodes", 0),
|
|
1327
|
+
"failed_nodes": data.get("failed_nodes", 0),
|
|
1328
|
+
"success_rate": data.get("success_rate", 0),
|
|
1329
|
+
"html_path": str(relative_html).replace("\\", "/"),
|
|
1330
|
+
"status": "failed" if data.get("failed_nodes", 0) > 0 else "success",
|
|
1331
|
+
}
|
|
1332
|
+
)
|
|
1333
|
+
except Exception as e:
|
|
1334
|
+
ctx.debug(f"Failed to load run metadata: {json_path}, error: {e}")
|
|
1335
|
+
continue
|
|
1336
|
+
|
|
1337
|
+
if not runs:
|
|
1338
|
+
return
|
|
1339
|
+
|
|
1340
|
+
# Generate index HTML
|
|
1341
|
+
index_html = self._render_index_html(runs)
|
|
1342
|
+
index_path = pipeline_dir / "index.html"
|
|
1343
|
+
|
|
1344
|
+
try:
|
|
1345
|
+
with open(index_path, "w", encoding="utf-8") as f:
|
|
1346
|
+
f.write(index_html)
|
|
1347
|
+
ctx.debug("Pipeline index generated", path=str(index_path), runs=len(runs))
|
|
1348
|
+
except Exception as e:
|
|
1349
|
+
ctx.warning(f"Failed to write pipeline index: {e}")
|
|
1350
|
+
|
|
1351
|
+
def _render_index_html(self, runs: List[Dict[str, Any]]) -> str:
|
|
1352
|
+
"""Render the pipeline history index HTML."""
|
|
1353
|
+
rows_html = ""
|
|
1354
|
+
for run in runs:
|
|
1355
|
+
status_class = "success" if run["status"] == "success" else "failed"
|
|
1356
|
+
status_icon = "✓" if run["status"] == "success" else "✗"
|
|
1357
|
+
rows_html += f"""
|
|
1358
|
+
<tr class="{status_class}">
|
|
1359
|
+
<td><a href="{run["html_path"]}">{run["run_id"]}</a></td>
|
|
1360
|
+
<td>{run["started_at"]}</td>
|
|
1361
|
+
<td>{run["duration"]:.2f}s</td>
|
|
1362
|
+
<td>{run["total_nodes"]}</td>
|
|
1363
|
+
<td class="status-cell {status_class}">{status_icon} {run["completed_nodes"]}/{run["total_nodes"]}</td>
|
|
1364
|
+
<td>{run["success_rate"]:.1f}%</td>
|
|
1365
|
+
</tr>
|
|
1366
|
+
"""
|
|
1367
|
+
|
|
1368
|
+
return f"""<!DOCTYPE html>
|
|
1369
|
+
<html lang="en">
|
|
1370
|
+
<head>
|
|
1371
|
+
<meta charset="UTF-8">
|
|
1372
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
1373
|
+
<title>Pipeline History: {self.pipeline_name}</title>
|
|
1374
|
+
<style>
|
|
1375
|
+
:root {{
|
|
1376
|
+
--primary-color: #0066cc;
|
|
1377
|
+
--success-color: #28a745;
|
|
1378
|
+
--error-color: #dc3545;
|
|
1379
|
+
}}
|
|
1380
|
+
body {{
|
|
1381
|
+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
|
1382
|
+
background: #f4f7f9;
|
|
1383
|
+
margin: 0;
|
|
1384
|
+
padding: 20px;
|
|
1385
|
+
}}
|
|
1386
|
+
.container {{ max-width: 1200px; margin: 0 auto; }}
|
|
1387
|
+
h1 {{ color: var(--primary-color); margin-bottom: 20px; }}
|
|
1388
|
+
table {{
|
|
1389
|
+
width: 100%;
|
|
1390
|
+
background: #fff;
|
|
1391
|
+
border-collapse: collapse;
|
|
1392
|
+
border-radius: 8px;
|
|
1393
|
+
overflow: hidden;
|
|
1394
|
+
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
|
1395
|
+
}}
|
|
1396
|
+
th, td {{ padding: 12px 16px; text-align: left; border-bottom: 1px solid #e1e4e8; }}
|
|
1397
|
+
th {{ background: #f8f9fa; font-weight: 600; }}
|
|
1398
|
+
tr:hover {{ background: #f8f9fa; }}
|
|
1399
|
+
a {{ color: var(--primary-color); text-decoration: none; }}
|
|
1400
|
+
a:hover {{ text-decoration: underline; }}
|
|
1401
|
+
.status-cell.success {{ color: var(--success-color); font-weight: 600; }}
|
|
1402
|
+
.status-cell.failed {{ color: var(--error-color); font-weight: 600; }}
|
|
1403
|
+
tr.failed {{ background: #fff5f5; }}
|
|
1404
|
+
</style>
|
|
1405
|
+
</head>
|
|
1406
|
+
<body>
|
|
1407
|
+
<div class="container">
|
|
1408
|
+
<h1>📊 Pipeline History: {self.pipeline_name}</h1>
|
|
1409
|
+
<p style="color: #666; margin-bottom: 20px;">Showing {len(runs)} most recent runs</p>
|
|
1410
|
+
<table>
|
|
1411
|
+
<thead>
|
|
1412
|
+
<tr>
|
|
1413
|
+
<th>Run ID</th>
|
|
1414
|
+
<th>Started</th>
|
|
1415
|
+
<th>Duration</th>
|
|
1416
|
+
<th>Nodes</th>
|
|
1417
|
+
<th>Status</th>
|
|
1418
|
+
<th>Success Rate</th>
|
|
1419
|
+
</tr>
|
|
1420
|
+
</thead>
|
|
1421
|
+
<tbody>
|
|
1422
|
+
{rows_html}
|
|
1423
|
+
</tbody>
|
|
1424
|
+
</table>
|
|
1425
|
+
</div>
|
|
1426
|
+
</body>
|
|
1427
|
+
</html>
|
|
1428
|
+
"""
|
|
1429
|
+
|
|
1430
|
+
# Legacy methods removed as they are now handled by renderers
|
|
1431
|
+
# _generate_node_section, _sample_to_markdown, _dataframe_to_markdown
|