photo-stack-finder 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. orchestrator/__init__.py +2 -2
  2. orchestrator/app.py +6 -11
  3. orchestrator/build_pipeline.py +19 -21
  4. orchestrator/orchestrator_runner.py +11 -8
  5. orchestrator/pipeline_builder.py +126 -126
  6. orchestrator/pipeline_orchestrator.py +604 -604
  7. orchestrator/review_persistence.py +162 -162
  8. orchestrator/static/orchestrator.css +76 -76
  9. orchestrator/static/orchestrator.html +11 -5
  10. orchestrator/static/orchestrator.js +3 -1
  11. overlap_metrics/__init__.py +1 -1
  12. overlap_metrics/config.py +135 -135
  13. overlap_metrics/core.py +284 -284
  14. overlap_metrics/estimators.py +292 -292
  15. overlap_metrics/metrics.py +307 -307
  16. overlap_metrics/registry.py +99 -99
  17. overlap_metrics/utils.py +104 -104
  18. photo_compare/__init__.py +1 -1
  19. photo_compare/base.py +285 -285
  20. photo_compare/config.py +225 -225
  21. photo_compare/distance.py +15 -15
  22. photo_compare/feature_methods.py +173 -173
  23. photo_compare/file_hash.py +29 -29
  24. photo_compare/hash_methods.py +99 -99
  25. photo_compare/histogram_methods.py +118 -118
  26. photo_compare/pixel_methods.py +58 -58
  27. photo_compare/structural_methods.py +104 -104
  28. photo_compare/types.py +28 -28
  29. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/METADATA +21 -22
  30. photo_stack_finder-0.1.8.dist-info/RECORD +75 -0
  31. scripts/orchestrate.py +12 -10
  32. utils/__init__.py +4 -3
  33. utils/base_pipeline_stage.py +171 -171
  34. utils/base_ports.py +176 -176
  35. utils/benchmark_utils.py +823 -823
  36. utils/channel.py +74 -74
  37. utils/comparison_gates.py +40 -21
  38. utils/compute_benchmarks.py +355 -355
  39. utils/compute_identical.py +94 -24
  40. utils/compute_indices.py +235 -235
  41. utils/compute_perceptual_hash.py +127 -127
  42. utils/compute_perceptual_match.py +240 -240
  43. utils/compute_sha_bins.py +64 -20
  44. utils/compute_template_similarity.py +1 -1
  45. utils/compute_versions.py +483 -483
  46. utils/config.py +8 -5
  47. utils/data_io.py +83 -83
  48. utils/graph_context.py +44 -44
  49. utils/logger.py +2 -2
  50. utils/models.py +2 -2
  51. utils/photo_file.py +90 -91
  52. utils/pipeline_graph.py +334 -334
  53. utils/pipeline_stage.py +408 -408
  54. utils/plot_helpers.py +123 -123
  55. utils/ports.py +136 -136
  56. utils/progress.py +415 -415
  57. utils/report_builder.py +139 -139
  58. utils/review_types.py +55 -55
  59. utils/review_utils.py +10 -19
  60. utils/sequence.py +10 -8
  61. utils/sequence_clustering.py +1 -1
  62. utils/template.py +57 -57
  63. utils/template_parsing.py +71 -0
  64. photo_stack_finder-0.1.7.dist-info/RECORD +0 -74
  65. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/WHEEL +0 -0
  66. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/entry_points.txt +0 -0
  67. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/licenses/LICENSE +0 -0
  68. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/top_level.txt +0 -0
@@ -1,604 +1,604 @@
1
- """Pipeline orchestrator for executing validated stage graphs.
2
-
3
- The orchestrator is a minimal coordinator that:
4
- - Executes stages in topological order (computed by PipelineGraph)
5
- - Tracks execution state (ready, running, complete, failed)
6
- - Provides formatted status for UI polling (backend-heavy pattern)
7
- - Halts immediately on first stage failure (fail-fast)
8
- - Has zero domain knowledge (just calls stage.run())
9
-
10
- All formatting happens in the backend - the UI just displays values.
11
-
12
- Architecture:
13
- - Orchestrator receives validated graph from PipelineBuilder
14
- - Calls stage.run() for each stage in dependency order
15
- - Tracks current/completed/failed stages for status queries
16
- - Formats all data for UI consumption (numbers, icons, CSS classes)
17
- - Never modifies graph or stages (read-only after construction)
18
-
19
- Usage:
20
- # Created automatically by PipelineBuilder.__exit__
21
- orchestrator = builder.orchestrator
22
-
23
- # Execute pipeline
24
- orchestrator.execute()
25
-
26
- # Query status (for UI polling)
27
- status = orchestrator.get_execution_status()
28
- """
29
-
30
- from __future__ import annotations
31
-
32
- import time
33
- import traceback
34
- from collections.abc import Callable
35
-
36
- from utils import BasePipelineStage, PhotoFile, PipelineGraph, ProgressInfo, format_seconds_weighted, get_logger
37
-
38
- logger = get_logger()
39
-
40
-
41
- class PipelineOrchestrator:
42
- """Minimal coordinator that executes stages in topological order.
43
-
44
- The orchestrator has zero domain knowledge about stages - it just
45
- executes them in dependency order and tracks execution state.
46
-
47
- All formatting for UI display happens in the orchestrator (backend-heavy
48
- pattern) - the frontend just renders the pre-formatted strings.
49
-
50
- Attributes:
51
- graph: The validated PipelineGraph containing all stages
52
- current_stage_index: Index of currently executing stage (1-based, 0 = not started)
53
- current_phase: Current execution phase (cache_load, prepare, compute, finalise, save)
54
- failed: True if a stage has failed
55
- previous_photos_final: Running count of photos from previous stage (for statistics)
56
- previous_seqs_final: Running count of sequences from previous stage (for statistics)
57
- """
58
-
59
- get_photofiles: Callable[[], dict[int, PhotoFile]]
60
-
61
- def __init__(self, graph: PipelineGraph, should_stop: Callable[[], bool] | None = None) -> None:
62
- """Initialize orchestrator with validated graph.
63
-
64
- Args:
65
- graph: Validated PipelineGraph from PipelineBuilder
66
- should_stop: Optional callback that returns True when execution should stop
67
-
68
- Note:
69
- The graph must have been validated (cycles checked, ports connected)
70
- and execution order must have been computed before passing to orchestrator.
71
- """
72
- self.graph = graph
73
- self.current_stage_index: int = 0
74
- self.current_phase: str = ""
75
- self.failed: bool = False
76
- self.should_stop = should_stop
77
-
78
- # Track running counts from previous stage to support statistics display
79
- # for cached stages (which don't have init counts populated)
80
- self.previous_photos_final: int | None = None
81
- self.previous_seqs_final: int | None = None
82
-
83
- # Stage Index Helpers - Centralized 1-based indexing logic
84
- # =========================================================
85
- # Stage IDs are 1-based (1, 2, 3, ..., N)
86
- # current_stage_index semantics:
87
- # 0 = pipeline not started
88
- # 1..N = stage with that ID is currently running
89
- # N+1 = all stages complete
90
-
91
- def _get_total_stages(self) -> int:
92
- """Get total number of stages in pipeline.
93
-
94
- Returns:
95
- Number of stages (same as max stage_id)
96
- """
97
- return len(self.graph.nodes)
98
-
99
- def _is_pipeline_not_started(self) -> bool:
100
- """Check if pipeline has not started yet.
101
-
102
- Returns:
103
- True if no stages have started (current_stage_index == 0)
104
- """
105
- return self.current_stage_index == 0
106
-
107
- def _is_pipeline_complete(self) -> bool:
108
- """Check if all pipeline stages have completed.
109
-
110
- Returns:
111
- True if current_stage_index > total stages (last stage incremented index)
112
- """
113
- return self.current_stage_index > self._get_total_stages()
114
-
115
- def _is_stage_complete(self, stage_id: int) -> bool:
116
- """Check if a stage has completed execution.
117
-
118
- Args:
119
- stage_id: 1-based stage ID to check
120
-
121
- Returns:
122
- True if stage_id < current_stage_index (stage finished and index advanced)
123
- """
124
- return stage_id < self.current_stage_index
125
-
126
- def _is_stage_running(self, stage_id: int) -> bool:
127
- """Check if a stage is currently executing.
128
-
129
- Args:
130
- stage_id: 1-based stage ID to check
131
-
132
- Returns:
133
- True if stage_id == current_stage_index (stage started but not finished)
134
- """
135
- return stage_id == self.current_stage_index
136
-
137
- def _is_stage_pending(self, stage_id: int) -> bool:
138
- """Check if a stage has not started yet.
139
-
140
- Args:
141
- stage_id: 1-based stage ID to check
142
-
143
- Returns:
144
- True if stage_id > current_stage_index (stage hasn't started)
145
- """
146
- return stage_id > self.current_stage_index
147
-
148
- # Stage Index Mutations - Centralized modification operations
149
- # ============================================================
150
-
151
- def _mark_stage_started(self, stage_id: int) -> None:
152
- """Mark a stage as started by setting current_stage_index to its ID.
153
-
154
- Args:
155
- stage_id: 1-based stage ID that is starting execution
156
- """
157
- logger.info(f"Stage {stage_id} starting (current_stage_index: {self.current_stage_index} -> {stage_id})")
158
- self.current_stage_index = stage_id
159
-
160
- def _mark_stage_completed(self, stage_id: int) -> None:
161
- """Mark a stage as completed by incrementing current_stage_index.
162
-
163
- Args:
164
- stage_id: 1-based stage ID that just completed execution
165
- """
166
- new_index = stage_id + 1
167
- logger.info(f"Stage {stage_id} completed (current_stage_index: {self.current_stage_index} -> {new_index})")
168
- self.current_stage_index = new_index
169
-
170
- def execute(self) -> None:
171
- """Execute all stages in dependency order with graceful cancellation support.
172
-
173
- Executes each stage by calling stage.run(). Stages are executed
174
- sequentially in topological order (dependencies run before dependents).
175
-
176
- The orchestrator supports graceful cancellation via the should_stop callback.
177
- Before/after each stage, it checks if cancellation was requested and halts
178
- cleanly if so. Within stages, KeyboardInterrupt is caught to handle SIGINT.
179
-
180
- The orchestrator is fail-fast: execution halts immediately on the
181
- first stage failure. The `self.failed` flag is set for status queries.
182
-
183
- Raises:
184
- Exception: Any exception raised by a stage (after logging)
185
-
186
- Note:
187
- Stages must implement their own caching and dependency checking.
188
- The orchestrator just calls run() and doesn't manage caching.
189
- """
190
- # Get stages in topological order (dependencies first)
191
- stages = self.graph.get_stages_in_order()
192
-
193
- for stage in stages:
194
- # Check for stop request BEFORE starting each stage
195
- if self.should_stop and self.should_stop():
196
- logger.info("Pipeline execution cancelled by user request")
197
- return
198
-
199
- # Mark stage as started (sets current_stage_index to stage's 1-based ID)
200
- assert stage.stage_id is not None, f"Stage {stage.stage_name} missing stage_id"
201
- self._mark_stage_started(stage.stage_id)
202
- self.current_phase = ""
203
-
204
- try:
205
- # Set phase callback to track execution phase
206
- stage._phase_callback = self._update_phase
207
-
208
- # Call stage's run method
209
- # Note: Stages handle their own caching and dependency checking
210
- #
211
- # KNOWN ISSUE (will be fixed in Phase 2):
212
- # Current PipelineStage.run() signature requires (prep, args) arguments.
213
- # The new orchestrator architecture is designed for port-based stages
214
- # where run() takes no arguments (inputs come from ports).
215
- # This mypy error will be resolved in Phase 2 when all stages are migrated
216
- # to the new port-based interface.
217
- stage.run()
218
-
219
- # Check for stop request AFTER stage completes
220
- # (allows current stage to finish, but prevents starting next stage)
221
- if self.should_stop and self.should_stop():
222
- logger.info("Pipeline execution cancelled by user request")
223
- return
224
-
225
- # Mark stage as completed (increments current_stage_index)
226
- self._mark_stage_completed(stage.stage_id)
227
- self.current_phase = ""
228
-
229
- # Clear progress tracker AFTER marking complete
230
- # (keeps progress visible through finalise() and save)
231
- stage._progress_tracker = None
232
-
233
- # Log completion with summary
234
- self._log_stage_completion(stage)
235
-
236
- except Exception as e:
237
- # Record failure and halt execution (fail-fast)
238
- self.failed = True
239
- logger.error(f"Stage '{stage.stage_name}' failed: {e} + {traceback.format_exc()}")
240
- raise # Re-raise to halt pipeline # Re-raise to halt pipeline # Re-raise to halt pipeline
241
-
242
- def _update_phase(self, phase: str) -> None:
243
- """Update current execution phase (called by stage via phase callback).
244
-
245
- Args:
246
- phase: Current phase name (cache_load, prepare, compute, finalise, save)
247
- """
248
- self.current_phase = phase
249
-
250
- def _log_stage_completion(self, stage: BasePipelineStage) -> None:
251
- """Log stage completion with summary statistics.
252
-
253
- Displays final reference counts and percentage reduction achieved by the stage.
254
- Omits sequence counts in early stages where they don't exist.
255
- Shows execution time if stage was computed (not loaded from cache).
256
-
257
- Uses orchestrator's tracked counts from previous stage as init counts.
258
- This works for both cached stages (where stage.ref_photos_init is None)
259
- and computed stages, providing a single source of truth.
260
-
261
- Args:
262
- stage: The stage that just completed
263
- """
264
- # Build completion message with summary
265
- message_parts = [f"Stage '{stage.stage_name}' complete:"]
266
-
267
- # Add execution time if stage was computed (progress tracker exists)
268
- # Progress tracker is only created during compute phase, so its existence
269
- # indicates the stage was not loaded from cache
270
- if hasattr(stage, "_progress_tracker") and stage._progress_tracker is not None:
271
- elapsed = time.time() - stage._progress_tracker.start_time
272
- message_parts.append(f"{format_seconds_weighted(elapsed)}")
273
-
274
- # Add photo counts with reduction percentage if available
275
- if stage.ref_photos_final is not None:
276
- photo_msg = f"{stage.ref_photos_final} photos"
277
-
278
- # Calculate percentage reduction using orchestrator's tracked count from previous stage
279
- if self.previous_photos_final is not None and self.previous_photos_final > 0:
280
- reduction = ((self.previous_photos_final - stage.ref_photos_final) / self.previous_photos_final) * 100
281
- photo_msg += f" ({reduction:.1f}% reduction)"
282
-
283
- message_parts.append(photo_msg)
284
-
285
- # Add sequence counts with reduction percentage if available
286
- # Omit if None or 0 (early stages don't have sequences yet)
287
- if stage.ref_seqs_final is not None and stage.ref_seqs_final > 0:
288
- seq_msg = f"{stage.ref_seqs_final} sequences"
289
-
290
- # Calculate percentage reduction using orchestrator's tracked count from previous stage
291
- if self.previous_seqs_final is not None and self.previous_seqs_final > 0:
292
- reduction = ((self.previous_seqs_final - stage.ref_seqs_final) / self.previous_seqs_final) * 100
293
- seq_msg += f" ({reduction:.1f}% reduction)"
294
-
295
- message_parts.append(seq_msg)
296
-
297
- # Update tracked counts for next stage (single source of truth)
298
- self.previous_photos_final = stage.ref_photos_final
299
- self.previous_seqs_final = stage.ref_seqs_final
300
-
301
- # Add review data summary if available
302
- review_type = stage.needs_review()
303
- if review_type == "photos" and len(stage.identical_review_result) > 0:
304
- message_parts.append(f"Found {len(stage.identical_review_result)} identical groups")
305
- elif review_type == "sequences" and len(stage.sequence_review_result) > 0:
306
- message_parts.append(f"Found {len(stage.sequence_review_result)} sequence groups")
307
-
308
- # Add cache size if available
309
- if stage.path.exists():
310
- cache_size_mb = stage.path.stat().st_size / (1024 * 1024)
311
- message_parts.append(f"Cache saved ({cache_size_mb:.1f} MB)")
312
-
313
- logger.info(" | ".join(message_parts))
314
-
315
- # ========================================================================
316
- # UI Query Methods (Backend-Heavy Pattern)
317
- # ========================================================================
318
- #
319
- # All methods below return fully formatted data ready for UI display.
320
- # The UI just renders the strings, numbers, and CSS classes - no
321
- # formatting logic in JavaScript.
322
-
323
- # FIXME: Use a pydantic structure rather than a complicated dict.
324
- def get_execution_status(
325
- self,
326
- ) -> dict[
327
- str,
328
- str | dict[str, str] | list[dict[str, str | int | bool | list[str] | None]] | None,
329
- ]:
330
- """Get complete execution state with all formatting done (for UI polling).
331
-
332
- Returns fully formatted execution status including:
333
- - Pipeline state (ready/running/complete/failed)
334
- - Overall progress (stages completed, percentage)
335
- - Current execution phase (cache_load, prepare, compute, finalise, save)
336
- - Per-stage status (pending/running/complete/failed with icons)
337
- - Current stage progress (if running)
338
- - Stage statistics (formatted for display)
339
-
340
- All strings, numbers, and CSS classes are formatted in the backend.
341
- The UI just renders the values without any formatting logic.
342
-
343
- Returns:
344
- Dictionary with fully formatted execution status:
345
- {
346
- "state": "running", # "ready" | "running" | "complete" | "failed"
347
- "state_display": "Running...", # Human-readable state
348
- "pipeline_progress": "2 / 5 stages", # Formatted progress
349
- "pipeline_percentage": "40%", # Percentage as string
350
- "pipeline_bar_width": "40%", # CSS width property
351
- "current_phase": "compute", # Current execution phase (or "" if not running)
352
- "stages": [ # List of stages with formatted status
353
- {
354
- "position": 1,
355
- "name": "discover_files",
356
- "display_name": "Discover Files",
357
- "status": "complete",
358
- "status_icon": "check",
359
- "status_class": "stage-complete",
360
- "has_review": False
361
- },
362
- ...
363
- ],
364
- "current_progress": { # Progress of current stage (if running)
365
- "fraction_complete": 0.73,
366
- "percentage_display": "73%",
367
- "progress_bar_width": "73%",
368
- "status_message": "Processing photos",
369
- "items_display": "11,123 / 15,234",
370
- "rate_display": "1,250 items/sec",
371
- "eta_display": "3 seconds",
372
- "stage_display": "Compute Identical"
373
- } or None
374
- }
375
- """
376
- progress = self.get_current_progress()
377
-
378
- # Calculate overall pipeline progress
379
- # Number of completed stages = current_stage_index - 1 (since index points to running stage)
380
- # Exception: when pipeline complete, current_stage_index = total + 1, so we clamp to total
381
- total = self._get_total_stages()
382
- completed = min(max(0, self.current_stage_index - 1), total)
383
- pipeline_percentage = int((completed / total) * 100) if total > 0 else 0
384
-
385
- return {
386
- "state": self._get_state(), # "ready", "running", "complete", "failed"
387
- "state_display": self._format_state_display(), # "Running...", "Complete ✓"
388
- # Overall pipeline progress (formatted)
389
- "pipeline_progress": f"{completed} / {total} stages",
390
- "pipeline_percentage": f"{pipeline_percentage}%",
391
- "pipeline_bar_width": f"{pipeline_percentage}%",
392
- # Current phase (cache_load, prepare, compute, finalise, save)
393
- "current_phase": self.current_phase,
394
- # Stage list with status and statistics
395
- "stages": self._format_stage_list(),
396
- # Current stage progress (if running)
397
- "current_progress": progress.__dict__ if progress else None,
398
- }
399
-
400
- def get_current_progress(self) -> ProgressInfo | None:
401
- """Get progress of currently running stage.
402
-
403
- Returns:
404
- ProgressInfo with formatted progress data, or None if no stage is running
405
-
406
- Note:
407
- Requires stages to implement get_progress() method (Phase 1.6)
408
- """
409
- if self._is_pipeline_not_started():
410
- return None # Pipeline not started
411
-
412
- if self._is_pipeline_complete():
413
- return None # All stages complete
414
-
415
- stages = self.graph.get_stages_in_order()
416
-
417
- # Find stage by matching stage_id (1-based) with current_stage_index
418
- current_stage = next(
419
- (s for s in stages if s.stage_id == self.current_stage_index),
420
- None,
421
- )
422
-
423
- if current_stage is None:
424
- return None # Stage not found
425
-
426
- return current_stage.get_progress()
427
-
428
- def _get_state(self) -> str:
429
- """Get current pipeline state.
430
-
431
- Returns:
432
- One of: "ready", "running", "complete", "failed"
433
- """
434
- if self.failed:
435
- return "failed"
436
-
437
- if self._is_pipeline_complete():
438
- return "complete"
439
-
440
- if not self._is_pipeline_not_started() or self.current_phase:
441
- return "running"
442
-
443
- return "ready"
444
-
445
- def _format_state_display(self) -> str:
446
- """Format state for display.
447
-
448
- Returns:
449
- Human-readable state with emoji/symbol:
450
- - "Ready to execute"
451
- - "Running..."
452
- - "Complete ✓"
453
- - "Failed ✗"
454
- """
455
- state_display = {
456
- "ready": "Ready to execute",
457
- "running": "Running...",
458
- "complete": "Complete ✓",
459
- "failed": "Failed ✗",
460
- }
461
- return state_display[self._get_state()]
462
-
463
- def _format_stage_list(
464
- self,
465
- ) -> list[dict[str, str | int | bool | list[str] | None]]:
466
- """Format stage list with status indicators and statistics.
467
-
468
- Returns list of stages in execution order with:
469
- - Position (1-indexed)
470
- - Name and display name
471
- - Status (pending/running/complete/failed)
472
- - Status icon and CSS class
473
- - Whether stage has review data
474
- - Statistics (photo/sequence counts and reduction percentages)
475
-
476
- Returns:
477
- List of formatted stage dictionaries for UI rendering
478
- """
479
- stages = self.graph.get_stages_in_order()
480
- result: list[dict[str, str | int | bool | list[str] | None]] = []
481
-
482
- # Track previous stage's final counts for statistics calculation
483
- prev_photos_final: int | None = None
484
- prev_seqs_final: int | None = None
485
-
486
- for stage in stages:
487
- # Use stage's 1-based ID for status comparison
488
- assert stage.stage_id is not None, f"Stage {stage.stage_name} missing stage_id"
489
- stage_id = stage.stage_id
490
-
491
- # Determine status using centralized helper methods
492
- status: str
493
- icon: str
494
- css_class: str
495
-
496
- if self.failed and self._is_stage_running(stage_id):
497
- # Current stage failed
498
- status, icon, css_class = "failed", "X", "stage-failed"
499
- elif self._is_stage_complete(stage_id):
500
- # Stage completed
501
- status, icon, css_class = "complete", "check", "stage-complete"
502
- elif self._is_stage_running(stage_id):
503
- # Stage currently running
504
- status, icon, css_class = "running", "play", "stage-running"
505
- else:
506
- # Stage pending (not started yet)
507
- status, icon, css_class = "pending", " ", "stage-pending"
508
-
509
- # Build statistics strings for completed stages
510
- stats_parts: list[str] = []
511
-
512
- if self._is_stage_complete(stage_id):
513
- # Stage completed - calculate statistics
514
- # For photos
515
- if stage.ref_photos_final is not None:
516
- photo_str = f"{stage.ref_photos_final:,} photos"
517
- if prev_photos_final is not None and prev_photos_final > 0:
518
- reduction = ((prev_photos_final - stage.ref_photos_final) / prev_photos_final) * 100
519
- photo_str += f" ({reduction:.1f}% reduction)"
520
- stats_parts.append(photo_str)
521
-
522
- # For sequences (omit if None or 0)
523
- if stage.ref_seqs_final is not None and stage.ref_seqs_final > 0:
524
- seq_str = f"{stage.ref_seqs_final:,} sequences"
525
- if prev_seqs_final is not None and prev_seqs_final > 0:
526
- reduction = ((prev_seqs_final - stage.ref_seqs_final) / prev_seqs_final) * 100
527
- seq_str += f" ({reduction:.1f}% reduction)"
528
- stats_parts.append(seq_str)
529
-
530
- # Add performance metrics (elapsed time and throughput)
531
- if stage.elapsed_seconds is not None:
532
- minutes = int(stage.elapsed_seconds // 60)
533
- seconds = int(stage.elapsed_seconds % 60)
534
- time_str = f"{minutes}m {seconds}s" if minutes > 0 else f"{seconds}s"
535
- stats_parts.append(time_str)
536
-
537
- if stage.throughput is not None:
538
- stats_parts.append(f"{stage.throughput:.0f} items/sec")
539
-
540
- # Update tracking for next stage
541
- prev_photos_final = stage.ref_photos_final
542
- prev_seqs_final = stage.ref_seqs_final
543
-
544
- result.append(
545
- {
546
- "position": stage_id, # Already 1-based from graph
547
- "name": stage.stage_name,
548
- "display_name": stage.stage_name, # Frontend expects display_name
549
- "status": status,
550
- "status_icon": icon,
551
- "status_class": css_class,
552
- # Only show review data if stage completed (stage_id < current_stage_index)
553
- "has_review": stage_id < self.current_stage_index and stage.has_review_data(),
554
- # Statistics (formatted string, empty for pending/running stages)
555
- "statistics": " | ".join(stats_parts) if stats_parts else None,
556
- }
557
- )
558
-
559
- return result
560
-
561
- # ========================================================================
562
- # Utility Methods (for Web UI)
563
- # ========================================================================
564
-
565
- def has_review_data(self, stage_name: str) -> bool:
566
- """Check if stage has review data in current run.
567
-
568
- Args:
569
- stage_name: Name of the stage
570
-
571
- Returns:
572
- True if stage completed in current run AND has reviewable data available
573
-
574
- Raises:
575
- KeyError: If stage name not found in graph
576
- """
577
- # Find stage by name
578
- stages = self.graph.get_stages_in_order()
579
- found_stage = next((s for s in stages if s.stage_name == stage_name), None)
580
-
581
- if found_stage is None:
582
- raise KeyError(f"Stage '{stage_name}' not found in graph")
583
-
584
- assert found_stage.stage_id is not None, f"Stage {stage_name} missing stage_id"
585
-
586
- # Must have completed in current run (stage_id < current_stage_index)
587
- if found_stage.stage_id >= self.current_stage_index:
588
- return False
589
-
590
- return found_stage.has_review_data()
591
-
592
- def get_stage(self, stage_name: str) -> BasePipelineStage:
593
- """Get stage instance (for review UI).
594
-
595
- Args:
596
- stage_name: Name of the stage
597
-
598
- Returns:
599
- BasePipelineStage instance
600
-
601
- Raises:
602
- KeyError: If stage name not found in graph
603
- """
604
- return self.graph.get_all_stages()[stage_name]
1
+ """Pipeline orchestrator for executing validated stage graphs.
2
+
3
+ The orchestrator is a minimal coordinator that:
4
+ - Executes stages in topological order (computed by PipelineGraph)
5
+ - Tracks execution state (ready, running, complete, failed)
6
+ - Provides formatted status for UI polling (backend-heavy pattern)
7
+ - Halts immediately on first stage failure (fail-fast)
8
+ - Has zero domain knowledge (just calls stage.run())
9
+
10
+ All formatting happens in the backend - the UI just displays values.
11
+
12
+ Architecture:
13
+ - Orchestrator receives validated graph from PipelineBuilder
14
+ - Calls stage.run() for each stage in dependency order
15
+ - Tracks current/completed/failed stages for status queries
16
+ - Formats all data for UI consumption (numbers, icons, CSS classes)
17
+ - Never modifies graph or stages (read-only after construction)
18
+
19
+ Usage:
20
+ # Created automatically by PipelineBuilder.__exit__
21
+ orchestrator = builder.orchestrator
22
+
23
+ # Execute pipeline
24
+ orchestrator.execute()
25
+
26
+ # Query status (for UI polling)
27
+ status = orchestrator.get_execution_status()
28
+ """
29
+
30
+ from __future__ import annotations
31
+
32
+ import time
33
+ import traceback
34
+ from collections.abc import Callable
35
+
36
+ from utils import BasePipelineStage, PhotoFile, PipelineGraph, ProgressInfo, format_seconds_weighted, get_logger
37
+
38
+ logger = get_logger()
39
+
40
+
41
+ class PipelineOrchestrator:
42
+ """Minimal coordinator that executes stages in topological order.
43
+
44
+ The orchestrator has zero domain knowledge about stages - it just
45
+ executes them in dependency order and tracks execution state.
46
+
47
+ All formatting for UI display happens in the orchestrator (backend-heavy
48
+ pattern) - the frontend just renders the pre-formatted strings.
49
+
50
+ Attributes:
51
+ graph: The validated PipelineGraph containing all stages
52
+ current_stage_index: Index of currently executing stage (1-based, 0 = not started)
53
+ current_phase: Current execution phase (cache_load, prepare, compute, finalise, save)
54
+ failed: True if a stage has failed
55
+ previous_photos_final: Running count of photos from previous stage (for statistics)
56
+ previous_seqs_final: Running count of sequences from previous stage (for statistics)
57
+ """
58
+
59
+ get_photofiles: Callable[[], dict[int, PhotoFile]]
60
+
61
+ def __init__(self, graph: PipelineGraph, should_stop: Callable[[], bool] | None = None) -> None:
62
+ """Initialize orchestrator with validated graph.
63
+
64
+ Args:
65
+ graph: Validated PipelineGraph from PipelineBuilder
66
+ should_stop: Optional callback that returns True when execution should stop
67
+
68
+ Note:
69
+ The graph must have been validated (cycles checked, ports connected)
70
+ and execution order must have been computed before passing to orchestrator.
71
+ """
72
+ self.graph = graph
73
+ self.current_stage_index: int = 0
74
+ self.current_phase: str = ""
75
+ self.failed: bool = False
76
+ self.should_stop = should_stop
77
+
78
+ # Track running counts from previous stage to support statistics display
79
+ # for cached stages (which don't have init counts populated)
80
+ self.previous_photos_final: int | None = None
81
+ self.previous_seqs_final: int | None = None
82
+
83
+ # Stage Index Helpers - Centralized 1-based indexing logic
84
+ # =========================================================
85
+ # Stage IDs are 1-based (1, 2, 3, ..., N)
86
+ # current_stage_index semantics:
87
+ # 0 = pipeline not started
88
+ # 1..N = stage with that ID is currently running
89
+ # N+1 = all stages complete
90
+
91
+ def _get_total_stages(self) -> int:
92
+ """Get total number of stages in pipeline.
93
+
94
+ Returns:
95
+ Number of stages (same as max stage_id)
96
+ """
97
+ return len(self.graph.nodes)
98
+
99
+ def _is_pipeline_not_started(self) -> bool:
100
+ """Check if pipeline has not started yet.
101
+
102
+ Returns:
103
+ True if no stages have started (current_stage_index == 0)
104
+ """
105
+ return self.current_stage_index == 0
106
+
107
+ def _is_pipeline_complete(self) -> bool:
108
+ """Check if all pipeline stages have completed.
109
+
110
+ Returns:
111
+ True if current_stage_index > total stages (last stage incremented index)
112
+ """
113
+ return self.current_stage_index > self._get_total_stages()
114
+
115
+ def _is_stage_complete(self, stage_id: int) -> bool:
116
+ """Check if a stage has completed execution.
117
+
118
+ Args:
119
+ stage_id: 1-based stage ID to check
120
+
121
+ Returns:
122
+ True if stage_id < current_stage_index (stage finished and index advanced)
123
+ """
124
+ return stage_id < self.current_stage_index
125
+
126
+ def _is_stage_running(self, stage_id: int) -> bool:
127
+ """Check if a stage is currently executing.
128
+
129
+ Args:
130
+ stage_id: 1-based stage ID to check
131
+
132
+ Returns:
133
+ True if stage_id == current_stage_index (stage started but not finished)
134
+ """
135
+ return stage_id == self.current_stage_index
136
+
137
+ def _is_stage_pending(self, stage_id: int) -> bool:
138
+ """Check if a stage has not started yet.
139
+
140
+ Args:
141
+ stage_id: 1-based stage ID to check
142
+
143
+ Returns:
144
+ True if stage_id > current_stage_index (stage hasn't started)
145
+ """
146
+ return stage_id > self.current_stage_index
147
+
148
+ # Stage Index Mutations - Centralized modification operations
149
+ # ============================================================
150
+
151
+ def _mark_stage_started(self, stage_id: int) -> None:
152
+ """Mark a stage as started by setting current_stage_index to its ID.
153
+
154
+ Args:
155
+ stage_id: 1-based stage ID that is starting execution
156
+ """
157
+ logger.info(f"Stage {stage_id} starting (current_stage_index: {self.current_stage_index} -> {stage_id})")
158
+ self.current_stage_index = stage_id
159
+
160
+ def _mark_stage_completed(self, stage_id: int) -> None:
161
+ """Mark a stage as completed by incrementing current_stage_index.
162
+
163
+ Args:
164
+ stage_id: 1-based stage ID that just completed execution
165
+ """
166
+ new_index = stage_id + 1
167
+ logger.info(f"Stage {stage_id} completed (current_stage_index: {self.current_stage_index} -> {new_index})")
168
+ self.current_stage_index = new_index
169
+
170
+ def execute(self) -> None:
171
+ """Execute all stages in dependency order with graceful cancellation support.
172
+
173
+ Executes each stage by calling stage.run(). Stages are executed
174
+ sequentially in topological order (dependencies run before dependents).
175
+
176
+ The orchestrator supports graceful cancellation via the should_stop callback.
177
+ Before/after each stage, it checks if cancellation was requested and halts
178
+ cleanly if so. Within stages, KeyboardInterrupt is caught to handle SIGINT.
179
+
180
+ The orchestrator is fail-fast: execution halts immediately on the
181
+ first stage failure. The `self.failed` flag is set for status queries.
182
+
183
+ Raises:
184
+ Exception: Any exception raised by a stage (after logging)
185
+
186
+ Note:
187
+ Stages must implement their own caching and dependency checking.
188
+ The orchestrator just calls run() and doesn't manage caching.
189
+ """
190
+ # Get stages in topological order (dependencies first)
191
+ stages = self.graph.get_stages_in_order()
192
+
193
+ for stage in stages:
194
+ # Check for stop request BEFORE starting each stage
195
+ if self.should_stop and self.should_stop():
196
+ logger.info("Pipeline execution cancelled by user request")
197
+ return
198
+
199
+ # Mark stage as started (sets current_stage_index to stage's 1-based ID)
200
+ assert stage.stage_id is not None, f"Stage {stage.stage_name} missing stage_id"
201
+ self._mark_stage_started(stage.stage_id)
202
+ self.current_phase = ""
203
+
204
+ try:
205
+ # Set phase callback to track execution phase
206
+ stage._phase_callback = self._update_phase
207
+
208
+ # Call stage's run method
209
+ # Note: Stages handle their own caching and dependency checking
210
+ #
211
+ # KNOWN ISSUE (will be fixed in Phase 2):
212
+ # Current PipelineStage.run() signature requires (prep, args) arguments.
213
+ # The new orchestrator architecture is designed for port-based stages
214
+ # where run() takes no arguments (inputs come from ports).
215
+ # This mypy error will be resolved in Phase 2 when all stages are migrated
216
+ # to the new port-based interface.
217
+ stage.run()
218
+
219
+ # Check for stop request AFTER stage completes
220
+ # (allows current stage to finish, but prevents starting next stage)
221
+ if self.should_stop and self.should_stop():
222
+ logger.info("Pipeline execution cancelled by user request")
223
+ return
224
+
225
+ # Mark stage as completed (increments current_stage_index)
226
+ self._mark_stage_completed(stage.stage_id)
227
+ self.current_phase = ""
228
+
229
+ # Clear progress tracker AFTER marking complete
230
+ # (keeps progress visible through finalise() and save)
231
+ stage._progress_tracker = None
232
+
233
+ # Log completion with summary
234
+ self._log_stage_completion(stage)
235
+
236
+ except Exception as e:
237
+ # Record failure and halt execution (fail-fast)
238
+ self.failed = True
239
+ logger.error(f"Stage '{stage.stage_name}' failed: {e} + {traceback.format_exc()}")
240
+ raise # Re-raise to halt pipeline # Re-raise to halt pipeline # Re-raise to halt pipeline
241
+
242
+ def _update_phase(self, phase: str) -> None:
243
+ """Update current execution phase (called by stage via phase callback).
244
+
245
+ Args:
246
+ phase: Current phase name (cache_load, prepare, compute, finalise, save)
247
+ """
248
+ self.current_phase = phase
249
+
250
+ def _log_stage_completion(self, stage: BasePipelineStage) -> None:
251
+ """Log stage completion with summary statistics.
252
+
253
+ Displays final reference counts and percentage reduction achieved by the stage.
254
+ Omits sequence counts in early stages where they don't exist.
255
+ Shows execution time if stage was computed (not loaded from cache).
256
+
257
+ Uses orchestrator's tracked counts from previous stage as init counts.
258
+ This works for both cached stages (where stage.ref_photos_init is None)
259
+ and computed stages, providing a single source of truth.
260
+
261
+ Args:
262
+ stage: The stage that just completed
263
+ """
264
+ # Build completion message with summary
265
+ message_parts = [f"Stage '{stage.stage_name}' complete:"]
266
+
267
+ # Add execution time if stage was computed (progress tracker exists)
268
+ # Progress tracker is only created during compute phase, so its existence
269
+ # indicates the stage was not loaded from cache
270
+ if hasattr(stage, "_progress_tracker") and stage._progress_tracker is not None:
271
+ elapsed = time.time() - stage._progress_tracker.start_time
272
+ message_parts.append(f"{format_seconds_weighted(elapsed)}")
273
+
274
+ # Add photo counts with reduction percentage if available
275
+ if stage.ref_photos_final is not None:
276
+ photo_msg = f"{stage.ref_photos_final} photos"
277
+
278
+ # Calculate percentage reduction using orchestrator's tracked count from previous stage
279
+ if self.previous_photos_final is not None and self.previous_photos_final > 0:
280
+ reduction = ((self.previous_photos_final - stage.ref_photos_final) / self.previous_photos_final) * 100
281
+ photo_msg += f" ({reduction:.1f}% reduction)"
282
+
283
+ message_parts.append(photo_msg)
284
+
285
+ # Add sequence counts with reduction percentage if available
286
+ # Omit if None or 0 (early stages don't have sequences yet)
287
+ if stage.ref_seqs_final is not None and stage.ref_seqs_final > 0:
288
+ seq_msg = f"{stage.ref_seqs_final} sequences"
289
+
290
+ # Calculate percentage reduction using orchestrator's tracked count from previous stage
291
+ if self.previous_seqs_final is not None and self.previous_seqs_final > 0:
292
+ reduction = ((self.previous_seqs_final - stage.ref_seqs_final) / self.previous_seqs_final) * 100
293
+ seq_msg += f" ({reduction:.1f}% reduction)"
294
+
295
+ message_parts.append(seq_msg)
296
+
297
+ # Update tracked counts for next stage (single source of truth)
298
+ self.previous_photos_final = stage.ref_photos_final
299
+ self.previous_seqs_final = stage.ref_seqs_final
300
+
301
+ # Add review data summary if available
302
+ review_type = stage.needs_review()
303
+ if review_type == "photos" and len(stage.identical_review_result) > 0:
304
+ message_parts.append(f"Found {len(stage.identical_review_result)} identical groups")
305
+ elif review_type == "sequences" and len(stage.sequence_review_result) > 0:
306
+ message_parts.append(f"Found {len(stage.sequence_review_result)} sequence groups")
307
+
308
+ # Add cache size if available
309
+ if stage.path.exists():
310
+ cache_size_mb = stage.path.stat().st_size / (1024 * 1024)
311
+ message_parts.append(f"Cache saved ({cache_size_mb:.1f} MB)")
312
+
313
+ logger.info(" | ".join(message_parts))
314
+
315
+ # ========================================================================
316
+ # UI Query Methods (Backend-Heavy Pattern)
317
+ # ========================================================================
318
+ #
319
+ # All methods below return fully formatted data ready for UI display.
320
+ # The UI just renders the strings, numbers, and CSS classes - no
321
+ # formatting logic in JavaScript.
322
+
323
+ # FIXME: Use a pydantic structure rather than a complicated dict.
324
+ def get_execution_status(
325
+ self,
326
+ ) -> dict[
327
+ str,
328
+ str | dict[str, str] | list[dict[str, str | int | bool | list[str] | None]] | None,
329
+ ]:
330
+ """Get complete execution state with all formatting done (for UI polling).
331
+
332
+ Returns fully formatted execution status including:
333
+ - Pipeline state (ready/running/complete/failed)
334
+ - Overall progress (stages completed, percentage)
335
+ - Current execution phase (cache_load, prepare, compute, finalise, save)
336
+ - Per-stage status (pending/running/complete/failed with icons)
337
+ - Current stage progress (if running)
338
+ - Stage statistics (formatted for display)
339
+
340
+ All strings, numbers, and CSS classes are formatted in the backend.
341
+ The UI just renders the values without any formatting logic.
342
+
343
+ Returns:
344
+ Dictionary with fully formatted execution status:
345
+ {
346
+ "state": "running", # "ready" | "running" | "complete" | "failed"
347
+ "state_display": "Running...", # Human-readable state
348
+ "pipeline_progress": "2 / 5 stages", # Formatted progress
349
+ "pipeline_percentage": "40%", # Percentage as string
350
+ "pipeline_bar_width": "40%", # CSS width property
351
+ "current_phase": "compute", # Current execution phase (or "" if not running)
352
+ "stages": [ # List of stages with formatted status
353
+ {
354
+ "position": 1,
355
+ "name": "discover_files",
356
+ "display_name": "Discover Files",
357
+ "status": "complete",
358
+ "status_icon": "check",
359
+ "status_class": "stage-complete",
360
+ "has_review": False
361
+ },
362
+ ...
363
+ ],
364
+ "current_progress": { # Progress of current stage (if running)
365
+ "fraction_complete": 0.73,
366
+ "percentage_display": "73%",
367
+ "progress_bar_width": "73%",
368
+ "status_message": "Processing photos",
369
+ "items_display": "11,123 / 15,234",
370
+ "rate_display": "1,250 items/sec",
371
+ "eta_display": "3 seconds",
372
+ "stage_display": "Compute Identical"
373
+ } or None
374
+ }
375
+ """
376
+ progress = self.get_current_progress()
377
+
378
+ # Calculate overall pipeline progress
379
+ # Number of completed stages = current_stage_index - 1 (since index points to running stage)
380
+ # Exception: when pipeline complete, current_stage_index = total + 1, so we clamp to total
381
+ total = self._get_total_stages()
382
+ completed = min(max(0, self.current_stage_index - 1), total)
383
+ pipeline_percentage = int((completed / total) * 100) if total > 0 else 0
384
+
385
+ return {
386
+ "state": self._get_state(), # "ready", "running", "complete", "failed"
387
+ "state_display": self._format_state_display(), # "Running...", "Complete ✓"
388
+ # Overall pipeline progress (formatted)
389
+ "pipeline_progress": f"{completed} / {total} stages",
390
+ "pipeline_percentage": f"{pipeline_percentage}%",
391
+ "pipeline_bar_width": f"{pipeline_percentage}%",
392
+ # Current phase (cache_load, prepare, compute, finalise, save)
393
+ "current_phase": self.current_phase,
394
+ # Stage list with status and statistics
395
+ "stages": self._format_stage_list(),
396
+ # Current stage progress (if running)
397
+ "current_progress": progress.__dict__ if progress else None,
398
+ }
399
+
400
+ def get_current_progress(self) -> ProgressInfo | None:
401
+ """Get progress of currently running stage.
402
+
403
+ Returns:
404
+ ProgressInfo with formatted progress data, or None if no stage is running
405
+
406
+ Note:
407
+ Requires stages to implement get_progress() method (Phase 1.6)
408
+ """
409
+ if self._is_pipeline_not_started():
410
+ return None # Pipeline not started
411
+
412
+ if self._is_pipeline_complete():
413
+ return None # All stages complete
414
+
415
+ stages = self.graph.get_stages_in_order()
416
+
417
+ # Find stage by matching stage_id (1-based) with current_stage_index
418
+ current_stage = next(
419
+ (s for s in stages if s.stage_id == self.current_stage_index),
420
+ None,
421
+ )
422
+
423
+ if current_stage is None:
424
+ return None # Stage not found
425
+
426
+ return current_stage.get_progress()
427
+
428
+ def _get_state(self) -> str:
429
+ """Get current pipeline state.
430
+
431
+ Returns:
432
+ One of: "ready", "running", "complete", "failed"
433
+ """
434
+ if self.failed:
435
+ return "failed"
436
+
437
+ if self._is_pipeline_complete():
438
+ return "complete"
439
+
440
+ if not self._is_pipeline_not_started() or self.current_phase:
441
+ return "running"
442
+
443
+ return "ready"
444
+
445
+ def _format_state_display(self) -> str:
446
+ """Format state for display.
447
+
448
+ Returns:
449
+ Human-readable state with emoji/symbol:
450
+ - "Ready to execute"
451
+ - "Running..."
452
+ - "Complete ✓"
453
+ - "Failed ✗"
454
+ """
455
+ state_display = {
456
+ "ready": "Ready to execute",
457
+ "running": "Running...",
458
+ "complete": "Complete ✓",
459
+ "failed": "Failed ✗",
460
+ }
461
+ return state_display[self._get_state()]
462
+
463
+ def _format_stage_list(
464
+ self,
465
+ ) -> list[dict[str, str | int | bool | list[str] | None]]:
466
+ """Format stage list with status indicators and statistics.
467
+
468
+ Returns list of stages in execution order with:
469
+ - Position (1-indexed)
470
+ - Name and display name
471
+ - Status (pending/running/complete/failed)
472
+ - Status icon and CSS class
473
+ - Whether stage has review data
474
+ - Statistics (photo/sequence counts and reduction percentages)
475
+
476
+ Returns:
477
+ List of formatted stage dictionaries for UI rendering
478
+ """
479
+ stages = self.graph.get_stages_in_order()
480
+ result: list[dict[str, str | int | bool | list[str] | None]] = []
481
+
482
+ # Track previous stage's final counts for statistics calculation
483
+ prev_photos_final: int | None = None
484
+ prev_seqs_final: int | None = None
485
+
486
+ for stage in stages:
487
+ # Use stage's 1-based ID for status comparison
488
+ assert stage.stage_id is not None, f"Stage {stage.stage_name} missing stage_id"
489
+ stage_id = stage.stage_id
490
+
491
+ # Determine status using centralized helper methods
492
+ status: str
493
+ icon: str
494
+ css_class: str
495
+
496
+ if self.failed and self._is_stage_running(stage_id):
497
+ # Current stage failed
498
+ status, icon, css_class = "failed", "X", "stage-failed"
499
+ elif self._is_stage_complete(stage_id):
500
+ # Stage completed
501
+ status, icon, css_class = "complete", "check", "stage-complete"
502
+ elif self._is_stage_running(stage_id):
503
+ # Stage currently running
504
+ status, icon, css_class = "running", "play", "stage-running"
505
+ else:
506
+ # Stage pending (not started yet)
507
+ status, icon, css_class = "pending", " ", "stage-pending"
508
+
509
+ # Build statistics strings for completed stages
510
+ stats_parts: list[str] = []
511
+
512
+ if self._is_stage_complete(stage_id):
513
+ # Stage completed - calculate statistics
514
+ # For photos
515
+ if stage.ref_photos_final is not None:
516
+ photo_str = f"{stage.ref_photos_final:,} photos"
517
+ if prev_photos_final is not None and prev_photos_final > 0:
518
+ reduction = ((prev_photos_final - stage.ref_photos_final) / prev_photos_final) * 100
519
+ photo_str += f" ({reduction:.1f}% reduction)"
520
+ stats_parts.append(photo_str)
521
+
522
+ # For sequences (omit if None or 0)
523
+ if stage.ref_seqs_final is not None and stage.ref_seqs_final > 0:
524
+ seq_str = f"{stage.ref_seqs_final:,} sequences"
525
+ if prev_seqs_final is not None and prev_seqs_final > 0:
526
+ reduction = ((prev_seqs_final - stage.ref_seqs_final) / prev_seqs_final) * 100
527
+ seq_str += f" ({reduction:.1f}% reduction)"
528
+ stats_parts.append(seq_str)
529
+
530
+ # Add performance metrics (elapsed time and throughput)
531
+ if stage.elapsed_seconds is not None:
532
+ minutes = int(stage.elapsed_seconds // 60)
533
+ seconds = int(stage.elapsed_seconds % 60)
534
+ time_str = f"{minutes}m {seconds}s" if minutes > 0 else f"{seconds}s"
535
+ stats_parts.append(time_str)
536
+
537
+ if stage.throughput is not None:
538
+ stats_parts.append(f"{stage.throughput:.0f} items/sec")
539
+
540
+ # Update tracking for next stage
541
+ prev_photos_final = stage.ref_photos_final
542
+ prev_seqs_final = stage.ref_seqs_final
543
+
544
+ result.append(
545
+ {
546
+ "position": stage_id, # Already 1-based from graph
547
+ "name": stage.stage_name,
548
+ "display_name": stage.stage_name, # Frontend expects display_name
549
+ "status": status,
550
+ "status_icon": icon,
551
+ "status_class": css_class,
552
+ # Only show review data if stage completed (stage_id < current_stage_index)
553
+ "has_review": stage_id < self.current_stage_index and stage.has_review_data(),
554
+ # Statistics (formatted string, empty for pending/running stages)
555
+ "statistics": " | ".join(stats_parts) if stats_parts else None,
556
+ }
557
+ )
558
+
559
+ return result
560
+
561
+ # ========================================================================
562
+ # Utility Methods (for Web UI)
563
+ # ========================================================================
564
+
565
+ def has_review_data(self, stage_name: str) -> bool:
566
+ """Check if stage has review data in current run.
567
+
568
+ Args:
569
+ stage_name: Name of the stage
570
+
571
+ Returns:
572
+ True if stage completed in current run AND has reviewable data available
573
+
574
+ Raises:
575
+ KeyError: If stage name not found in graph
576
+ """
577
+ # Find stage by name
578
+ stages = self.graph.get_stages_in_order()
579
+ found_stage = next((s for s in stages if s.stage_name == stage_name), None)
580
+
581
+ if found_stage is None:
582
+ raise KeyError(f"Stage '{stage_name}' not found in graph")
583
+
584
+ assert found_stage.stage_id is not None, f"Stage {stage_name} missing stage_id"
585
+
586
+ # Must have completed in current run (stage_id < current_stage_index)
587
+ if found_stage.stage_id >= self.current_stage_index:
588
+ return False
589
+
590
+ return found_stage.has_review_data()
591
+
592
+ def get_stage(self, stage_name: str) -> BasePipelineStage:
593
+ """Get stage instance (for review UI).
594
+
595
+ Args:
596
+ stage_name: Name of the stage
597
+
598
+ Returns:
599
+ BasePipelineStage instance
600
+
601
+ Raises:
602
+ KeyError: If stage name not found in graph
603
+ """
604
+ return self.graph.get_all_stages()[stage_name]