photo-stack-finder 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. orchestrator/__init__.py +2 -2
  2. orchestrator/app.py +6 -11
  3. orchestrator/build_pipeline.py +19 -21
  4. orchestrator/orchestrator_runner.py +11 -8
  5. orchestrator/pipeline_builder.py +126 -126
  6. orchestrator/pipeline_orchestrator.py +604 -604
  7. orchestrator/review_persistence.py +162 -162
  8. orchestrator/static/orchestrator.css +76 -76
  9. orchestrator/static/orchestrator.html +11 -5
  10. orchestrator/static/orchestrator.js +3 -1
  11. overlap_metrics/__init__.py +1 -1
  12. overlap_metrics/config.py +135 -135
  13. overlap_metrics/core.py +284 -284
  14. overlap_metrics/estimators.py +292 -292
  15. overlap_metrics/metrics.py +307 -307
  16. overlap_metrics/registry.py +99 -99
  17. overlap_metrics/utils.py +104 -104
  18. photo_compare/__init__.py +1 -1
  19. photo_compare/base.py +285 -285
  20. photo_compare/config.py +225 -225
  21. photo_compare/distance.py +15 -15
  22. photo_compare/feature_methods.py +173 -173
  23. photo_compare/file_hash.py +29 -29
  24. photo_compare/hash_methods.py +99 -99
  25. photo_compare/histogram_methods.py +118 -118
  26. photo_compare/pixel_methods.py +58 -58
  27. photo_compare/structural_methods.py +104 -104
  28. photo_compare/types.py +28 -28
  29. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/METADATA +21 -22
  30. photo_stack_finder-0.1.8.dist-info/RECORD +75 -0
  31. scripts/orchestrate.py +12 -10
  32. utils/__init__.py +4 -3
  33. utils/base_pipeline_stage.py +171 -171
  34. utils/base_ports.py +176 -176
  35. utils/benchmark_utils.py +823 -823
  36. utils/channel.py +74 -74
  37. utils/comparison_gates.py +40 -21
  38. utils/compute_benchmarks.py +355 -355
  39. utils/compute_identical.py +94 -24
  40. utils/compute_indices.py +235 -235
  41. utils/compute_perceptual_hash.py +127 -127
  42. utils/compute_perceptual_match.py +240 -240
  43. utils/compute_sha_bins.py +64 -20
  44. utils/compute_template_similarity.py +1 -1
  45. utils/compute_versions.py +483 -483
  46. utils/config.py +8 -5
  47. utils/data_io.py +83 -83
  48. utils/graph_context.py +44 -44
  49. utils/logger.py +2 -2
  50. utils/models.py +2 -2
  51. utils/photo_file.py +90 -91
  52. utils/pipeline_graph.py +334 -334
  53. utils/pipeline_stage.py +408 -408
  54. utils/plot_helpers.py +123 -123
  55. utils/ports.py +136 -136
  56. utils/progress.py +415 -415
  57. utils/report_builder.py +139 -139
  58. utils/review_types.py +55 -55
  59. utils/review_utils.py +10 -19
  60. utils/sequence.py +10 -8
  61. utils/sequence_clustering.py +1 -1
  62. utils/template.py +57 -57
  63. utils/template_parsing.py +71 -0
  64. photo_stack_finder-0.1.7.dist-info/RECORD +0 -74
  65. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/WHEEL +0 -0
  66. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/entry_points.txt +0 -0
  67. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/licenses/LICENSE +0 -0
  68. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/top_level.txt +0 -0
utils/pipeline_stage.py CHANGED
@@ -1,408 +1,408 @@
1
- """Base class for all pipeline stages."""
2
-
3
- from __future__ import annotations
4
-
5
- import os
6
- import pickle
7
- import tempfile
8
- from abc import abstractmethod
9
- from collections.abc import Iterable, Iterator, Sized
10
- from pathlib import Path
11
- from typing import cast, final
12
-
13
- from joblib import Parallel, delayed
14
-
15
- from .base_pipeline_stage import BasePipelineStage
16
- from .config import CONFIG
17
- from .graph_context import get_active_graph
18
- from .logger import get_logger
19
- from .models import IdenticalGroup, ReviewType, SequenceGroup
20
- from .progress import ProgressTracker
21
-
22
- # Type aliases for pipeline stage method signatures (must be defined outside class)
23
- type PrepareResult[S, R] = tuple[Iterable[S], R]
24
- type WorkerResult[T] = tuple[list[IdenticalGroup], list[SequenceGroup], T]
25
-
26
-
27
- class PipelineStage[S, T, R](BasePipelineStage):
28
- """Abstract base class for pipeline stages with parallel processing support.
29
-
30
- Stages define three abstract methods for parallel execution:
31
- - prepare(): Set up work items and result accumulator
32
- - stage_worker(): Process individual work items (runs in parallel)
33
- - accumulate_results(): Merge worker results into final output
34
-
35
- Type safety is provided by port declarations (InputPort[T], OutputPort[T])
36
- rather than class-level generic parameters.
37
-
38
- Review Data Architecture:
39
- - result: Working data that flows through pipeline (may be nested)
40
- - review_result: Pre-computed review data for UI (always flat, per-stage)
41
- """
42
-
43
- result: R
44
-
45
- def __init__(
46
- self,
47
- path: Path,
48
- stage_name: str,
49
- ):
50
- """Initialize pipeline stage with output path and name.
51
-
52
- Args:
53
- path: Path where stage results will be cached
54
- stage_name: Human-readable name for progress tracking
55
- """
56
- super().__init__(path, stage_name)
57
-
58
- # Auto-register with active graph if within PipelineBuilder context
59
- active_graph = get_active_graph()
60
- if active_graph is not None:
61
- active_graph.add_node(self)
62
-
63
- @abstractmethod
64
- def prepare(self) -> PrepareResult[S, R]:
65
- """Prepare input for parallel processing by reading from input ports.
66
-
67
- This method reads data from input ports and prepares work items for parallel
68
- processing.
69
-
70
- Returns:
71
- Tuple of (work_items, accumulator) where work_items will be processed
72
- in parallel and results accumulated into accumulator
73
- """
74
- ...
75
-
76
- @staticmethod
77
- @abstractmethod
78
- def stage_worker(job: S, args: str) -> WorkerResult[T]:
79
- """This method performs the core, isolated, and concurrent work."""
80
- ...
81
-
82
- @abstractmethod
83
- def accumulate_results(self, result: R, job: T) -> None:
84
- """Accumulate worker result into final output.
85
-
86
- Args:
87
- result: Accumulator to update (returned from prepare)
88
- job: Result from stage_worker to incorporate
89
- """
90
- ...
91
-
92
- # === Review Interface (for orchestrator discovery) ===
93
-
94
- def needs_review(self) -> ReviewType:
95
- """Discover what type of review this stage produces.
96
-
97
- This allows the orchestrator to dynamically discover which stages
98
- produce reviewable output without hard-coding stage names.
99
-
100
- Returns:
101
- - "none": No reviewable output (default)
102
- - "photos": Produces photo groups (byte-identical duplicates)
103
- - "sequences": Produces sequence groups (similar sequences)
104
- """
105
- return "none"
106
-
107
- def _unpack_cache(
108
- self,
109
- loaded_cache: (
110
- tuple[R, list[SequenceGroup], list[IdenticalGroup], int | None, int | None]
111
- | tuple[R, list[SequenceGroup], list[IdenticalGroup], int | None, int | None, float | None, float | None]
112
- ),
113
- ) -> None:
114
- """Unpack cache tuple with backward compatibility.
115
-
116
- Handles both old 5-element format (before performance metrics) and new 7-element format.
117
- """
118
- if len(loaded_cache) == 5:
119
- # Old format: (result, seq_review, id_review, ref_photos, ref_seqs)
120
- (
121
- self.result,
122
- self.sequence_review_result,
123
- self.identical_review_result,
124
- self.ref_photos_final,
125
- self.ref_seqs_final,
126
- ) = loaded_cache
127
- self.elapsed_seconds = None
128
- self.throughput = None
129
- else:
130
- # New format: (result, seq_review, id_review, ref_photos, ref_seqs, elapsed, throughput)
131
- (
132
- self.result,
133
- self.sequence_review_result,
134
- self.identical_review_result,
135
- self.ref_photos_final,
136
- self.ref_seqs_final,
137
- self.elapsed_seconds,
138
- self.throughput,
139
- ) = loaded_cache
140
-
141
- def has_review_data(self) -> bool:
142
- """Check if review data is available for this stage.
143
-
144
- Returns:
145
- True if stage has completed and has reviewable data available
146
- """
147
- return self.needs_review() != "none"
148
-
149
- def batch_compute(self, work: Iterable[S], args: str) -> Iterator[WorkerResult[T]]:
150
- """Orchestrates parallel or sequential processing with graceful shutdown support.
151
-
152
- This implementation uses Joblib for efficient parallel processing with automatic
153
- load balancing. The batch_size="auto" allows Joblib to optimize batching based
154
- on measured task duration, which works well with sorted work items.
155
-
156
- IMPORTANT: This function preserves lazy evaluation of iterators. If work is a
157
- generator/iterator, it will be consumed lazily without materializing the entire
158
- sequence in memory. Progress tracking will show indeterminate progress (no total).
159
-
160
- Graceful Shutdown: When SIGINT (Ctrl+C) is received, joblib workers are terminated
161
- and this method catches KeyboardInterrupt, logging the cancellation and returning
162
- cleanly. This prevents ShutdownExecutorError from occurring.
163
-
164
- Args:
165
- work: An iterable of work items (e.g., file paths) to be processed.
166
- args: Arguments to pass to the stage worker function.
167
-
168
- Yields:
169
- WorkerResult tuple containing (identical_groups, sequence_groups, work_data)
170
- """
171
- # Check if work supports len() without materializing
172
- # Use hasattr to avoid forcing evaluation of generators
173
- total_count: int | None
174
- try:
175
- # Cast to Sized since we know it has __len__ (hasattr check)
176
- total_count = len(cast(Sized, work)) if hasattr(work, "__len__") else None
177
- # This is a best effort so ok to handle any exceptions
178
- except: # noqa: E722
179
- # Some iterables don't support len() even with __len__ attribute
180
- total_count = None
181
-
182
- with ProgressTracker(self.stage_name, total=total_count) as progress:
183
- # Expose progress tracker for UI polling (see get_progress())
184
- self._progress_tracker = progress
185
-
186
- try:
187
- if CONFIG.processing.DEBUG_MODE:
188
- # Sequential processing for debugging
189
- for j in work:
190
- r = self.__class__.stage_worker(j, args)
191
- yield r
192
- progress.update()
193
- else:
194
- # Log worker configuration
195
- logger = get_logger()
196
- logger.info(f"{self.stage_name}: Using {CONFIG.processing.MAX_WORKERS} parallel workers")
197
-
198
- # Parallel processing with Joblib
199
- # batch_size="auto" lets Joblib optimize batching automatically
200
- # return_as="generator" provides streaming results
201
- # IMPORTANT: Pass work directly (don't materialize) - Joblib handles iterators
202
- results = Parallel(
203
- n_jobs=CONFIG.processing.MAX_WORKERS,
204
- backend="loky", # Robust process-based backend
205
- prefer="processes", # Good for jobs with variable time requirements
206
- batch_size="auto", # Automatic batch size optimization
207
- return_as="generator_unordered", # Stream results as they complete
208
- )(delayed(self.__class__.stage_worker)(item, args) for item in work)
209
-
210
- # Yield results as they complete with progress tracking
211
- for result in results:
212
- yield result
213
- progress.update()
214
-
215
- except KeyboardInterrupt:
216
- # SIGINT received (Ctrl+C or shutdown endpoint called)
217
- # Joblib workers have been terminated by the signal
218
- # Exit cleanly without trying to dispatch more work
219
- logger = get_logger()
220
- logger.info(f"{self.stage_name}: Received shutdown signal, stopping batch processing")
221
- return # Exit generator cleanly (don't re-raise)
222
-
223
- finally:
224
- # Don't clear tracker here - let orchestrator clear after marking complete
225
- # This keeps progress visible during finalise() and cache save
226
- pass
227
-
228
- # ========================================================================
229
- # Port-Based Connectivity (Phase 1: Infrastructure - Optional for now)
230
- # ========================================================================
231
- #
232
- # Stages declare typed ports explicitly in __init__ method:
233
- # - Input ports: InputPort[Type] for receiving data from upstream stages
234
- # - Output ports: OutputPort[Type] for sending data to downstream stages
235
- #
236
- # Graph builder wires ports explicitly with compile-time type checking
237
- # by binding consumer input ports to producer output ports.
238
- #
239
- # No dynamic discovery needed - static types ensure correctness.
240
-
241
- # ============================================================================
242
- # DO NOT REMOVE @final DECORATOR - PREVENTS ARCHITECTURAL VIOLATIONS
243
- # ============================================================================
244
- # The @final decorator prevents subclasses from overriding run().
245
- # This is CRITICAL to maintain separation of concerns:
246
- #
247
- # - run() handles: caching, batch processing, logging, result storage
248
- # - prepare() handles: reading inputs from ports, stage-specific logic
249
- #
250
- # Previously, stages overrode run() which led to:
251
- # - Redundant _execute_impl() pattern (removed in refactor)
252
- # - Duplicate code across stages
253
- # - Broken caching when stages forgot to call parent
254
- # - 149 lines of unnecessary complexity
255
- #
256
- # If you need stage-specific behavior, override prepare().
257
- # NEVER remove @final and override run() - this violates the pattern.
258
- # ============================================================================
259
- @final
260
- def run(self) -> None:
261
- """Execute pipeline stage with dependency-aware caching support.
262
-
263
- This method is final and cannot be overridden by subclasses.
264
-
265
- If cached results exist and are newer than all dependencies, loads and returns them.
266
- Otherwise, prepares work (reading from input ports), processes in parallel,
267
- accumulates results, saves to cache, and returns final result.
268
-
269
- Stages must store their worker arguments in self.args during __init__.
270
-
271
- NEW: Also builds and caches review data alongside working data for stages
272
- that produce reviews (needs_review() != "none").
273
-
274
- Phase callbacks notify orchestrator of current execution phase:
275
- - cache_load: Loading results from cache
276
- - prepare: Reading inputs and setting up work
277
- - compute: Processing work items in parallel
278
- - finalise: Computing final statistics and validating results
279
- - save: Writing results to cache
280
-
281
- Returns:
282
- Accumulated results (working data for pipeline flow)
283
- """
284
- if self._cache_is_valid():
285
- # Notify phase: loading from cache
286
- if self._phase_callback:
287
- self._phase_callback("cache_load")
288
-
289
- # Load from cache and store in instance (tuple unpacking with type annotation)
290
- # Cache contains FINAL counts and performance metrics (what this stage produced)
291
- loaded_cache: (
292
- tuple[R, list[SequenceGroup], list[IdenticalGroup], int | None, int | None]
293
- | tuple[
294
- R, list[SequenceGroup], list[IdenticalGroup], int | None, int | None, float | None, float | None
295
- ]
296
- ) = atomic_pickle_load(self.path)
297
- self._unpack_cache(loaded_cache)
298
-
299
- return
300
-
301
- # Not cached - compute result
302
- # Notify phase: preparing work
303
- if self._phase_callback:
304
- self._phase_callback("prepare")
305
-
306
- # prepare() reads from input ports and stores inputs as instance vars
307
- work: Iterable[S]
308
- work, result = self.prepare()
309
-
310
- # Notify phase: computing (parallel processing)
311
- if self._phase_callback:
312
- self._phase_callback("compute")
313
-
314
- # Process work items in parallel
315
- for r in self.batch_compute(work, self.stage_name):
316
- # Extract review and work data from worker result
317
- identical_review, sequence_review, work_item = r
318
-
319
- # Generic review accumulation (just extend lists, update dicts)
320
- self.identical_review_result.extend(identical_review)
321
- self.sequence_review_result.extend(sequence_review)
322
-
323
- # Stage-specific work accumulation
324
- self.accumulate_results(result, work_item)
325
-
326
- # Capture performance metrics from progress tracker (after context manager exits)
327
- if self._progress_tracker:
328
- self.elapsed_seconds = self._progress_tracker.elapsed_seconds
329
- self.throughput = self._progress_tracker.final_rate
330
-
331
- # Store result in instance BEFORE finalise (stages need to access self.result)
332
- self.result = result
333
-
334
- # Notify phase: finalizing results
335
- if self._phase_callback:
336
- self._phase_callback("finalise")
337
-
338
- # Update progress status for finalization
339
- if self._progress_tracker:
340
- self._progress_tracker.set_status("Finalizing results...")
341
-
342
- self.finalise()
343
-
344
- # Notify phase: saving to cache
345
- if self._phase_callback:
346
- self._phase_callback("save")
347
-
348
- # Update progress status for cache save
349
- if self._progress_tracker:
350
- self._progress_tracker.set_status("Saving to cache...")
351
-
352
- # Save working data, review data, ref counts, and performance metrics to cache (as typed tuple)
353
- # Save FINAL counts and metrics (set by finalise()) so downstream stages know what we produced
354
- saved_cache: tuple[
355
- R, list[SequenceGroup], list[IdenticalGroup], int | None, int | None, float | None, float | None
356
- ] = (
357
- self.result, # Use self.result (already assigned above)
358
- self.sequence_review_result,
359
- self.identical_review_result,
360
- self.ref_photos_final, # Final photo count (what THIS stage produced)
361
- self.ref_seqs_final, # Final sequence count (what THIS stage produced)
362
- self.elapsed_seconds, # Stage execution time in seconds
363
- self.throughput, # Items per second
364
- )
365
-
366
- atomic_pickle_dump(saved_cache, self.path)
367
-
368
-
369
- def atomic_pickle_load[T](
370
- path: Path,
371
- expected_type: type[T] | None = None, # noqa: ARG001
372
- ) -> T:
373
- """Load object from pickle file with optional type hint.
374
-
375
- Args:
376
- path: Path to pickle file
377
- expected_type: Optional type parameter for explicit type checking (unused at runtime,
378
- exists only to satisfy mypy's requirement that TypeVar appears in parameters)
379
-
380
- Returns:
381
- Unpickled object of type T
382
-
383
- Note:
384
- Type safety comes from explicit annotations at call sites:
385
- loaded: tuple[R, ...] = atomic_pickle_load(path)
386
- """
387
- with path.open("rb") as f:
388
- result: T = pickle.load(f)
389
- return result
390
-
391
-
392
- def atomic_pickle_dump[T](obj: T, path: Path) -> None:
393
- """Write pickle file atomically using temp file + os.replace.
394
-
395
- Args:
396
- obj: Object to pickle
397
- path: Destination file path
398
- """
399
- # Write to temp file in same directory as target
400
- temp_fd, temp_path = tempfile.mkstemp(dir=path.parent, prefix=f".{path.name}.", suffix=".tmp")
401
- # os.fdopen returns BufferedWriter which is compatible with SupportsWrite[bytes]
402
- # Type checker is being overly strict here
403
- # Must use os.fdopen rather than Path.open otherwise file handles trip over each other
404
- with os.fdopen(temp_fd, "wb") as f:
405
- # noinspection PyTypeChecker
406
- pickle.dump(obj, f)
407
- # Atomic replace
408
- Path(temp_path).replace(path)
1
+ """Base class for all pipeline stages."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import pickle
7
+ import tempfile
8
+ from abc import abstractmethod
9
+ from collections.abc import Iterable, Iterator, Sized
10
+ from pathlib import Path
11
+ from typing import cast, final
12
+
13
+ from joblib import Parallel, delayed
14
+
15
+ from .base_pipeline_stage import BasePipelineStage
16
+ from .config import CONFIG
17
+ from .graph_context import get_active_graph
18
+ from .logger import get_logger
19
+ from .models import IdenticalGroup, ReviewType, SequenceGroup
20
+ from .progress import ProgressTracker
21
+
22
+ # Type aliases for pipeline stage method signatures (must be defined outside class)
23
+ type PrepareResult[S, R] = tuple[Iterable[S], R]
24
+ type WorkerResult[T] = tuple[list[IdenticalGroup], list[SequenceGroup], T]
25
+
26
+
27
+ class PipelineStage[S, T, R](BasePipelineStage):
28
+ """Abstract base class for pipeline stages with parallel processing support.
29
+
30
+ Stages define three abstract methods for parallel execution:
31
+ - prepare(): Set up work items and result accumulator
32
+ - stage_worker(): Process individual work items (runs in parallel)
33
+ - accumulate_results(): Merge worker results into final output
34
+
35
+ Type safety is provided by port declarations (InputPort[T], OutputPort[T])
36
+ rather than class-level generic parameters.
37
+
38
+ Review Data Architecture:
39
+ - result: Working data that flows through pipeline (may be nested)
40
+ - review_result: Pre-computed review data for UI (always flat, per-stage)
41
+ """
42
+
43
+ result: R
44
+
45
+ def __init__(
46
+ self,
47
+ path: Path,
48
+ stage_name: str,
49
+ ):
50
+ """Initialize pipeline stage with output path and name.
51
+
52
+ Args:
53
+ path: Path where stage results will be cached
54
+ stage_name: Human-readable name for progress tracking
55
+ """
56
+ super().__init__(path, stage_name)
57
+
58
+ # Auto-register with active graph if within PipelineBuilder context
59
+ active_graph = get_active_graph()
60
+ if active_graph is not None:
61
+ active_graph.add_node(self)
62
+
63
+ @abstractmethod
64
+ def prepare(self) -> PrepareResult[S, R]:
65
+ """Prepare input for parallel processing by reading from input ports.
66
+
67
+ This method reads data from input ports and prepares work items for parallel
68
+ processing.
69
+
70
+ Returns:
71
+ Tuple of (work_items, accumulator) where work_items will be processed
72
+ in parallel and results accumulated into accumulator
73
+ """
74
+ ...
75
+
76
+ @staticmethod
77
+ @abstractmethod
78
+ def stage_worker(job: S, args: str) -> WorkerResult[T]:
79
+ """This method performs the core, isolated, and concurrent work."""
80
+ ...
81
+
82
+ @abstractmethod
83
+ def accumulate_results(self, result: R, job: T) -> None:
84
+ """Accumulate worker result into final output.
85
+
86
+ Args:
87
+ result: Accumulator to update (returned from prepare)
88
+ job: Result from stage_worker to incorporate
89
+ """
90
+ ...
91
+
92
+ # === Review Interface (for orchestrator discovery) ===
93
+
94
+ def needs_review(self) -> ReviewType:
95
+ """Discover what type of review this stage produces.
96
+
97
+ This allows the orchestrator to dynamically discover which stages
98
+ produce reviewable output without hard-coding stage names.
99
+
100
+ Returns:
101
+ - "none": No reviewable output (default)
102
+ - "photos": Produces photo groups (byte-identical duplicates)
103
+ - "sequences": Produces sequence groups (similar sequences)
104
+ """
105
+ return "none"
106
+
107
+ def _unpack_cache(
108
+ self,
109
+ loaded_cache: (
110
+ tuple[R, list[SequenceGroup], list[IdenticalGroup], int | None, int | None]
111
+ | tuple[R, list[SequenceGroup], list[IdenticalGroup], int | None, int | None, float | None, float | None]
112
+ ),
113
+ ) -> None:
114
+ """Unpack cache tuple with backward compatibility.
115
+
116
+ Handles both old 5-element format (before performance metrics) and new 7-element format.
117
+ """
118
+ if len(loaded_cache) == 5:
119
+ # Old format: (result, seq_review, id_review, ref_photos, ref_seqs)
120
+ (
121
+ self.result,
122
+ self.sequence_review_result,
123
+ self.identical_review_result,
124
+ self.ref_photos_final,
125
+ self.ref_seqs_final,
126
+ ) = loaded_cache
127
+ self.elapsed_seconds = None
128
+ self.throughput = None
129
+ else:
130
+ # New format: (result, seq_review, id_review, ref_photos, ref_seqs, elapsed, throughput)
131
+ (
132
+ self.result,
133
+ self.sequence_review_result,
134
+ self.identical_review_result,
135
+ self.ref_photos_final,
136
+ self.ref_seqs_final,
137
+ self.elapsed_seconds,
138
+ self.throughput,
139
+ ) = loaded_cache
140
+
141
+ def has_review_data(self) -> bool:
142
+ """Check if review data is available for this stage.
143
+
144
+ Returns:
145
+ True if stage has completed and has reviewable data available
146
+ """
147
+ return self.needs_review() != "none"
148
+
149
+ def batch_compute(self, work: Iterable[S], args: str) -> Iterator[WorkerResult[T]]:
150
+ """Orchestrates parallel or sequential processing with graceful shutdown support.
151
+
152
+ This implementation uses Joblib for efficient parallel processing with automatic
153
+ load balancing. The batch_size="auto" allows Joblib to optimize batching based
154
+ on measured task duration, which works well with sorted work items.
155
+
156
+ IMPORTANT: This function preserves lazy evaluation of iterators. If work is a
157
+ generator/iterator, it will be consumed lazily without materializing the entire
158
+ sequence in memory. Progress tracking will show indeterminate progress (no total).
159
+
160
+ Graceful Shutdown: When SIGINT (Ctrl+C) is received, joblib workers are terminated
161
+ and this method catches KeyboardInterrupt, logging the cancellation and returning
162
+ cleanly. This prevents ShutdownExecutorError from occurring.
163
+
164
+ Args:
165
+ work: An iterable of work items (e.g., file paths) to be processed.
166
+ args: Arguments to pass to the stage worker function.
167
+
168
+ Yields:
169
+ WorkerResult tuple containing (identical_groups, sequence_groups, work_data)
170
+ """
171
+ # Check if work supports len() without materializing
172
+ # Use hasattr to avoid forcing evaluation of generators
173
+ total_count: int | None
174
+ try:
175
+ # Cast to Sized since we know it has __len__ (hasattr check)
176
+ total_count = len(cast(Sized, work)) if hasattr(work, "__len__") else None
177
+ # This is a best effort so ok to handle any exceptions
178
+ except: # noqa: E722
179
+ # Some iterables don't support len() even with __len__ attribute
180
+ total_count = None
181
+
182
+ with ProgressTracker(self.stage_name, total=total_count) as progress:
183
+ # Expose progress tracker for UI polling (see get_progress())
184
+ self._progress_tracker = progress
185
+
186
+ try:
187
+ if CONFIG.processing.DEBUG_MODE:
188
+ # Sequential processing for debugging
189
+ for j in work:
190
+ r = self.__class__.stage_worker(j, args)
191
+ yield r
192
+ progress.update()
193
+ else:
194
+ # Log worker configuration
195
+ logger = get_logger()
196
+ logger.info(f"{self.stage_name}: Using {CONFIG.processing.MAX_WORKERS} parallel workers")
197
+
198
+ # Parallel processing with Joblib
199
+ # batch_size="auto" lets Joblib optimize batching automatically
200
+ # return_as="generator" provides streaming results
201
+ # IMPORTANT: Pass work directly (don't materialize) - Joblib handles iterators
202
+ results = Parallel(
203
+ n_jobs=CONFIG.processing.MAX_WORKERS,
204
+ backend="loky", # Robust process-based backend
205
+ prefer="processes", # Good for jobs with variable time requirements
206
+ batch_size="auto", # Automatic batch size optimization
207
+ return_as="generator_unordered", # Stream results as they complete
208
+ )(delayed(self.__class__.stage_worker)(item, args) for item in work)
209
+
210
+ # Yield results as they complete with progress tracking
211
+ for result in results:
212
+ yield result
213
+ progress.update()
214
+
215
+ except KeyboardInterrupt:
216
+ # SIGINT received (Ctrl+C or shutdown endpoint called)
217
+ # Joblib workers have been terminated by the signal
218
+ # Exit cleanly without trying to dispatch more work
219
+ logger = get_logger()
220
+ logger.info(f"{self.stage_name}: Received shutdown signal, stopping batch processing")
221
+ return # Exit generator cleanly (don't re-raise)
222
+
223
+ finally:
224
+ # Don't clear tracker here - let orchestrator clear after marking complete
225
+ # This keeps progress visible during finalise() and cache save
226
+ pass
227
+
228
+ # ========================================================================
229
+ # Port-Based Connectivity (Phase 1: Infrastructure - Optional for now)
230
+ # ========================================================================
231
+ #
232
+ # Stages declare typed ports explicitly in __init__ method:
233
+ # - Input ports: InputPort[Type] for receiving data from upstream stages
234
+ # - Output ports: OutputPort[Type] for sending data to downstream stages
235
+ #
236
+ # Graph builder wires ports explicitly with compile-time type checking
237
+ # by binding consumer input ports to producer output ports.
238
+ #
239
+ # No dynamic discovery needed - static types ensure correctness.
240
+
241
+ # ============================================================================
242
+ # DO NOT REMOVE @final DECORATOR - PREVENTS ARCHITECTURAL VIOLATIONS
243
+ # ============================================================================
244
+ # The @final decorator prevents subclasses from overriding run().
245
+ # This is CRITICAL to maintain separation of concerns:
246
+ #
247
+ # - run() handles: caching, batch processing, logging, result storage
248
+ # - prepare() handles: reading inputs from ports, stage-specific logic
249
+ #
250
+ # Previously, stages overrode run() which led to:
251
+ # - Redundant _execute_impl() pattern (removed in refactor)
252
+ # - Duplicate code across stages
253
+ # - Broken caching when stages forgot to call parent
254
+ # - 149 lines of unnecessary complexity
255
+ #
256
+ # If you need stage-specific behavior, override prepare().
257
+ # NEVER remove @final and override run() - this violates the pattern.
258
+ # ============================================================================
259
+ @final
260
+ def run(self) -> None:
261
+ """Execute pipeline stage with dependency-aware caching support.
262
+
263
+ This method is final and cannot be overridden by subclasses.
264
+
265
+ If cached results exist and are newer than all dependencies, loads and returns them.
266
+ Otherwise, prepares work (reading from input ports), processes in parallel,
267
+ accumulates results, saves to cache, and returns final result.
268
+
269
+ Stages must store their worker arguments in self.args during __init__.
270
+
271
+ NEW: Also builds and caches review data alongside working data for stages
272
+ that produce reviews (needs_review() != "none").
273
+
274
+ Phase callbacks notify orchestrator of current execution phase:
275
+ - cache_load: Loading results from cache
276
+ - prepare: Reading inputs and setting up work
277
+ - compute: Processing work items in parallel
278
+ - finalise: Computing final statistics and validating results
279
+ - save: Writing results to cache
280
+
281
+ Returns:
282
+ Accumulated results (working data for pipeline flow)
283
+ """
284
+ if self._cache_is_valid():
285
+ # Notify phase: loading from cache
286
+ if self._phase_callback:
287
+ self._phase_callback("cache_load")
288
+
289
+ # Load from cache and store in instance (tuple unpacking with type annotation)
290
+ # Cache contains FINAL counts and performance metrics (what this stage produced)
291
+ loaded_cache: (
292
+ tuple[R, list[SequenceGroup], list[IdenticalGroup], int | None, int | None]
293
+ | tuple[
294
+ R, list[SequenceGroup], list[IdenticalGroup], int | None, int | None, float | None, float | None
295
+ ]
296
+ ) = atomic_pickle_load(self.path)
297
+ self._unpack_cache(loaded_cache)
298
+
299
+ return
300
+
301
+ # Not cached - compute result
302
+ # Notify phase: preparing work
303
+ if self._phase_callback:
304
+ self._phase_callback("prepare")
305
+
306
+ # prepare() reads from input ports and stores inputs as instance vars
307
+ work: Iterable[S]
308
+ work, result = self.prepare()
309
+
310
+ # Notify phase: computing (parallel processing)
311
+ if self._phase_callback:
312
+ self._phase_callback("compute")
313
+
314
+ # Process work items in parallel
315
+ for r in self.batch_compute(work, self.stage_name):
316
+ # Extract review and work data from worker result
317
+ identical_review, sequence_review, work_item = r
318
+
319
+ # Generic review accumulation (just extend lists, update dicts)
320
+ self.identical_review_result.extend(identical_review)
321
+ self.sequence_review_result.extend(sequence_review)
322
+
323
+ # Stage-specific work accumulation
324
+ self.accumulate_results(result, work_item)
325
+
326
+ # Capture performance metrics from progress tracker (after context manager exits)
327
+ if self._progress_tracker:
328
+ self.elapsed_seconds = self._progress_tracker.elapsed_seconds
329
+ self.throughput = self._progress_tracker.final_rate
330
+
331
+ # Store result in instance BEFORE finalise (stages need to access self.result)
332
+ self.result = result
333
+
334
+ # Notify phase: finalizing results
335
+ if self._phase_callback:
336
+ self._phase_callback("finalise")
337
+
338
+ # Update progress status for finalization
339
+ if self._progress_tracker:
340
+ self._progress_tracker.set_status("Finalizing results...")
341
+
342
+ self.finalise()
343
+
344
+ # Notify phase: saving to cache
345
+ if self._phase_callback:
346
+ self._phase_callback("save")
347
+
348
+ # Update progress status for cache save
349
+ if self._progress_tracker:
350
+ self._progress_tracker.set_status("Saving to cache...")
351
+
352
+ # Save working data, review data, ref counts, and performance metrics to cache (as typed tuple)
353
+ # Save FINAL counts and metrics (set by finalise()) so downstream stages know what we produced
354
+ saved_cache: tuple[
355
+ R, list[SequenceGroup], list[IdenticalGroup], int | None, int | None, float | None, float | None
356
+ ] = (
357
+ self.result, # Use self.result (already assigned above)
358
+ self.sequence_review_result,
359
+ self.identical_review_result,
360
+ self.ref_photos_final, # Final photo count (what THIS stage produced)
361
+ self.ref_seqs_final, # Final sequence count (what THIS stage produced)
362
+ self.elapsed_seconds, # Stage execution time in seconds
363
+ self.throughput, # Items per second
364
+ )
365
+
366
+ atomic_pickle_dump(saved_cache, self.path)
367
+
368
+
369
+ def atomic_pickle_load[T](
370
+ path: Path,
371
+ expected_type: type[T] | None = None, # noqa: ARG001
372
+ ) -> T:
373
+ """Load object from pickle file with optional type hint.
374
+
375
+ Args:
376
+ path: Path to pickle file
377
+ expected_type: Optional type parameter for explicit type checking (unused at runtime,
378
+ exists only to satisfy mypy's requirement that TypeVar appears in parameters)
379
+
380
+ Returns:
381
+ Unpickled object of type T
382
+
383
+ Note:
384
+ Type safety comes from explicit annotations at call sites:
385
+ loaded: tuple[R, ...] = atomic_pickle_load(path)
386
+ """
387
+ with path.open("rb") as f:
388
+ result: T = pickle.load(f)
389
+ return result
390
+
391
+
392
+ def atomic_pickle_dump[T](obj: T, path: Path) -> None:
393
+ """Write pickle file atomically using temp file + os.replace.
394
+
395
+ Args:
396
+ obj: Object to pickle
397
+ path: Destination file path
398
+ """
399
+ # Write to temp file in same directory as target
400
+ temp_fd, temp_path = tempfile.mkstemp(dir=path.parent, prefix=f".{path.name}.", suffix=".tmp")
401
+ # os.fdopen returns BufferedWriter which is compatible with SupportsWrite[bytes]
402
+ # Type checker is being overly strict here
403
+ # Must use os.fdopen rather than Path.open otherwise file handles trip over each other
404
+ with os.fdopen(temp_fd, "wb") as f:
405
+ # noinspection PyTypeChecker
406
+ pickle.dump(obj, f)
407
+ # Atomic replace
408
+ Path(temp_path).replace(path)