photo-stack-finder 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orchestrator/__init__.py +2 -2
- orchestrator/app.py +6 -11
- orchestrator/build_pipeline.py +19 -21
- orchestrator/orchestrator_runner.py +11 -8
- orchestrator/pipeline_builder.py +126 -126
- orchestrator/pipeline_orchestrator.py +604 -604
- orchestrator/review_persistence.py +162 -162
- orchestrator/static/orchestrator.css +76 -76
- orchestrator/static/orchestrator.html +11 -5
- orchestrator/static/orchestrator.js +3 -1
- overlap_metrics/__init__.py +1 -1
- overlap_metrics/config.py +135 -135
- overlap_metrics/core.py +284 -284
- overlap_metrics/estimators.py +292 -292
- overlap_metrics/metrics.py +307 -307
- overlap_metrics/registry.py +99 -99
- overlap_metrics/utils.py +104 -104
- photo_compare/__init__.py +1 -1
- photo_compare/base.py +285 -285
- photo_compare/config.py +225 -225
- photo_compare/distance.py +15 -15
- photo_compare/feature_methods.py +173 -173
- photo_compare/file_hash.py +29 -29
- photo_compare/hash_methods.py +99 -99
- photo_compare/histogram_methods.py +118 -118
- photo_compare/pixel_methods.py +58 -58
- photo_compare/structural_methods.py +104 -104
- photo_compare/types.py +28 -28
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/METADATA +21 -22
- photo_stack_finder-0.1.8.dist-info/RECORD +75 -0
- scripts/orchestrate.py +12 -10
- utils/__init__.py +4 -3
- utils/base_pipeline_stage.py +171 -171
- utils/base_ports.py +176 -176
- utils/benchmark_utils.py +823 -823
- utils/channel.py +74 -74
- utils/comparison_gates.py +40 -21
- utils/compute_benchmarks.py +355 -355
- utils/compute_identical.py +94 -24
- utils/compute_indices.py +235 -235
- utils/compute_perceptual_hash.py +127 -127
- utils/compute_perceptual_match.py +240 -240
- utils/compute_sha_bins.py +64 -20
- utils/compute_template_similarity.py +1 -1
- utils/compute_versions.py +483 -483
- utils/config.py +8 -5
- utils/data_io.py +83 -83
- utils/graph_context.py +44 -44
- utils/logger.py +2 -2
- utils/models.py +2 -2
- utils/photo_file.py +90 -91
- utils/pipeline_graph.py +334 -334
- utils/pipeline_stage.py +408 -408
- utils/plot_helpers.py +123 -123
- utils/ports.py +136 -136
- utils/progress.py +415 -415
- utils/report_builder.py +139 -139
- utils/review_types.py +55 -55
- utils/review_utils.py +10 -19
- utils/sequence.py +10 -8
- utils/sequence_clustering.py +1 -1
- utils/template.py +57 -57
- utils/template_parsing.py +71 -0
- photo_stack_finder-0.1.7.dist-info/RECORD +0 -74
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/WHEEL +0 -0
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/entry_points.txt +0 -0
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/top_level.txt +0 -0
utils/pipeline_stage.py
CHANGED
|
@@ -1,408 +1,408 @@
|
|
|
1
|
-
"""Base class for all pipeline stages."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import os
|
|
6
|
-
import pickle
|
|
7
|
-
import tempfile
|
|
8
|
-
from abc import abstractmethod
|
|
9
|
-
from collections.abc import Iterable, Iterator, Sized
|
|
10
|
-
from pathlib import Path
|
|
11
|
-
from typing import cast, final
|
|
12
|
-
|
|
13
|
-
from joblib import Parallel, delayed
|
|
14
|
-
|
|
15
|
-
from .base_pipeline_stage import BasePipelineStage
|
|
16
|
-
from .config import CONFIG
|
|
17
|
-
from .graph_context import get_active_graph
|
|
18
|
-
from .logger import get_logger
|
|
19
|
-
from .models import IdenticalGroup, ReviewType, SequenceGroup
|
|
20
|
-
from .progress import ProgressTracker
|
|
21
|
-
|
|
22
|
-
# Type aliases for pipeline stage method signatures (must be defined outside class)
|
|
23
|
-
type PrepareResult[S, R] = tuple[Iterable[S], R]
|
|
24
|
-
type WorkerResult[T] = tuple[list[IdenticalGroup], list[SequenceGroup], T]
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class PipelineStage[S, T, R](BasePipelineStage):
|
|
28
|
-
"""Abstract base class for pipeline stages with parallel processing support.
|
|
29
|
-
|
|
30
|
-
Stages define three abstract methods for parallel execution:
|
|
31
|
-
- prepare(): Set up work items and result accumulator
|
|
32
|
-
- stage_worker(): Process individual work items (runs in parallel)
|
|
33
|
-
- accumulate_results(): Merge worker results into final output
|
|
34
|
-
|
|
35
|
-
Type safety is provided by port declarations (InputPort[T], OutputPort[T])
|
|
36
|
-
rather than class-level generic parameters.
|
|
37
|
-
|
|
38
|
-
Review Data Architecture:
|
|
39
|
-
- result: Working data that flows through pipeline (may be nested)
|
|
40
|
-
- review_result: Pre-computed review data for UI (always flat, per-stage)
|
|
41
|
-
"""
|
|
42
|
-
|
|
43
|
-
result: R
|
|
44
|
-
|
|
45
|
-
def __init__(
|
|
46
|
-
self,
|
|
47
|
-
path: Path,
|
|
48
|
-
stage_name: str,
|
|
49
|
-
):
|
|
50
|
-
"""Initialize pipeline stage with output path and name.
|
|
51
|
-
|
|
52
|
-
Args:
|
|
53
|
-
path: Path where stage results will be cached
|
|
54
|
-
stage_name: Human-readable name for progress tracking
|
|
55
|
-
"""
|
|
56
|
-
super().__init__(path, stage_name)
|
|
57
|
-
|
|
58
|
-
# Auto-register with active graph if within PipelineBuilder context
|
|
59
|
-
active_graph = get_active_graph()
|
|
60
|
-
if active_graph is not None:
|
|
61
|
-
active_graph.add_node(self)
|
|
62
|
-
|
|
63
|
-
@abstractmethod
|
|
64
|
-
def prepare(self) -> PrepareResult[S, R]:
|
|
65
|
-
"""Prepare input for parallel processing by reading from input ports.
|
|
66
|
-
|
|
67
|
-
This method reads data from input ports and prepares work items for parallel
|
|
68
|
-
processing.
|
|
69
|
-
|
|
70
|
-
Returns:
|
|
71
|
-
Tuple of (work_items, accumulator) where work_items will be processed
|
|
72
|
-
in parallel and results accumulated into accumulator
|
|
73
|
-
"""
|
|
74
|
-
...
|
|
75
|
-
|
|
76
|
-
@staticmethod
|
|
77
|
-
@abstractmethod
|
|
78
|
-
def stage_worker(job: S, args: str) -> WorkerResult[T]:
|
|
79
|
-
"""This method performs the core, isolated, and concurrent work."""
|
|
80
|
-
...
|
|
81
|
-
|
|
82
|
-
@abstractmethod
|
|
83
|
-
def accumulate_results(self, result: R, job: T) -> None:
|
|
84
|
-
"""Accumulate worker result into final output.
|
|
85
|
-
|
|
86
|
-
Args:
|
|
87
|
-
result: Accumulator to update (returned from prepare)
|
|
88
|
-
job: Result from stage_worker to incorporate
|
|
89
|
-
"""
|
|
90
|
-
...
|
|
91
|
-
|
|
92
|
-
# === Review Interface (for orchestrator discovery) ===
|
|
93
|
-
|
|
94
|
-
def needs_review(self) -> ReviewType:
|
|
95
|
-
"""Discover what type of review this stage produces.
|
|
96
|
-
|
|
97
|
-
This allows the orchestrator to dynamically discover which stages
|
|
98
|
-
produce reviewable output without hard-coding stage names.
|
|
99
|
-
|
|
100
|
-
Returns:
|
|
101
|
-
- "none": No reviewable output (default)
|
|
102
|
-
- "photos": Produces photo groups (byte-identical duplicates)
|
|
103
|
-
- "sequences": Produces sequence groups (similar sequences)
|
|
104
|
-
"""
|
|
105
|
-
return "none"
|
|
106
|
-
|
|
107
|
-
def _unpack_cache(
|
|
108
|
-
self,
|
|
109
|
-
loaded_cache: (
|
|
110
|
-
tuple[R, list[SequenceGroup], list[IdenticalGroup], int | None, int | None]
|
|
111
|
-
| tuple[R, list[SequenceGroup], list[IdenticalGroup], int | None, int | None, float | None, float | None]
|
|
112
|
-
),
|
|
113
|
-
) -> None:
|
|
114
|
-
"""Unpack cache tuple with backward compatibility.
|
|
115
|
-
|
|
116
|
-
Handles both old 5-element format (before performance metrics) and new 7-element format.
|
|
117
|
-
"""
|
|
118
|
-
if len(loaded_cache) == 5:
|
|
119
|
-
# Old format: (result, seq_review, id_review, ref_photos, ref_seqs)
|
|
120
|
-
(
|
|
121
|
-
self.result,
|
|
122
|
-
self.sequence_review_result,
|
|
123
|
-
self.identical_review_result,
|
|
124
|
-
self.ref_photos_final,
|
|
125
|
-
self.ref_seqs_final,
|
|
126
|
-
) = loaded_cache
|
|
127
|
-
self.elapsed_seconds = None
|
|
128
|
-
self.throughput = None
|
|
129
|
-
else:
|
|
130
|
-
# New format: (result, seq_review, id_review, ref_photos, ref_seqs, elapsed, throughput)
|
|
131
|
-
(
|
|
132
|
-
self.result,
|
|
133
|
-
self.sequence_review_result,
|
|
134
|
-
self.identical_review_result,
|
|
135
|
-
self.ref_photos_final,
|
|
136
|
-
self.ref_seqs_final,
|
|
137
|
-
self.elapsed_seconds,
|
|
138
|
-
self.throughput,
|
|
139
|
-
) = loaded_cache
|
|
140
|
-
|
|
141
|
-
def has_review_data(self) -> bool:
|
|
142
|
-
"""Check if review data is available for this stage.
|
|
143
|
-
|
|
144
|
-
Returns:
|
|
145
|
-
True if stage has completed and has reviewable data available
|
|
146
|
-
"""
|
|
147
|
-
return self.needs_review() != "none"
|
|
148
|
-
|
|
149
|
-
def batch_compute(self, work: Iterable[S], args: str) -> Iterator[WorkerResult[T]]:
|
|
150
|
-
"""Orchestrates parallel or sequential processing with graceful shutdown support.
|
|
151
|
-
|
|
152
|
-
This implementation uses Joblib for efficient parallel processing with automatic
|
|
153
|
-
load balancing. The batch_size="auto" allows Joblib to optimize batching based
|
|
154
|
-
on measured task duration, which works well with sorted work items.
|
|
155
|
-
|
|
156
|
-
IMPORTANT: This function preserves lazy evaluation of iterators. If work is a
|
|
157
|
-
generator/iterator, it will be consumed lazily without materializing the entire
|
|
158
|
-
sequence in memory. Progress tracking will show indeterminate progress (no total).
|
|
159
|
-
|
|
160
|
-
Graceful Shutdown: When SIGINT (Ctrl+C) is received, joblib workers are terminated
|
|
161
|
-
and this method catches KeyboardInterrupt, logging the cancellation and returning
|
|
162
|
-
cleanly. This prevents ShutdownExecutorError from occurring.
|
|
163
|
-
|
|
164
|
-
Args:
|
|
165
|
-
work: An iterable of work items (e.g., file paths) to be processed.
|
|
166
|
-
args: Arguments to pass to the stage worker function.
|
|
167
|
-
|
|
168
|
-
Yields:
|
|
169
|
-
WorkerResult tuple containing (identical_groups, sequence_groups, work_data)
|
|
170
|
-
"""
|
|
171
|
-
# Check if work supports len() without materializing
|
|
172
|
-
# Use hasattr to avoid forcing evaluation of generators
|
|
173
|
-
total_count: int | None
|
|
174
|
-
try:
|
|
175
|
-
# Cast to Sized since we know it has __len__ (hasattr check)
|
|
176
|
-
total_count = len(cast(Sized, work)) if hasattr(work, "__len__") else None
|
|
177
|
-
# This is a best effort so ok to handle any exceptions
|
|
178
|
-
except: # noqa: E722
|
|
179
|
-
# Some iterables don't support len() even with __len__ attribute
|
|
180
|
-
total_count = None
|
|
181
|
-
|
|
182
|
-
with ProgressTracker(self.stage_name, total=total_count) as progress:
|
|
183
|
-
# Expose progress tracker for UI polling (see get_progress())
|
|
184
|
-
self._progress_tracker = progress
|
|
185
|
-
|
|
186
|
-
try:
|
|
187
|
-
if CONFIG.processing.DEBUG_MODE:
|
|
188
|
-
# Sequential processing for debugging
|
|
189
|
-
for j in work:
|
|
190
|
-
r = self.__class__.stage_worker(j, args)
|
|
191
|
-
yield r
|
|
192
|
-
progress.update()
|
|
193
|
-
else:
|
|
194
|
-
# Log worker configuration
|
|
195
|
-
logger = get_logger()
|
|
196
|
-
logger.info(f"{self.stage_name}: Using {CONFIG.processing.MAX_WORKERS} parallel workers")
|
|
197
|
-
|
|
198
|
-
# Parallel processing with Joblib
|
|
199
|
-
# batch_size="auto" lets Joblib optimize batching automatically
|
|
200
|
-
# return_as="generator" provides streaming results
|
|
201
|
-
# IMPORTANT: Pass work directly (don't materialize) - Joblib handles iterators
|
|
202
|
-
results = Parallel(
|
|
203
|
-
n_jobs=CONFIG.processing.MAX_WORKERS,
|
|
204
|
-
backend="loky", # Robust process-based backend
|
|
205
|
-
prefer="processes", # Good for jobs with variable time requirements
|
|
206
|
-
batch_size="auto", # Automatic batch size optimization
|
|
207
|
-
return_as="generator_unordered", # Stream results as they complete
|
|
208
|
-
)(delayed(self.__class__.stage_worker)(item, args) for item in work)
|
|
209
|
-
|
|
210
|
-
# Yield results as they complete with progress tracking
|
|
211
|
-
for result in results:
|
|
212
|
-
yield result
|
|
213
|
-
progress.update()
|
|
214
|
-
|
|
215
|
-
except KeyboardInterrupt:
|
|
216
|
-
# SIGINT received (Ctrl+C or shutdown endpoint called)
|
|
217
|
-
# Joblib workers have been terminated by the signal
|
|
218
|
-
# Exit cleanly without trying to dispatch more work
|
|
219
|
-
logger = get_logger()
|
|
220
|
-
logger.info(f"{self.stage_name}: Received shutdown signal, stopping batch processing")
|
|
221
|
-
return # Exit generator cleanly (don't re-raise)
|
|
222
|
-
|
|
223
|
-
finally:
|
|
224
|
-
# Don't clear tracker here - let orchestrator clear after marking complete
|
|
225
|
-
# This keeps progress visible during finalise() and cache save
|
|
226
|
-
pass
|
|
227
|
-
|
|
228
|
-
# ========================================================================
|
|
229
|
-
# Port-Based Connectivity (Phase 1: Infrastructure - Optional for now)
|
|
230
|
-
# ========================================================================
|
|
231
|
-
#
|
|
232
|
-
# Stages declare typed ports explicitly in __init__ method:
|
|
233
|
-
# - Input ports: InputPort[Type] for receiving data from upstream stages
|
|
234
|
-
# - Output ports: OutputPort[Type] for sending data to downstream stages
|
|
235
|
-
#
|
|
236
|
-
# Graph builder wires ports explicitly with compile-time type checking
|
|
237
|
-
# by binding consumer input ports to producer output ports.
|
|
238
|
-
#
|
|
239
|
-
# No dynamic discovery needed - static types ensure correctness.
|
|
240
|
-
|
|
241
|
-
# ============================================================================
|
|
242
|
-
# DO NOT REMOVE @final DECORATOR - PREVENTS ARCHITECTURAL VIOLATIONS
|
|
243
|
-
# ============================================================================
|
|
244
|
-
# The @final decorator prevents subclasses from overriding run().
|
|
245
|
-
# This is CRITICAL to maintain separation of concerns:
|
|
246
|
-
#
|
|
247
|
-
# - run() handles: caching, batch processing, logging, result storage
|
|
248
|
-
# - prepare() handles: reading inputs from ports, stage-specific logic
|
|
249
|
-
#
|
|
250
|
-
# Previously, stages overrode run() which led to:
|
|
251
|
-
# - Redundant _execute_impl() pattern (removed in refactor)
|
|
252
|
-
# - Duplicate code across stages
|
|
253
|
-
# - Broken caching when stages forgot to call parent
|
|
254
|
-
# - 149 lines of unnecessary complexity
|
|
255
|
-
#
|
|
256
|
-
# If you need stage-specific behavior, override prepare().
|
|
257
|
-
# NEVER remove @final and override run() - this violates the pattern.
|
|
258
|
-
# ============================================================================
|
|
259
|
-
@final
|
|
260
|
-
def run(self) -> None:
|
|
261
|
-
"""Execute pipeline stage with dependency-aware caching support.
|
|
262
|
-
|
|
263
|
-
This method is final and cannot be overridden by subclasses.
|
|
264
|
-
|
|
265
|
-
If cached results exist and are newer than all dependencies, loads and returns them.
|
|
266
|
-
Otherwise, prepares work (reading from input ports), processes in parallel,
|
|
267
|
-
accumulates results, saves to cache, and returns final result.
|
|
268
|
-
|
|
269
|
-
Stages must store their worker arguments in self.args during __init__.
|
|
270
|
-
|
|
271
|
-
NEW: Also builds and caches review data alongside working data for stages
|
|
272
|
-
that produce reviews (needs_review() != "none").
|
|
273
|
-
|
|
274
|
-
Phase callbacks notify orchestrator of current execution phase:
|
|
275
|
-
- cache_load: Loading results from cache
|
|
276
|
-
- prepare: Reading inputs and setting up work
|
|
277
|
-
- compute: Processing work items in parallel
|
|
278
|
-
- finalise: Computing final statistics and validating results
|
|
279
|
-
- save: Writing results to cache
|
|
280
|
-
|
|
281
|
-
Returns:
|
|
282
|
-
Accumulated results (working data for pipeline flow)
|
|
283
|
-
"""
|
|
284
|
-
if self._cache_is_valid():
|
|
285
|
-
# Notify phase: loading from cache
|
|
286
|
-
if self._phase_callback:
|
|
287
|
-
self._phase_callback("cache_load")
|
|
288
|
-
|
|
289
|
-
# Load from cache and store in instance (tuple unpacking with type annotation)
|
|
290
|
-
# Cache contains FINAL counts and performance metrics (what this stage produced)
|
|
291
|
-
loaded_cache: (
|
|
292
|
-
tuple[R, list[SequenceGroup], list[IdenticalGroup], int | None, int | None]
|
|
293
|
-
| tuple[
|
|
294
|
-
R, list[SequenceGroup], list[IdenticalGroup], int | None, int | None, float | None, float | None
|
|
295
|
-
]
|
|
296
|
-
) = atomic_pickle_load(self.path)
|
|
297
|
-
self._unpack_cache(loaded_cache)
|
|
298
|
-
|
|
299
|
-
return
|
|
300
|
-
|
|
301
|
-
# Not cached - compute result
|
|
302
|
-
# Notify phase: preparing work
|
|
303
|
-
if self._phase_callback:
|
|
304
|
-
self._phase_callback("prepare")
|
|
305
|
-
|
|
306
|
-
# prepare() reads from input ports and stores inputs as instance vars
|
|
307
|
-
work: Iterable[S]
|
|
308
|
-
work, result = self.prepare()
|
|
309
|
-
|
|
310
|
-
# Notify phase: computing (parallel processing)
|
|
311
|
-
if self._phase_callback:
|
|
312
|
-
self._phase_callback("compute")
|
|
313
|
-
|
|
314
|
-
# Process work items in parallel
|
|
315
|
-
for r in self.batch_compute(work, self.stage_name):
|
|
316
|
-
# Extract review and work data from worker result
|
|
317
|
-
identical_review, sequence_review, work_item = r
|
|
318
|
-
|
|
319
|
-
# Generic review accumulation (just extend lists, update dicts)
|
|
320
|
-
self.identical_review_result.extend(identical_review)
|
|
321
|
-
self.sequence_review_result.extend(sequence_review)
|
|
322
|
-
|
|
323
|
-
# Stage-specific work accumulation
|
|
324
|
-
self.accumulate_results(result, work_item)
|
|
325
|
-
|
|
326
|
-
# Capture performance metrics from progress tracker (after context manager exits)
|
|
327
|
-
if self._progress_tracker:
|
|
328
|
-
self.elapsed_seconds = self._progress_tracker.elapsed_seconds
|
|
329
|
-
self.throughput = self._progress_tracker.final_rate
|
|
330
|
-
|
|
331
|
-
# Store result in instance BEFORE finalise (stages need to access self.result)
|
|
332
|
-
self.result = result
|
|
333
|
-
|
|
334
|
-
# Notify phase: finalizing results
|
|
335
|
-
if self._phase_callback:
|
|
336
|
-
self._phase_callback("finalise")
|
|
337
|
-
|
|
338
|
-
# Update progress status for finalization
|
|
339
|
-
if self._progress_tracker:
|
|
340
|
-
self._progress_tracker.set_status("Finalizing results...")
|
|
341
|
-
|
|
342
|
-
self.finalise()
|
|
343
|
-
|
|
344
|
-
# Notify phase: saving to cache
|
|
345
|
-
if self._phase_callback:
|
|
346
|
-
self._phase_callback("save")
|
|
347
|
-
|
|
348
|
-
# Update progress status for cache save
|
|
349
|
-
if self._progress_tracker:
|
|
350
|
-
self._progress_tracker.set_status("Saving to cache...")
|
|
351
|
-
|
|
352
|
-
# Save working data, review data, ref counts, and performance metrics to cache (as typed tuple)
|
|
353
|
-
# Save FINAL counts and metrics (set by finalise()) so downstream stages know what we produced
|
|
354
|
-
saved_cache: tuple[
|
|
355
|
-
R, list[SequenceGroup], list[IdenticalGroup], int | None, int | None, float | None, float | None
|
|
356
|
-
] = (
|
|
357
|
-
self.result, # Use self.result (already assigned above)
|
|
358
|
-
self.sequence_review_result,
|
|
359
|
-
self.identical_review_result,
|
|
360
|
-
self.ref_photos_final, # Final photo count (what THIS stage produced)
|
|
361
|
-
self.ref_seqs_final, # Final sequence count (what THIS stage produced)
|
|
362
|
-
self.elapsed_seconds, # Stage execution time in seconds
|
|
363
|
-
self.throughput, # Items per second
|
|
364
|
-
)
|
|
365
|
-
|
|
366
|
-
atomic_pickle_dump(saved_cache, self.path)
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
def atomic_pickle_load[T](
|
|
370
|
-
path: Path,
|
|
371
|
-
expected_type: type[T] | None = None, # noqa: ARG001
|
|
372
|
-
) -> T:
|
|
373
|
-
"""Load object from pickle file with optional type hint.
|
|
374
|
-
|
|
375
|
-
Args:
|
|
376
|
-
path: Path to pickle file
|
|
377
|
-
expected_type: Optional type parameter for explicit type checking (unused at runtime,
|
|
378
|
-
exists only to satisfy mypy's requirement that TypeVar appears in parameters)
|
|
379
|
-
|
|
380
|
-
Returns:
|
|
381
|
-
Unpickled object of type T
|
|
382
|
-
|
|
383
|
-
Note:
|
|
384
|
-
Type safety comes from explicit annotations at call sites:
|
|
385
|
-
loaded: tuple[R, ...] = atomic_pickle_load(path)
|
|
386
|
-
"""
|
|
387
|
-
with path.open("rb") as f:
|
|
388
|
-
result: T = pickle.load(f)
|
|
389
|
-
return result
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
def atomic_pickle_dump[T](obj: T, path: Path) -> None:
|
|
393
|
-
"""Write pickle file atomically using temp file + os.replace.
|
|
394
|
-
|
|
395
|
-
Args:
|
|
396
|
-
obj: Object to pickle
|
|
397
|
-
path: Destination file path
|
|
398
|
-
"""
|
|
399
|
-
# Write to temp file in same directory as target
|
|
400
|
-
temp_fd, temp_path = tempfile.mkstemp(dir=path.parent, prefix=f".{path.name}.", suffix=".tmp")
|
|
401
|
-
# os.fdopen returns BufferedWriter which is compatible with SupportsWrite[bytes]
|
|
402
|
-
# Type checker is being overly strict here
|
|
403
|
-
# Must use os.fdopen rather than Path.open otherwise file handles trip over each other
|
|
404
|
-
with os.fdopen(temp_fd, "wb") as f:
|
|
405
|
-
# noinspection PyTypeChecker
|
|
406
|
-
pickle.dump(obj, f)
|
|
407
|
-
# Atomic replace
|
|
408
|
-
Path(temp_path).replace(path)
|
|
1
|
+
"""Base class for all pipeline stages."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import pickle
|
|
7
|
+
import tempfile
|
|
8
|
+
from abc import abstractmethod
|
|
9
|
+
from collections.abc import Iterable, Iterator, Sized
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import cast, final
|
|
12
|
+
|
|
13
|
+
from joblib import Parallel, delayed
|
|
14
|
+
|
|
15
|
+
from .base_pipeline_stage import BasePipelineStage
|
|
16
|
+
from .config import CONFIG
|
|
17
|
+
from .graph_context import get_active_graph
|
|
18
|
+
from .logger import get_logger
|
|
19
|
+
from .models import IdenticalGroup, ReviewType, SequenceGroup
|
|
20
|
+
from .progress import ProgressTracker
|
|
21
|
+
|
|
22
|
+
# Type aliases for pipeline stage method signatures (must be defined outside class)
|
|
23
|
+
type PrepareResult[S, R] = tuple[Iterable[S], R]
|
|
24
|
+
type WorkerResult[T] = tuple[list[IdenticalGroup], list[SequenceGroup], T]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class PipelineStage[S, T, R](BasePipelineStage):
|
|
28
|
+
"""Abstract base class for pipeline stages with parallel processing support.
|
|
29
|
+
|
|
30
|
+
Stages define three abstract methods for parallel execution:
|
|
31
|
+
- prepare(): Set up work items and result accumulator
|
|
32
|
+
- stage_worker(): Process individual work items (runs in parallel)
|
|
33
|
+
- accumulate_results(): Merge worker results into final output
|
|
34
|
+
|
|
35
|
+
Type safety is provided by port declarations (InputPort[T], OutputPort[T])
|
|
36
|
+
rather than class-level generic parameters.
|
|
37
|
+
|
|
38
|
+
Review Data Architecture:
|
|
39
|
+
- result: Working data that flows through pipeline (may be nested)
|
|
40
|
+
- review_result: Pre-computed review data for UI (always flat, per-stage)
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
result: R
|
|
44
|
+
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
path: Path,
|
|
48
|
+
stage_name: str,
|
|
49
|
+
):
|
|
50
|
+
"""Initialize pipeline stage with output path and name.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
path: Path where stage results will be cached
|
|
54
|
+
stage_name: Human-readable name for progress tracking
|
|
55
|
+
"""
|
|
56
|
+
super().__init__(path, stage_name)
|
|
57
|
+
|
|
58
|
+
# Auto-register with active graph if within PipelineBuilder context
|
|
59
|
+
active_graph = get_active_graph()
|
|
60
|
+
if active_graph is not None:
|
|
61
|
+
active_graph.add_node(self)
|
|
62
|
+
|
|
63
|
+
@abstractmethod
|
|
64
|
+
def prepare(self) -> PrepareResult[S, R]:
|
|
65
|
+
"""Prepare input for parallel processing by reading from input ports.
|
|
66
|
+
|
|
67
|
+
This method reads data from input ports and prepares work items for parallel
|
|
68
|
+
processing.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Tuple of (work_items, accumulator) where work_items will be processed
|
|
72
|
+
in parallel and results accumulated into accumulator
|
|
73
|
+
"""
|
|
74
|
+
...
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
@abstractmethod
|
|
78
|
+
def stage_worker(job: S, args: str) -> WorkerResult[T]:
|
|
79
|
+
"""This method performs the core, isolated, and concurrent work."""
|
|
80
|
+
...
|
|
81
|
+
|
|
82
|
+
@abstractmethod
|
|
83
|
+
def accumulate_results(self, result: R, job: T) -> None:
|
|
84
|
+
"""Accumulate worker result into final output.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
result: Accumulator to update (returned from prepare)
|
|
88
|
+
job: Result from stage_worker to incorporate
|
|
89
|
+
"""
|
|
90
|
+
...
|
|
91
|
+
|
|
92
|
+
# === Review Interface (for orchestrator discovery) ===
|
|
93
|
+
|
|
94
|
+
def needs_review(self) -> ReviewType:
|
|
95
|
+
"""Discover what type of review this stage produces.
|
|
96
|
+
|
|
97
|
+
This allows the orchestrator to dynamically discover which stages
|
|
98
|
+
produce reviewable output without hard-coding stage names.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
- "none": No reviewable output (default)
|
|
102
|
+
- "photos": Produces photo groups (byte-identical duplicates)
|
|
103
|
+
- "sequences": Produces sequence groups (similar sequences)
|
|
104
|
+
"""
|
|
105
|
+
return "none"
|
|
106
|
+
|
|
107
|
+
def _unpack_cache(
|
|
108
|
+
self,
|
|
109
|
+
loaded_cache: (
|
|
110
|
+
tuple[R, list[SequenceGroup], list[IdenticalGroup], int | None, int | None]
|
|
111
|
+
| tuple[R, list[SequenceGroup], list[IdenticalGroup], int | None, int | None, float | None, float | None]
|
|
112
|
+
),
|
|
113
|
+
) -> None:
|
|
114
|
+
"""Unpack cache tuple with backward compatibility.
|
|
115
|
+
|
|
116
|
+
Handles both old 5-element format (before performance metrics) and new 7-element format.
|
|
117
|
+
"""
|
|
118
|
+
if len(loaded_cache) == 5:
|
|
119
|
+
# Old format: (result, seq_review, id_review, ref_photos, ref_seqs)
|
|
120
|
+
(
|
|
121
|
+
self.result,
|
|
122
|
+
self.sequence_review_result,
|
|
123
|
+
self.identical_review_result,
|
|
124
|
+
self.ref_photos_final,
|
|
125
|
+
self.ref_seqs_final,
|
|
126
|
+
) = loaded_cache
|
|
127
|
+
self.elapsed_seconds = None
|
|
128
|
+
self.throughput = None
|
|
129
|
+
else:
|
|
130
|
+
# New format: (result, seq_review, id_review, ref_photos, ref_seqs, elapsed, throughput)
|
|
131
|
+
(
|
|
132
|
+
self.result,
|
|
133
|
+
self.sequence_review_result,
|
|
134
|
+
self.identical_review_result,
|
|
135
|
+
self.ref_photos_final,
|
|
136
|
+
self.ref_seqs_final,
|
|
137
|
+
self.elapsed_seconds,
|
|
138
|
+
self.throughput,
|
|
139
|
+
) = loaded_cache
|
|
140
|
+
|
|
141
|
+
def has_review_data(self) -> bool:
|
|
142
|
+
"""Check if review data is available for this stage.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
True if stage has completed and has reviewable data available
|
|
146
|
+
"""
|
|
147
|
+
return self.needs_review() != "none"
|
|
148
|
+
|
|
149
|
+
def batch_compute(self, work: Iterable[S], args: str) -> Iterator[WorkerResult[T]]:
|
|
150
|
+
"""Orchestrates parallel or sequential processing with graceful shutdown support.
|
|
151
|
+
|
|
152
|
+
This implementation uses Joblib for efficient parallel processing with automatic
|
|
153
|
+
load balancing. The batch_size="auto" allows Joblib to optimize batching based
|
|
154
|
+
on measured task duration, which works well with sorted work items.
|
|
155
|
+
|
|
156
|
+
IMPORTANT: This function preserves lazy evaluation of iterators. If work is a
|
|
157
|
+
generator/iterator, it will be consumed lazily without materializing the entire
|
|
158
|
+
sequence in memory. Progress tracking will show indeterminate progress (no total).
|
|
159
|
+
|
|
160
|
+
Graceful Shutdown: When SIGINT (Ctrl+C) is received, joblib workers are terminated
|
|
161
|
+
and this method catches KeyboardInterrupt, logging the cancellation and returning
|
|
162
|
+
cleanly. This prevents ShutdownExecutorError from occurring.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
work: An iterable of work items (e.g., file paths) to be processed.
|
|
166
|
+
args: Arguments to pass to the stage worker function.
|
|
167
|
+
|
|
168
|
+
Yields:
|
|
169
|
+
WorkerResult tuple containing (identical_groups, sequence_groups, work_data)
|
|
170
|
+
"""
|
|
171
|
+
# Check if work supports len() without materializing
|
|
172
|
+
# Use hasattr to avoid forcing evaluation of generators
|
|
173
|
+
total_count: int | None
|
|
174
|
+
try:
|
|
175
|
+
# Cast to Sized since we know it has __len__ (hasattr check)
|
|
176
|
+
total_count = len(cast(Sized, work)) if hasattr(work, "__len__") else None
|
|
177
|
+
# This is a best effort so ok to handle any exceptions
|
|
178
|
+
except: # noqa: E722
|
|
179
|
+
# Some iterables don't support len() even with __len__ attribute
|
|
180
|
+
total_count = None
|
|
181
|
+
|
|
182
|
+
with ProgressTracker(self.stage_name, total=total_count) as progress:
|
|
183
|
+
# Expose progress tracker for UI polling (see get_progress())
|
|
184
|
+
self._progress_tracker = progress
|
|
185
|
+
|
|
186
|
+
try:
|
|
187
|
+
if CONFIG.processing.DEBUG_MODE:
|
|
188
|
+
# Sequential processing for debugging
|
|
189
|
+
for j in work:
|
|
190
|
+
r = self.__class__.stage_worker(j, args)
|
|
191
|
+
yield r
|
|
192
|
+
progress.update()
|
|
193
|
+
else:
|
|
194
|
+
# Log worker configuration
|
|
195
|
+
logger = get_logger()
|
|
196
|
+
logger.info(f"{self.stage_name}: Using {CONFIG.processing.MAX_WORKERS} parallel workers")
|
|
197
|
+
|
|
198
|
+
# Parallel processing with Joblib
|
|
199
|
+
# batch_size="auto" lets Joblib optimize batching automatically
|
|
200
|
+
# return_as="generator" provides streaming results
|
|
201
|
+
# IMPORTANT: Pass work directly (don't materialize) - Joblib handles iterators
|
|
202
|
+
results = Parallel(
|
|
203
|
+
n_jobs=CONFIG.processing.MAX_WORKERS,
|
|
204
|
+
backend="loky", # Robust process-based backend
|
|
205
|
+
prefer="processes", # Good for jobs with variable time requirements
|
|
206
|
+
batch_size="auto", # Automatic batch size optimization
|
|
207
|
+
return_as="generator_unordered", # Stream results as they complete
|
|
208
|
+
)(delayed(self.__class__.stage_worker)(item, args) for item in work)
|
|
209
|
+
|
|
210
|
+
# Yield results as they complete with progress tracking
|
|
211
|
+
for result in results:
|
|
212
|
+
yield result
|
|
213
|
+
progress.update()
|
|
214
|
+
|
|
215
|
+
except KeyboardInterrupt:
|
|
216
|
+
# SIGINT received (Ctrl+C or shutdown endpoint called)
|
|
217
|
+
# Joblib workers have been terminated by the signal
|
|
218
|
+
# Exit cleanly without trying to dispatch more work
|
|
219
|
+
logger = get_logger()
|
|
220
|
+
logger.info(f"{self.stage_name}: Received shutdown signal, stopping batch processing")
|
|
221
|
+
return # Exit generator cleanly (don't re-raise)
|
|
222
|
+
|
|
223
|
+
finally:
|
|
224
|
+
# Don't clear tracker here - let orchestrator clear after marking complete
|
|
225
|
+
# This keeps progress visible during finalise() and cache save
|
|
226
|
+
pass
|
|
227
|
+
|
|
228
|
+
# ========================================================================
|
|
229
|
+
# Port-Based Connectivity (Phase 1: Infrastructure - Optional for now)
|
|
230
|
+
# ========================================================================
|
|
231
|
+
#
|
|
232
|
+
# Stages declare typed ports explicitly in __init__ method:
|
|
233
|
+
# - Input ports: InputPort[Type] for receiving data from upstream stages
|
|
234
|
+
# - Output ports: OutputPort[Type] for sending data to downstream stages
|
|
235
|
+
#
|
|
236
|
+
# Graph builder wires ports explicitly with compile-time type checking
|
|
237
|
+
# by binding consumer input ports to producer output ports.
|
|
238
|
+
#
|
|
239
|
+
# No dynamic discovery needed - static types ensure correctness.
|
|
240
|
+
|
|
241
|
+
# ============================================================================
|
|
242
|
+
# DO NOT REMOVE @final DECORATOR - PREVENTS ARCHITECTURAL VIOLATIONS
|
|
243
|
+
# ============================================================================
|
|
244
|
+
# The @final decorator prevents subclasses from overriding run().
|
|
245
|
+
# This is CRITICAL to maintain separation of concerns:
|
|
246
|
+
#
|
|
247
|
+
# - run() handles: caching, batch processing, logging, result storage
|
|
248
|
+
# - prepare() handles: reading inputs from ports, stage-specific logic
|
|
249
|
+
#
|
|
250
|
+
# Previously, stages overrode run() which led to:
|
|
251
|
+
# - Redundant _execute_impl() pattern (removed in refactor)
|
|
252
|
+
# - Duplicate code across stages
|
|
253
|
+
# - Broken caching when stages forgot to call parent
|
|
254
|
+
# - 149 lines of unnecessary complexity
|
|
255
|
+
#
|
|
256
|
+
# If you need stage-specific behavior, override prepare().
|
|
257
|
+
# NEVER remove @final and override run() - this violates the pattern.
|
|
258
|
+
# ============================================================================
|
|
259
|
+
@final
|
|
260
|
+
def run(self) -> None:
|
|
261
|
+
"""Execute pipeline stage with dependency-aware caching support.
|
|
262
|
+
|
|
263
|
+
This method is final and cannot be overridden by subclasses.
|
|
264
|
+
|
|
265
|
+
If cached results exist and are newer than all dependencies, loads and returns them.
|
|
266
|
+
Otherwise, prepares work (reading from input ports), processes in parallel,
|
|
267
|
+
accumulates results, saves to cache, and returns final result.
|
|
268
|
+
|
|
269
|
+
Stages must store their worker arguments in self.args during __init__.
|
|
270
|
+
|
|
271
|
+
NEW: Also builds and caches review data alongside working data for stages
|
|
272
|
+
that produce reviews (needs_review() != "none").
|
|
273
|
+
|
|
274
|
+
Phase callbacks notify orchestrator of current execution phase:
|
|
275
|
+
- cache_load: Loading results from cache
|
|
276
|
+
- prepare: Reading inputs and setting up work
|
|
277
|
+
- compute: Processing work items in parallel
|
|
278
|
+
- finalise: Computing final statistics and validating results
|
|
279
|
+
- save: Writing results to cache
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
Accumulated results (working data for pipeline flow)
|
|
283
|
+
"""
|
|
284
|
+
if self._cache_is_valid():
|
|
285
|
+
# Notify phase: loading from cache
|
|
286
|
+
if self._phase_callback:
|
|
287
|
+
self._phase_callback("cache_load")
|
|
288
|
+
|
|
289
|
+
# Load from cache and store in instance (tuple unpacking with type annotation)
|
|
290
|
+
# Cache contains FINAL counts and performance metrics (what this stage produced)
|
|
291
|
+
loaded_cache: (
|
|
292
|
+
tuple[R, list[SequenceGroup], list[IdenticalGroup], int | None, int | None]
|
|
293
|
+
| tuple[
|
|
294
|
+
R, list[SequenceGroup], list[IdenticalGroup], int | None, int | None, float | None, float | None
|
|
295
|
+
]
|
|
296
|
+
) = atomic_pickle_load(self.path)
|
|
297
|
+
self._unpack_cache(loaded_cache)
|
|
298
|
+
|
|
299
|
+
return
|
|
300
|
+
|
|
301
|
+
# Not cached - compute result
|
|
302
|
+
# Notify phase: preparing work
|
|
303
|
+
if self._phase_callback:
|
|
304
|
+
self._phase_callback("prepare")
|
|
305
|
+
|
|
306
|
+
# prepare() reads from input ports and stores inputs as instance vars
|
|
307
|
+
work: Iterable[S]
|
|
308
|
+
work, result = self.prepare()
|
|
309
|
+
|
|
310
|
+
# Notify phase: computing (parallel processing)
|
|
311
|
+
if self._phase_callback:
|
|
312
|
+
self._phase_callback("compute")
|
|
313
|
+
|
|
314
|
+
# Process work items in parallel
|
|
315
|
+
for r in self.batch_compute(work, self.stage_name):
|
|
316
|
+
# Extract review and work data from worker result
|
|
317
|
+
identical_review, sequence_review, work_item = r
|
|
318
|
+
|
|
319
|
+
# Generic review accumulation (just extend lists, update dicts)
|
|
320
|
+
self.identical_review_result.extend(identical_review)
|
|
321
|
+
self.sequence_review_result.extend(sequence_review)
|
|
322
|
+
|
|
323
|
+
# Stage-specific work accumulation
|
|
324
|
+
self.accumulate_results(result, work_item)
|
|
325
|
+
|
|
326
|
+
# Capture performance metrics from progress tracker (after context manager exits)
|
|
327
|
+
if self._progress_tracker:
|
|
328
|
+
self.elapsed_seconds = self._progress_tracker.elapsed_seconds
|
|
329
|
+
self.throughput = self._progress_tracker.final_rate
|
|
330
|
+
|
|
331
|
+
# Store result in instance BEFORE finalise (stages need to access self.result)
|
|
332
|
+
self.result = result
|
|
333
|
+
|
|
334
|
+
# Notify phase: finalizing results
|
|
335
|
+
if self._phase_callback:
|
|
336
|
+
self._phase_callback("finalise")
|
|
337
|
+
|
|
338
|
+
# Update progress status for finalization
|
|
339
|
+
if self._progress_tracker:
|
|
340
|
+
self._progress_tracker.set_status("Finalizing results...")
|
|
341
|
+
|
|
342
|
+
self.finalise()
|
|
343
|
+
|
|
344
|
+
# Notify phase: saving to cache
|
|
345
|
+
if self._phase_callback:
|
|
346
|
+
self._phase_callback("save")
|
|
347
|
+
|
|
348
|
+
# Update progress status for cache save
|
|
349
|
+
if self._progress_tracker:
|
|
350
|
+
self._progress_tracker.set_status("Saving to cache...")
|
|
351
|
+
|
|
352
|
+
# Save working data, review data, ref counts, and performance metrics to cache (as typed tuple)
|
|
353
|
+
# Save FINAL counts and metrics (set by finalise()) so downstream stages know what we produced
|
|
354
|
+
saved_cache: tuple[
|
|
355
|
+
R, list[SequenceGroup], list[IdenticalGroup], int | None, int | None, float | None, float | None
|
|
356
|
+
] = (
|
|
357
|
+
self.result, # Use self.result (already assigned above)
|
|
358
|
+
self.sequence_review_result,
|
|
359
|
+
self.identical_review_result,
|
|
360
|
+
self.ref_photos_final, # Final photo count (what THIS stage produced)
|
|
361
|
+
self.ref_seqs_final, # Final sequence count (what THIS stage produced)
|
|
362
|
+
self.elapsed_seconds, # Stage execution time in seconds
|
|
363
|
+
self.throughput, # Items per second
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
atomic_pickle_dump(saved_cache, self.path)
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def atomic_pickle_load[T](
|
|
370
|
+
path: Path,
|
|
371
|
+
expected_type: type[T] | None = None, # noqa: ARG001
|
|
372
|
+
) -> T:
|
|
373
|
+
"""Load object from pickle file with optional type hint.
|
|
374
|
+
|
|
375
|
+
Args:
|
|
376
|
+
path: Path to pickle file
|
|
377
|
+
expected_type: Optional type parameter for explicit type checking (unused at runtime,
|
|
378
|
+
exists only to satisfy mypy's requirement that TypeVar appears in parameters)
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
Unpickled object of type T
|
|
382
|
+
|
|
383
|
+
Note:
|
|
384
|
+
Type safety comes from explicit annotations at call sites:
|
|
385
|
+
loaded: tuple[R, ...] = atomic_pickle_load(path)
|
|
386
|
+
"""
|
|
387
|
+
with path.open("rb") as f:
|
|
388
|
+
result: T = pickle.load(f)
|
|
389
|
+
return result
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def atomic_pickle_dump[T](obj: T, path: Path) -> None:
|
|
393
|
+
"""Write pickle file atomically using temp file + os.replace.
|
|
394
|
+
|
|
395
|
+
Args:
|
|
396
|
+
obj: Object to pickle
|
|
397
|
+
path: Destination file path
|
|
398
|
+
"""
|
|
399
|
+
# Write to temp file in same directory as target
|
|
400
|
+
temp_fd, temp_path = tempfile.mkstemp(dir=path.parent, prefix=f".{path.name}.", suffix=".tmp")
|
|
401
|
+
# os.fdopen returns BufferedWriter which is compatible with SupportsWrite[bytes]
|
|
402
|
+
# Type checker is being overly strict here
|
|
403
|
+
# Must use os.fdopen rather than Path.open otherwise file handles trip over each other
|
|
404
|
+
with os.fdopen(temp_fd, "wb") as f:
|
|
405
|
+
# noinspection PyTypeChecker
|
|
406
|
+
pickle.dump(obj, f)
|
|
407
|
+
# Atomic replace
|
|
408
|
+
Path(temp_path).replace(path)
|