photo-stack-finder 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. orchestrator/__init__.py +2 -2
  2. orchestrator/app.py +6 -11
  3. orchestrator/build_pipeline.py +19 -21
  4. orchestrator/orchestrator_runner.py +11 -8
  5. orchestrator/pipeline_builder.py +126 -126
  6. orchestrator/pipeline_orchestrator.py +604 -604
  7. orchestrator/review_persistence.py +162 -162
  8. orchestrator/static/orchestrator.css +76 -76
  9. orchestrator/static/orchestrator.html +11 -5
  10. orchestrator/static/orchestrator.js +3 -1
  11. overlap_metrics/__init__.py +1 -1
  12. overlap_metrics/config.py +135 -135
  13. overlap_metrics/core.py +284 -284
  14. overlap_metrics/estimators.py +292 -292
  15. overlap_metrics/metrics.py +307 -307
  16. overlap_metrics/registry.py +99 -99
  17. overlap_metrics/utils.py +104 -104
  18. photo_compare/__init__.py +1 -1
  19. photo_compare/base.py +285 -285
  20. photo_compare/config.py +225 -225
  21. photo_compare/distance.py +15 -15
  22. photo_compare/feature_methods.py +173 -173
  23. photo_compare/file_hash.py +29 -29
  24. photo_compare/hash_methods.py +99 -99
  25. photo_compare/histogram_methods.py +118 -118
  26. photo_compare/pixel_methods.py +58 -58
  27. photo_compare/structural_methods.py +104 -104
  28. photo_compare/types.py +28 -28
  29. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/METADATA +21 -22
  30. photo_stack_finder-0.1.8.dist-info/RECORD +75 -0
  31. scripts/orchestrate.py +12 -10
  32. utils/__init__.py +4 -3
  33. utils/base_pipeline_stage.py +171 -171
  34. utils/base_ports.py +176 -176
  35. utils/benchmark_utils.py +823 -823
  36. utils/channel.py +74 -74
  37. utils/comparison_gates.py +40 -21
  38. utils/compute_benchmarks.py +355 -355
  39. utils/compute_identical.py +94 -24
  40. utils/compute_indices.py +235 -235
  41. utils/compute_perceptual_hash.py +127 -127
  42. utils/compute_perceptual_match.py +240 -240
  43. utils/compute_sha_bins.py +64 -20
  44. utils/compute_template_similarity.py +1 -1
  45. utils/compute_versions.py +483 -483
  46. utils/config.py +8 -5
  47. utils/data_io.py +83 -83
  48. utils/graph_context.py +44 -44
  49. utils/logger.py +2 -2
  50. utils/models.py +2 -2
  51. utils/photo_file.py +90 -91
  52. utils/pipeline_graph.py +334 -334
  53. utils/pipeline_stage.py +408 -408
  54. utils/plot_helpers.py +123 -123
  55. utils/ports.py +136 -136
  56. utils/progress.py +415 -415
  57. utils/report_builder.py +139 -139
  58. utils/review_types.py +55 -55
  59. utils/review_utils.py +10 -19
  60. utils/sequence.py +10 -8
  61. utils/sequence_clustering.py +1 -1
  62. utils/template.py +57 -57
  63. utils/template_parsing.py +71 -0
  64. photo_stack_finder-0.1.7.dist-info/RECORD +0 -74
  65. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/WHEEL +0 -0
  66. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/entry_points.txt +0 -0
  67. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/licenses/LICENSE +0 -0
  68. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/top_level.txt +0 -0
@@ -1,355 +1,355 @@
1
- """PipelineStage for computing benchmark results."""
2
-
3
- from __future__ import annotations
4
-
5
- import json
6
- import time
7
- from collections.abc import Mapping
8
- from typing import Any, cast, get_args
9
-
10
- import pandas as pd
11
- import psutil
12
-
13
- from photo_compare import ComparisonMethodName, create_comparison_method
14
-
15
- from .benchmark_utils import cluster_pairs_for_scoring, generate_benchmark_pairs, post_analysis
16
- from .config import CONFIG
17
- from .photo_file import PhotoFile, load_normalized_pixels
18
- from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
19
- from .ports import InputPort
20
- from .sequence import PhotoSequence
21
-
22
- # Type definitions for this specific stage:
23
- type Pair = tuple[int, int]
24
- type Score = float
25
- type TimingStats = dict[str, float] # {"prep_time": float, "compare_time": float, "prep_count": int}
26
- # S: Work Item (Method to run + Cluster of pairs + Photo paths for worker file access)
27
- # T: Worker Result Data (List of calculated scores + timing stats)
28
- # R: Accumulator (Mapping MethodName -> Pair -> Score + timing stats)
29
- type _S = tuple[ComparisonMethodName, list[Pair], dict[int, str]]
30
- type _T = tuple[list[tuple[ComparisonMethodName, Pair, Score]], TimingStats]
31
- type _R = tuple[
32
- dict[ComparisonMethodName, dict[Pair, Score]], # scores
33
- dict[ComparisonMethodName, TimingStats], # timing per method
34
- ]
35
-
36
- # Extract list of comparison methods from the Literal type
37
- COMPARISON_METHODS: list[ComparisonMethodName] = list(get_args(ComparisonMethodName))
38
-
39
-
40
- def get_available_memory() -> int:
41
- """Get available system memory in bytes using psutil.
42
-
43
- Returns:
44
- Available memory in bytes
45
- """
46
- return psutil.virtual_memory().available
47
-
48
-
49
- def calculate_max_cluster_size(
50
- num_workers: int | None = None,
51
- max_prep_size: int = 8 * 1024 * 1024, # 8 MB per photo
52
- memory_fraction: float = 0.8,
53
- ) -> int:
54
- """Calculate maximum photos per cluster based on available memory.
55
-
56
- Formula: photos_per_cluster * prep_size * num_workers < available_memory
57
-
58
- Args:
59
- num_workers: Number of parallel worker processes (defaults to CONFIG.processing.MAX_WORKERS)
60
- max_prep_size: Maximum memory per prepared photo (bytes)
61
- memory_fraction: Fraction of available memory to use (0.0-1.0)
62
-
63
- Returns:
64
- Maximum number of photos allowed in a single cluster
65
- """
66
- if num_workers is None:
67
- num_workers = CONFIG.processing.MAX_WORKERS
68
-
69
- available_memory = get_available_memory()
70
- usable_memory = int(available_memory * memory_fraction)
71
- max_photos = usable_memory // (max_prep_size * num_workers)
72
- return max(max_photos, 10) # Minimum 10 photos per cluster # Minimum 10 photos per cluster
73
-
74
-
75
- class ComputeBenchmarks(PipelineStage[_S, _T, _R]):
76
- """Pipeline stage for generating photo benchmark pairs, clustering them into work units.
77
-
78
- Calculates scores using various comparison methods in parallel,
79
- and performs a final analysis.
80
- """
81
-
82
- # --- Port Declarations ---
83
- # Class attributes for InputPorts and OutputPorts
84
- forest_i: InputPort[list[PhotoSequence]]
85
- photofiles_i: InputPort[Mapping[int, PhotoFile]]
86
-
87
- # --- Data Storage for Finalise ---
88
- positive_pairs: list[Pair]
89
- different_pairs: list[Pair]
90
-
91
- def __init__(self) -> None:
92
- """Initialize the benchmark stage."""
93
- super().__init__(
94
- path=CONFIG.paths.benchmark_scores_pkl,
95
- stage_name="Photo Comparison Benchmark",
96
- )
97
-
98
- # Initialize instance attributes
99
- self.positive_pairs = []
100
- self.different_pairs = []
101
- # Result is a tuple: (scores_dict, timing_dict)
102
- self.result: _R = ({}, {})
103
- self.args = "" # Not strictly necessary for this stage
104
-
105
- # Define input ports (InputPort only needs a name)
106
- self.forest_i = InputPort("forest_data")
107
- self.photofiles_i = InputPort("photofiles_map")
108
-
109
- def prepare(self) -> PrepareResult[_S, _R]:
110
- """Generate benchmark pairs, cluster them, and create work units.
111
-
112
- 1. Generate Similar/Dissimilar pairs from the forest.
113
- 2. Cluster the pairs into connected components of limited size.
114
- 3. Create work units: (ComparisonMethodName, ClusterOfPairs, PhotoPaths).
115
-
116
- Returns:
117
- Tuple of (work_units, initial_accumulator).
118
- """
119
- # --- 1. Load Inputs ---
120
- # InputPort.load() is used to read data from upstream stages
121
- forest: list[PhotoSequence] = self.forest_i.read()
122
- photofiles: Mapping[int, PhotoFile] = self.photofiles_i.read()
123
-
124
- # Get reference counts from upstream for UI statistics tracking
125
- self.ref_photos_init = self.forest_i.get_ref_photo_count()
126
- self.ref_seqs_init = self.forest_i.get_ref_sequence_count()
127
-
128
- # --- 2. Generate Pairs ---
129
- n_different = CONFIG.benchmark.N_DIFFERENT_PAIRS
130
- seed = CONFIG.processing.DEFAULT_RANDOM_SEED
131
-
132
- # Calculate max cluster size based on available memory
133
- # Uses CONFIG.processing.MAX_WORKERS
134
- max_cluster_size = calculate_max_cluster_size(
135
- max_prep_size=CONFIG.benchmark.MAX_PREP_SIZE_BYTES,
136
- memory_fraction=CONFIG.benchmark.MEMORY_FRACTION,
137
- )
138
-
139
- self.positive_pairs, self.different_pairs, unique_ids = generate_benchmark_pairs(
140
- forest=forest,
141
- n_different=n_different,
142
- seed=seed,
143
- )
144
-
145
- # Store for use in finalise()
146
- self.positive_pairs: list[Pair] = self.positive_pairs
147
- self.different_pairs: list[Pair] = self.different_pairs
148
-
149
- all_pairs: list[Pair] = self.positive_pairs + self.different_pairs
150
-
151
- # --- 3. Cluster Pairs ---
152
- cluster_list = cluster_pairs_for_scoring(
153
- pairs=all_pairs,
154
- max_cluster_size=max_cluster_size,
155
- )
156
-
157
- # --- 4. Build photo_paths dict for worker file access ---
158
- photo_paths: dict[int, str] = {photo_id: str(photofiles[photo_id].path) for photo_id in unique_ids}
159
-
160
- # --- 5. Create Work Units ---
161
- work_units: list[_S] = []
162
- for method in COMPARISON_METHODS:
163
- for _, cluster_pairs in cluster_list:
164
- work_units.append((method, cluster_pairs, photo_paths))
165
-
166
- # --- 6. Initialize Accumulator ---
167
- # Accumulator is a tuple: (scores_dict, timing_dict)
168
- scores_dict: dict[ComparisonMethodName, dict[Pair, Score]] = {}
169
- timing_dict: dict[ComparisonMethodName, TimingStats] = {}
170
-
171
- for method in COMPARISON_METHODS:
172
- scores_dict[method] = {}
173
- timing_dict[method] = {
174
- "prep_time": 0.0,
175
- "compare_time": 0.0,
176
- "prep_count": 0.0,
177
- }
178
-
179
- initial_accum: _R = (scores_dict, timing_dict)
180
-
181
- return work_units, initial_accum
182
-
183
- @classmethod
184
- def stage_worker(
185
- cls,
186
- job: _S,
187
- _args: str,
188
- ) -> WorkerResult[_T]:
189
- """Process an individual work unit: calculate scores for all pairs in a cluster.
190
-
191
- Uses a single comparison method with lazy preparation and local caching:
192
- - Photos are prepared on-demand (first use)
193
- - Prepared data is cached locally for reuse
194
- - Minimizes redundant file I/O and preparation
195
- - Measures preparation and comparison timing
196
-
197
- Args:
198
- job: 3-tuple (method_name, cluster_pairs, photo_paths)
199
- _args: Unused worker arguments
200
-
201
- Returns:
202
- WorkerResult with calculated scores and timing statistics
203
-
204
- Raises:
205
- FileNotFoundError: If photo file is missing (critical error, must surface)
206
- """
207
- method_name, cluster_pairs, photo_paths = job
208
-
209
- # Create comparison method instance
210
- method = create_comparison_method(method_name)
211
-
212
- # Local cache for prepared photo data (photo_id -> prepared_data)
213
- local_cache: dict[int, Any] = {}
214
-
215
- calculated_scores: list[tuple[ComparisonMethodName, Pair, Score]] = []
216
-
217
- # Track timing statistics
218
- prep_time = 0.0
219
- compare_time = 0.0
220
- prep_count = 0
221
-
222
- for a_id, b_id in cluster_pairs:
223
- # Lazy preparation: prepare photo only if not in cache
224
- if a_id not in local_cache:
225
- t0 = time.perf_counter()
226
- pixels = load_normalized_pixels(photo_paths[a_id])
227
- local_cache[a_id] = method.prepare(pixels)
228
- prep_time += time.perf_counter() - t0
229
- prep_count += 1
230
-
231
- if b_id not in local_cache:
232
- t0 = time.perf_counter()
233
- pixels = load_normalized_pixels(photo_paths[b_id])
234
- local_cache[b_id] = method.prepare(pixels)
235
- prep_time += time.perf_counter() - t0
236
- prep_count += 1
237
-
238
- # Measure comparison time
239
- t0 = time.perf_counter()
240
- score: float = method.compare(local_cache[a_id], local_cache[b_id])
241
- compare_time += time.perf_counter() - t0
242
-
243
- calculated_scores.append((method_name, (a_id, b_id), score))
244
-
245
- timing_stats: TimingStats = {
246
- "prep_time": prep_time,
247
- "compare_time": compare_time,
248
- "prep_count": float(prep_count),
249
- }
250
-
251
- return [], [], (calculated_scores, timing_stats)
252
-
253
- def accumulate_results(
254
- self,
255
- accum: _R,
256
- job_result: _T,
257
- ) -> None:
258
- """Merges worker results into the main accumulator dictionary.
259
-
260
- Accumulates both scores and timing statistics per comparison method.
261
-
262
- Args:
263
- accum: Tuple of (scores_dict, timing_dict)
264
- job_result: Tuple of (scores_list, timing_stats)
265
- """
266
- scores_list, timing_stats = job_result
267
- scores_dict, timing_dict = accum
268
-
269
- # Early return if no scores to accumulate
270
- if not scores_list:
271
- return
272
-
273
- # Accumulate scores
274
- for method_name, pair, score in scores_list:
275
- scores_dict[method_name][pair] = score
276
-
277
- # Accumulate timing stats (sum across workers for each method)
278
- # All scores in a work unit are for the same method
279
- method_name = scores_list[0][0]
280
- if method_name not in timing_dict:
281
- timing_dict[method_name] = {
282
- "prep_time": 0.0,
283
- "compare_time": 0.0,
284
- "prep_count": 0.0,
285
- }
286
- timing_dict[method_name]["prep_time"] += timing_stats["prep_time"]
287
- timing_dict[method_name]["compare_time"] += timing_stats["compare_time"]
288
- timing_dict[method_name]["prep_count"] += timing_stats["prep_count"]
289
-
290
- def finalise(self) -> None:
291
- """Perform post-analysis and save results.
292
-
293
- Includes calculating metrics, generating reports, and saving outputs,
294
- as per the original benchmarks.py script. Also saves timing statistics.
295
- """
296
- # Extract scores and timing from accumulator
297
- scores_dict, timing_dict = self.result
298
-
299
- # Update status with initial info
300
- total_pairs = len(self.positive_pairs) + len(self.different_pairs)
301
- if self._progress_tracker:
302
- self._progress_tracker.set_status(f"Analyzing {total_pairs:,} pairs across {len(scores_dict)} methods...")
303
-
304
- # Calculate derived timing metrics for each method
305
- timing_summary = {}
306
- for method_name, stats in timing_dict.items():
307
- prep_time = stats["prep_time"]
308
- compare_time = stats["compare_time"]
309
- prep_count = int(stats["prep_count"])
310
- num_pairs = len(scores_dict.get(method_name, {}))
311
-
312
- # Calculate derived metrics
313
- timing_summary[method_name] = {
314
- "prep_time_seconds": prep_time,
315
- "compare_time_seconds": compare_time,
316
- "total_time_seconds": prep_time + compare_time,
317
- "prep_count": prep_count,
318
- "num_pairs": num_pairs,
319
- "prep_time_per_photo_ms": (prep_time / prep_count * 1000) if prep_count > 0 else 0.0,
320
- "compare_time_per_pair_us": (compare_time / num_pairs * 1_000_000) if num_pairs > 0 else 0.0,
321
- "photos_per_second": prep_count / prep_time if prep_time > 0 else 0.0,
322
- "comparisons_per_second": num_pairs / compare_time if compare_time > 0 else 0.0,
323
- }
324
-
325
- # Save timing data to JSON
326
- timing_output_path = CONFIG.paths.work_dir / "benchmark_timing.json"
327
- timing_output_path.parent.mkdir(parents=True, exist_ok=True)
328
- with timing_output_path.open("w", encoding="utf-8") as f:
329
- json.dump(timing_summary, f, indent=2)
330
-
331
- # Perform the full analysis (pass only scores_dict)
332
- # Cast to expected type for post_analysis (dict[str, dict[Pair, Score]])
333
- post_analysis(
334
- final_scores=cast(dict[str, dict[Pair, Score]], scores_dict),
335
- positive_pairs=self.positive_pairs, # Now correctly stored from prepare()
336
- different_pairs=self.different_pairs, # Now correctly stored from prepare()
337
- output_dir=CONFIG.paths.work_dir,
338
- )
339
-
340
- # Read back the best results and update status with findings
341
- metrics_path = CONFIG.paths.work_dir / "method_metrics.csv"
342
- if metrics_path.exists():
343
- df_metrics = pd.read_csv(metrics_path, index_col=0)
344
- best_method = df_metrics["f1"].idxmax()
345
- best_f1 = df_metrics.loc[best_method, "f1"]
346
- best_auc = df_metrics.loc[best_method, "auc"]
347
-
348
- if self._progress_tracker:
349
- self._progress_tracker.set_status(
350
- f"Best: {best_method} (F1={best_f1:.4f}, AUC={best_auc:.4f}) | {total_pairs:,} pairs tested"
351
- )
352
-
353
- # Update stage statistics (required by BasePipelineStage)
354
- self.ref_photos_final = self.ref_photos_init
355
- self.ref_seqs_final = self.ref_seqs_init
1
+ """PipelineStage for computing benchmark results."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import time
7
+ from collections.abc import Mapping
8
+ from typing import Any, cast, get_args
9
+
10
+ import pandas as pd
11
+ import psutil
12
+
13
+ from photo_compare import ComparisonMethodName, create_comparison_method
14
+
15
+ from .benchmark_utils import cluster_pairs_for_scoring, generate_benchmark_pairs, post_analysis
16
+ from .config import CONFIG
17
+ from .photo_file import PhotoFile, load_normalized_pixels
18
+ from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
19
+ from .ports import InputPort
20
+ from .sequence import PhotoSequence
21
+
22
+ # Type definitions for this specific stage:
23
+ type Pair = tuple[int, int]
24
+ type Score = float
25
+ type TimingStats = dict[str, float] # {"prep_time": float, "compare_time": float, "prep_count": int}
26
+ # S: Work Item (Method to run + Cluster of pairs + Photo paths for worker file access)
27
+ # T: Worker Result Data (List of calculated scores + timing stats)
28
+ # R: Accumulator (Mapping MethodName -> Pair -> Score + timing stats)
29
+ type _S = tuple[ComparisonMethodName, list[Pair], dict[int, str]]
30
+ type _T = tuple[list[tuple[ComparisonMethodName, Pair, Score]], TimingStats]
31
+ type _R = tuple[
32
+ dict[ComparisonMethodName, dict[Pair, Score]], # scores
33
+ dict[ComparisonMethodName, TimingStats], # timing per method
34
+ ]
35
+
36
+ # Extract list of comparison methods from the Literal type
37
+ COMPARISON_METHODS: list[ComparisonMethodName] = list(get_args(ComparisonMethodName))
38
+
39
+
40
+ def get_available_memory() -> int:
41
+ """Get available system memory in bytes using psutil.
42
+
43
+ Returns:
44
+ Available memory in bytes
45
+ """
46
+ return psutil.virtual_memory().available
47
+
48
+
49
+ def calculate_max_cluster_size(
50
+ num_workers: int | None = None,
51
+ max_prep_size: int = 8 * 1024 * 1024, # 8 MB per photo
52
+ memory_fraction: float = 0.8,
53
+ ) -> int:
54
+ """Calculate maximum photos per cluster based on available memory.
55
+
56
+ Formula: photos_per_cluster * prep_size * num_workers < available_memory
57
+
58
+ Args:
59
+ num_workers: Number of parallel worker processes (defaults to CONFIG.processing.MAX_WORKERS)
60
+ max_prep_size: Maximum memory per prepared photo (bytes)
61
+ memory_fraction: Fraction of available memory to use (0.0-1.0)
62
+
63
+ Returns:
64
+ Maximum number of photos allowed in a single cluster
65
+ """
66
+ if num_workers is None:
67
+ num_workers = CONFIG.processing.MAX_WORKERS
68
+
69
+ available_memory = get_available_memory()
70
+ usable_memory = int(available_memory * memory_fraction)
71
+ max_photos = usable_memory // (max_prep_size * num_workers)
72
+ return max(max_photos, 10) # Minimum 10 photos per cluster # Minimum 10 photos per cluster
73
+
74
+
75
+ class ComputeBenchmarks(PipelineStage[_S, _T, _R]):
76
+ """Pipeline stage for generating photo benchmark pairs, clustering them into work units.
77
+
78
+ Calculates scores using various comparison methods in parallel,
79
+ and performs a final analysis.
80
+ """
81
+
82
+ # --- Port Declarations ---
83
+ # Class attributes for InputPorts and OutputPorts
84
+ forest_i: InputPort[list[PhotoSequence]]
85
+ photofiles_i: InputPort[Mapping[int, PhotoFile]]
86
+
87
+ # --- Data Storage for Finalise ---
88
+ positive_pairs: list[Pair]
89
+ different_pairs: list[Pair]
90
+
91
+ def __init__(self) -> None:
92
+ """Initialize the benchmark stage."""
93
+ super().__init__(
94
+ path=CONFIG.paths.benchmark_scores_pkl,
95
+ stage_name="Photo Comparison Benchmark",
96
+ )
97
+
98
+ # Initialize instance attributes
99
+ self.positive_pairs = []
100
+ self.different_pairs = []
101
+ # Result is a tuple: (scores_dict, timing_dict)
102
+ self.result: _R = ({}, {})
103
+ self.args = "" # Not strictly necessary for this stage
104
+
105
+ # Define input ports (InputPort only needs a name)
106
+ self.forest_i = InputPort("forest_data")
107
+ self.photofiles_i = InputPort("photofiles_map")
108
+
109
+ def prepare(self) -> PrepareResult[_S, _R]:
110
+ """Generate benchmark pairs, cluster them, and create work units.
111
+
112
+ 1. Generate Similar/Dissimilar pairs from the forest.
113
+ 2. Cluster the pairs into connected components of limited size.
114
+ 3. Create work units: (ComparisonMethodName, ClusterOfPairs, PhotoPaths).
115
+
116
+ Returns:
117
+ Tuple of (work_units, initial_accumulator).
118
+ """
119
+ # --- 1. Load Inputs ---
120
+ # InputPort.load() is used to read data from upstream stages
121
+ forest: list[PhotoSequence] = self.forest_i.read()
122
+ photofiles: Mapping[int, PhotoFile] = self.photofiles_i.read()
123
+
124
+ # Get reference counts from upstream for UI statistics tracking
125
+ self.ref_photos_init = self.forest_i.get_ref_photo_count()
126
+ self.ref_seqs_init = self.forest_i.get_ref_sequence_count()
127
+
128
+ # --- 2. Generate Pairs ---
129
+ n_different = CONFIG.benchmark.N_DIFFERENT_PAIRS
130
+ seed = CONFIG.processing.DEFAULT_RANDOM_SEED
131
+
132
+ # Calculate max cluster size based on available memory
133
+ # Uses CONFIG.processing.MAX_WORKERS
134
+ max_cluster_size = calculate_max_cluster_size(
135
+ max_prep_size=CONFIG.benchmark.MAX_PREP_SIZE_BYTES,
136
+ memory_fraction=CONFIG.benchmark.MEMORY_FRACTION,
137
+ )
138
+
139
+ self.positive_pairs, self.different_pairs, unique_ids = generate_benchmark_pairs(
140
+ forest=forest,
141
+ n_different=n_different,
142
+ seed=seed,
143
+ )
144
+
145
+ # Store for use in finalise()
146
+ self.positive_pairs: list[Pair] = self.positive_pairs
147
+ self.different_pairs: list[Pair] = self.different_pairs
148
+
149
+ all_pairs: list[Pair] = self.positive_pairs + self.different_pairs
150
+
151
+ # --- 3. Cluster Pairs ---
152
+ cluster_list = cluster_pairs_for_scoring(
153
+ pairs=all_pairs,
154
+ max_cluster_size=max_cluster_size,
155
+ )
156
+
157
+ # --- 4. Build photo_paths dict for worker file access ---
158
+ photo_paths: dict[int, str] = {photo_id: str(photofiles[photo_id].path) for photo_id in unique_ids}
159
+
160
+ # --- 5. Create Work Units ---
161
+ work_units: list[_S] = []
162
+ for method in COMPARISON_METHODS:
163
+ for _, cluster_pairs in cluster_list:
164
+ work_units.append((method, cluster_pairs, photo_paths))
165
+
166
+ # --- 6. Initialize Accumulator ---
167
+ # Accumulator is a tuple: (scores_dict, timing_dict)
168
+ scores_dict: dict[ComparisonMethodName, dict[Pair, Score]] = {}
169
+ timing_dict: dict[ComparisonMethodName, TimingStats] = {}
170
+
171
+ for method in COMPARISON_METHODS:
172
+ scores_dict[method] = {}
173
+ timing_dict[method] = {
174
+ "prep_time": 0.0,
175
+ "compare_time": 0.0,
176
+ "prep_count": 0.0,
177
+ }
178
+
179
+ initial_accum: _R = (scores_dict, timing_dict)
180
+
181
+ return work_units, initial_accum
182
+
183
+ @classmethod
184
+ def stage_worker(
185
+ cls,
186
+ job: _S,
187
+ _args: str,
188
+ ) -> WorkerResult[_T]:
189
+ """Process an individual work unit: calculate scores for all pairs in a cluster.
190
+
191
+ Uses a single comparison method with lazy preparation and local caching:
192
+ - Photos are prepared on-demand (first use)
193
+ - Prepared data is cached locally for reuse
194
+ - Minimizes redundant file I/O and preparation
195
+ - Measures preparation and comparison timing
196
+
197
+ Args:
198
+ job: 3-tuple (method_name, cluster_pairs, photo_paths)
199
+ _args: Unused worker arguments
200
+
201
+ Returns:
202
+ WorkerResult with calculated scores and timing statistics
203
+
204
+ Raises:
205
+ FileNotFoundError: If photo file is missing (critical error, must surface)
206
+ """
207
+ method_name, cluster_pairs, photo_paths = job
208
+
209
+ # Create comparison method instance
210
+ method = create_comparison_method(method_name)
211
+
212
+ # Local cache for prepared photo data (photo_id -> prepared_data)
213
+ local_cache: dict[int, Any] = {}
214
+
215
+ calculated_scores: list[tuple[ComparisonMethodName, Pair, Score]] = []
216
+
217
+ # Track timing statistics
218
+ prep_time = 0.0
219
+ compare_time = 0.0
220
+ prep_count = 0
221
+
222
+ for a_id, b_id in cluster_pairs:
223
+ # Lazy preparation: prepare photo only if not in cache
224
+ if a_id not in local_cache:
225
+ t0 = time.perf_counter()
226
+ pixels = load_normalized_pixels(photo_paths[a_id])
227
+ local_cache[a_id] = method.prepare(pixels)
228
+ prep_time += time.perf_counter() - t0
229
+ prep_count += 1
230
+
231
+ if b_id not in local_cache:
232
+ t0 = time.perf_counter()
233
+ pixels = load_normalized_pixels(photo_paths[b_id])
234
+ local_cache[b_id] = method.prepare(pixels)
235
+ prep_time += time.perf_counter() - t0
236
+ prep_count += 1
237
+
238
+ # Measure comparison time
239
+ t0 = time.perf_counter()
240
+ score: float = method.compare(local_cache[a_id], local_cache[b_id])
241
+ compare_time += time.perf_counter() - t0
242
+
243
+ calculated_scores.append((method_name, (a_id, b_id), score))
244
+
245
+ timing_stats: TimingStats = {
246
+ "prep_time": prep_time,
247
+ "compare_time": compare_time,
248
+ "prep_count": float(prep_count),
249
+ }
250
+
251
+ return [], [], (calculated_scores, timing_stats)
252
+
253
+ def accumulate_results(
254
+ self,
255
+ accum: _R,
256
+ job_result: _T,
257
+ ) -> None:
258
+ """Merges worker results into the main accumulator dictionary.
259
+
260
+ Accumulates both scores and timing statistics per comparison method.
261
+
262
+ Args:
263
+ accum: Tuple of (scores_dict, timing_dict)
264
+ job_result: Tuple of (scores_list, timing_stats)
265
+ """
266
+ scores_list, timing_stats = job_result
267
+ scores_dict, timing_dict = accum
268
+
269
+ # Early return if no scores to accumulate
270
+ if not scores_list:
271
+ return
272
+
273
+ # Accumulate scores
274
+ for method_name, pair, score in scores_list:
275
+ scores_dict[method_name][pair] = score
276
+
277
+ # Accumulate timing stats (sum across workers for each method)
278
+ # All scores in a work unit are for the same method
279
+ method_name = scores_list[0][0]
280
+ if method_name not in timing_dict:
281
+ timing_dict[method_name] = {
282
+ "prep_time": 0.0,
283
+ "compare_time": 0.0,
284
+ "prep_count": 0.0,
285
+ }
286
+ timing_dict[method_name]["prep_time"] += timing_stats["prep_time"]
287
+ timing_dict[method_name]["compare_time"] += timing_stats["compare_time"]
288
+ timing_dict[method_name]["prep_count"] += timing_stats["prep_count"]
289
+
290
+ def finalise(self) -> None:
291
+ """Perform post-analysis and save results.
292
+
293
+ Includes calculating metrics, generating reports, and saving outputs,
294
+ as per the original benchmarks.py script. Also saves timing statistics.
295
+ """
296
+ # Extract scores and timing from accumulator
297
+ scores_dict, timing_dict = self.result
298
+
299
+ # Update status with initial info
300
+ total_pairs = len(self.positive_pairs) + len(self.different_pairs)
301
+ if self._progress_tracker:
302
+ self._progress_tracker.set_status(f"Analyzing {total_pairs:,} pairs across {len(scores_dict)} methods...")
303
+
304
+ # Calculate derived timing metrics for each method
305
+ timing_summary = {}
306
+ for method_name, stats in timing_dict.items():
307
+ prep_time = stats["prep_time"]
308
+ compare_time = stats["compare_time"]
309
+ prep_count = int(stats["prep_count"])
310
+ num_pairs = len(scores_dict.get(method_name, {}))
311
+
312
+ # Calculate derived metrics
313
+ timing_summary[method_name] = {
314
+ "prep_time_seconds": prep_time,
315
+ "compare_time_seconds": compare_time,
316
+ "total_time_seconds": prep_time + compare_time,
317
+ "prep_count": prep_count,
318
+ "num_pairs": num_pairs,
319
+ "prep_time_per_photo_ms": (prep_time / prep_count * 1000) if prep_count > 0 else 0.0,
320
+ "compare_time_per_pair_us": (compare_time / num_pairs * 1_000_000) if num_pairs > 0 else 0.0,
321
+ "photos_per_second": prep_count / prep_time if prep_time > 0 else 0.0,
322
+ "comparisons_per_second": num_pairs / compare_time if compare_time > 0 else 0.0,
323
+ }
324
+
325
+ # Save timing data to JSON
326
+ timing_output_path = CONFIG.paths.work_dir / "benchmark_timing.json"
327
+ timing_output_path.parent.mkdir(parents=True, exist_ok=True)
328
+ with timing_output_path.open("w", encoding="utf-8") as f:
329
+ json.dump(timing_summary, f, indent=2)
330
+
331
+ # Perform the full analysis (pass only scores_dict)
332
+ # Cast to expected type for post_analysis (dict[str, dict[Pair, Score]])
333
+ post_analysis(
334
+ final_scores=cast(dict[str, dict[Pair, Score]], scores_dict),
335
+ positive_pairs=self.positive_pairs, # Now correctly stored from prepare()
336
+ different_pairs=self.different_pairs, # Now correctly stored from prepare()
337
+ output_dir=CONFIG.paths.work_dir,
338
+ )
339
+
340
+ # Read back the best results and update status with findings
341
+ metrics_path = CONFIG.paths.work_dir / "method_metrics.csv"
342
+ if metrics_path.exists():
343
+ df_metrics = pd.read_csv(metrics_path, index_col=0)
344
+ best_method = df_metrics["f1"].idxmax()
345
+ best_f1 = df_metrics.loc[best_method, "f1"]
346
+ best_auc = df_metrics.loc[best_method, "auc"]
347
+
348
+ if self._progress_tracker:
349
+ self._progress_tracker.set_status(
350
+ f"Best: {best_method} (F1={best_f1:.4f}, AUC={best_auc:.4f}) | {total_pairs:,} pairs tested"
351
+ )
352
+
353
+ # Update stage statistics (required by BasePipelineStage)
354
+ self.ref_photos_final = self.ref_photos_init
355
+ self.ref_seqs_final = self.ref_seqs_init