photo-stack-finder 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orchestrator/__init__.py +2 -2
- orchestrator/app.py +6 -11
- orchestrator/build_pipeline.py +19 -21
- orchestrator/orchestrator_runner.py +11 -8
- orchestrator/pipeline_builder.py +126 -126
- orchestrator/pipeline_orchestrator.py +604 -604
- orchestrator/review_persistence.py +162 -162
- orchestrator/static/orchestrator.css +76 -76
- orchestrator/static/orchestrator.html +11 -5
- orchestrator/static/orchestrator.js +3 -1
- overlap_metrics/__init__.py +1 -1
- overlap_metrics/config.py +135 -135
- overlap_metrics/core.py +284 -284
- overlap_metrics/estimators.py +292 -292
- overlap_metrics/metrics.py +307 -307
- overlap_metrics/registry.py +99 -99
- overlap_metrics/utils.py +104 -104
- photo_compare/__init__.py +1 -1
- photo_compare/base.py +285 -285
- photo_compare/config.py +225 -225
- photo_compare/distance.py +15 -15
- photo_compare/feature_methods.py +173 -173
- photo_compare/file_hash.py +29 -29
- photo_compare/hash_methods.py +99 -99
- photo_compare/histogram_methods.py +118 -118
- photo_compare/pixel_methods.py +58 -58
- photo_compare/structural_methods.py +104 -104
- photo_compare/types.py +28 -28
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/METADATA +21 -22
- photo_stack_finder-0.1.8.dist-info/RECORD +75 -0
- scripts/orchestrate.py +12 -10
- utils/__init__.py +4 -3
- utils/base_pipeline_stage.py +171 -171
- utils/base_ports.py +176 -176
- utils/benchmark_utils.py +823 -823
- utils/channel.py +74 -74
- utils/comparison_gates.py +40 -21
- utils/compute_benchmarks.py +355 -355
- utils/compute_identical.py +94 -24
- utils/compute_indices.py +235 -235
- utils/compute_perceptual_hash.py +127 -127
- utils/compute_perceptual_match.py +240 -240
- utils/compute_sha_bins.py +64 -20
- utils/compute_template_similarity.py +1 -1
- utils/compute_versions.py +483 -483
- utils/config.py +8 -5
- utils/data_io.py +83 -83
- utils/graph_context.py +44 -44
- utils/logger.py +2 -2
- utils/models.py +2 -2
- utils/photo_file.py +90 -91
- utils/pipeline_graph.py +334 -334
- utils/pipeline_stage.py +408 -408
- utils/plot_helpers.py +123 -123
- utils/ports.py +136 -136
- utils/progress.py +415 -415
- utils/report_builder.py +139 -139
- utils/review_types.py +55 -55
- utils/review_utils.py +10 -19
- utils/sequence.py +10 -8
- utils/sequence_clustering.py +1 -1
- utils/template.py +57 -57
- utils/template_parsing.py +71 -0
- photo_stack_finder-0.1.7.dist-info/RECORD +0 -74
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/WHEEL +0 -0
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/entry_points.txt +0 -0
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/top_level.txt +0 -0
utils/compute_benchmarks.py
CHANGED
|
@@ -1,355 +1,355 @@
|
|
|
1
|
-
"""PipelineStage for computing benchmark results."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import json
|
|
6
|
-
import time
|
|
7
|
-
from collections.abc import Mapping
|
|
8
|
-
from typing import Any, cast, get_args
|
|
9
|
-
|
|
10
|
-
import pandas as pd
|
|
11
|
-
import psutil
|
|
12
|
-
|
|
13
|
-
from photo_compare import ComparisonMethodName, create_comparison_method
|
|
14
|
-
|
|
15
|
-
from .benchmark_utils import cluster_pairs_for_scoring, generate_benchmark_pairs, post_analysis
|
|
16
|
-
from .config import CONFIG
|
|
17
|
-
from .photo_file import PhotoFile, load_normalized_pixels
|
|
18
|
-
from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
|
|
19
|
-
from .ports import InputPort
|
|
20
|
-
from .sequence import PhotoSequence
|
|
21
|
-
|
|
22
|
-
# Type definitions for this specific stage:
|
|
23
|
-
type Pair = tuple[int, int]
|
|
24
|
-
type Score = float
|
|
25
|
-
type TimingStats = dict[str, float] # {"prep_time": float, "compare_time": float, "prep_count": int}
|
|
26
|
-
# S: Work Item (Method to run + Cluster of pairs + Photo paths for worker file access)
|
|
27
|
-
# T: Worker Result Data (List of calculated scores + timing stats)
|
|
28
|
-
# R: Accumulator (Mapping MethodName -> Pair -> Score + timing stats)
|
|
29
|
-
type _S = tuple[ComparisonMethodName, list[Pair], dict[int, str]]
|
|
30
|
-
type _T = tuple[list[tuple[ComparisonMethodName, Pair, Score]], TimingStats]
|
|
31
|
-
type _R = tuple[
|
|
32
|
-
dict[ComparisonMethodName, dict[Pair, Score]], # scores
|
|
33
|
-
dict[ComparisonMethodName, TimingStats], # timing per method
|
|
34
|
-
]
|
|
35
|
-
|
|
36
|
-
# Extract list of comparison methods from the Literal type
|
|
37
|
-
COMPARISON_METHODS: list[ComparisonMethodName] = list(get_args(ComparisonMethodName))
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def get_available_memory() -> int:
|
|
41
|
-
"""Get available system memory in bytes using psutil.
|
|
42
|
-
|
|
43
|
-
Returns:
|
|
44
|
-
Available memory in bytes
|
|
45
|
-
"""
|
|
46
|
-
return psutil.virtual_memory().available
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def calculate_max_cluster_size(
|
|
50
|
-
num_workers: int | None = None,
|
|
51
|
-
max_prep_size: int = 8 * 1024 * 1024, # 8 MB per photo
|
|
52
|
-
memory_fraction: float = 0.8,
|
|
53
|
-
) -> int:
|
|
54
|
-
"""Calculate maximum photos per cluster based on available memory.
|
|
55
|
-
|
|
56
|
-
Formula: photos_per_cluster * prep_size * num_workers < available_memory
|
|
57
|
-
|
|
58
|
-
Args:
|
|
59
|
-
num_workers: Number of parallel worker processes (defaults to CONFIG.processing.MAX_WORKERS)
|
|
60
|
-
max_prep_size: Maximum memory per prepared photo (bytes)
|
|
61
|
-
memory_fraction: Fraction of available memory to use (0.0-1.0)
|
|
62
|
-
|
|
63
|
-
Returns:
|
|
64
|
-
Maximum number of photos allowed in a single cluster
|
|
65
|
-
"""
|
|
66
|
-
if num_workers is None:
|
|
67
|
-
num_workers = CONFIG.processing.MAX_WORKERS
|
|
68
|
-
|
|
69
|
-
available_memory = get_available_memory()
|
|
70
|
-
usable_memory = int(available_memory * memory_fraction)
|
|
71
|
-
max_photos = usable_memory // (max_prep_size * num_workers)
|
|
72
|
-
return max(max_photos, 10) # Minimum 10 photos per cluster # Minimum 10 photos per cluster
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
class ComputeBenchmarks(PipelineStage[_S, _T, _R]):
|
|
76
|
-
"""Pipeline stage for generating photo benchmark pairs, clustering them into work units.
|
|
77
|
-
|
|
78
|
-
Calculates scores using various comparison methods in parallel,
|
|
79
|
-
and performs a final analysis.
|
|
80
|
-
"""
|
|
81
|
-
|
|
82
|
-
# --- Port Declarations ---
|
|
83
|
-
# Class attributes for InputPorts and OutputPorts
|
|
84
|
-
forest_i: InputPort[list[PhotoSequence]]
|
|
85
|
-
photofiles_i: InputPort[Mapping[int, PhotoFile]]
|
|
86
|
-
|
|
87
|
-
# --- Data Storage for Finalise ---
|
|
88
|
-
positive_pairs: list[Pair]
|
|
89
|
-
different_pairs: list[Pair]
|
|
90
|
-
|
|
91
|
-
def __init__(self) -> None:
|
|
92
|
-
"""Initialize the benchmark stage."""
|
|
93
|
-
super().__init__(
|
|
94
|
-
path=CONFIG.paths.benchmark_scores_pkl,
|
|
95
|
-
stage_name="Photo Comparison Benchmark",
|
|
96
|
-
)
|
|
97
|
-
|
|
98
|
-
# Initialize instance attributes
|
|
99
|
-
self.positive_pairs = []
|
|
100
|
-
self.different_pairs = []
|
|
101
|
-
# Result is a tuple: (scores_dict, timing_dict)
|
|
102
|
-
self.result: _R = ({}, {})
|
|
103
|
-
self.args = "" # Not strictly necessary for this stage
|
|
104
|
-
|
|
105
|
-
# Define input ports (InputPort only needs a name)
|
|
106
|
-
self.forest_i = InputPort("forest_data")
|
|
107
|
-
self.photofiles_i = InputPort("photofiles_map")
|
|
108
|
-
|
|
109
|
-
def prepare(self) -> PrepareResult[_S, _R]:
|
|
110
|
-
"""Generate benchmark pairs, cluster them, and create work units.
|
|
111
|
-
|
|
112
|
-
1. Generate Similar/Dissimilar pairs from the forest.
|
|
113
|
-
2. Cluster the pairs into connected components of limited size.
|
|
114
|
-
3. Create work units: (ComparisonMethodName, ClusterOfPairs, PhotoPaths).
|
|
115
|
-
|
|
116
|
-
Returns:
|
|
117
|
-
Tuple of (work_units, initial_accumulator).
|
|
118
|
-
"""
|
|
119
|
-
# --- 1. Load Inputs ---
|
|
120
|
-
# InputPort.load() is used to read data from upstream stages
|
|
121
|
-
forest: list[PhotoSequence] = self.forest_i.read()
|
|
122
|
-
photofiles: Mapping[int, PhotoFile] = self.photofiles_i.read()
|
|
123
|
-
|
|
124
|
-
# Get reference counts from upstream for UI statistics tracking
|
|
125
|
-
self.ref_photos_init = self.forest_i.get_ref_photo_count()
|
|
126
|
-
self.ref_seqs_init = self.forest_i.get_ref_sequence_count()
|
|
127
|
-
|
|
128
|
-
# --- 2. Generate Pairs ---
|
|
129
|
-
n_different = CONFIG.benchmark.N_DIFFERENT_PAIRS
|
|
130
|
-
seed = CONFIG.processing.DEFAULT_RANDOM_SEED
|
|
131
|
-
|
|
132
|
-
# Calculate max cluster size based on available memory
|
|
133
|
-
# Uses CONFIG.processing.MAX_WORKERS
|
|
134
|
-
max_cluster_size = calculate_max_cluster_size(
|
|
135
|
-
max_prep_size=CONFIG.benchmark.MAX_PREP_SIZE_BYTES,
|
|
136
|
-
memory_fraction=CONFIG.benchmark.MEMORY_FRACTION,
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
self.positive_pairs, self.different_pairs, unique_ids = generate_benchmark_pairs(
|
|
140
|
-
forest=forest,
|
|
141
|
-
n_different=n_different,
|
|
142
|
-
seed=seed,
|
|
143
|
-
)
|
|
144
|
-
|
|
145
|
-
# Store for use in finalise()
|
|
146
|
-
self.positive_pairs: list[Pair] = self.positive_pairs
|
|
147
|
-
self.different_pairs: list[Pair] = self.different_pairs
|
|
148
|
-
|
|
149
|
-
all_pairs: list[Pair] = self.positive_pairs + self.different_pairs
|
|
150
|
-
|
|
151
|
-
# --- 3. Cluster Pairs ---
|
|
152
|
-
cluster_list = cluster_pairs_for_scoring(
|
|
153
|
-
pairs=all_pairs,
|
|
154
|
-
max_cluster_size=max_cluster_size,
|
|
155
|
-
)
|
|
156
|
-
|
|
157
|
-
# --- 4. Build photo_paths dict for worker file access ---
|
|
158
|
-
photo_paths: dict[int, str] = {photo_id: str(photofiles[photo_id].path) for photo_id in unique_ids}
|
|
159
|
-
|
|
160
|
-
# --- 5. Create Work Units ---
|
|
161
|
-
work_units: list[_S] = []
|
|
162
|
-
for method in COMPARISON_METHODS:
|
|
163
|
-
for _, cluster_pairs in cluster_list:
|
|
164
|
-
work_units.append((method, cluster_pairs, photo_paths))
|
|
165
|
-
|
|
166
|
-
# --- 6. Initialize Accumulator ---
|
|
167
|
-
# Accumulator is a tuple: (scores_dict, timing_dict)
|
|
168
|
-
scores_dict: dict[ComparisonMethodName, dict[Pair, Score]] = {}
|
|
169
|
-
timing_dict: dict[ComparisonMethodName, TimingStats] = {}
|
|
170
|
-
|
|
171
|
-
for method in COMPARISON_METHODS:
|
|
172
|
-
scores_dict[method] = {}
|
|
173
|
-
timing_dict[method] = {
|
|
174
|
-
"prep_time": 0.0,
|
|
175
|
-
"compare_time": 0.0,
|
|
176
|
-
"prep_count": 0.0,
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
initial_accum: _R = (scores_dict, timing_dict)
|
|
180
|
-
|
|
181
|
-
return work_units, initial_accum
|
|
182
|
-
|
|
183
|
-
@classmethod
|
|
184
|
-
def stage_worker(
|
|
185
|
-
cls,
|
|
186
|
-
job: _S,
|
|
187
|
-
_args: str,
|
|
188
|
-
) -> WorkerResult[_T]:
|
|
189
|
-
"""Process an individual work unit: calculate scores for all pairs in a cluster.
|
|
190
|
-
|
|
191
|
-
Uses a single comparison method with lazy preparation and local caching:
|
|
192
|
-
- Photos are prepared on-demand (first use)
|
|
193
|
-
- Prepared data is cached locally for reuse
|
|
194
|
-
- Minimizes redundant file I/O and preparation
|
|
195
|
-
- Measures preparation and comparison timing
|
|
196
|
-
|
|
197
|
-
Args:
|
|
198
|
-
job: 3-tuple (method_name, cluster_pairs, photo_paths)
|
|
199
|
-
_args: Unused worker arguments
|
|
200
|
-
|
|
201
|
-
Returns:
|
|
202
|
-
WorkerResult with calculated scores and timing statistics
|
|
203
|
-
|
|
204
|
-
Raises:
|
|
205
|
-
FileNotFoundError: If photo file is missing (critical error, must surface)
|
|
206
|
-
"""
|
|
207
|
-
method_name, cluster_pairs, photo_paths = job
|
|
208
|
-
|
|
209
|
-
# Create comparison method instance
|
|
210
|
-
method = create_comparison_method(method_name)
|
|
211
|
-
|
|
212
|
-
# Local cache for prepared photo data (photo_id -> prepared_data)
|
|
213
|
-
local_cache: dict[int, Any] = {}
|
|
214
|
-
|
|
215
|
-
calculated_scores: list[tuple[ComparisonMethodName, Pair, Score]] = []
|
|
216
|
-
|
|
217
|
-
# Track timing statistics
|
|
218
|
-
prep_time = 0.0
|
|
219
|
-
compare_time = 0.0
|
|
220
|
-
prep_count = 0
|
|
221
|
-
|
|
222
|
-
for a_id, b_id in cluster_pairs:
|
|
223
|
-
# Lazy preparation: prepare photo only if not in cache
|
|
224
|
-
if a_id not in local_cache:
|
|
225
|
-
t0 = time.perf_counter()
|
|
226
|
-
pixels = load_normalized_pixels(photo_paths[a_id])
|
|
227
|
-
local_cache[a_id] = method.prepare(pixels)
|
|
228
|
-
prep_time += time.perf_counter() - t0
|
|
229
|
-
prep_count += 1
|
|
230
|
-
|
|
231
|
-
if b_id not in local_cache:
|
|
232
|
-
t0 = time.perf_counter()
|
|
233
|
-
pixels = load_normalized_pixels(photo_paths[b_id])
|
|
234
|
-
local_cache[b_id] = method.prepare(pixels)
|
|
235
|
-
prep_time += time.perf_counter() - t0
|
|
236
|
-
prep_count += 1
|
|
237
|
-
|
|
238
|
-
# Measure comparison time
|
|
239
|
-
t0 = time.perf_counter()
|
|
240
|
-
score: float = method.compare(local_cache[a_id], local_cache[b_id])
|
|
241
|
-
compare_time += time.perf_counter() - t0
|
|
242
|
-
|
|
243
|
-
calculated_scores.append((method_name, (a_id, b_id), score))
|
|
244
|
-
|
|
245
|
-
timing_stats: TimingStats = {
|
|
246
|
-
"prep_time": prep_time,
|
|
247
|
-
"compare_time": compare_time,
|
|
248
|
-
"prep_count": float(prep_count),
|
|
249
|
-
}
|
|
250
|
-
|
|
251
|
-
return [], [], (calculated_scores, timing_stats)
|
|
252
|
-
|
|
253
|
-
def accumulate_results(
|
|
254
|
-
self,
|
|
255
|
-
accum: _R,
|
|
256
|
-
job_result: _T,
|
|
257
|
-
) -> None:
|
|
258
|
-
"""Merges worker results into the main accumulator dictionary.
|
|
259
|
-
|
|
260
|
-
Accumulates both scores and timing statistics per comparison method.
|
|
261
|
-
|
|
262
|
-
Args:
|
|
263
|
-
accum: Tuple of (scores_dict, timing_dict)
|
|
264
|
-
job_result: Tuple of (scores_list, timing_stats)
|
|
265
|
-
"""
|
|
266
|
-
scores_list, timing_stats = job_result
|
|
267
|
-
scores_dict, timing_dict = accum
|
|
268
|
-
|
|
269
|
-
# Early return if no scores to accumulate
|
|
270
|
-
if not scores_list:
|
|
271
|
-
return
|
|
272
|
-
|
|
273
|
-
# Accumulate scores
|
|
274
|
-
for method_name, pair, score in scores_list:
|
|
275
|
-
scores_dict[method_name][pair] = score
|
|
276
|
-
|
|
277
|
-
# Accumulate timing stats (sum across workers for each method)
|
|
278
|
-
# All scores in a work unit are for the same method
|
|
279
|
-
method_name = scores_list[0][0]
|
|
280
|
-
if method_name not in timing_dict:
|
|
281
|
-
timing_dict[method_name] = {
|
|
282
|
-
"prep_time": 0.0,
|
|
283
|
-
"compare_time": 0.0,
|
|
284
|
-
"prep_count": 0.0,
|
|
285
|
-
}
|
|
286
|
-
timing_dict[method_name]["prep_time"] += timing_stats["prep_time"]
|
|
287
|
-
timing_dict[method_name]["compare_time"] += timing_stats["compare_time"]
|
|
288
|
-
timing_dict[method_name]["prep_count"] += timing_stats["prep_count"]
|
|
289
|
-
|
|
290
|
-
def finalise(self) -> None:
|
|
291
|
-
"""Perform post-analysis and save results.
|
|
292
|
-
|
|
293
|
-
Includes calculating metrics, generating reports, and saving outputs,
|
|
294
|
-
as per the original benchmarks.py script. Also saves timing statistics.
|
|
295
|
-
"""
|
|
296
|
-
# Extract scores and timing from accumulator
|
|
297
|
-
scores_dict, timing_dict = self.result
|
|
298
|
-
|
|
299
|
-
# Update status with initial info
|
|
300
|
-
total_pairs = len(self.positive_pairs) + len(self.different_pairs)
|
|
301
|
-
if self._progress_tracker:
|
|
302
|
-
self._progress_tracker.set_status(f"Analyzing {total_pairs:,} pairs across {len(scores_dict)} methods...")
|
|
303
|
-
|
|
304
|
-
# Calculate derived timing metrics for each method
|
|
305
|
-
timing_summary = {}
|
|
306
|
-
for method_name, stats in timing_dict.items():
|
|
307
|
-
prep_time = stats["prep_time"]
|
|
308
|
-
compare_time = stats["compare_time"]
|
|
309
|
-
prep_count = int(stats["prep_count"])
|
|
310
|
-
num_pairs = len(scores_dict.get(method_name, {}))
|
|
311
|
-
|
|
312
|
-
# Calculate derived metrics
|
|
313
|
-
timing_summary[method_name] = {
|
|
314
|
-
"prep_time_seconds": prep_time,
|
|
315
|
-
"compare_time_seconds": compare_time,
|
|
316
|
-
"total_time_seconds": prep_time + compare_time,
|
|
317
|
-
"prep_count": prep_count,
|
|
318
|
-
"num_pairs": num_pairs,
|
|
319
|
-
"prep_time_per_photo_ms": (prep_time / prep_count * 1000) if prep_count > 0 else 0.0,
|
|
320
|
-
"compare_time_per_pair_us": (compare_time / num_pairs * 1_000_000) if num_pairs > 0 else 0.0,
|
|
321
|
-
"photos_per_second": prep_count / prep_time if prep_time > 0 else 0.0,
|
|
322
|
-
"comparisons_per_second": num_pairs / compare_time if compare_time > 0 else 0.0,
|
|
323
|
-
}
|
|
324
|
-
|
|
325
|
-
# Save timing data to JSON
|
|
326
|
-
timing_output_path = CONFIG.paths.work_dir / "benchmark_timing.json"
|
|
327
|
-
timing_output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
328
|
-
with timing_output_path.open("w", encoding="utf-8") as f:
|
|
329
|
-
json.dump(timing_summary, f, indent=2)
|
|
330
|
-
|
|
331
|
-
# Perform the full analysis (pass only scores_dict)
|
|
332
|
-
# Cast to expected type for post_analysis (dict[str, dict[Pair, Score]])
|
|
333
|
-
post_analysis(
|
|
334
|
-
final_scores=cast(dict[str, dict[Pair, Score]], scores_dict),
|
|
335
|
-
positive_pairs=self.positive_pairs, # Now correctly stored from prepare()
|
|
336
|
-
different_pairs=self.different_pairs, # Now correctly stored from prepare()
|
|
337
|
-
output_dir=CONFIG.paths.work_dir,
|
|
338
|
-
)
|
|
339
|
-
|
|
340
|
-
# Read back the best results and update status with findings
|
|
341
|
-
metrics_path = CONFIG.paths.work_dir / "method_metrics.csv"
|
|
342
|
-
if metrics_path.exists():
|
|
343
|
-
df_metrics = pd.read_csv(metrics_path, index_col=0)
|
|
344
|
-
best_method = df_metrics["f1"].idxmax()
|
|
345
|
-
best_f1 = df_metrics.loc[best_method, "f1"]
|
|
346
|
-
best_auc = df_metrics.loc[best_method, "auc"]
|
|
347
|
-
|
|
348
|
-
if self._progress_tracker:
|
|
349
|
-
self._progress_tracker.set_status(
|
|
350
|
-
f"Best: {best_method} (F1={best_f1:.4f}, AUC={best_auc:.4f}) | {total_pairs:,} pairs tested"
|
|
351
|
-
)
|
|
352
|
-
|
|
353
|
-
# Update stage statistics (required by BasePipelineStage)
|
|
354
|
-
self.ref_photos_final = self.ref_photos_init
|
|
355
|
-
self.ref_seqs_final = self.ref_seqs_init
|
|
1
|
+
"""PipelineStage for computing benchmark results."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import time
|
|
7
|
+
from collections.abc import Mapping
|
|
8
|
+
from typing import Any, cast, get_args
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import psutil
|
|
12
|
+
|
|
13
|
+
from photo_compare import ComparisonMethodName, create_comparison_method
|
|
14
|
+
|
|
15
|
+
from .benchmark_utils import cluster_pairs_for_scoring, generate_benchmark_pairs, post_analysis
|
|
16
|
+
from .config import CONFIG
|
|
17
|
+
from .photo_file import PhotoFile, load_normalized_pixels
|
|
18
|
+
from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
|
|
19
|
+
from .ports import InputPort
|
|
20
|
+
from .sequence import PhotoSequence
|
|
21
|
+
|
|
22
|
+
# Type definitions for this specific stage:
|
|
23
|
+
type Pair = tuple[int, int]
|
|
24
|
+
type Score = float
|
|
25
|
+
type TimingStats = dict[str, float] # {"prep_time": float, "compare_time": float, "prep_count": int}
|
|
26
|
+
# S: Work Item (Method to run + Cluster of pairs + Photo paths for worker file access)
|
|
27
|
+
# T: Worker Result Data (List of calculated scores + timing stats)
|
|
28
|
+
# R: Accumulator (Mapping MethodName -> Pair -> Score + timing stats)
|
|
29
|
+
type _S = tuple[ComparisonMethodName, list[Pair], dict[int, str]]
|
|
30
|
+
type _T = tuple[list[tuple[ComparisonMethodName, Pair, Score]], TimingStats]
|
|
31
|
+
type _R = tuple[
|
|
32
|
+
dict[ComparisonMethodName, dict[Pair, Score]], # scores
|
|
33
|
+
dict[ComparisonMethodName, TimingStats], # timing per method
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
# Extract list of comparison methods from the Literal type
|
|
37
|
+
COMPARISON_METHODS: list[ComparisonMethodName] = list(get_args(ComparisonMethodName))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_available_memory() -> int:
|
|
41
|
+
"""Get available system memory in bytes using psutil.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Available memory in bytes
|
|
45
|
+
"""
|
|
46
|
+
return psutil.virtual_memory().available
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def calculate_max_cluster_size(
|
|
50
|
+
num_workers: int | None = None,
|
|
51
|
+
max_prep_size: int = 8 * 1024 * 1024, # 8 MB per photo
|
|
52
|
+
memory_fraction: float = 0.8,
|
|
53
|
+
) -> int:
|
|
54
|
+
"""Calculate maximum photos per cluster based on available memory.
|
|
55
|
+
|
|
56
|
+
Formula: photos_per_cluster * prep_size * num_workers < available_memory
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
num_workers: Number of parallel worker processes (defaults to CONFIG.processing.MAX_WORKERS)
|
|
60
|
+
max_prep_size: Maximum memory per prepared photo (bytes)
|
|
61
|
+
memory_fraction: Fraction of available memory to use (0.0-1.0)
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Maximum number of photos allowed in a single cluster
|
|
65
|
+
"""
|
|
66
|
+
if num_workers is None:
|
|
67
|
+
num_workers = CONFIG.processing.MAX_WORKERS
|
|
68
|
+
|
|
69
|
+
available_memory = get_available_memory()
|
|
70
|
+
usable_memory = int(available_memory * memory_fraction)
|
|
71
|
+
max_photos = usable_memory // (max_prep_size * num_workers)
|
|
72
|
+
return max(max_photos, 10) # Minimum 10 photos per cluster # Minimum 10 photos per cluster
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class ComputeBenchmarks(PipelineStage[_S, _T, _R]):
|
|
76
|
+
"""Pipeline stage for generating photo benchmark pairs, clustering them into work units.
|
|
77
|
+
|
|
78
|
+
Calculates scores using various comparison methods in parallel,
|
|
79
|
+
and performs a final analysis.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
# --- Port Declarations ---
|
|
83
|
+
# Class attributes for InputPorts and OutputPorts
|
|
84
|
+
forest_i: InputPort[list[PhotoSequence]]
|
|
85
|
+
photofiles_i: InputPort[Mapping[int, PhotoFile]]
|
|
86
|
+
|
|
87
|
+
# --- Data Storage for Finalise ---
|
|
88
|
+
positive_pairs: list[Pair]
|
|
89
|
+
different_pairs: list[Pair]
|
|
90
|
+
|
|
91
|
+
def __init__(self) -> None:
|
|
92
|
+
"""Initialize the benchmark stage."""
|
|
93
|
+
super().__init__(
|
|
94
|
+
path=CONFIG.paths.benchmark_scores_pkl,
|
|
95
|
+
stage_name="Photo Comparison Benchmark",
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Initialize instance attributes
|
|
99
|
+
self.positive_pairs = []
|
|
100
|
+
self.different_pairs = []
|
|
101
|
+
# Result is a tuple: (scores_dict, timing_dict)
|
|
102
|
+
self.result: _R = ({}, {})
|
|
103
|
+
self.args = "" # Not strictly necessary for this stage
|
|
104
|
+
|
|
105
|
+
# Define input ports (InputPort only needs a name)
|
|
106
|
+
self.forest_i = InputPort("forest_data")
|
|
107
|
+
self.photofiles_i = InputPort("photofiles_map")
|
|
108
|
+
|
|
109
|
+
def prepare(self) -> PrepareResult[_S, _R]:
|
|
110
|
+
"""Generate benchmark pairs, cluster them, and create work units.
|
|
111
|
+
|
|
112
|
+
1. Generate Similar/Dissimilar pairs from the forest.
|
|
113
|
+
2. Cluster the pairs into connected components of limited size.
|
|
114
|
+
3. Create work units: (ComparisonMethodName, ClusterOfPairs, PhotoPaths).
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Tuple of (work_units, initial_accumulator).
|
|
118
|
+
"""
|
|
119
|
+
# --- 1. Load Inputs ---
|
|
120
|
+
# InputPort.load() is used to read data from upstream stages
|
|
121
|
+
forest: list[PhotoSequence] = self.forest_i.read()
|
|
122
|
+
photofiles: Mapping[int, PhotoFile] = self.photofiles_i.read()
|
|
123
|
+
|
|
124
|
+
# Get reference counts from upstream for UI statistics tracking
|
|
125
|
+
self.ref_photos_init = self.forest_i.get_ref_photo_count()
|
|
126
|
+
self.ref_seqs_init = self.forest_i.get_ref_sequence_count()
|
|
127
|
+
|
|
128
|
+
# --- 2. Generate Pairs ---
|
|
129
|
+
n_different = CONFIG.benchmark.N_DIFFERENT_PAIRS
|
|
130
|
+
seed = CONFIG.processing.DEFAULT_RANDOM_SEED
|
|
131
|
+
|
|
132
|
+
# Calculate max cluster size based on available memory
|
|
133
|
+
# Uses CONFIG.processing.MAX_WORKERS
|
|
134
|
+
max_cluster_size = calculate_max_cluster_size(
|
|
135
|
+
max_prep_size=CONFIG.benchmark.MAX_PREP_SIZE_BYTES,
|
|
136
|
+
memory_fraction=CONFIG.benchmark.MEMORY_FRACTION,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
self.positive_pairs, self.different_pairs, unique_ids = generate_benchmark_pairs(
|
|
140
|
+
forest=forest,
|
|
141
|
+
n_different=n_different,
|
|
142
|
+
seed=seed,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Store for use in finalise()
|
|
146
|
+
self.positive_pairs: list[Pair] = self.positive_pairs
|
|
147
|
+
self.different_pairs: list[Pair] = self.different_pairs
|
|
148
|
+
|
|
149
|
+
all_pairs: list[Pair] = self.positive_pairs + self.different_pairs
|
|
150
|
+
|
|
151
|
+
# --- 3. Cluster Pairs ---
|
|
152
|
+
cluster_list = cluster_pairs_for_scoring(
|
|
153
|
+
pairs=all_pairs,
|
|
154
|
+
max_cluster_size=max_cluster_size,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# --- 4. Build photo_paths dict for worker file access ---
|
|
158
|
+
photo_paths: dict[int, str] = {photo_id: str(photofiles[photo_id].path) for photo_id in unique_ids}
|
|
159
|
+
|
|
160
|
+
# --- 5. Create Work Units ---
|
|
161
|
+
work_units: list[_S] = []
|
|
162
|
+
for method in COMPARISON_METHODS:
|
|
163
|
+
for _, cluster_pairs in cluster_list:
|
|
164
|
+
work_units.append((method, cluster_pairs, photo_paths))
|
|
165
|
+
|
|
166
|
+
# --- 6. Initialize Accumulator ---
|
|
167
|
+
# Accumulator is a tuple: (scores_dict, timing_dict)
|
|
168
|
+
scores_dict: dict[ComparisonMethodName, dict[Pair, Score]] = {}
|
|
169
|
+
timing_dict: dict[ComparisonMethodName, TimingStats] = {}
|
|
170
|
+
|
|
171
|
+
for method in COMPARISON_METHODS:
|
|
172
|
+
scores_dict[method] = {}
|
|
173
|
+
timing_dict[method] = {
|
|
174
|
+
"prep_time": 0.0,
|
|
175
|
+
"compare_time": 0.0,
|
|
176
|
+
"prep_count": 0.0,
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
initial_accum: _R = (scores_dict, timing_dict)
|
|
180
|
+
|
|
181
|
+
return work_units, initial_accum
|
|
182
|
+
|
|
183
|
+
@classmethod
|
|
184
|
+
def stage_worker(
|
|
185
|
+
cls,
|
|
186
|
+
job: _S,
|
|
187
|
+
_args: str,
|
|
188
|
+
) -> WorkerResult[_T]:
|
|
189
|
+
"""Process an individual work unit: calculate scores for all pairs in a cluster.
|
|
190
|
+
|
|
191
|
+
Uses a single comparison method with lazy preparation and local caching:
|
|
192
|
+
- Photos are prepared on-demand (first use)
|
|
193
|
+
- Prepared data is cached locally for reuse
|
|
194
|
+
- Minimizes redundant file I/O and preparation
|
|
195
|
+
- Measures preparation and comparison timing
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
job: 3-tuple (method_name, cluster_pairs, photo_paths)
|
|
199
|
+
_args: Unused worker arguments
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
WorkerResult with calculated scores and timing statistics
|
|
203
|
+
|
|
204
|
+
Raises:
|
|
205
|
+
FileNotFoundError: If photo file is missing (critical error, must surface)
|
|
206
|
+
"""
|
|
207
|
+
method_name, cluster_pairs, photo_paths = job
|
|
208
|
+
|
|
209
|
+
# Create comparison method instance
|
|
210
|
+
method = create_comparison_method(method_name)
|
|
211
|
+
|
|
212
|
+
# Local cache for prepared photo data (photo_id -> prepared_data)
|
|
213
|
+
local_cache: dict[int, Any] = {}
|
|
214
|
+
|
|
215
|
+
calculated_scores: list[tuple[ComparisonMethodName, Pair, Score]] = []
|
|
216
|
+
|
|
217
|
+
# Track timing statistics
|
|
218
|
+
prep_time = 0.0
|
|
219
|
+
compare_time = 0.0
|
|
220
|
+
prep_count = 0
|
|
221
|
+
|
|
222
|
+
for a_id, b_id in cluster_pairs:
|
|
223
|
+
# Lazy preparation: prepare photo only if not in cache
|
|
224
|
+
if a_id not in local_cache:
|
|
225
|
+
t0 = time.perf_counter()
|
|
226
|
+
pixels = load_normalized_pixels(photo_paths[a_id])
|
|
227
|
+
local_cache[a_id] = method.prepare(pixels)
|
|
228
|
+
prep_time += time.perf_counter() - t0
|
|
229
|
+
prep_count += 1
|
|
230
|
+
|
|
231
|
+
if b_id not in local_cache:
|
|
232
|
+
t0 = time.perf_counter()
|
|
233
|
+
pixels = load_normalized_pixels(photo_paths[b_id])
|
|
234
|
+
local_cache[b_id] = method.prepare(pixels)
|
|
235
|
+
prep_time += time.perf_counter() - t0
|
|
236
|
+
prep_count += 1
|
|
237
|
+
|
|
238
|
+
# Measure comparison time
|
|
239
|
+
t0 = time.perf_counter()
|
|
240
|
+
score: float = method.compare(local_cache[a_id], local_cache[b_id])
|
|
241
|
+
compare_time += time.perf_counter() - t0
|
|
242
|
+
|
|
243
|
+
calculated_scores.append((method_name, (a_id, b_id), score))
|
|
244
|
+
|
|
245
|
+
timing_stats: TimingStats = {
|
|
246
|
+
"prep_time": prep_time,
|
|
247
|
+
"compare_time": compare_time,
|
|
248
|
+
"prep_count": float(prep_count),
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
return [], [], (calculated_scores, timing_stats)
|
|
252
|
+
|
|
253
|
+
def accumulate_results(
|
|
254
|
+
self,
|
|
255
|
+
accum: _R,
|
|
256
|
+
job_result: _T,
|
|
257
|
+
) -> None:
|
|
258
|
+
"""Merges worker results into the main accumulator dictionary.
|
|
259
|
+
|
|
260
|
+
Accumulates both scores and timing statistics per comparison method.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
accum: Tuple of (scores_dict, timing_dict)
|
|
264
|
+
job_result: Tuple of (scores_list, timing_stats)
|
|
265
|
+
"""
|
|
266
|
+
scores_list, timing_stats = job_result
|
|
267
|
+
scores_dict, timing_dict = accum
|
|
268
|
+
|
|
269
|
+
# Early return if no scores to accumulate
|
|
270
|
+
if not scores_list:
|
|
271
|
+
return
|
|
272
|
+
|
|
273
|
+
# Accumulate scores
|
|
274
|
+
for method_name, pair, score in scores_list:
|
|
275
|
+
scores_dict[method_name][pair] = score
|
|
276
|
+
|
|
277
|
+
# Accumulate timing stats (sum across workers for each method)
|
|
278
|
+
# All scores in a work unit are for the same method
|
|
279
|
+
method_name = scores_list[0][0]
|
|
280
|
+
if method_name not in timing_dict:
|
|
281
|
+
timing_dict[method_name] = {
|
|
282
|
+
"prep_time": 0.0,
|
|
283
|
+
"compare_time": 0.0,
|
|
284
|
+
"prep_count": 0.0,
|
|
285
|
+
}
|
|
286
|
+
timing_dict[method_name]["prep_time"] += timing_stats["prep_time"]
|
|
287
|
+
timing_dict[method_name]["compare_time"] += timing_stats["compare_time"]
|
|
288
|
+
timing_dict[method_name]["prep_count"] += timing_stats["prep_count"]
|
|
289
|
+
|
|
290
|
+
def finalise(self) -> None:
|
|
291
|
+
"""Perform post-analysis and save results.
|
|
292
|
+
|
|
293
|
+
Includes calculating metrics, generating reports, and saving outputs,
|
|
294
|
+
as per the original benchmarks.py script. Also saves timing statistics.
|
|
295
|
+
"""
|
|
296
|
+
# Extract scores and timing from accumulator
|
|
297
|
+
scores_dict, timing_dict = self.result
|
|
298
|
+
|
|
299
|
+
# Update status with initial info
|
|
300
|
+
total_pairs = len(self.positive_pairs) + len(self.different_pairs)
|
|
301
|
+
if self._progress_tracker:
|
|
302
|
+
self._progress_tracker.set_status(f"Analyzing {total_pairs:,} pairs across {len(scores_dict)} methods...")
|
|
303
|
+
|
|
304
|
+
# Calculate derived timing metrics for each method
|
|
305
|
+
timing_summary = {}
|
|
306
|
+
for method_name, stats in timing_dict.items():
|
|
307
|
+
prep_time = stats["prep_time"]
|
|
308
|
+
compare_time = stats["compare_time"]
|
|
309
|
+
prep_count = int(stats["prep_count"])
|
|
310
|
+
num_pairs = len(scores_dict.get(method_name, {}))
|
|
311
|
+
|
|
312
|
+
# Calculate derived metrics
|
|
313
|
+
timing_summary[method_name] = {
|
|
314
|
+
"prep_time_seconds": prep_time,
|
|
315
|
+
"compare_time_seconds": compare_time,
|
|
316
|
+
"total_time_seconds": prep_time + compare_time,
|
|
317
|
+
"prep_count": prep_count,
|
|
318
|
+
"num_pairs": num_pairs,
|
|
319
|
+
"prep_time_per_photo_ms": (prep_time / prep_count * 1000) if prep_count > 0 else 0.0,
|
|
320
|
+
"compare_time_per_pair_us": (compare_time / num_pairs * 1_000_000) if num_pairs > 0 else 0.0,
|
|
321
|
+
"photos_per_second": prep_count / prep_time if prep_time > 0 else 0.0,
|
|
322
|
+
"comparisons_per_second": num_pairs / compare_time if compare_time > 0 else 0.0,
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
# Save timing data to JSON
|
|
326
|
+
timing_output_path = CONFIG.paths.work_dir / "benchmark_timing.json"
|
|
327
|
+
timing_output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
328
|
+
with timing_output_path.open("w", encoding="utf-8") as f:
|
|
329
|
+
json.dump(timing_summary, f, indent=2)
|
|
330
|
+
|
|
331
|
+
# Perform the full analysis (pass only scores_dict)
|
|
332
|
+
# Cast to expected type for post_analysis (dict[str, dict[Pair, Score]])
|
|
333
|
+
post_analysis(
|
|
334
|
+
final_scores=cast(dict[str, dict[Pair, Score]], scores_dict),
|
|
335
|
+
positive_pairs=self.positive_pairs, # Now correctly stored from prepare()
|
|
336
|
+
different_pairs=self.different_pairs, # Now correctly stored from prepare()
|
|
337
|
+
output_dir=CONFIG.paths.work_dir,
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
# Read back the best results and update status with findings
|
|
341
|
+
metrics_path = CONFIG.paths.work_dir / "method_metrics.csv"
|
|
342
|
+
if metrics_path.exists():
|
|
343
|
+
df_metrics = pd.read_csv(metrics_path, index_col=0)
|
|
344
|
+
best_method = df_metrics["f1"].idxmax()
|
|
345
|
+
best_f1 = df_metrics.loc[best_method, "f1"]
|
|
346
|
+
best_auc = df_metrics.loc[best_method, "auc"]
|
|
347
|
+
|
|
348
|
+
if self._progress_tracker:
|
|
349
|
+
self._progress_tracker.set_status(
|
|
350
|
+
f"Best: {best_method} (F1={best_f1:.4f}, AUC={best_auc:.4f}) | {total_pairs:,} pairs tested"
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
# Update stage statistics (required by BasePipelineStage)
|
|
354
|
+
self.ref_photos_final = self.ref_photos_init
|
|
355
|
+
self.ref_seqs_final = self.ref_seqs_init
|