photo-stack-finder 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. orchestrator/__init__.py +2 -2
  2. orchestrator/app.py +6 -11
  3. orchestrator/build_pipeline.py +19 -21
  4. orchestrator/orchestrator_runner.py +11 -8
  5. orchestrator/pipeline_builder.py +126 -126
  6. orchestrator/pipeline_orchestrator.py +604 -604
  7. orchestrator/review_persistence.py +162 -162
  8. orchestrator/static/orchestrator.css +76 -76
  9. orchestrator/static/orchestrator.html +11 -5
  10. orchestrator/static/orchestrator.js +3 -1
  11. overlap_metrics/__init__.py +1 -1
  12. overlap_metrics/config.py +135 -135
  13. overlap_metrics/core.py +284 -284
  14. overlap_metrics/estimators.py +292 -292
  15. overlap_metrics/metrics.py +307 -307
  16. overlap_metrics/registry.py +99 -99
  17. overlap_metrics/utils.py +104 -104
  18. photo_compare/__init__.py +1 -1
  19. photo_compare/base.py +285 -285
  20. photo_compare/config.py +225 -225
  21. photo_compare/distance.py +15 -15
  22. photo_compare/feature_methods.py +173 -173
  23. photo_compare/file_hash.py +29 -29
  24. photo_compare/hash_methods.py +99 -99
  25. photo_compare/histogram_methods.py +118 -118
  26. photo_compare/pixel_methods.py +58 -58
  27. photo_compare/structural_methods.py +104 -104
  28. photo_compare/types.py +28 -28
  29. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/METADATA +21 -22
  30. photo_stack_finder-0.1.8.dist-info/RECORD +75 -0
  31. scripts/orchestrate.py +12 -10
  32. utils/__init__.py +4 -3
  33. utils/base_pipeline_stage.py +171 -171
  34. utils/base_ports.py +176 -176
  35. utils/benchmark_utils.py +823 -823
  36. utils/channel.py +74 -74
  37. utils/comparison_gates.py +40 -21
  38. utils/compute_benchmarks.py +355 -355
  39. utils/compute_identical.py +94 -24
  40. utils/compute_indices.py +235 -235
  41. utils/compute_perceptual_hash.py +127 -127
  42. utils/compute_perceptual_match.py +240 -240
  43. utils/compute_sha_bins.py +64 -20
  44. utils/compute_template_similarity.py +1 -1
  45. utils/compute_versions.py +483 -483
  46. utils/config.py +8 -5
  47. utils/data_io.py +83 -83
  48. utils/graph_context.py +44 -44
  49. utils/logger.py +2 -2
  50. utils/models.py +2 -2
  51. utils/photo_file.py +90 -91
  52. utils/pipeline_graph.py +334 -334
  53. utils/pipeline_stage.py +408 -408
  54. utils/plot_helpers.py +123 -123
  55. utils/ports.py +136 -136
  56. utils/progress.py +415 -415
  57. utils/report_builder.py +139 -139
  58. utils/review_types.py +55 -55
  59. utils/review_utils.py +10 -19
  60. utils/sequence.py +10 -8
  61. utils/sequence_clustering.py +1 -1
  62. utils/template.py +57 -57
  63. utils/template_parsing.py +71 -0
  64. photo_stack_finder-0.1.7.dist-info/RECORD +0 -74
  65. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/WHEEL +0 -0
  66. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/entry_points.txt +0 -0
  67. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/licenses/LICENSE +0 -0
  68. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/top_level.txt +0 -0
utils/benchmark_utils.py CHANGED
@@ -1,823 +1,823 @@
1
- """Benchmark utilities."""
2
-
3
- from __future__ import annotations
4
-
5
- import logging
6
- import random
7
- import time
8
- from collections import defaultdict
9
- from collections.abc import Iterable, Sequence
10
- from itertools import combinations
11
- from pathlib import Path
12
- from typing import Any, cast
13
-
14
- # --- Scientific Libraries ---
15
- import matplotlib
16
-
17
- # Use non-interactive backend to avoid GUI threading issues
18
- matplotlib.use("Agg")
19
- import matplotlib.pyplot as plt
20
- import networkx as nx
21
- import numpy as np
22
- import pandas as pd
23
- from numpy.typing import NDArray
24
- from scipy.stats import ttest_ind
25
- from sklearn.decomposition import PCA
26
- from sklearn.metrics import (
27
- confusion_matrix,
28
- f1_score,
29
- precision_score,
30
- recall_score,
31
- roc_auc_score,
32
- roc_curve,
33
- )
34
- from sklearn.preprocessing import StandardScaler
35
-
36
- # --- External Project Dependencies (MUST EXIST IN PROJECT) ---
37
- # Replace/confirm these imports match your actual file structure
38
- from .config import CONFIG
39
- from .photo_file import PhotoFile
40
- from .plot_helpers import save_correlation_heatmap, save_histogram_comparison, save_pca_scatter
41
- from .report_builder import ReportBuilder
42
- from .sequence import PhotoFileSeries, PhotoSequence
43
-
44
- # ----------------------------
45
- # -----------------------------------------------------------
46
-
47
- # Type Aliases
48
- type Pair = tuple[int, int]
49
- type _R = dict[str, dict[Pair, float]]
50
- type _Score = float
51
-
52
-
53
- # --- Core Utility Functions ---
54
-
55
-
56
- def _split_large_component(
57
- comp: set[int], graph: nx.Graph[int], max_size: int, pairs: Sequence[Pair]
58
- ) -> list[set[int]]:
59
- """Splits a large connected component (of pair indices) into smaller pieces using a greedy BFS approach.
60
-
61
- Constrains by unique photo count, not pair count.
62
-
63
- Args:
64
- comp: Set of pair indices to split
65
- graph: Graph where nodes are pair indices
66
- max_size: Maximum unique photos per piece
67
- pairs: Original pairs array to calculate photo counts
68
-
69
- Returns:
70
- List of smaller components (sets of pair indices)
71
- """
72
-
73
- # Helper to count unique photos in a component
74
- def count_photos(pair_indices: set[int]) -> int:
75
- return len(unique_ids_from_pairs([pairs[i] for i in pair_indices]))
76
-
77
- if count_photos(comp) <= max_size * 2:
78
- # Arbitrarily split in half if size is manageable
79
- comp_list: list[int] = list(comp)
80
- mid: int = len(comp_list) // 2
81
- return [set(comp_list[:mid]), set(comp_list[mid:])]
82
-
83
- subgraph: nx.Graph[int] = graph.subgraph(comp)
84
- pieces: list[set[int]] = []
85
- remaining: set[int] = set(comp)
86
-
87
- while remaining:
88
- start_node: int = next(iter(remaining))
89
- current_piece: set[int] = set()
90
- queue: list[int] = [start_node]
91
-
92
- # Grow piece up to max_size unique photos using BFS
93
- while queue and count_photos(current_piece) < max_size:
94
- node: int = queue.pop(0)
95
- if node in remaining:
96
- current_piece.add(node)
97
- remaining.remove(node)
98
-
99
- # Add neighbors to queue
100
- neighbor: int
101
- for neighbor in subgraph.neighbors(node):
102
- if neighbor in remaining and neighbor not in queue:
103
- queue.append(neighbor)
104
-
105
- pieces.append(current_piece)
106
- return pieces
107
-
108
-
109
- def unique_ids_from_pairs(pairs: Iterable[Pair]) -> set[int]:
110
- """Utility function to collect unique IDs from a list of pairs."""
111
- u: set[int] = set()
112
- a: int
113
- b: int
114
- for a, b in pairs:
115
- u.add(a)
116
- u.add(b)
117
- return u
118
-
119
-
120
- def generate_known_different_pairs(
121
- forest: list[PhotoSequence],
122
- n_pairs: int,
123
- seed: int,
124
- ) -> list[Pair]:
125
- """Generate high-quality known-different pairs using forest structure.
126
-
127
- Uses cross-template or distant-sequence sampling.
128
- """
129
- known_different: list[Pair] = []
130
-
131
- # 1. Derive template exemplars from the forest
132
- template_exemplars: dict[str, int] = {}
133
- for obj in forest:
134
- # Use hasattr checks to handle polymorphic PhotoSequence objects
135
- if hasattr(obj, "template_key") and hasattr(obj, "get_reference"):
136
- template_key = obj.template_key
137
- reference: PhotoFileSeries = obj.get_reference()
138
-
139
- if template_key and template_key not in template_exemplars and reference:
140
- first_key = next(iter(reference.keys()), None)
141
- if first_key is not None:
142
- # PhotoFile is expected to have an .id attribute
143
- exemplar: PhotoFile = reference[first_key]
144
- template_exemplars[template_key] = exemplar.id
145
-
146
- # 2. Group templates by parent directory (assumed from template_key path structure)
147
- templates_by_parent: defaultdict[Path, list[str]] = defaultdict(list)
148
- for template_key in template_exemplars:
149
- parent: Path = Path(template_key).parent
150
- templates_by_parent[parent].append(template_key)
151
-
152
- # 3. Cross-parent pairing (high confidence negatives)
153
- for (_p1, t1_list), (_p2, t2_list) in combinations(templates_by_parent.items(), 2):
154
- for t1 in t1_list[:5]:
155
- for t2 in t2_list[:5]:
156
- if t1 in template_exemplars and t2 in template_exemplars:
157
- known_different.append((template_exemplars[t1], template_exemplars[t2]))
158
-
159
- # 4. Distant sequence positions (high confidence negatives)
160
- for obj in forest:
161
- seq: PhotoFileSeries = obj.get_reference()
162
- sorted_indices: list[Any] = sorted(seq.index)
163
- for i in range(min(10, len(sorted_indices) // 2)):
164
- if i < len(sorted_indices) and -(i + 1) >= -len(sorted_indices):
165
- known_different.append((seq[sorted_indices[i]].id, seq[sorted_indices[-(i + 1)]].id))
166
-
167
- rng: random.Random = random.Random(seed)
168
- rng.shuffle(known_different)
169
- return known_different[:n_pairs]
170
-
171
-
172
- def generate_benchmark_pairs(
173
- forest: list[PhotoSequence],
174
- # Included for consistency, though not directly used
175
- n_different: int,
176
- seed: int,
177
- ) -> tuple[list[Pair], list[Pair], list[int]]:
178
- """Generates similar (positive) and known-different (negative) photo pairs.
179
-
180
- Extracts pairs from the forest structure based on sequence relationships.
181
- """
182
- # 1. Generate similar (positive) pairs
183
- positive_pairs: list[Pair] = []
184
- for obj in forest:
185
- if hasattr(obj, "get_reference"):
186
- reference: PhotoFileSeries = obj.get_reference()
187
- sequences: list[PhotoSequence] = obj.sequences
188
-
189
- for idx, exemplar in reference.items():
190
- positive_pairs.extend(
191
- [
192
- (exemplar.id, seq.get_reference()[idx].id)
193
- for seq in sequences
194
- if idx in seq.get_reference() and exemplar.id != seq.get_reference()[idx].id
195
- ]
196
- )
197
-
198
- # 2. Generate known-different (negative) pairs
199
- n_diff_limit: int = min(len(positive_pairs), n_different)
200
- different_pairs: list[Pair] = generate_known_different_pairs(
201
- forest=forest,
202
- n_pairs=n_diff_limit,
203
- seed=seed,
204
- )
205
-
206
- # 3. Determine unique IDs
207
- unique_ids: list[int] = sorted(unique_ids_from_pairs(positive_pairs + different_pairs))
208
-
209
- return positive_pairs, different_pairs, unique_ids
210
-
211
-
212
- def _separate_components_by_size(
213
- components: list[set[int]],
214
- max_cluster_size: int,
215
- pairs: Sequence[Pair],
216
- ) -> tuple[list[set[int]], list[set[int]], list[set[int]]]:
217
- """Separate components into small, medium, and large based on photo count.
218
-
219
- Args:
220
- components: Connected components (sets of pair indices)
221
- max_cluster_size: Maximum photos per cluster
222
- pairs: Original pairs list
223
-
224
- Returns:
225
- Tuple of (small_components, medium_components, large_components)
226
- - small: photo_count <= max_cluster_size
227
- - medium: max_cluster_size < photo_count <= max_cluster_size * 2
228
- - large: photo_count > max_cluster_size * 2
229
- """
230
- small_components: list[set[int]] = []
231
- medium_components: list[set[int]] = []
232
- large_components: list[set[int]] = []
233
-
234
- for comp in components:
235
- photo_count = len(unique_ids_from_pairs([pairs[i] for i in comp]))
236
- if photo_count <= max_cluster_size:
237
- small_components.append(comp)
238
- elif photo_count > max_cluster_size * 2:
239
- large_components.append(comp)
240
- else:
241
- medium_components.append(comp)
242
-
243
- return small_components, medium_components, large_components
244
-
245
-
246
- def _combine_small_components(
247
- components: list[set[int]],
248
- max_cluster_size: int,
249
- pairs: Sequence[Pair],
250
- ) -> list[set[int]]:
251
- """Greedily combine small components to maximize cluster utilization.
252
-
253
- Uses a two-pointer approach: large components on the left, small on the right.
254
- Fills each cluster starting with the largest component, then adds smaller ones.
255
-
256
- Args:
257
- components: Components to combine (assumed pre-sorted by size descending)
258
- max_cluster_size: Maximum photos per cluster
259
- pairs: Original pairs list
260
-
261
- Returns:
262
- List of combined clusters (as sets of pair indices)
263
- """
264
- if not components:
265
- return []
266
-
267
- pair_clusters: list[set[int]] = []
268
- left_idx: int = 0
269
- right_idx: int = len(components) - 1
270
-
271
- while left_idx <= right_idx:
272
- current_cluster: set[int] = set(components[left_idx])
273
- current_photo_count = len(unique_ids_from_pairs([pairs[i] for i in current_cluster]))
274
- left_idx += 1
275
-
276
- while right_idx >= left_idx:
277
- candidate_photo_count = len(unique_ids_from_pairs([pairs[i] for i in components[right_idx]]))
278
- if current_photo_count + candidate_photo_count <= max_cluster_size:
279
- current_cluster.update(components[right_idx])
280
- current_photo_count += candidate_photo_count
281
- right_idx -= 1
282
- else:
283
- break
284
-
285
- pair_clusters.append(current_cluster)
286
-
287
- return pair_clusters
288
-
289
-
290
- def cluster_pairs_for_scoring(pairs: Sequence[Pair], max_cluster_size: int) -> list[tuple[set[int], list[Pair]]]:
291
- """Clusters a list of pairs into connected components of limited size.
292
-
293
- Constrains by unique photo count, not pair count, to respect memory limits.
294
-
295
- Args:
296
- pairs: List of photo ID pairs to cluster
297
- max_cluster_size: Maximum unique photos per cluster (memory constraint)
298
-
299
- Returns:
300
- List of (cluster_photos, cluster_pairs) tuples
301
- """
302
- # 1. Build graph where nodes are pair indices
303
- graph: nx.Graph[int] = nx.Graph()
304
- graph.add_nodes_from(range(len(pairs)))
305
-
306
- # Connect nodes (pairs) that share a photo ID
307
- for i in range(len(pairs)):
308
- for j in range(i + 1, len(pairs)):
309
- if set(pairs[i]) & set(pairs[j]):
310
- graph.add_edge(i, j)
311
-
312
- # 2. Get initial connected components
313
- components: list[set[int]] = list(nx.connected_components(graph))
314
-
315
- # 3. Separate and split components by PHOTO count (not pair count)
316
- small_components, medium_components, large_components = _separate_components_by_size(
317
- components, max_cluster_size, pairs
318
- )
319
-
320
- # Split large components into manageable pieces
321
- split_pieces: list[set[int]] = []
322
- for comp in large_components:
323
- pieces: list[set[int]] = _split_large_component(comp, graph, max_cluster_size, pairs)
324
- split_pieces.extend(pieces)
325
-
326
- # Merge all processable components
327
- small_components.extend(split_pieces)
328
- small_components.extend(medium_components)
329
-
330
- # 4. Sort by photo count descending for optimal packing
331
- small_components.sort(
332
- key=lambda c: len(unique_ids_from_pairs([pairs[i] for i in c])),
333
- reverse=True,
334
- )
335
-
336
- # 5. Greedy combination for final clusters (by PHOTO count)
337
- pair_clusters = _combine_small_components(small_components, max_cluster_size, pairs)
338
-
339
- # 6. Convert indices to pairs
340
- result: list[tuple[set[int], list[Pair]]] = []
341
- for pair_indices in pair_clusters:
342
- cluster_pairs: list[Pair] = [pairs[i] for i in pair_indices]
343
- cluster_photos: set[int] = unique_ids_from_pairs(cluster_pairs)
344
- result.append((cluster_photos, cluster_pairs))
345
-
346
- return result
347
-
348
-
349
- # --- Analysis Functions ---
350
-
351
-
352
- def calculate_metrics_at_best_threshold(y_true: NDArray[Any], y_scores: NDArray[Any]) -> dict[str, float]:
353
- """Calculates metrics by finding the optimal threshold closest to a target FPR.
354
-
355
- Target False Positive Rate (FPR) is 0.01.
356
- """
357
- target_fpr = 0.01
358
-
359
- # roc_curve expects positive class to have higher score, which is true for similarity
360
- fpr, _tpr, thresholds = roc_curve(y_true, y_scores)
361
-
362
- # Find the threshold closest to the target FPR
363
- diff = fpr - target_fpr
364
- optimal_idx = np.argmin(np.abs(diff))
365
- best_threshold = thresholds[optimal_idx]
366
-
367
- y_pred = (y_scores >= best_threshold).astype(int)
368
-
369
- auc = roc_auc_score(y_true, y_scores)
370
- f1 = f1_score(y_true, y_pred, zero_division=0)
371
- precision = precision_score(y_true, y_pred, zero_division=0)
372
- recall = recall_score(y_true, y_pred, zero_division=0)
373
-
374
- # Confusion matrix values
375
- tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
376
-
377
- return {
378
- "auc": float(auc),
379
- "threshold": float(best_threshold),
380
- "f1": float(f1),
381
- "precision": float(precision),
382
- "recall": float(recall),
383
- "tpr_at_threshold": float(tp / (tp + fn)) if (tp + fn) > 0 else 0.0,
384
- "tp": int(tp),
385
- "tn": int(tn),
386
- "fp": int(fp),
387
- "fn": int(fn),
388
- "fpr_at_threshold": float(fp / (fp + tn)) if (fp + tn) > 0 else 0.0,
389
- }
390
-
391
-
392
- def evaluate_consensus_strategy(df_scores: pd.DataFrame, y_true: NDArray[Any]) -> tuple[str, dict[str, float]]:
393
- """Evaluates the median consensus strategy."""
394
- median_scores = cast(NDArray[Any], df_scores.median(axis=1).values)
395
- metrics = calculate_metrics_at_best_threshold(y_true, median_scores)
396
- return "Median", metrics
397
-
398
-
399
- def evaluate_voting_strategy(df_scores: pd.DataFrame, y_true: NDArray[Any]) -> tuple[str, dict[str, float]]:
400
- """Evaluates a balanced voting strategy using majority vote.
401
-
402
- Finds the best single-method thresholds and applies majority voting.
403
- """
404
- method_thresholds = {}
405
- for method in df_scores.columns:
406
- method_values = cast(NDArray[Any], df_scores[method].values)
407
- metrics = calculate_metrics_at_best_threshold(y_true, method_values)
408
- method_thresholds[method] = metrics["threshold"]
409
-
410
- # Vote for similarity if score >= individual method's optimal threshold
411
- votes_df = pd.DataFrame()
412
- for method, threshold in method_thresholds.items():
413
- method_values = cast(NDArray[Any], df_scores[method].values)
414
- votes_df[method] = (method_values >= threshold).astype(int)
415
-
416
- # Final score is the mean vote (a value between 0.0 and 1.0)
417
- voting_scores = cast(NDArray[Any], votes_df.mean(axis=1).values)
418
-
419
- # Use a fixed threshold of 0.5 for the voting mean to get final prediction
420
- y_pred = (voting_scores >= 0.5).astype(int)
421
-
422
- auc = roc_auc_score(y_true, voting_scores)
423
- f1 = f1_score(y_true, y_pred, zero_division=0)
424
- precision = precision_score(y_true, y_pred, zero_division=0)
425
- recall = recall_score(y_true, y_pred, zero_division=0)
426
-
427
- return "Majority Vote", {
428
- "auc": float(auc),
429
- "threshold": 0.5,
430
- "f1": float(f1),
431
- "precision": float(precision),
432
- "recall": float(recall),
433
- "tpr_at_threshold": float(recall),
434
- "tp": int(np.sum((y_true == 1) & (y_pred == 1))),
435
- "tn": int(np.sum((y_true == 0) & (y_pred == 0))),
436
- "fp": int(np.sum((y_true == 0) & (y_pred == 1))),
437
- "fn": int(np.sum((y_true == 1) & (y_pred == 0))),
438
- "fpr_at_threshold": float(np.sum((y_true == 0) & (y_pred == 1)) / np.sum(y_true == 0)),
439
- }
440
-
441
-
442
- def _prepare_benchmark_data(
443
- final_scores: _R,
444
- positive_pairs: list[Pair],
445
- different_pairs: list[Pair],
446
- ) -> tuple[pd.DataFrame, NDArray[np.int_]]:
447
- """Prepare score DataFrame and ground truth labels.
448
-
449
- Args:
450
- final_scores: Dict mapping method names to pair scores
451
- positive_pairs: List of similar photo pairs
452
- different_pairs: List of dissimilar photo pairs
453
-
454
- Returns:
455
- Tuple of (score_dataframe, y_true_labels)
456
-
457
- Raises:
458
- ValueError: If no valid pairs remain after dropping NaNs
459
- """
460
- all_pairs: list[Pair] = positive_pairs + different_pairs
461
-
462
- score_data = {method: [final_scores[method][pair] for pair in all_pairs] for method in final_scores}
463
- df_scores = pd.DataFrame(score_data, index=pd.MultiIndex.from_tuples(all_pairs, names=["photo_a", "photo_b"]))
464
- df_scores = df_scores.dropna()
465
-
466
- if len(df_scores) == 0:
467
- raise ValueError("No valid pairs remain after dropping NaNs")
468
-
469
- # Align y_true for the pairs that remain (iterate in order to preserve alignment)
470
- is_positive_mask = [pair in positive_pairs for pair in df_scores.index]
471
- y_true = np.array(is_positive_mask, dtype=int)
472
-
473
- return df_scores, y_true
474
-
475
-
476
- def _calculate_method_metrics(
477
- df_scores: pd.DataFrame,
478
- y_true: NDArray[np.int_],
479
- ) -> tuple[dict[str, dict[str, float]], tuple[str, dict[str, float]], dict[str, dict[str, NDArray[Any]]]]:
480
- """Calculate performance metrics for each method.
481
-
482
- Args:
483
- df_scores: DataFrame with method scores
484
- y_true: Ground truth labels
485
-
486
- Returns:
487
- Tuple of (method_metrics, best_single_method, plot_data)
488
- """
489
- method_metrics: dict[str, dict[str, float]] = {}
490
- best_single_method: tuple[str, dict[str, float]] = ("", {"f1": -1.0})
491
- plot_data: dict[str, dict[str, NDArray[Any]]] = {}
492
-
493
- for method in df_scores.columns:
494
- y_scores: NDArray[Any] = cast(NDArray[Any], df_scores[method].values)
495
-
496
- metrics = calculate_metrics_at_best_threshold(y_true, y_scores)
497
- method_metrics[method] = metrics
498
-
499
- if metrics["f1"] > best_single_method[1]["f1"]:
500
- best_single_method = (method, metrics)
501
-
502
- pos_scores = y_scores[y_true == 1]
503
- neg_scores = y_scores[y_true == 0]
504
- plot_data[method] = {"pos": pos_scores, "neg": neg_scores}
505
-
506
- # Calculate Effect Size (Cohen's d) and statistical significance
507
- cohen_d = (np.mean(pos_scores) - np.mean(neg_scores)) / np.sqrt(
508
- (np.std(pos_scores, ddof=1) ** 2 + np.std(neg_scores, ddof=1) ** 2) / 2
509
- )
510
- ttest_result = ttest_ind(pos_scores, neg_scores, equal_var=False)
511
- method_metrics[method]["cohen_d"] = float(cohen_d)
512
- method_metrics[method]["p_value"] = float(ttest_result.pvalue)
513
-
514
- return method_metrics, best_single_method, plot_data
515
-
516
-
517
- def _analyze_method_correlations(
518
- df_scores: pd.DataFrame,
519
- ) -> tuple[pd.DataFrame, dict[str, float]]:
520
- """Analyze correlations between methods.
521
-
522
- Args:
523
- df_scores: DataFrame with method scores
524
-
525
- Returns:
526
- Tuple of (correlation_matrix, independence_scores)
527
- """
528
- corr_matrix = df_scores.corr(method="pearson")
529
-
530
- # Remove diagonal (self-correlation)
531
- for method in corr_matrix.columns:
532
- corr_matrix.loc[method, method] = np.nan
533
-
534
- # Calculate independence (lower correlation = more independent)
535
- independent_methods = corr_matrix.abs().mean(axis=1).sort_values().to_dict()
536
-
537
- return corr_matrix, independent_methods
538
-
539
-
540
- def _generate_visualizations(
541
- df_scores: pd.DataFrame,
542
- y_true: NDArray[np.int_],
543
- method_metrics: dict[str, dict[str, float]],
544
- plot_data: dict[str, dict[str, NDArray[Any]]],
545
- corr_matrix: pd.DataFrame,
546
- output_dir: Path,
547
- ) -> None:
548
- """Generate all benchmark visualization plots.
549
-
550
- Args:
551
- df_scores: DataFrame with method scores
552
- y_true: Ground truth labels
553
- method_metrics: Metrics for each method
554
- plot_data: Positive/negative score distributions
555
- corr_matrix: Method correlation matrix
556
- output_dir: Directory to save plots
557
- """
558
- plt.style.use("ggplot")
559
-
560
- # Distribution histograms for each method
561
- for method in df_scores.columns:
562
- save_histogram_comparison(
563
- pos_data=plot_data[method]["pos"],
564
- neg_data=plot_data[method]["neg"],
565
- threshold=method_metrics[method]["threshold"],
566
- method_name=method,
567
- output_path=output_dir / f"distribution_{method}.png",
568
- )
569
-
570
- # Correlation heatmap
571
- save_correlation_heatmap(
572
- corr_matrix=corr_matrix.fillna(0),
573
- output_path=output_dir / "correlation_heatmap.png",
574
- )
575
-
576
- # PCA dimensionality reduction (with graceful fallback)
577
- try:
578
- scaler = StandardScaler()
579
- x_scaled = scaler.fit_transform(df_scores.values)
580
- pca = PCA(n_components=min(len(df_scores.columns), 3))
581
- x_pca = pca.fit_transform(x_scaled)
582
-
583
- save_pca_scatter(
584
- x_pca=x_pca,
585
- y_true=y_true,
586
- explained_variance=pca.explained_variance_ratio_.tolist(),
587
- output_path=output_dir / "pca_plot.png",
588
- )
589
- except Exception as e:
590
- logging.warning(f"PCA failed: {e}")
591
-
592
-
593
- def _evaluate_ensemble_strategies(
594
- df_scores: pd.DataFrame,
595
- y_true: NDArray[np.int_],
596
- ) -> tuple[tuple[str, dict[str, float]], tuple[str, dict[str, float]]]:
597
- """Evaluate consensus and voting ensemble strategies.
598
-
599
- Args:
600
- df_scores: DataFrame with method scores
601
- y_true: Ground truth labels
602
-
603
- Returns:
604
- Tuple of (best_consensus, best_voting)
605
- """
606
- best_consensus: tuple[str, dict[str, float]] = evaluate_consensus_strategy(df_scores, y_true)
607
- best_voting: tuple[str, dict[str, float]] = evaluate_voting_strategy(df_scores, y_true)
608
- return best_consensus, best_voting
609
-
610
-
611
- def _generate_analysis_report(
612
- method_metrics: dict[str, dict[str, float]],
613
- best_single_method: tuple[str, dict[str, float]],
614
- best_voting: tuple[str, dict[str, float]],
615
- best_consensus: tuple[str, dict[str, float]],
616
- independent_methods: dict[str, float],
617
- n_rows: int,
618
- y_true: NDArray[np.int_],
619
- ) -> str:
620
- """Generate formatted analysis report using ReportBuilder.
621
-
622
- Args:
623
- method_metrics: Performance metrics for each method
624
- best_single_method: Best performing single method
625
- best_voting: Best voting ensemble strategy
626
- best_consensus: Best consensus ensemble strategy
627
- independent_methods: Method independence scores
628
- n_rows: Total number of pairs scored
629
- y_true: Ground truth labels
630
-
631
- Returns:
632
- Formatted report string
633
- """
634
- sorted_methods = sorted(method_metrics.items(), key=lambda item: item[1].get("f1", -1.0), reverse=True)
635
-
636
- report = (
637
- ReportBuilder()
638
- .add_title("PHOTO BENCHMARK ANALYSIS REPORT")
639
- .add_text(f"Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}")
640
- .add_text(f"Total Pairs Scored: {n_rows} (Similar: {np.sum(y_true == 1)}, Dissimilar: {np.sum(y_true == 0)})")
641
- .add_section("1. INDIVIDUAL METHOD PERFORMANCE")
642
- )
643
-
644
- # Add metrics for each method
645
- for method, metrics in sorted_methods:
646
- report.add_text(f"Method: {method}")
647
- report.add_text(f" AUC: {metrics['auc']:.4f}")
648
- report.add_text(f" Optimal Threshold (at 1% FPR target): {metrics['threshold']:.4f}")
649
- report.add_text(f" F1 Score: {metrics['f1']:.4f}")
650
- report.add_text(f" Precision: {metrics['precision']:.4f}")
651
- report.add_text(f" Recall/TPR: {metrics['recall']:.4f}")
652
- report.add_text(f" Effect Size (Cohen's d): {metrics.get('cohen_d', 'N/A'):.3f}")
653
- report.add_blank_line()
654
-
655
- # Add recommendations
656
- report.add_section("2. RECOMMENDATIONS")
657
- report.add_text("Option A: Best Single Method (highest performance)")
658
- report.add_text(f" Use {best_single_method[0]} with threshold > {best_single_method[1]['threshold']:.4f}")
659
- report.add_text(
660
- f" Performance: F1={best_single_method[1]['f1']:.4f}, "
661
- f"Precision={best_single_method[1]['precision']:.4f}, "
662
- f"Recall={best_single_method[1]['recall']:.4f}"
663
- )
664
- report.add_blank_line()
665
-
666
- report.add_text("Option B: Voting Strategy (balanced ensemble)")
667
- report.add_text(f" Use {best_voting[0]}")
668
- report.add_text(
669
- f" Performance: F1={best_voting[1]['f1']:.4f}, "
670
- f"Precision={best_voting[1]['precision']:.4f}, "
671
- f"Recall={best_voting[1]['recall']:.4f}"
672
- )
673
- report.add_blank_line()
674
-
675
- report.add_text("Option C: Consensus Median (most robust ensemble)")
676
- report.add_text(f" Use median score > {best_consensus[1]['threshold']:.4f}")
677
- report.add_text(
678
- f" Performance: F1={best_consensus[1]['f1']:.4f}, "
679
- f"Precision={best_consensus[1]['precision']:.4f}, "
680
- f"Recall={best_consensus[1]['recall']:.4f}"
681
- )
682
- report.add_blank_line()
683
-
684
- # Add independence analysis
685
- report.add_section("3. METHOD INDEPENDENCE (for ensemble)")
686
- report.add_text("Most independent methods (lowest average absolute correlation):")
687
- for method, avg_corr_val in independent_methods.items():
688
- report.add_text(f" • {method}: avg |r|={avg_corr_val:.3f}")
689
-
690
- return report.build()
691
-
692
-
693
- def _analyze_cascades(
694
- df_scores: pd.DataFrame,
695
- y_true: NDArray[np.int_],
696
- method_metrics: dict[str, dict[str, float]],
697
- output_dir: Path,
698
- ) -> None:
699
- """Test specific cascade combinations and save results.
700
-
701
- Args:
702
- df_scores: DataFrame with method scores
703
- y_true: Ground truth labels
704
- method_metrics: Performance metrics for each method
705
- output_dir: Directory to save results
706
- """
707
- # Only run if all required methods are present
708
- if not all(method in df_scores.columns for method in ["dhash", "ssim", "sift"]):
709
- logging.info("Skipping cascade analysis - required methods (dhash, ssim, sift) not all present")
710
- return
711
-
712
- cascade_results = []
713
-
714
- # Test: dhash only
715
- dhash_only = df_scores["dhash"] > method_metrics["dhash"]["threshold"]
716
- cascade_results.append(
717
- {
718
- "cascade": "dhash_only",
719
- "f1": f1_score(y_true, dhash_only),
720
- "precision": precision_score(y_true, dhash_only),
721
- "recall": recall_score(y_true, dhash_only),
722
- }
723
- )
724
-
725
- # Test: dhash → ssim (current cascade)
726
- dhash_pass = df_scores["dhash"] > method_metrics["dhash"]["threshold"]
727
- ssim_pass = df_scores["ssim"] > method_metrics["ssim"]["threshold"]
728
- current_cascade = dhash_pass & ssim_pass
729
- cascade_results.append(
730
- {
731
- "cascade": "dhash_AND_ssim",
732
- "f1": f1_score(y_true, current_cascade),
733
- "precision": precision_score(y_true, current_cascade),
734
- "recall": recall_score(y_true, current_cascade),
735
- }
736
- )
737
-
738
- # Test: dhash → sift (independent cascade)
739
- sift_pass = df_scores["sift"] > method_metrics["sift"]["threshold"]
740
- independent_cascade = dhash_pass & sift_pass
741
- cascade_results.append(
742
- {
743
- "cascade": "dhash_AND_sift",
744
- "f1": f1_score(y_true, independent_cascade),
745
- "precision": precision_score(y_true, independent_cascade),
746
- "recall": recall_score(y_true, independent_cascade),
747
- }
748
- )
749
-
750
- # Save cascade analysis
751
- df_cascade = pd.DataFrame(cascade_results)
752
- df_cascade.to_csv(output_dir / CONFIG.paths.CASCADE_COMPARISON, index=False)
753
- logging.info(f"Cascade comparison saved to {output_dir / CONFIG.paths.CASCADE_COMPARISON}")
754
-
755
-
756
- def post_analysis(
757
- final_scores: _R,
758
- positive_pairs: list[Pair],
759
- different_pairs: list[Pair],
760
- output_dir: Path,
761
- ) -> None:
762
- """Performs the full benchmark analysis and generates reports.
763
-
764
- Includes metrics calculation, correlation analysis, ensemble evaluation, plotting, and report generation.
765
- """
766
- logging.info("Starting post-analysis phase.")
767
- output_dir.mkdir(parents=True, exist_ok=True)
768
-
769
- # 1. Prepare data
770
- try:
771
- df_scores, y_true = _prepare_benchmark_data(final_scores, positive_pairs, different_pairs)
772
- except ValueError as e:
773
- logging.error(f"Data preparation failed: {e}. Aborting analysis.")
774
- return
775
-
776
- # Save pair-level scores and ground truth for outlier analysis
777
- df_scores.to_csv(output_dir / CONFIG.paths.PAIR_SCORES)
778
- pair_ground_truth = pd.DataFrame(
779
- {
780
- "photo_a": [pair[0] for pair in df_scores.index],
781
- "photo_b": [pair[1] for pair in df_scores.index],
782
- "ground_truth": ["similar" if is_pos else "dissimilar" for is_pos in (y_true == 1)],
783
- }
784
- )
785
- pair_ground_truth.to_csv(output_dir / CONFIG.paths.PAIR_GROUND_TRUTH, index=False)
786
- logging.info(
787
- f"Saved {len(df_scores)} pair-level scores to {CONFIG.paths.PAIR_SCORES} and {CONFIG.paths.PAIR_GROUND_TRUTH}"
788
- )
789
-
790
- # 2. Calculate individual method metrics
791
- method_metrics, best_single_method, plot_data = _calculate_method_metrics(df_scores, y_true)
792
-
793
- # 3. Analyze correlations and independence
794
- corr_matrix, independent_methods = _analyze_method_correlations(df_scores)
795
-
796
- # 4. Evaluate ensemble strategies
797
- best_consensus, best_voting = _evaluate_ensemble_strategies(df_scores, y_true)
798
-
799
- # 5. Generate visualizations
800
- _generate_visualizations(df_scores, y_true, method_metrics, plot_data, corr_matrix, output_dir)
801
-
802
- # 6. Generate analysis report
803
- report_text = _generate_analysis_report(
804
- method_metrics,
805
- best_single_method,
806
- best_voting,
807
- best_consensus,
808
- independent_methods,
809
- len(df_scores),
810
- y_true,
811
- )
812
- report_file = output_dir / CONFIG.paths.ANALYSIS_RECOMMENDATIONS
813
- report_file.write_text(report_text, encoding="utf-8")
814
-
815
- # 7. Analyze cascade combinations
816
- _analyze_cascades(df_scores, y_true, method_metrics, output_dir)
817
-
818
- # 8. Save derived data to CSV
819
- df_metrics = pd.DataFrame(method_metrics).T
820
- df_metrics.to_csv(output_dir / CONFIG.paths.METHOD_METRICS)
821
- corr_matrix.to_csv(output_dir / CONFIG.paths.METHOD_CORRELATIONS)
822
-
823
- logging.info(f"Analysis complete. Report written to {report_file}")
1
+ """Benchmark utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import random
7
+ import time
8
+ from collections import defaultdict
9
+ from collections.abc import Iterable, Sequence
10
+ from itertools import combinations
11
+ from pathlib import Path
12
+ from typing import Any, cast
13
+
14
+ # --- Scientific Libraries ---
15
+ import matplotlib
16
+
17
+ # Use non-interactive backend to avoid GUI threading issues
18
+ matplotlib.use("Agg")
19
+ import matplotlib.pyplot as plt
20
+ import networkx as nx
21
+ import numpy as np
22
+ import pandas as pd
23
+ from numpy.typing import NDArray
24
+ from scipy.stats import ttest_ind
25
+ from sklearn.decomposition import PCA
26
+ from sklearn.metrics import (
27
+ confusion_matrix,
28
+ f1_score,
29
+ precision_score,
30
+ recall_score,
31
+ roc_auc_score,
32
+ roc_curve,
33
+ )
34
+ from sklearn.preprocessing import StandardScaler
35
+
36
+ # --- External Project Dependencies (MUST EXIST IN PROJECT) ---
37
+ # Replace/confirm these imports match your actual file structure
38
+ from .config import CONFIG
39
+ from .photo_file import PhotoFile
40
+ from .plot_helpers import save_correlation_heatmap, save_histogram_comparison, save_pca_scatter
41
+ from .report_builder import ReportBuilder
42
+ from .sequence import PhotoFileSeries, PhotoSequence
43
+
44
+ # ----------------------------
45
+ # -----------------------------------------------------------
46
+
47
+ # Type Aliases
48
+ type Pair = tuple[int, int]
49
+ type _R = dict[str, dict[Pair, float]]
50
+ type _Score = float
51
+
52
+
53
+ # --- Core Utility Functions ---
54
+
55
+
56
+ def _split_large_component(
57
+ comp: set[int], graph: nx.Graph[int], max_size: int, pairs: Sequence[Pair]
58
+ ) -> list[set[int]]:
59
+ """Splits a large connected component (of pair indices) into smaller pieces using a greedy BFS approach.
60
+
61
+ Constrains by unique photo count, not pair count.
62
+
63
+ Args:
64
+ comp: Set of pair indices to split
65
+ graph: Graph where nodes are pair indices
66
+ max_size: Maximum unique photos per piece
67
+ pairs: Original pairs array to calculate photo counts
68
+
69
+ Returns:
70
+ List of smaller components (sets of pair indices)
71
+ """
72
+
73
+ # Helper to count unique photos in a component
74
+ def count_photos(pair_indices: set[int]) -> int:
75
+ return len(unique_ids_from_pairs([pairs[i] for i in pair_indices]))
76
+
77
+ if count_photos(comp) <= max_size * 2:
78
+ # Arbitrarily split in half if size is manageable
79
+ comp_list: list[int] = list(comp)
80
+ mid: int = len(comp_list) // 2
81
+ return [set(comp_list[:mid]), set(comp_list[mid:])]
82
+
83
+ subgraph: nx.Graph[int] = graph.subgraph(comp)
84
+ pieces: list[set[int]] = []
85
+ remaining: set[int] = set(comp)
86
+
87
+ while remaining:
88
+ start_node: int = next(iter(remaining))
89
+ current_piece: set[int] = set()
90
+ queue: list[int] = [start_node]
91
+
92
+ # Grow piece up to max_size unique photos using BFS
93
+ while queue and count_photos(current_piece) < max_size:
94
+ node: int = queue.pop(0)
95
+ if node in remaining:
96
+ current_piece.add(node)
97
+ remaining.remove(node)
98
+
99
+ # Add neighbors to queue
100
+ neighbor: int
101
+ for neighbor in subgraph.neighbors(node):
102
+ if neighbor in remaining and neighbor not in queue:
103
+ queue.append(neighbor)
104
+
105
+ pieces.append(current_piece)
106
+ return pieces
107
+
108
+
109
+ def unique_ids_from_pairs(pairs: Iterable[Pair]) -> set[int]:
110
+ """Utility function to collect unique IDs from a list of pairs."""
111
+ u: set[int] = set()
112
+ a: int
113
+ b: int
114
+ for a, b in pairs:
115
+ u.add(a)
116
+ u.add(b)
117
+ return u
118
+
119
+
120
+ def generate_known_different_pairs(
121
+ forest: list[PhotoSequence],
122
+ n_pairs: int,
123
+ seed: int,
124
+ ) -> list[Pair]:
125
+ """Generate high-quality known-different pairs using forest structure.
126
+
127
+ Uses cross-template or distant-sequence sampling.
128
+ """
129
+ known_different: list[Pair] = []
130
+
131
+ # 1. Derive template exemplars from the forest
132
+ template_exemplars: dict[str, int] = {}
133
+ for obj in forest:
134
+ # Use hasattr checks to handle polymorphic PhotoSequence objects
135
+ if hasattr(obj, "template_key") and hasattr(obj, "get_reference"):
136
+ template_key = obj.template_key
137
+ reference: PhotoFileSeries = obj.get_reference()
138
+
139
+ if template_key and template_key not in template_exemplars and reference:
140
+ first_key = next(iter(reference.keys()), None)
141
+ if first_key is not None:
142
+ # PhotoFile is expected to have an .id attribute
143
+ exemplar: PhotoFile = reference[first_key]
144
+ template_exemplars[template_key] = exemplar.id
145
+
146
+ # 2. Group templates by parent directory (assumed from template_key path structure)
147
+ templates_by_parent: defaultdict[Path, list[str]] = defaultdict(list)
148
+ for template_key in template_exemplars:
149
+ parent: Path = Path(template_key).parent
150
+ templates_by_parent[parent].append(template_key)
151
+
152
+ # 3. Cross-parent pairing (high confidence negatives)
153
+ for (_p1, t1_list), (_p2, t2_list) in combinations(templates_by_parent.items(), 2):
154
+ for t1 in t1_list[:5]:
155
+ for t2 in t2_list[:5]:
156
+ if t1 in template_exemplars and t2 in template_exemplars:
157
+ known_different.append((template_exemplars[t1], template_exemplars[t2]))
158
+
159
+ # 4. Distant sequence positions (high confidence negatives)
160
+ for obj in forest:
161
+ seq: PhotoFileSeries = obj.get_reference()
162
+ sorted_indices: list[Any] = sorted(seq.index)
163
+ for i in range(min(10, len(sorted_indices) // 2)):
164
+ if i < len(sorted_indices) and -(i + 1) >= -len(sorted_indices):
165
+ known_different.append((seq[sorted_indices[i]].id, seq[sorted_indices[-(i + 1)]].id))
166
+
167
+ rng: random.Random = random.Random(seed)
168
+ rng.shuffle(known_different)
169
+ return known_different[:n_pairs]
170
+
171
+
172
+ def generate_benchmark_pairs(
173
+ forest: list[PhotoSequence],
174
+ # Included for consistency, though not directly used
175
+ n_different: int,
176
+ seed: int,
177
+ ) -> tuple[list[Pair], list[Pair], list[int]]:
178
+ """Generates similar (positive) and known-different (negative) photo pairs.
179
+
180
+ Extracts pairs from the forest structure based on sequence relationships.
181
+ """
182
+ # 1. Generate similar (positive) pairs
183
+ positive_pairs: list[Pair] = []
184
+ for obj in forest:
185
+ if hasattr(obj, "get_reference"):
186
+ reference: PhotoFileSeries = obj.get_reference()
187
+ sequences: list[PhotoSequence] = obj.sequences
188
+
189
+ for idx, exemplar in reference.items():
190
+ positive_pairs.extend(
191
+ [
192
+ (exemplar.id, seq.get_reference()[idx].id)
193
+ for seq in sequences
194
+ if idx in seq.get_reference() and exemplar.id != seq.get_reference()[idx].id
195
+ ]
196
+ )
197
+
198
+ # 2. Generate known-different (negative) pairs
199
+ n_diff_limit: int = min(len(positive_pairs), n_different)
200
+ different_pairs: list[Pair] = generate_known_different_pairs(
201
+ forest=forest,
202
+ n_pairs=n_diff_limit,
203
+ seed=seed,
204
+ )
205
+
206
+ # 3. Determine unique IDs
207
+ unique_ids: list[int] = sorted(unique_ids_from_pairs(positive_pairs + different_pairs))
208
+
209
+ return positive_pairs, different_pairs, unique_ids
210
+
211
+
212
+ def _separate_components_by_size(
213
+ components: list[set[int]],
214
+ max_cluster_size: int,
215
+ pairs: Sequence[Pair],
216
+ ) -> tuple[list[set[int]], list[set[int]], list[set[int]]]:
217
+ """Separate components into small, medium, and large based on photo count.
218
+
219
+ Args:
220
+ components: Connected components (sets of pair indices)
221
+ max_cluster_size: Maximum photos per cluster
222
+ pairs: Original pairs list
223
+
224
+ Returns:
225
+ Tuple of (small_components, medium_components, large_components)
226
+ - small: photo_count <= max_cluster_size
227
+ - medium: max_cluster_size < photo_count <= max_cluster_size * 2
228
+ - large: photo_count > max_cluster_size * 2
229
+ """
230
+ small_components: list[set[int]] = []
231
+ medium_components: list[set[int]] = []
232
+ large_components: list[set[int]] = []
233
+
234
+ for comp in components:
235
+ photo_count = len(unique_ids_from_pairs([pairs[i] for i in comp]))
236
+ if photo_count <= max_cluster_size:
237
+ small_components.append(comp)
238
+ elif photo_count > max_cluster_size * 2:
239
+ large_components.append(comp)
240
+ else:
241
+ medium_components.append(comp)
242
+
243
+ return small_components, medium_components, large_components
244
+
245
+
246
+ def _combine_small_components(
247
+ components: list[set[int]],
248
+ max_cluster_size: int,
249
+ pairs: Sequence[Pair],
250
+ ) -> list[set[int]]:
251
+ """Greedily combine small components to maximize cluster utilization.
252
+
253
+ Uses a two-pointer approach: large components on the left, small on the right.
254
+ Fills each cluster starting with the largest component, then adds smaller ones.
255
+
256
+ Args:
257
+ components: Components to combine (assumed pre-sorted by size descending)
258
+ max_cluster_size: Maximum photos per cluster
259
+ pairs: Original pairs list
260
+
261
+ Returns:
262
+ List of combined clusters (as sets of pair indices)
263
+ """
264
+ if not components:
265
+ return []
266
+
267
+ pair_clusters: list[set[int]] = []
268
+ left_idx: int = 0
269
+ right_idx: int = len(components) - 1
270
+
271
+ while left_idx <= right_idx:
272
+ current_cluster: set[int] = set(components[left_idx])
273
+ current_photo_count = len(unique_ids_from_pairs([pairs[i] for i in current_cluster]))
274
+ left_idx += 1
275
+
276
+ while right_idx >= left_idx:
277
+ candidate_photo_count = len(unique_ids_from_pairs([pairs[i] for i in components[right_idx]]))
278
+ if current_photo_count + candidate_photo_count <= max_cluster_size:
279
+ current_cluster.update(components[right_idx])
280
+ current_photo_count += candidate_photo_count
281
+ right_idx -= 1
282
+ else:
283
+ break
284
+
285
+ pair_clusters.append(current_cluster)
286
+
287
+ return pair_clusters
288
+
289
+
290
+ def cluster_pairs_for_scoring(pairs: Sequence[Pair], max_cluster_size: int) -> list[tuple[set[int], list[Pair]]]:
291
+ """Clusters a list of pairs into connected components of limited size.
292
+
293
+ Constrains by unique photo count, not pair count, to respect memory limits.
294
+
295
+ Args:
296
+ pairs: List of photo ID pairs to cluster
297
+ max_cluster_size: Maximum unique photos per cluster (memory constraint)
298
+
299
+ Returns:
300
+ List of (cluster_photos, cluster_pairs) tuples
301
+ """
302
+ # 1. Build graph where nodes are pair indices
303
+ graph: nx.Graph[int] = nx.Graph()
304
+ graph.add_nodes_from(range(len(pairs)))
305
+
306
+ # Connect nodes (pairs) that share a photo ID
307
+ for i in range(len(pairs)):
308
+ for j in range(i + 1, len(pairs)):
309
+ if set(pairs[i]) & set(pairs[j]):
310
+ graph.add_edge(i, j)
311
+
312
+ # 2. Get initial connected components
313
+ components: list[set[int]] = list(nx.connected_components(graph))
314
+
315
+ # 3. Separate and split components by PHOTO count (not pair count)
316
+ small_components, medium_components, large_components = _separate_components_by_size(
317
+ components, max_cluster_size, pairs
318
+ )
319
+
320
+ # Split large components into manageable pieces
321
+ split_pieces: list[set[int]] = []
322
+ for comp in large_components:
323
+ pieces: list[set[int]] = _split_large_component(comp, graph, max_cluster_size, pairs)
324
+ split_pieces.extend(pieces)
325
+
326
+ # Merge all processable components
327
+ small_components.extend(split_pieces)
328
+ small_components.extend(medium_components)
329
+
330
+ # 4. Sort by photo count descending for optimal packing
331
+ small_components.sort(
332
+ key=lambda c: len(unique_ids_from_pairs([pairs[i] for i in c])),
333
+ reverse=True,
334
+ )
335
+
336
+ # 5. Greedy combination for final clusters (by PHOTO count)
337
+ pair_clusters = _combine_small_components(small_components, max_cluster_size, pairs)
338
+
339
+ # 6. Convert indices to pairs
340
+ result: list[tuple[set[int], list[Pair]]] = []
341
+ for pair_indices in pair_clusters:
342
+ cluster_pairs: list[Pair] = [pairs[i] for i in pair_indices]
343
+ cluster_photos: set[int] = unique_ids_from_pairs(cluster_pairs)
344
+ result.append((cluster_photos, cluster_pairs))
345
+
346
+ return result
347
+
348
+
349
+ # --- Analysis Functions ---
350
+
351
+
352
+ def calculate_metrics_at_best_threshold(y_true: NDArray[Any], y_scores: NDArray[Any]) -> dict[str, float]:
353
+ """Calculates metrics by finding the optimal threshold closest to a target FPR.
354
+
355
+ Target False Positive Rate (FPR) is 0.01.
356
+ """
357
+ target_fpr = 0.01
358
+
359
+ # roc_curve expects positive class to have higher score, which is true for similarity
360
+ fpr, _tpr, thresholds = roc_curve(y_true, y_scores)
361
+
362
+ # Find the threshold closest to the target FPR
363
+ diff = fpr - target_fpr
364
+ optimal_idx = np.argmin(np.abs(diff))
365
+ best_threshold = thresholds[optimal_idx]
366
+
367
+ y_pred = (y_scores >= best_threshold).astype(int)
368
+
369
+ auc = roc_auc_score(y_true, y_scores)
370
+ f1 = f1_score(y_true, y_pred, zero_division=0)
371
+ precision = precision_score(y_true, y_pred, zero_division=0)
372
+ recall = recall_score(y_true, y_pred, zero_division=0)
373
+
374
+ # Confusion matrix values
375
+ tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
376
+
377
+ return {
378
+ "auc": float(auc),
379
+ "threshold": float(best_threshold),
380
+ "f1": float(f1),
381
+ "precision": float(precision),
382
+ "recall": float(recall),
383
+ "tpr_at_threshold": float(tp / (tp + fn)) if (tp + fn) > 0 else 0.0,
384
+ "tp": int(tp),
385
+ "tn": int(tn),
386
+ "fp": int(fp),
387
+ "fn": int(fn),
388
+ "fpr_at_threshold": float(fp / (fp + tn)) if (fp + tn) > 0 else 0.0,
389
+ }
390
+
391
+
392
+ def evaluate_consensus_strategy(df_scores: pd.DataFrame, y_true: NDArray[Any]) -> tuple[str, dict[str, float]]:
393
+ """Evaluates the median consensus strategy."""
394
+ median_scores = cast(NDArray[Any], df_scores.median(axis=1).values)
395
+ metrics = calculate_metrics_at_best_threshold(y_true, median_scores)
396
+ return "Median", metrics
397
+
398
+
399
+ def evaluate_voting_strategy(df_scores: pd.DataFrame, y_true: NDArray[Any]) -> tuple[str, dict[str, float]]:
400
+ """Evaluates a balanced voting strategy using majority vote.
401
+
402
+ Finds the best single-method thresholds and applies majority voting.
403
+ """
404
+ method_thresholds = {}
405
+ for method in df_scores.columns:
406
+ method_values = cast(NDArray[Any], df_scores[method].values)
407
+ metrics = calculate_metrics_at_best_threshold(y_true, method_values)
408
+ method_thresholds[method] = metrics["threshold"]
409
+
410
+ # Vote for similarity if score >= individual method's optimal threshold
411
+ votes_df = pd.DataFrame()
412
+ for method, threshold in method_thresholds.items():
413
+ method_values = cast(NDArray[Any], df_scores[method].values)
414
+ votes_df[method] = (method_values >= threshold).astype(int)
415
+
416
+ # Final score is the mean vote (a value between 0.0 and 1.0)
417
+ voting_scores = cast(NDArray[Any], votes_df.mean(axis=1).values)
418
+
419
+ # Use a fixed threshold of 0.5 for the voting mean to get final prediction
420
+ y_pred = (voting_scores >= 0.5).astype(int)
421
+
422
+ auc = roc_auc_score(y_true, voting_scores)
423
+ f1 = f1_score(y_true, y_pred, zero_division=0)
424
+ precision = precision_score(y_true, y_pred, zero_division=0)
425
+ recall = recall_score(y_true, y_pred, zero_division=0)
426
+
427
+ return "Majority Vote", {
428
+ "auc": float(auc),
429
+ "threshold": 0.5,
430
+ "f1": float(f1),
431
+ "precision": float(precision),
432
+ "recall": float(recall),
433
+ "tpr_at_threshold": float(recall),
434
+ "tp": int(np.sum((y_true == 1) & (y_pred == 1))),
435
+ "tn": int(np.sum((y_true == 0) & (y_pred == 0))),
436
+ "fp": int(np.sum((y_true == 0) & (y_pred == 1))),
437
+ "fn": int(np.sum((y_true == 1) & (y_pred == 0))),
438
+ "fpr_at_threshold": float(np.sum((y_true == 0) & (y_pred == 1)) / np.sum(y_true == 0)),
439
+ }
440
+
441
+
442
+ def _prepare_benchmark_data(
443
+ final_scores: _R,
444
+ positive_pairs: list[Pair],
445
+ different_pairs: list[Pair],
446
+ ) -> tuple[pd.DataFrame, NDArray[np.int_]]:
447
+ """Prepare score DataFrame and ground truth labels.
448
+
449
+ Args:
450
+ final_scores: Dict mapping method names to pair scores
451
+ positive_pairs: List of similar photo pairs
452
+ different_pairs: List of dissimilar photo pairs
453
+
454
+ Returns:
455
+ Tuple of (score_dataframe, y_true_labels)
456
+
457
+ Raises:
458
+ ValueError: If no valid pairs remain after dropping NaNs
459
+ """
460
+ all_pairs: list[Pair] = positive_pairs + different_pairs
461
+
462
+ score_data = {method: [final_scores[method][pair] for pair in all_pairs] for method in final_scores}
463
+ df_scores = pd.DataFrame(score_data, index=pd.MultiIndex.from_tuples(all_pairs, names=["photo_a", "photo_b"]))
464
+ df_scores = df_scores.dropna()
465
+
466
+ if len(df_scores) == 0:
467
+ raise ValueError("No valid pairs remain after dropping NaNs")
468
+
469
+ # Align y_true for the pairs that remain (iterate in order to preserve alignment)
470
+ is_positive_mask = [pair in positive_pairs for pair in df_scores.index]
471
+ y_true = np.array(is_positive_mask, dtype=int)
472
+
473
+ return df_scores, y_true
474
+
475
+
476
+ def _calculate_method_metrics(
477
+ df_scores: pd.DataFrame,
478
+ y_true: NDArray[np.int_],
479
+ ) -> tuple[dict[str, dict[str, float]], tuple[str, dict[str, float]], dict[str, dict[str, NDArray[Any]]]]:
480
+ """Calculate performance metrics for each method.
481
+
482
+ Args:
483
+ df_scores: DataFrame with method scores
484
+ y_true: Ground truth labels
485
+
486
+ Returns:
487
+ Tuple of (method_metrics, best_single_method, plot_data)
488
+ """
489
+ method_metrics: dict[str, dict[str, float]] = {}
490
+ best_single_method: tuple[str, dict[str, float]] = ("", {"f1": -1.0})
491
+ plot_data: dict[str, dict[str, NDArray[Any]]] = {}
492
+
493
+ for method in df_scores.columns:
494
+ y_scores: NDArray[Any] = cast(NDArray[Any], df_scores[method].values)
495
+
496
+ metrics = calculate_metrics_at_best_threshold(y_true, y_scores)
497
+ method_metrics[method] = metrics
498
+
499
+ if metrics["f1"] > best_single_method[1]["f1"]:
500
+ best_single_method = (method, metrics)
501
+
502
+ pos_scores = y_scores[y_true == 1]
503
+ neg_scores = y_scores[y_true == 0]
504
+ plot_data[method] = {"pos": pos_scores, "neg": neg_scores}
505
+
506
+ # Calculate Effect Size (Cohen's d) and statistical significance
507
+ cohen_d = (np.mean(pos_scores) - np.mean(neg_scores)) / np.sqrt(
508
+ (np.std(pos_scores, ddof=1) ** 2 + np.std(neg_scores, ddof=1) ** 2) / 2
509
+ )
510
+ ttest_result = ttest_ind(pos_scores, neg_scores, equal_var=False)
511
+ method_metrics[method]["cohen_d"] = float(cohen_d)
512
+ method_metrics[method]["p_value"] = float(ttest_result.pvalue)
513
+
514
+ return method_metrics, best_single_method, plot_data
515
+
516
+
517
+ def _analyze_method_correlations(
518
+ df_scores: pd.DataFrame,
519
+ ) -> tuple[pd.DataFrame, dict[str, float]]:
520
+ """Analyze correlations between methods.
521
+
522
+ Args:
523
+ df_scores: DataFrame with method scores
524
+
525
+ Returns:
526
+ Tuple of (correlation_matrix, independence_scores)
527
+ """
528
+ corr_matrix = df_scores.corr(method="pearson")
529
+
530
+ # Remove diagonal (self-correlation)
531
+ for method in corr_matrix.columns:
532
+ corr_matrix.loc[method, method] = np.nan
533
+
534
+ # Calculate independence (lower correlation = more independent)
535
+ independent_methods = corr_matrix.abs().mean(axis=1).sort_values().to_dict()
536
+
537
+ return corr_matrix, independent_methods
538
+
539
+
540
+ def _generate_visualizations(
541
+ df_scores: pd.DataFrame,
542
+ y_true: NDArray[np.int_],
543
+ method_metrics: dict[str, dict[str, float]],
544
+ plot_data: dict[str, dict[str, NDArray[Any]]],
545
+ corr_matrix: pd.DataFrame,
546
+ output_dir: Path,
547
+ ) -> None:
548
+ """Generate all benchmark visualization plots.
549
+
550
+ Args:
551
+ df_scores: DataFrame with method scores
552
+ y_true: Ground truth labels
553
+ method_metrics: Metrics for each method
554
+ plot_data: Positive/negative score distributions
555
+ corr_matrix: Method correlation matrix
556
+ output_dir: Directory to save plots
557
+ """
558
+ plt.style.use("ggplot")
559
+
560
+ # Distribution histograms for each method
561
+ for method in df_scores.columns:
562
+ save_histogram_comparison(
563
+ pos_data=plot_data[method]["pos"],
564
+ neg_data=plot_data[method]["neg"],
565
+ threshold=method_metrics[method]["threshold"],
566
+ method_name=method,
567
+ output_path=output_dir / f"distribution_{method}.png",
568
+ )
569
+
570
+ # Correlation heatmap
571
+ save_correlation_heatmap(
572
+ corr_matrix=corr_matrix.fillna(0),
573
+ output_path=output_dir / "correlation_heatmap.png",
574
+ )
575
+
576
+ # PCA dimensionality reduction (with graceful fallback)
577
+ try:
578
+ scaler = StandardScaler()
579
+ x_scaled = scaler.fit_transform(df_scores.values)
580
+ pca = PCA(n_components=min(len(df_scores.columns), 3))
581
+ x_pca = pca.fit_transform(x_scaled)
582
+
583
+ save_pca_scatter(
584
+ x_pca=x_pca,
585
+ y_true=y_true,
586
+ explained_variance=pca.explained_variance_ratio_.tolist(),
587
+ output_path=output_dir / "pca_plot.png",
588
+ )
589
+ except Exception as e:
590
+ logging.warning(f"PCA failed: {e}")
591
+
592
+
593
+ def _evaluate_ensemble_strategies(
594
+ df_scores: pd.DataFrame,
595
+ y_true: NDArray[np.int_],
596
+ ) -> tuple[tuple[str, dict[str, float]], tuple[str, dict[str, float]]]:
597
+ """Evaluate consensus and voting ensemble strategies.
598
+
599
+ Args:
600
+ df_scores: DataFrame with method scores
601
+ y_true: Ground truth labels
602
+
603
+ Returns:
604
+ Tuple of (best_consensus, best_voting)
605
+ """
606
+ best_consensus: tuple[str, dict[str, float]] = evaluate_consensus_strategy(df_scores, y_true)
607
+ best_voting: tuple[str, dict[str, float]] = evaluate_voting_strategy(df_scores, y_true)
608
+ return best_consensus, best_voting
609
+
610
+
611
+ def _generate_analysis_report(
612
+ method_metrics: dict[str, dict[str, float]],
613
+ best_single_method: tuple[str, dict[str, float]],
614
+ best_voting: tuple[str, dict[str, float]],
615
+ best_consensus: tuple[str, dict[str, float]],
616
+ independent_methods: dict[str, float],
617
+ n_rows: int,
618
+ y_true: NDArray[np.int_],
619
+ ) -> str:
620
+ """Generate formatted analysis report using ReportBuilder.
621
+
622
+ Args:
623
+ method_metrics: Performance metrics for each method
624
+ best_single_method: Best performing single method
625
+ best_voting: Best voting ensemble strategy
626
+ best_consensus: Best consensus ensemble strategy
627
+ independent_methods: Method independence scores
628
+ n_rows: Total number of pairs scored
629
+ y_true: Ground truth labels
630
+
631
+ Returns:
632
+ Formatted report string
633
+ """
634
+ sorted_methods = sorted(method_metrics.items(), key=lambda item: item[1].get("f1", -1.0), reverse=True)
635
+
636
+ report = (
637
+ ReportBuilder()
638
+ .add_title("PHOTO BENCHMARK ANALYSIS REPORT")
639
+ .add_text(f"Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}")
640
+ .add_text(f"Total Pairs Scored: {n_rows} (Similar: {np.sum(y_true == 1)}, Dissimilar: {np.sum(y_true == 0)})")
641
+ .add_section("1. INDIVIDUAL METHOD PERFORMANCE")
642
+ )
643
+
644
+ # Add metrics for each method
645
+ for method, metrics in sorted_methods:
646
+ report.add_text(f"Method: {method}")
647
+ report.add_text(f" AUC: {metrics['auc']:.4f}")
648
+ report.add_text(f" Optimal Threshold (at 1% FPR target): {metrics['threshold']:.4f}")
649
+ report.add_text(f" F1 Score: {metrics['f1']:.4f}")
650
+ report.add_text(f" Precision: {metrics['precision']:.4f}")
651
+ report.add_text(f" Recall/TPR: {metrics['recall']:.4f}")
652
+ report.add_text(f" Effect Size (Cohen's d): {metrics.get('cohen_d', 'N/A'):.3f}")
653
+ report.add_blank_line()
654
+
655
+ # Add recommendations
656
+ report.add_section("2. RECOMMENDATIONS")
657
+ report.add_text("Option A: Best Single Method (highest performance)")
658
+ report.add_text(f" Use {best_single_method[0]} with threshold > {best_single_method[1]['threshold']:.4f}")
659
+ report.add_text(
660
+ f" Performance: F1={best_single_method[1]['f1']:.4f}, "
661
+ f"Precision={best_single_method[1]['precision']:.4f}, "
662
+ f"Recall={best_single_method[1]['recall']:.4f}"
663
+ )
664
+ report.add_blank_line()
665
+
666
+ report.add_text("Option B: Voting Strategy (balanced ensemble)")
667
+ report.add_text(f" Use {best_voting[0]}")
668
+ report.add_text(
669
+ f" Performance: F1={best_voting[1]['f1']:.4f}, "
670
+ f"Precision={best_voting[1]['precision']:.4f}, "
671
+ f"Recall={best_voting[1]['recall']:.4f}"
672
+ )
673
+ report.add_blank_line()
674
+
675
+ report.add_text("Option C: Consensus Median (most robust ensemble)")
676
+ report.add_text(f" Use median score > {best_consensus[1]['threshold']:.4f}")
677
+ report.add_text(
678
+ f" Performance: F1={best_consensus[1]['f1']:.4f}, "
679
+ f"Precision={best_consensus[1]['precision']:.4f}, "
680
+ f"Recall={best_consensus[1]['recall']:.4f}"
681
+ )
682
+ report.add_blank_line()
683
+
684
+ # Add independence analysis
685
+ report.add_section("3. METHOD INDEPENDENCE (for ensemble)")
686
+ report.add_text("Most independent methods (lowest average absolute correlation):")
687
+ for method, avg_corr_val in independent_methods.items():
688
+ report.add_text(f" • {method}: avg |r|={avg_corr_val:.3f}")
689
+
690
+ return report.build()
691
+
692
+
693
+ def _analyze_cascades(
694
+ df_scores: pd.DataFrame,
695
+ y_true: NDArray[np.int_],
696
+ method_metrics: dict[str, dict[str, float]],
697
+ output_dir: Path,
698
+ ) -> None:
699
+ """Test specific cascade combinations and save results.
700
+
701
+ Args:
702
+ df_scores: DataFrame with method scores
703
+ y_true: Ground truth labels
704
+ method_metrics: Performance metrics for each method
705
+ output_dir: Directory to save results
706
+ """
707
+ # Only run if all required methods are present
708
+ if not all(method in df_scores.columns for method in ["dhash", "ssim", "sift"]):
709
+ logging.info("Skipping cascade analysis - required methods (dhash, ssim, sift) not all present")
710
+ return
711
+
712
+ cascade_results = []
713
+
714
+ # Test: dhash only
715
+ dhash_only = df_scores["dhash"] > method_metrics["dhash"]["threshold"]
716
+ cascade_results.append(
717
+ {
718
+ "cascade": "dhash_only",
719
+ "f1": f1_score(y_true, dhash_only),
720
+ "precision": precision_score(y_true, dhash_only),
721
+ "recall": recall_score(y_true, dhash_only),
722
+ }
723
+ )
724
+
725
+ # Test: dhash → ssim (current cascade)
726
+ dhash_pass = df_scores["dhash"] > method_metrics["dhash"]["threshold"]
727
+ ssim_pass = df_scores["ssim"] > method_metrics["ssim"]["threshold"]
728
+ current_cascade = dhash_pass & ssim_pass
729
+ cascade_results.append(
730
+ {
731
+ "cascade": "dhash_AND_ssim",
732
+ "f1": f1_score(y_true, current_cascade),
733
+ "precision": precision_score(y_true, current_cascade),
734
+ "recall": recall_score(y_true, current_cascade),
735
+ }
736
+ )
737
+
738
+ # Test: dhash → sift (independent cascade)
739
+ sift_pass = df_scores["sift"] > method_metrics["sift"]["threshold"]
740
+ independent_cascade = dhash_pass & sift_pass
741
+ cascade_results.append(
742
+ {
743
+ "cascade": "dhash_AND_sift",
744
+ "f1": f1_score(y_true, independent_cascade),
745
+ "precision": precision_score(y_true, independent_cascade),
746
+ "recall": recall_score(y_true, independent_cascade),
747
+ }
748
+ )
749
+
750
+ # Save cascade analysis
751
+ df_cascade = pd.DataFrame(cascade_results)
752
+ df_cascade.to_csv(output_dir / CONFIG.paths.CASCADE_COMPARISON, index=False)
753
+ logging.info(f"Cascade comparison saved to {output_dir / CONFIG.paths.CASCADE_COMPARISON}")
754
+
755
+
756
+ def post_analysis(
757
+ final_scores: _R,
758
+ positive_pairs: list[Pair],
759
+ different_pairs: list[Pair],
760
+ output_dir: Path,
761
+ ) -> None:
762
+ """Performs the full benchmark analysis and generates reports.
763
+
764
+ Includes metrics calculation, correlation analysis, ensemble evaluation, plotting, and report generation.
765
+ """
766
+ logging.info("Starting post-analysis phase.")
767
+ output_dir.mkdir(parents=True, exist_ok=True)
768
+
769
+ # 1. Prepare data
770
+ try:
771
+ df_scores, y_true = _prepare_benchmark_data(final_scores, positive_pairs, different_pairs)
772
+ except ValueError as e:
773
+ logging.error(f"Data preparation failed: {e}. Aborting analysis.")
774
+ return
775
+
776
+ # Save pair-level scores and ground truth for outlier analysis
777
+ df_scores.to_csv(output_dir / CONFIG.paths.PAIR_SCORES)
778
+ pair_ground_truth = pd.DataFrame(
779
+ {
780
+ "photo_a": [pair[0] for pair in df_scores.index],
781
+ "photo_b": [pair[1] for pair in df_scores.index],
782
+ "ground_truth": ["similar" if is_pos else "dissimilar" for is_pos in (y_true == 1)],
783
+ }
784
+ )
785
+ pair_ground_truth.to_csv(output_dir / CONFIG.paths.PAIR_GROUND_TRUTH, index=False)
786
+ logging.info(
787
+ f"Saved {len(df_scores)} pair-level scores to {CONFIG.paths.PAIR_SCORES} and {CONFIG.paths.PAIR_GROUND_TRUTH}"
788
+ )
789
+
790
+ # 2. Calculate individual method metrics
791
+ method_metrics, best_single_method, plot_data = _calculate_method_metrics(df_scores, y_true)
792
+
793
+ # 3. Analyze correlations and independence
794
+ corr_matrix, independent_methods = _analyze_method_correlations(df_scores)
795
+
796
+ # 4. Evaluate ensemble strategies
797
+ best_consensus, best_voting = _evaluate_ensemble_strategies(df_scores, y_true)
798
+
799
+ # 5. Generate visualizations
800
+ _generate_visualizations(df_scores, y_true, method_metrics, plot_data, corr_matrix, output_dir)
801
+
802
+ # 6. Generate analysis report
803
+ report_text = _generate_analysis_report(
804
+ method_metrics,
805
+ best_single_method,
806
+ best_voting,
807
+ best_consensus,
808
+ independent_methods,
809
+ len(df_scores),
810
+ y_true,
811
+ )
812
+ report_file = output_dir / CONFIG.paths.ANALYSIS_RECOMMENDATIONS
813
+ report_file.write_text(report_text, encoding="utf-8")
814
+
815
+ # 7. Analyze cascade combinations
816
+ _analyze_cascades(df_scores, y_true, method_metrics, output_dir)
817
+
818
+ # 8. Save derived data to CSV
819
+ df_metrics = pd.DataFrame(method_metrics).T
820
+ df_metrics.to_csv(output_dir / CONFIG.paths.METHOD_METRICS)
821
+ corr_matrix.to_csv(output_dir / CONFIG.paths.METHOD_CORRELATIONS)
822
+
823
+ logging.info(f"Analysis complete. Report written to {report_file}")