photo-stack-finder 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. orchestrator/__init__.py +2 -2
  2. orchestrator/app.py +6 -11
  3. orchestrator/build_pipeline.py +19 -21
  4. orchestrator/orchestrator_runner.py +11 -8
  5. orchestrator/pipeline_builder.py +126 -126
  6. orchestrator/pipeline_orchestrator.py +604 -604
  7. orchestrator/review_persistence.py +162 -162
  8. orchestrator/static/orchestrator.css +76 -76
  9. orchestrator/static/orchestrator.html +11 -5
  10. orchestrator/static/orchestrator.js +3 -1
  11. overlap_metrics/__init__.py +1 -1
  12. overlap_metrics/config.py +135 -135
  13. overlap_metrics/core.py +284 -284
  14. overlap_metrics/estimators.py +292 -292
  15. overlap_metrics/metrics.py +307 -307
  16. overlap_metrics/registry.py +99 -99
  17. overlap_metrics/utils.py +104 -104
  18. photo_compare/__init__.py +1 -1
  19. photo_compare/base.py +285 -285
  20. photo_compare/config.py +225 -225
  21. photo_compare/distance.py +15 -15
  22. photo_compare/feature_methods.py +173 -173
  23. photo_compare/file_hash.py +29 -29
  24. photo_compare/hash_methods.py +99 -99
  25. photo_compare/histogram_methods.py +118 -118
  26. photo_compare/pixel_methods.py +58 -58
  27. photo_compare/structural_methods.py +104 -104
  28. photo_compare/types.py +28 -28
  29. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/METADATA +21 -22
  30. photo_stack_finder-0.1.8.dist-info/RECORD +75 -0
  31. scripts/orchestrate.py +12 -10
  32. utils/__init__.py +4 -3
  33. utils/base_pipeline_stage.py +171 -171
  34. utils/base_ports.py +176 -176
  35. utils/benchmark_utils.py +823 -823
  36. utils/channel.py +74 -74
  37. utils/comparison_gates.py +40 -21
  38. utils/compute_benchmarks.py +355 -355
  39. utils/compute_identical.py +94 -24
  40. utils/compute_indices.py +235 -235
  41. utils/compute_perceptual_hash.py +127 -127
  42. utils/compute_perceptual_match.py +240 -240
  43. utils/compute_sha_bins.py +64 -20
  44. utils/compute_template_similarity.py +1 -1
  45. utils/compute_versions.py +483 -483
  46. utils/config.py +8 -5
  47. utils/data_io.py +83 -83
  48. utils/graph_context.py +44 -44
  49. utils/logger.py +2 -2
  50. utils/models.py +2 -2
  51. utils/photo_file.py +90 -91
  52. utils/pipeline_graph.py +334 -334
  53. utils/pipeline_stage.py +408 -408
  54. utils/plot_helpers.py +123 -123
  55. utils/ports.py +136 -136
  56. utils/progress.py +415 -415
  57. utils/report_builder.py +139 -139
  58. utils/review_types.py +55 -55
  59. utils/review_utils.py +10 -19
  60. utils/sequence.py +10 -8
  61. utils/sequence_clustering.py +1 -1
  62. utils/template.py +57 -57
  63. utils/template_parsing.py +71 -0
  64. photo_stack_finder-0.1.7.dist-info/RECORD +0 -74
  65. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/WHEEL +0 -0
  66. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/entry_points.txt +0 -0
  67. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/licenses/LICENSE +0 -0
  68. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/top_level.txt +0 -0
utils/compute_versions.py CHANGED
@@ -1,483 +1,483 @@
1
- """Classes and helper functions for processing a bin of photos which have equivalent filename templates.
2
-
3
- Bins according to template core (ie the middle section of the filename which varies between files).
4
- """
5
-
6
- from __future__ import annotations
7
-
8
- from collections import defaultdict
9
- from collections.abc import Callable
10
- from typing import Any, cast
11
-
12
- import pandas as pd
13
-
14
- from .comparison_gates import GateName, GateSequence
15
- from .config import CONFIG
16
- from .logger import get_logger
17
- from .models import ReviewType
18
- from .photo_file import ImageData, PhotoFile, pick_exemplar_from_class
19
- from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
20
- from .ports import InputPort, OutputPort
21
- from .review_utils import build_sequence_group
22
- from .sequence import (
23
- INDEX_T,
24
- PhotoFileSeries,
25
- PhotoSequence,
26
- count_forest_ref_photos,
27
- count_forest_total_photos,
28
- )
29
- from .template import partial_format
30
-
31
-
32
- class ComputeVersions(
33
- PipelineStage[
34
- tuple[str, list[tuple[INDEX_T, PhotoFile]]], # S: template bin
35
- PhotoSequence, # T: work data
36
- tuple[list[PhotoSequence], dict[str, list[tuple[PhotoSequence, str]]]], # R: accumulator
37
- ]
38
- ):
39
- """From a dict of photos binned by filename template, produce a sequence which has detected any part of the filename that is a version."""
40
-
41
- def __init__(self) -> None:
42
- """Initialize ComputeVersions pipeline stage.
43
-
44
- Configures stage to save results to forest_versions_pkl and identifies
45
- as "Version Detection" in logs and metadata.
46
-
47
- """
48
- super().__init__(
49
- path=CONFIG.paths.forest_versions_pkl,
50
- stage_name="Version Detection",
51
- )
52
-
53
- # Store worker argument for port-based execution
54
- self.args = self.stage_name
55
-
56
- # Create input port for template bins (from ComputeTemplates)
57
- self.template_bins_i: InputPort[dict[str, list[tuple[INDEX_T, PhotoFile]]]] = InputPort("template_bins")
58
-
59
- # Create output ports - separate ports per downstream consumer (Decision 6)
60
- # Full tuple output (for backward compatibility or review)
61
- self.forest_template_bins_o: OutputPort[
62
- tuple[list[PhotoSequence], dict[str, list[tuple[PhotoSequence, str]]]]
63
- ] = OutputPort(self, getter=lambda: self.result)
64
-
65
- # Template remainder bins output (for ComputeTemplateSimilarity)
66
- self.template_remainder_bins_o: OutputPort[dict[str, list[tuple[PhotoSequence, str]]]] = OutputPort(
67
- self, getter=lambda: self.result[1]
68
- )
69
-
70
- # Debug counter for tracking accumulate_results calls
71
- self.accumulate_count = 0
72
-
73
- def prepare(
74
- self,
75
- ) -> PrepareResult[
76
- tuple[str, list[tuple[INDEX_T, PhotoFile]]],
77
- tuple[list[PhotoSequence], dict[str, list[tuple[PhotoSequence, str]]]],
78
- ]:
79
- """Prepare template bins for parallel processing.
80
-
81
- Sorts bins by size (descending) to process largest bins first, improving
82
- load balancing across worker processes.
83
-
84
- Template bins are read from the input port.
85
-
86
- Returns:
87
- Tuple of (work_items, accumulator) where:
88
- - work_items: List of (template_key, photos) tuples sorted by photo count
89
- - accumulator: Tuple of (forest, bins) where:
90
- - forest: Empty list for collecting all PhotoSequence objects
91
- - bins: Defaultdict for collecting (PhotoSequence, prefix) pairs grouped by template_remainder
92
- """
93
- # Read from input port to get template bins
94
- bins: dict[str, list[tuple[INDEX_T, PhotoFile]]] = self.template_bins_i.read()
95
- # Get reference counts from upstream (for ungrouped photos, ref == total)
96
- self.ref_photos_init = self.template_bins_i.get_ref_photo_count()
97
- self.ref_seqs_init = self.template_bins_i.get_ref_sequence_count()
98
- # Count total photos for internal invariant checking (should never change)
99
- self.total_photos = sum(len(photo_list) for photo_list in bins.values())
100
-
101
- work: list[tuple[str, list[tuple[INDEX_T, PhotoFile]]]] = sorted(bins.items(), key=lambda p: -len(p[1]))
102
-
103
- # ASSERTION: Verify we have exactly one work item per input sequence
104
- assert len(work) == self.ref_seqs_init, (
105
- f"Work item count mismatch: have {len(work)} work items but expected {self.ref_seqs_init} from upstream"
106
- )
107
-
108
- # ASSERTION: Verify all photos accounted for in work items
109
- photos_in_work = sum(len(photo_list) for _, photo_list in work)
110
- assert photos_in_work == self.total_photos, (
111
- f"Lost photos in prepare: started {self.total_photos}, have {photos_in_work} in work items"
112
- )
113
-
114
- return work, ([], defaultdict(list))
115
-
116
- @classmethod
117
- def _test_field_as_version_dimension(
118
- cls,
119
- df: pd.DataFrame,
120
- field_idx: str,
121
- photo_dict: dict[int, PhotoFile],
122
- compare: Callable[[int, int, ImageData | None, ImageData | None], tuple[bool, float]],
123
- ) -> bool:
124
- """Test if a field represents a version dimension.
125
-
126
- Criteria:
127
- 1. No more than MAX_MISMATCHES vs the reference sequence
128
- 2. More hits than misses (a reference photo is considered a hit)
129
-
130
- Args:
131
- df: DataFrame with photo indices
132
- field_idx: Field column name to test
133
- photo_dict: Mapping from photo ID to PhotoFile
134
- compare: Comparison function accepting optional ImageData for photo similarity
135
-
136
- Returns:
137
- True if field represents a version dimension
138
- """
139
- # OPTIMIZATION: Use dict-based grouping instead of expensive pivot_table
140
- # Group photos by position (all fields except the one being tested)
141
- group_cols = [c for c in df.columns if c not in [field_idx, "Index"]]
142
-
143
- # Build position -> {field_value -> [photo_ids]} mapping
144
- position_photos: dict[tuple[Any, ...], dict[Any, list[int]]] = defaultdict(lambda: defaultdict(list))
145
-
146
- for row in df.itertuples(index=False):
147
- # Position is the tuple of values for all non-test fields
148
- position_key = tuple(getattr(row, col) for col in group_cols) if group_cols else ()
149
- field_value = getattr(row, field_idx)
150
- photo_id = cast(int, row.Index)
151
- position_photos[position_key][field_value].append(photo_id)
152
-
153
- # Find positions with multiple field values (potential versions)
154
- positions_to_check = [
155
- (pos, field_groups) for pos, field_groups in position_photos.items() if len(field_groups) > 1
156
- ]
157
-
158
- if not positions_to_check:
159
- return False
160
-
161
- misses = 0
162
- hits = 0
163
- for _position, field_groups in positions_to_check:
164
- # Collect all photos at this position across all field values
165
- vset = {pid for photo_list in field_groups.values() for pid in photo_list}
166
- # Find the exemplar photo
167
- ex_id = pick_exemplar_from_class(photo_dict, vset)
168
-
169
- # OPTIMIZATION: Create ImageData once for exemplar, reuse across all comparisons
170
- with photo_dict[ex_id].image_data() as ex_img:
171
- # Check whether each photo is similar to the exemplar
172
- # Early break on first mismatch to save comparisons
173
- matches = True
174
- for pid in vset:
175
- if pid != ex_id:
176
- passes, _ = compare(ex_id, pid, ex_img, None)
177
- if not passes:
178
- matches = False
179
- break # Early exit - no need to check remaining photos
180
-
181
- if not matches:
182
- misses += 1
183
- if misses > CONFIG.sequences.MAX_MISMATCHES:
184
- return False
185
- else:
186
- hits += 1
187
-
188
- return misses < hits
189
-
190
- @classmethod
191
- def _create_reference_sequence(
192
- cls,
193
- df: pd.DataFrame,
194
- version_columns: list[str],
195
- photo_dict: dict[int, PhotoFile],
196
- template_key: str,
197
- ) -> tuple[PhotoFileSeries, dict[INDEX_T, PhotoFile]]:
198
- """Create reference sequence by removing version columns.
199
-
200
- Args:
201
- df: DataFrame with photo indices
202
- version_columns: List of column names that are version dimensions
203
- photo_dict: Mapping from photo ID to PhotoFile
204
- template_key: Original template key
205
-
206
- Returns:
207
- Tuple of (reference_series, reference_sequence_dict)
208
- """
209
- # OPTIMIZATION: Use dict-based grouping instead of pivot + reverse_pivot
210
- group_columns = [c for c in df.columns[1:] if c not in version_columns]
211
-
212
- # Group photos by position (non-version fields) and pick exemplar for each position
213
- position_photos: dict[tuple[Any, ...], set[int]] = defaultdict(set)
214
-
215
- for row in df.itertuples(index=False):
216
- # Position is the tuple of values for all non-version fields
217
- position_key = tuple(getattr(row, col) for col in group_columns) if group_columns else ()
218
- photo_id = cast(int, row.Index)
219
- position_photos[position_key].add(photo_id)
220
-
221
- # Pick exemplar for each position
222
- ref_seq: dict[INDEX_T, PhotoFile] = {}
223
- for position, photo_ids in position_photos.items():
224
- exemplar_id = pick_exemplar_from_class(photo_dict, photo_ids)
225
- ref_seq[position] = photo_dict[exemplar_id]
226
-
227
- # Remap remaining fields in template
228
- vcol_remap = {c: f"{{P{i}}}" for i, c in enumerate(group_columns)}
229
- ref_series = PhotoFileSeries(
230
- ref_seq,
231
- name=(partial_format(template_key, dict.fromkeys(version_columns, "V")).format_map(vcol_remap)),
232
- normal=False, # Don't remove fields from the template!
233
- )
234
-
235
- return ref_series, ref_seq
236
-
237
- @classmethod
238
- def _create_version_sequences(
239
- cls,
240
- df: pd.DataFrame,
241
- version_columns: list[str],
242
- ref_seq: dict[INDEX_T, PhotoFile],
243
- photo_dict: dict[int, PhotoFile],
244
- template_key: str,
245
- compare: Callable[[int, int], tuple[bool, float]],
246
- ) -> list[PhotoSequence]:
247
- """Create individual version sequences.
248
-
249
- Args:
250
- df: DataFrame with photo indices
251
- version_columns: List of column names that are version dimensions
252
- ref_seq: Reference sequence dictionary
253
- photo_dict: Mapping from photo ID to PhotoFile
254
- template_key: Original template key
255
- compare: Comparison function for photo similarity
256
-
257
- Returns:
258
- List of PhotoSequence objects for each version
259
- """
260
- # OPTIMIZATION: Use dict-based grouping instead of pivot + recover_rows
261
- group_columns = [c for c in df.columns[1:] if c not in version_columns]
262
- vcol_remap = {c: f"{{P{i}}}" for i, c in enumerate(group_columns)}
263
-
264
- # Group photos by (position, version_values) tuple
265
- version_photos: dict[tuple[Any, ...], dict[INDEX_T, int]] = defaultdict(dict)
266
-
267
- for row in df.itertuples(index=False):
268
- # Position is the tuple of values for all non-version fields
269
- position = tuple(getattr(row, col) for col in group_columns) if group_columns else ()
270
- # Version values tuple
271
- version_values = tuple(getattr(row, col) for col in version_columns)
272
- photo_id = cast(int, row.Index)
273
-
274
- version_photos[version_values][position] = photo_id
275
-
276
- # Create PhotoSequence for each version
277
- version_sequences = []
278
- for version_values, position_photo_ids in version_photos.items():
279
- # Build index_to_photo mapping
280
- index_to_photo: dict[INDEX_T, PhotoFile] = {pos: photo_dict[pid] for pos, pid in position_photo_ids.items()}
281
-
282
- seq_series = PhotoFileSeries(
283
- index_to_photo,
284
- name=partial_format(template_key, dict(zip(version_columns, version_values, strict=False))).format_map(
285
- vcol_remap
286
- ),
287
- normal=False,
288
- )
289
-
290
- # Cache similarity scores
291
- for idx, p in index_to_photo.items():
292
- exemplar_photo = ref_seq[idx]
293
- _passes, similarity = compare(p.id, exemplar_photo.id)
294
- p.cache["SEQUENCE_EXEMPLAR"] = exemplar_photo
295
- p.cache["SEQUENCE_SIMILARITY"] = similarity
296
-
297
- version_sequences.append(PhotoSequence(seq_series))
298
-
299
- return version_sequences
300
-
301
- @classmethod
302
- def stage_worker(
303
- cls, bin_data: tuple[str, list[tuple[INDEX_T, PhotoFile]]], created_by: str
304
- ) -> WorkerResult[PhotoSequence]:
305
- """Analyze a template bin for version patterns.
306
-
307
- Worker function that analyzes one template bin to detect version dimensions.
308
- Tests each field individually to see if it represents a version dimension.
309
-
310
- PhotoSequence construction automatically handles template normalization including:
311
- - Constant substitution
312
- - Common prefix folding
313
- - Variable renumbering
314
- - Index normalization
315
-
316
- Args:
317
- bin_data: Tuple of (template_key, list of PhotoFile objects)
318
- created_by: Process creating this sequence
319
-
320
- Returns:
321
- PhotoSequence with normalized template components stored as attributes
322
- """
323
- template_key, photo_tuples = bin_data
324
- n_photos: int = len(photo_tuples)
325
-
326
- # Extract normalized template and indices
327
- whole_sequence: PhotoFileSeries = PhotoFileSeries(dict(photo_tuples), name=template_key)
328
- template_key = whole_sequence.name
329
- assert len(whole_sequence) == n_photos
330
-
331
- # Handle bins with < 2 photos (no versions possible)
332
- if len(photo_tuples) < 2:
333
- return [], [], PhotoSequence(whole_sequence, created_by=created_by)
334
-
335
- # Setup dataframe and comparison infrastructure
336
- photo_dict: dict[int, PhotoFile] = {p.id: p for _, p in photo_tuples}
337
- df = pd.DataFrame([[p.id, *idx] for idx, p in whole_sequence.items()])
338
- df.columns = ["Index"] + [f"P{i}" for i in range(df.shape[1] - 1)]
339
- assert n_photos == df.shape[0]
340
-
341
- # Create comparison function with caching
342
- gates = GateSequence(cast(list[GateName], CONFIG.processing.COMPARISON_GATES))
343
- simcache: dict[tuple[int, int], tuple[bool, float]] = {}
344
-
345
- def compare(
346
- x: int,
347
- y: int,
348
- x_img: ImageData | None = None,
349
- y_img: ImageData | None = None,
350
- ) -> tuple[bool, float]:
351
- """Memo function to compare two photo ids with optional pre-created ImageData."""
352
- p = (x, y) if x < y else (y, x)
353
- if p not in simcache:
354
- passes, _score, similarity = gates.compare_with_rotation(
355
- photo_dict[x],
356
- photo_dict[y],
357
- ref_img=x_img,
358
- cand_img=y_img,
359
- )
360
- simcache[p] = (passes, similarity)
361
- return simcache[p]
362
-
363
- # Test fields from smallest to largest (by unique value count)
364
- field_sizes: list[tuple[str, int]] = [(field_idx, len(df[field_idx].unique())) for field_idx in df.columns[1:]]
365
- field_sizes.sort(key=lambda x: x[1])
366
-
367
- # Find version columns by testing each field
368
- version_columns: list[str] = []
369
- for field_idx, unique_count in field_sizes:
370
- # Break if too many unique values (list is sorted by count)
371
- if unique_count > CONFIG.sequences.MAX_COMPONENT_SIZE:
372
- break
373
-
374
- if cls._test_field_as_version_dimension(df, field_idx, photo_dict, compare):
375
- version_columns.append(field_idx)
376
-
377
- # No versions detected - return whole sequence
378
- if not version_columns:
379
- return [], [], PhotoSequence(whole_sequence, created_by=created_by)
380
-
381
- # Create reference sequence (removing version dimensions)
382
- ref_series, ref_seq = cls._create_reference_sequence(df, version_columns, photo_dict, template_key)
383
-
384
- # Create individual version sequences
385
- version_sequences = cls._create_version_sequences(
386
- df,
387
- version_columns,
388
- ref_seq,
389
- photo_dict,
390
- template_key,
391
- compare,
392
- )
393
-
394
- # Build final result
395
- result = PhotoSequence(ref_series, version_sequences, created_by=created_by)
396
- assert result.n_photos == n_photos, f"Lost photos: expected={n_photos}, achieved={result.n_photos}"
397
-
398
- return [], [build_sequence_group(result)], result
399
-
400
- def accumulate_results(
401
- self,
402
- accum: tuple[list[PhotoSequence], dict[str, list[tuple[PhotoSequence, str]]]],
403
- seq: PhotoSequence,
404
- ) -> None:
405
- """Accumulate PhotoSequence results from worker into forest and template remainder bins.
406
-
407
- Groups sequences by their template_remainder attribute,
408
- and also maintains the complete forest for review.
409
-
410
- Args:
411
- accum: Tuple of (forest, bins) where:
412
- - forest: List collecting all PhotoSequence objects
413
- - bins: Dictionary accumulating (sequence, prefix) pairs by template_remainder
414
- seq: PhotoSequence from stage_worker with normalized template components
415
- """
416
- forest, bins = accum
417
-
418
- # Debug counter
419
- self.accumulate_count += 1
420
-
421
- # Add to forest (deduplicated by object identity happens in review server)
422
- forest.append(seq)
423
-
424
- # Add to template remainder bins
425
- # Extract template components from PhotoSequence attributes
426
- # template_prefix is the constant prefix before first variable
427
- # template_remainder is the variable structure (used as grouping key)
428
- bins[seq.template_remainder].append((seq, seq.reverse_prefix))
429
-
430
- def finalise(self) -> None:
431
- forest = self.result[0]
432
- self.ref_photos_final = count_forest_ref_photos(forest)
433
- # ComputeVersions is the first stage creating PhotoSequences
434
- # Count top-level sequences (one per input template bin), not children
435
- # Children are versions detected WITHIN a bin, not separate input sequences
436
- self.ref_seqs_final = len(forest)
437
-
438
- # ComputeVersions is the FIRST grouping stage - it receives ungrouped photos
439
- # and creates version groups. Must preserve TOTAL photos, not just references.
440
- # Downstream stages receive grouped data and track only references.
441
- photos_final = count_forest_total_photos(forest)
442
- # Verify we have same number of sequences as input template bins
443
- seqs_final = len(forest)
444
-
445
- # Debug logging
446
- get_logger().info(
447
- f"ComputeVersions finalise: accumulate_results called {self.accumulate_count} times, "
448
- f"forest has {seqs_final} sequences, expected {self.ref_seqs_init}"
449
- )
450
-
451
- assert seqs_final == self.ref_seqs_init, (
452
- f"Sequence count mismatch in {self.stage_name}: "
453
- f"started with {self.ref_seqs_init} but ended with {seqs_final} "
454
- f"(accumulate_results was called {self.accumulate_count} times)"
455
- )
456
- assert photos_final == self.total_photos, (
457
- f"Started with {self.total_photos} photos but ended up with {photos_final}"
458
- )
459
-
460
- def needs_review(self) -> ReviewType:
461
- """This stage produces sequence groups (version-detected sequences).
462
-
463
- Returns:
464
- "sequences" to indicate this stage produces reviewable sequence groups
465
- """
466
- return "sequences"
467
-
468
- def has_review_data(self) -> bool:
469
- """Check if there are any version sequence groups to review.
470
-
471
- Returns:
472
- True if forest has classes (multi-sequence groups), False otherwise
473
- """
474
- # Check if stage has run
475
- if not hasattr(self, "result") or self.result is None:
476
- return False
477
-
478
- # Check if there are any classes (multi-sequence groups) in the forest
479
- forest = self.result[0]
480
- return any(seq.is_class() for seq in forest)
481
-
482
- # Typed result field - tuple of (forest, template_bins)
483
- result: tuple[list[PhotoSequence], dict[str, list[tuple[PhotoSequence, str]]]]
1
+ """Classes and helper functions for processing a bin of photos which have equivalent filename templates.
2
+
3
+ Bins according to template core (ie the middle section of the filename which varies between files).
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from collections import defaultdict
9
+ from collections.abc import Callable
10
+ from typing import Any, cast
11
+
12
+ import pandas as pd
13
+
14
+ from .comparison_gates import GateName, GateSequence
15
+ from .config import CONFIG
16
+ from .logger import get_logger
17
+ from .models import ReviewType
18
+ from .photo_file import ImageData, PhotoFile, pick_exemplar_from_class
19
+ from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
20
+ from .ports import InputPort, OutputPort
21
+ from .review_utils import build_sequence_group
22
+ from .sequence import (
23
+ INDEX_T,
24
+ PhotoFileSeries,
25
+ PhotoSequence,
26
+ count_forest_ref_photos,
27
+ count_forest_total_photos,
28
+ )
29
+ from .template import partial_format
30
+
31
+
32
+ class ComputeVersions(
33
+ PipelineStage[
34
+ tuple[str, list[tuple[INDEX_T, PhotoFile]]], # S: template bin
35
+ PhotoSequence, # T: work data
36
+ tuple[list[PhotoSequence], dict[str, list[tuple[PhotoSequence, str]]]], # R: accumulator
37
+ ]
38
+ ):
39
+ """From a dict of photos binned by filename template, produce a sequence which has detected any part of the filename that is a version."""
40
+
41
+ def __init__(self) -> None:
42
+ """Initialize ComputeVersions pipeline stage.
43
+
44
+ Configures stage to save results to forest_versions_pkl and identifies
45
+ as "Version Detection" in logs and metadata.
46
+
47
+ """
48
+ super().__init__(
49
+ path=CONFIG.paths.forest_versions_pkl,
50
+ stage_name="Version Detection",
51
+ )
52
+
53
+ # Store worker argument for port-based execution
54
+ self.args = self.stage_name
55
+
56
+ # Create input port for template bins (from ComputeTemplates)
57
+ self.template_bins_i: InputPort[dict[str, list[tuple[INDEX_T, PhotoFile]]]] = InputPort("template_bins")
58
+
59
+ # Create output ports - separate ports per downstream consumer (Decision 6)
60
+ # Full tuple output (for backward compatibility or review)
61
+ self.forest_template_bins_o: OutputPort[
62
+ tuple[list[PhotoSequence], dict[str, list[tuple[PhotoSequence, str]]]]
63
+ ] = OutputPort(self, getter=lambda: self.result)
64
+
65
+ # Template remainder bins output (for ComputeTemplateSimilarity)
66
+ self.template_remainder_bins_o: OutputPort[dict[str, list[tuple[PhotoSequence, str]]]] = OutputPort(
67
+ self, getter=lambda: self.result[1]
68
+ )
69
+
70
+ # Debug counter for tracking accumulate_results calls
71
+ self.accumulate_count = 0
72
+
73
+ def prepare(
74
+ self,
75
+ ) -> PrepareResult[
76
+ tuple[str, list[tuple[INDEX_T, PhotoFile]]],
77
+ tuple[list[PhotoSequence], dict[str, list[tuple[PhotoSequence, str]]]],
78
+ ]:
79
+ """Prepare template bins for parallel processing.
80
+
81
+ Sorts bins by size (descending) to process largest bins first, improving
82
+ load balancing across worker processes.
83
+
84
+ Template bins are read from the input port.
85
+
86
+ Returns:
87
+ Tuple of (work_items, accumulator) where:
88
+ - work_items: List of (template_key, photos) tuples sorted by photo count
89
+ - accumulator: Tuple of (forest, bins) where:
90
+ - forest: Empty list for collecting all PhotoSequence objects
91
+ - bins: Defaultdict for collecting (PhotoSequence, prefix) pairs grouped by template_remainder
92
+ """
93
+ # Read from input port to get template bins
94
+ bins: dict[str, list[tuple[INDEX_T, PhotoFile]]] = self.template_bins_i.read()
95
+ # Get reference counts from upstream (for ungrouped photos, ref == total)
96
+ self.ref_photos_init = self.template_bins_i.get_ref_photo_count()
97
+ self.ref_seqs_init = self.template_bins_i.get_ref_sequence_count()
98
+ # Count total photos for internal invariant checking (should never change)
99
+ self.total_photos = sum(len(photo_list) for photo_list in bins.values())
100
+
101
+ work: list[tuple[str, list[tuple[INDEX_T, PhotoFile]]]] = sorted(bins.items(), key=lambda p: -len(p[1]))
102
+
103
+ # ASSERTION: Verify we have exactly one work item per input sequence
104
+ assert len(work) == self.ref_seqs_init, (
105
+ f"Work item count mismatch: have {len(work)} work items but expected {self.ref_seqs_init} from upstream"
106
+ )
107
+
108
+ # ASSERTION: Verify all photos accounted for in work items
109
+ photos_in_work = sum(len(photo_list) for _, photo_list in work)
110
+ assert photos_in_work == self.total_photos, (
111
+ f"Lost photos in prepare: started {self.total_photos}, have {photos_in_work} in work items"
112
+ )
113
+
114
+ return work, ([], defaultdict(list))
115
+
116
+ @classmethod
117
+ def _test_field_as_version_dimension(
118
+ cls,
119
+ df: pd.DataFrame,
120
+ field_idx: str,
121
+ photo_dict: dict[int, PhotoFile],
122
+ compare: Callable[[int, int, ImageData | None, ImageData | None], tuple[bool, float]],
123
+ ) -> bool:
124
+ """Test if a field represents a version dimension.
125
+
126
+ Criteria:
127
+ 1. No more than MAX_MISMATCHES vs the reference sequence
128
+ 2. More hits than misses (a reference photo is considered a hit)
129
+
130
+ Args:
131
+ df: DataFrame with photo indices
132
+ field_idx: Field column name to test
133
+ photo_dict: Mapping from photo ID to PhotoFile
134
+ compare: Comparison function accepting optional ImageData for photo similarity
135
+
136
+ Returns:
137
+ True if field represents a version dimension
138
+ """
139
+ # OPTIMIZATION: Use dict-based grouping instead of expensive pivot_table
140
+ # Group photos by position (all fields except the one being tested)
141
+ group_cols = [c for c in df.columns if c not in [field_idx, "Index"]]
142
+
143
+ # Build position -> {field_value -> [photo_ids]} mapping
144
+ position_photos: dict[tuple[Any, ...], dict[Any, list[int]]] = defaultdict(lambda: defaultdict(list))
145
+
146
+ for row in df.itertuples(index=False):
147
+ # Position is the tuple of values for all non-test fields
148
+ position_key = tuple(getattr(row, col) for col in group_cols) if group_cols else ()
149
+ field_value = getattr(row, field_idx)
150
+ photo_id = cast(int, row.Index)
151
+ position_photos[position_key][field_value].append(photo_id)
152
+
153
+ # Find positions with multiple field values (potential versions)
154
+ positions_to_check = [
155
+ (pos, field_groups) for pos, field_groups in position_photos.items() if len(field_groups) > 1
156
+ ]
157
+
158
+ if not positions_to_check:
159
+ return False
160
+
161
+ misses = 0
162
+ hits = 0
163
+ for _position, field_groups in positions_to_check:
164
+ # Collect all photos at this position across all field values
165
+ vset = {pid for photo_list in field_groups.values() for pid in photo_list}
166
+ # Find the exemplar photo
167
+ ex_id = pick_exemplar_from_class(photo_dict, vset)
168
+
169
+ # OPTIMIZATION: Create ImageData once for exemplar, reuse across all comparisons
170
+ with photo_dict[ex_id].image_data() as ex_img:
171
+ # Check whether each photo is similar to the exemplar
172
+ # Early break on first mismatch to save comparisons
173
+ matches = True
174
+ for pid in vset:
175
+ if pid != ex_id:
176
+ passes, _ = compare(ex_id, pid, ex_img, None)
177
+ if not passes:
178
+ matches = False
179
+ break # Early exit - no need to check remaining photos
180
+
181
+ if not matches:
182
+ misses += 1
183
+ if misses > CONFIG.sequences.MAX_MISMATCHES:
184
+ return False
185
+ else:
186
+ hits += 1
187
+
188
+ return misses < hits
189
+
190
+ @classmethod
191
+ def _create_reference_sequence(
192
+ cls,
193
+ df: pd.DataFrame,
194
+ version_columns: list[str],
195
+ photo_dict: dict[int, PhotoFile],
196
+ template_key: str,
197
+ ) -> tuple[PhotoFileSeries, dict[INDEX_T, PhotoFile]]:
198
+ """Create reference sequence by removing version columns.
199
+
200
+ Args:
201
+ df: DataFrame with photo indices
202
+ version_columns: List of column names that are version dimensions
203
+ photo_dict: Mapping from photo ID to PhotoFile
204
+ template_key: Original template key
205
+
206
+ Returns:
207
+ Tuple of (reference_series, reference_sequence_dict)
208
+ """
209
+ # OPTIMIZATION: Use dict-based grouping instead of pivot + reverse_pivot
210
+ group_columns = [c for c in df.columns[1:] if c not in version_columns]
211
+
212
+ # Group photos by position (non-version fields) and pick exemplar for each position
213
+ position_photos: dict[tuple[Any, ...], set[int]] = defaultdict(set)
214
+
215
+ for row in df.itertuples(index=False):
216
+ # Position is the tuple of values for all non-version fields
217
+ position_key = tuple(getattr(row, col) for col in group_columns) if group_columns else ()
218
+ photo_id = cast(int, row.Index)
219
+ position_photos[position_key].add(photo_id)
220
+
221
+ # Pick exemplar for each position
222
+ ref_seq: dict[INDEX_T, PhotoFile] = {}
223
+ for position, photo_ids in position_photos.items():
224
+ exemplar_id = pick_exemplar_from_class(photo_dict, photo_ids)
225
+ ref_seq[position] = photo_dict[exemplar_id]
226
+
227
+ # Remap remaining fields in template
228
+ vcol_remap = {c: f"{{P{i}}}" for i, c in enumerate(group_columns)}
229
+ ref_series = PhotoFileSeries(
230
+ ref_seq,
231
+ name=(partial_format(template_key, dict.fromkeys(version_columns, "V")).format_map(vcol_remap)),
232
+ normal=False, # Don't remove fields from the template!
233
+ )
234
+
235
+ return ref_series, ref_seq
236
+
237
+ @classmethod
238
+ def _create_version_sequences(
239
+ cls,
240
+ df: pd.DataFrame,
241
+ version_columns: list[str],
242
+ ref_seq: dict[INDEX_T, PhotoFile],
243
+ photo_dict: dict[int, PhotoFile],
244
+ template_key: str,
245
+ compare: Callable[[int, int], tuple[bool, float]],
246
+ ) -> list[PhotoSequence]:
247
+ """Create individual version sequences.
248
+
249
+ Args:
250
+ df: DataFrame with photo indices
251
+ version_columns: List of column names that are version dimensions
252
+ ref_seq: Reference sequence dictionary
253
+ photo_dict: Mapping from photo ID to PhotoFile
254
+ template_key: Original template key
255
+ compare: Comparison function for photo similarity
256
+
257
+ Returns:
258
+ List of PhotoSequence objects for each version
259
+ """
260
+ # OPTIMIZATION: Use dict-based grouping instead of pivot + recover_rows
261
+ group_columns = [c for c in df.columns[1:] if c not in version_columns]
262
+ vcol_remap = {c: f"{{P{i}}}" for i, c in enumerate(group_columns)}
263
+
264
+ # Group photos by (position, version_values) tuple
265
+ version_photos: dict[tuple[Any, ...], dict[INDEX_T, int]] = defaultdict(dict)
266
+
267
+ for row in df.itertuples(index=False):
268
+ # Position is the tuple of values for all non-version fields
269
+ position = tuple(getattr(row, col) for col in group_columns) if group_columns else ()
270
+ # Version values tuple
271
+ version_values = tuple(getattr(row, col) for col in version_columns)
272
+ photo_id = cast(int, row.Index)
273
+
274
+ version_photos[version_values][position] = photo_id
275
+
276
+ # Create PhotoSequence for each version
277
+ version_sequences = []
278
+ for version_values, position_photo_ids in version_photos.items():
279
+ # Build index_to_photo mapping
280
+ index_to_photo: dict[INDEX_T, PhotoFile] = {pos: photo_dict[pid] for pos, pid in position_photo_ids.items()}
281
+
282
+ seq_series = PhotoFileSeries(
283
+ index_to_photo,
284
+ name=partial_format(template_key, dict(zip(version_columns, version_values, strict=False))).format_map(
285
+ vcol_remap
286
+ ),
287
+ normal=False,
288
+ )
289
+
290
+ # Cache similarity scores
291
+ for idx, p in index_to_photo.items():
292
+ exemplar_photo = ref_seq[idx]
293
+ _passes, similarity = compare(p.id, exemplar_photo.id)
294
+ p.cache["SEQUENCE_EXEMPLAR"] = exemplar_photo
295
+ p.cache["SEQUENCE_SIMILARITY"] = similarity
296
+
297
+ version_sequences.append(PhotoSequence(seq_series))
298
+
299
+ return version_sequences
300
+
301
+ @classmethod
302
+ def stage_worker(
303
+ cls, bin_data: tuple[str, list[tuple[INDEX_T, PhotoFile]]], created_by: str
304
+ ) -> WorkerResult[PhotoSequence]:
305
+ """Analyze a template bin for version patterns.
306
+
307
+ Worker function that analyzes one template bin to detect version dimensions.
308
+ Tests each field individually to see if it represents a version dimension.
309
+
310
+ PhotoSequence construction automatically handles template normalization including:
311
+ - Constant substitution
312
+ - Common prefix folding
313
+ - Variable renumbering
314
+ - Index normalization
315
+
316
+ Args:
317
+ bin_data: Tuple of (template_key, list of PhotoFile objects)
318
+ created_by: Process creating this sequence
319
+
320
+ Returns:
321
+ PhotoSequence with normalized template components stored as attributes
322
+ """
323
+ template_key, photo_tuples = bin_data
324
+ n_photos: int = len(photo_tuples)
325
+
326
+ # Extract normalized template and indices
327
+ whole_sequence: PhotoFileSeries = PhotoFileSeries(dict(photo_tuples), name=template_key)
328
+ template_key = whole_sequence.name
329
+ assert len(whole_sequence) == n_photos
330
+
331
+ # Handle bins with < 2 photos (no versions possible)
332
+ if len(photo_tuples) < 2:
333
+ return [], [], PhotoSequence(whole_sequence, created_by=created_by)
334
+
335
+ # Setup dataframe and comparison infrastructure
336
+ photo_dict: dict[int, PhotoFile] = {p.id: p for _, p in photo_tuples}
337
+ df = pd.DataFrame([[p.id, *idx] for idx, p in whole_sequence.items()])
338
+ df.columns = ["Index"] + [f"P{i}" for i in range(df.shape[1] - 1)]
339
+ assert n_photos == df.shape[0]
340
+
341
+ # Create comparison function with caching
342
+ gates = GateSequence(cast(list[GateName], CONFIG.processing.COMPARISON_GATES))
343
+ simcache: dict[tuple[int, int], tuple[bool, float]] = {}
344
+
345
+ def compare(
346
+ x: int,
347
+ y: int,
348
+ x_img: ImageData | None = None,
349
+ y_img: ImageData | None = None,
350
+ ) -> tuple[bool, float]:
351
+ """Memo function to compare two photo ids with optional pre-created ImageData."""
352
+ p = (x, y) if x < y else (y, x)
353
+ if p not in simcache:
354
+ passes, _score, similarity = gates.compare_with_rotation(
355
+ photo_dict[x],
356
+ photo_dict[y],
357
+ ref_img=x_img,
358
+ cand_img=y_img,
359
+ )
360
+ simcache[p] = (passes, similarity)
361
+ return simcache[p]
362
+
363
+ # Test fields from smallest to largest (by unique value count)
364
+ field_sizes: list[tuple[str, int]] = [(field_idx, len(df[field_idx].unique())) for field_idx in df.columns[1:]]
365
+ field_sizes.sort(key=lambda x: x[1])
366
+
367
+ # Find version columns by testing each field
368
+ version_columns: list[str] = []
369
+ for field_idx, unique_count in field_sizes:
370
+ # Break if too many unique values (list is sorted by count)
371
+ if unique_count > CONFIG.sequences.MAX_COMPONENT_SIZE:
372
+ break
373
+
374
+ if cls._test_field_as_version_dimension(df, field_idx, photo_dict, compare):
375
+ version_columns.append(field_idx)
376
+
377
+ # No versions detected - return whole sequence
378
+ if not version_columns:
379
+ return [], [], PhotoSequence(whole_sequence, created_by=created_by)
380
+
381
+ # Create reference sequence (removing version dimensions)
382
+ ref_series, ref_seq = cls._create_reference_sequence(df, version_columns, photo_dict, template_key)
383
+
384
+ # Create individual version sequences
385
+ version_sequences = cls._create_version_sequences(
386
+ df,
387
+ version_columns,
388
+ ref_seq,
389
+ photo_dict,
390
+ template_key,
391
+ compare,
392
+ )
393
+
394
+ # Build final result
395
+ result = PhotoSequence(ref_series, version_sequences, created_by=created_by)
396
+ assert result.n_photos == n_photos, f"Lost photos: expected={n_photos}, achieved={result.n_photos}"
397
+
398
+ return [], [build_sequence_group(result)], result
399
+
400
+ def accumulate_results(
401
+ self,
402
+ accum: tuple[list[PhotoSequence], dict[str, list[tuple[PhotoSequence, str]]]],
403
+ seq: PhotoSequence,
404
+ ) -> None:
405
+ """Accumulate PhotoSequence results from worker into forest and template remainder bins.
406
+
407
+ Groups sequences by their template_remainder attribute,
408
+ and also maintains the complete forest for review.
409
+
410
+ Args:
411
+ accum: Tuple of (forest, bins) where:
412
+ - forest: List collecting all PhotoSequence objects
413
+ - bins: Dictionary accumulating (sequence, prefix) pairs by template_remainder
414
+ seq: PhotoSequence from stage_worker with normalized template components
415
+ """
416
+ forest, bins = accum
417
+
418
+ # Debug counter
419
+ self.accumulate_count += 1
420
+
421
+ # Add to forest (deduplication by object identity happens in review server)
422
+ forest.append(seq)
423
+
424
+ # Add to template remainder bins
425
+ # Extract template components from PhotoSequence attributes
426
+ # template_prefix is the constant prefix before first variable
427
+ # template_remainder is the variable structure (used as grouping key)
428
+ bins[seq.template_remainder].append((seq, seq.reverse_prefix))
429
+
430
+ def finalise(self) -> None:
431
+ forest = self.result[0]
432
+ self.ref_photos_final = count_forest_ref_photos(forest)
433
+ # ComputeVersions is the first stage creating PhotoSequences
434
+ # Count top-level sequences (one per input template bin), not children
435
+ # Children are versions detected WITHIN a bin, not separate input sequences
436
+ self.ref_seqs_final = len(forest)
437
+
438
+ # ComputeVersions is the FIRST grouping stage - it receives ungrouped photos
439
+ # and creates version groups. Must preserve TOTAL photos, not just references.
440
+ # Downstream stages receive grouped data and track only references.
441
+ photos_final = count_forest_total_photos(forest)
442
+ # Verify we have same number of sequences as input template bins
443
+ seqs_final = len(forest)
444
+
445
+ # Debug logging
446
+ get_logger().info(
447
+ f"ComputeVersions finalise: accumulate_results called {self.accumulate_count} times, "
448
+ f"forest has {seqs_final} sequences, expected {self.ref_seqs_init}"
449
+ )
450
+
451
+ assert seqs_final == self.ref_seqs_init, (
452
+ f"Sequence count mismatch in {self.stage_name}: "
453
+ f"started with {self.ref_seqs_init} but ended with {seqs_final} "
454
+ f"(accumulate_results was called {self.accumulate_count} times)"
455
+ )
456
+ assert photos_final == self.total_photos, (
457
+ f"Started with {self.total_photos} photos but ended up with {photos_final}"
458
+ )
459
+
460
+ def needs_review(self) -> ReviewType:
461
+ """This stage produces sequence groups (version-detected sequences).
462
+
463
+ Returns:
464
+ "sequences" to indicate this stage produces reviewable sequence groups
465
+ """
466
+ return "sequences"
467
+
468
+ def has_review_data(self) -> bool:
469
+ """Check if there are any version sequence groups to review.
470
+
471
+ Returns:
472
+ True if forest has classes (multi-sequence groups), False otherwise
473
+ """
474
+ # Check if stage has run
475
+ if not hasattr(self, "result") or self.result is None:
476
+ return False
477
+
478
+ # Check if there are any classes (multi-sequence groups) in the forest
479
+ forest = self.result[0]
480
+ return any(seq.is_class() for seq in forest)
481
+
482
+ # Typed result field - tuple of (forest, template_bins)
483
+ result: tuple[list[PhotoSequence], dict[str, list[tuple[PhotoSequence, str]]]]