photo-stack-finder 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. orchestrator/__init__.py +2 -2
  2. orchestrator/app.py +6 -11
  3. orchestrator/build_pipeline.py +19 -21
  4. orchestrator/orchestrator_runner.py +11 -8
  5. orchestrator/pipeline_builder.py +126 -126
  6. orchestrator/pipeline_orchestrator.py +604 -604
  7. orchestrator/review_persistence.py +162 -162
  8. orchestrator/static/orchestrator.css +76 -76
  9. orchestrator/static/orchestrator.html +11 -5
  10. orchestrator/static/orchestrator.js +3 -1
  11. overlap_metrics/__init__.py +1 -1
  12. overlap_metrics/config.py +135 -135
  13. overlap_metrics/core.py +284 -284
  14. overlap_metrics/estimators.py +292 -292
  15. overlap_metrics/metrics.py +307 -307
  16. overlap_metrics/registry.py +99 -99
  17. overlap_metrics/utils.py +104 -104
  18. photo_compare/__init__.py +1 -1
  19. photo_compare/base.py +285 -285
  20. photo_compare/config.py +225 -225
  21. photo_compare/distance.py +15 -15
  22. photo_compare/feature_methods.py +173 -173
  23. photo_compare/file_hash.py +29 -29
  24. photo_compare/hash_methods.py +99 -99
  25. photo_compare/histogram_methods.py +118 -118
  26. photo_compare/pixel_methods.py +58 -58
  27. photo_compare/structural_methods.py +104 -104
  28. photo_compare/types.py +28 -28
  29. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/METADATA +21 -22
  30. photo_stack_finder-0.1.8.dist-info/RECORD +75 -0
  31. scripts/orchestrate.py +12 -10
  32. utils/__init__.py +4 -3
  33. utils/base_pipeline_stage.py +171 -171
  34. utils/base_ports.py +176 -176
  35. utils/benchmark_utils.py +823 -823
  36. utils/channel.py +74 -74
  37. utils/comparison_gates.py +40 -21
  38. utils/compute_benchmarks.py +355 -355
  39. utils/compute_identical.py +94 -24
  40. utils/compute_indices.py +235 -235
  41. utils/compute_perceptual_hash.py +127 -127
  42. utils/compute_perceptual_match.py +240 -240
  43. utils/compute_sha_bins.py +64 -20
  44. utils/compute_template_similarity.py +1 -1
  45. utils/compute_versions.py +483 -483
  46. utils/config.py +8 -5
  47. utils/data_io.py +83 -83
  48. utils/graph_context.py +44 -44
  49. utils/logger.py +2 -2
  50. utils/models.py +2 -2
  51. utils/photo_file.py +90 -91
  52. utils/pipeline_graph.py +334 -334
  53. utils/pipeline_stage.py +408 -408
  54. utils/plot_helpers.py +123 -123
  55. utils/ports.py +136 -136
  56. utils/progress.py +415 -415
  57. utils/report_builder.py +139 -139
  58. utils/review_types.py +55 -55
  59. utils/review_utils.py +10 -19
  60. utils/sequence.py +10 -8
  61. utils/sequence_clustering.py +1 -1
  62. utils/template.py +57 -57
  63. utils/template_parsing.py +71 -0
  64. photo_stack_finder-0.1.7.dist-info/RECORD +0 -74
  65. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/WHEEL +0 -0
  66. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/entry_points.txt +0 -0
  67. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/licenses/LICENSE +0 -0
  68. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/top_level.txt +0 -0
@@ -6,18 +6,41 @@ import random
6
6
 
7
7
  from .config import CONFIG
8
8
  from .models import IdenticalGroup, ReviewType
9
- from .photo_file import PhotoFile, pick_exemplar_from_class
9
+ from .photo_file import (
10
+ PhotoFile,
11
+ pick_exemplar_from_class,
12
+ )
10
13
  from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
11
14
  from .ports import InputPort, OutputPort
12
15
  from .review_utils import build_identical_group
16
+ from .template_parsing import INDEX_T
13
17
 
14
18
 
15
- class ComputeIdentical(PipelineStage[list[PhotoFile], list[PhotoFile], list[PhotoFile]]):
16
- # FIXME : Add docstring
17
- # TODO: Update to incorporate digit parsing and output template bins
19
+ class ComputeIdentical(
20
+ PipelineStage[
21
+ list[PhotoFile], # WorkItem: SHA bins
22
+ list[PhotoFile], # Accumulator: flat list of exemplars during processing
23
+ dict[str, list[tuple[INDEX_T, PhotoFile]]], # Result: template bins after finalise()
24
+ ]
25
+ ):
26
+ """Pipeline stage that detects byte-identical files and outputs template bins.
27
+
28
+ This stage consumes SHA256 bins, identifies byte-identical files within each bin,
29
+ picks exemplars from non-identical files, and bins the exemplars by filename template.
30
+
31
+ Input:
32
+ SHA256 bins (from ComputeSHABins)
33
+
34
+ Output:
35
+ Template bins: dict mapping template patterns to (index, photo) tuples
36
+ Example: {"IMG_{P0}.jpg": [("1234", photo1), ("5678", photo2)]}
37
+
38
+ Review data:
39
+ Identical photo groups for user review
40
+ """
41
+
18
42
  # Typed result field - populated after run() completes
19
- # Full tuple: (identical_classes, nonidentical_exemplars)
20
- result: list[PhotoFile]
43
+ result: dict[str, list[tuple[INDEX_T, PhotoFile]]]
21
44
 
22
45
  def __init__(self) -> None:
23
46
  """Initialize identical files detection stage."""
@@ -33,12 +56,14 @@ class ComputeIdentical(PipelineStage[list[PhotoFile], list[PhotoFile], list[Phot
33
56
  self.sha_bins_i: InputPort[dict[str, list[PhotoFile]]] = InputPort("sha_bins")
34
57
 
35
58
  # Create output ports
36
- # - nonidentical_o: for next stage (templates)
37
- self.nonidentical_o: OutputPort[list[PhotoFile]] = OutputPort(self, getter=lambda: self.result)
59
+ # - nonidentical_o: template bins for next stage (versions)
60
+ self.nonidentical_o: OutputPort[dict[str, list[tuple[INDEX_T, PhotoFile]]]] = OutputPort(
61
+ self, getter=lambda: self.result
62
+ )
38
63
 
39
64
  def prepare(
40
65
  self,
41
- ) -> PrepareResult[list[PhotoFile], list[PhotoFile]]:
66
+ ) -> PrepareResult[list[PhotoFile], dict[str, list[tuple[INDEX_T, PhotoFile]]]]:
42
67
  """Prepare identical file detection by splitting bins into work items.
43
68
 
44
69
  Reads SHA bins from input port and prepares work items for parallel processing.
@@ -46,7 +71,7 @@ class ComputeIdentical(PipelineStage[list[PhotoFile], list[PhotoFile], list[Phot
46
71
  Returns:
47
72
  Tuple of (work_items, accumulator) where:
48
73
  - work_items: List of bins with multiple photos (potential duplicates)
49
- - accumulator: nonidentical_photos
74
+ - accumulator: Empty template bins dict (populated during accumulation)
50
75
  """
51
76
  # Read SHA bins from input port
52
77
  sha_bins: dict[str, list[PhotoFile]] = self.sha_bins_i.read()
@@ -58,14 +83,30 @@ class ComputeIdentical(PipelineStage[list[PhotoFile], list[PhotoFile], list[Phot
58
83
 
59
84
  multiple_bins: list[list[PhotoFile]] = [b for b in sha_bins.values() if len(b) > 1]
60
85
  singleton_bins: list[list[PhotoFile]] = [b for b in sha_bins.values() if len(b) == 1]
61
- exemplars: list[PhotoFile] = [x for b in singleton_bins for x in b]
62
86
 
63
- return multiple_bins, exemplars
87
+ # Pre-bin singleton exemplars by template (optimization)
88
+ template_bins: dict[str, list[tuple[INDEX_T, PhotoFile]]] = {}
89
+ for bin in singleton_bins:
90
+ photo = bin[0]
91
+ assert "TEMPLATE" in photo.cache, "Template must be cached by SHA stage"
92
+ template_bins.setdefault(photo.template, []).append((photo.template_index, photo))
93
+
94
+ return multiple_bins, template_bins
64
95
 
65
96
  def finalise(self) -> None:
66
- self.ref_photos_final = len(self.result)
67
- self.ref_seqs_final = None
68
- # Count total photos to ensure no photos lost (invariant check)
97
+ """Update reference counts after template binning.
98
+
99
+ Template binning already happened during accumulation, so this just
100
+ computes the final reference counts.
101
+ """
102
+ # self.result is already a dict[str, list[tuple[INDEX_T, PhotoFile]]] from accumulator
103
+ template_bins = self.result
104
+
105
+ # Update reference counting
106
+ self.ref_photos_final = sum(len(photos) for photos in template_bins.values())
107
+ self.ref_seqs_final = len(template_bins) # Number of unique templates
108
+
109
+ # Existing invariant check (photos count unchanged)
69
110
  photos_final: int = (
70
111
  sum(len(cl.photos) for cl in self.identical_review_result)
71
112
  - len(self.identical_review_result)
@@ -76,6 +117,9 @@ class ComputeIdentical(PipelineStage[list[PhotoFile], list[PhotoFile], list[Phot
76
117
  f"ComputeIdentical started with {self.total_photos} photos and ended up with {photos_final}"
77
118
  )
78
119
 
120
+ # Shuffle review groups for variety in review UI (byte-identical groups are 100% confident)
121
+ random.shuffle(self.identical_review_result)
122
+
79
123
  @classmethod
80
124
  def stage_worker(cls, photo_list: list[PhotoFile], _args: str) -> WorkerResult[list[PhotoFile]]:
81
125
  """Process one SHA bin to find byte-identical files.
@@ -95,18 +139,39 @@ class ComputeIdentical(PipelineStage[list[PhotoFile], list[PhotoFile], list[Phot
95
139
  # Singleton bins are filtered out by prepare()
96
140
  assert len(photo_list) >= 2
97
141
 
142
+ # If skipping byte-identical check, trust SHA256 uniqueness
143
+ # Treat all photos in the bin as identical (same SHA = identical files)
144
+ if CONFIG.processing.SKIP_BYTE_IDENTICAL:
145
+ # Pick best exemplar using same logic as non-skip path
146
+ # (prefer higher resolution, larger file size, stable path/ID tiebreaker)
147
+ photos_dict: dict[int, PhotoFile] = {pf.id: pf for pf in photo_list}
148
+ photo_ids: set[int] = set(photos_dict.keys())
149
+ exemplar_id: int = pick_exemplar_from_class(photos_dict, photo_ids)
150
+ exemplar: PhotoFile = photos_dict[exemplar_id]
151
+
152
+ # Mark all other photos as identical to the exemplar
153
+ for photo in photo_list:
154
+ if photo.id != exemplar_id:
155
+ photo.cache["IDENTICAL"] = exemplar
156
+
157
+ # Skip building review data - no need to review when trusting SHA256
158
+ # (building review data would open every image file just to get dimensions)
159
+ # Return empty review list and single exemplar
160
+ return [build_identical_group(photo_list, exemplar_id)], [], [exemplar]
161
+
162
+ # Otherwise, perform byte-by-byte comparison (original behavior)
98
163
  groups: list[IdenticalGroup] = []
99
164
  exemplars: list[PhotoFile] = []
100
165
 
101
166
  # Build dict and set for pick_exemplar_from_class
102
- photos_dict: dict[int, PhotoFile] = {pf.id: pf for pf in photo_list}
167
+ photos_dict = {pf.id: pf for pf in photo_list}
103
168
  remaining_ids: set[int] = set(photos_dict.keys())
104
169
 
105
170
  # Process bin until empty
106
171
  while remaining_ids:
107
172
  # Pick exemplar from remaining files
108
- exemplar_id: int = pick_exemplar_from_class(photos_dict, remaining_ids)
109
- exemplar: PhotoFile = photos_dict[exemplar_id]
173
+ exemplar_id = pick_exemplar_from_class(photos_dict, remaining_ids)
174
+ exemplar = photos_dict[exemplar_id]
110
175
 
111
176
  # Create new equivalence class starting with exemplar
112
177
  eq_class: list[PhotoFile] = [exemplar]
@@ -141,18 +206,23 @@ class ComputeIdentical(PipelineStage[list[PhotoFile], list[PhotoFile], list[Phot
141
206
  groups.append(build_identical_group(eq_class, exemplar_id))
142
207
  exemplars.append(exemplar)
143
208
 
144
- # Shuffle the groups to see something more inteeresting at review time.
145
- random.shuffle(groups)
146
-
147
209
  return groups, [], exemplars
148
210
 
149
211
  def accumulate_results(
150
212
  self,
151
- accum: list[PhotoFile],
213
+ accum: dict[str, list[tuple[INDEX_T, PhotoFile]]],
152
214
  job: list[PhotoFile],
153
215
  ) -> None:
154
- # FIXME: Add docstring
155
- accum.extend(job)
216
+ """Accumulate exemplars from workers into template bins.
217
+
218
+ Args:
219
+ accum: Template bins dictionary being built incrementally
220
+ job: List of exemplars from one worker (flat list)
221
+ """
222
+ # Bin each exemplar by its template
223
+ for exemplar in job:
224
+ assert "TEMPLATE" in exemplar.cache, "Template must be cached by SHA stage"
225
+ accum.setdefault(exemplar.template, []).append((exemplar.template_index, exemplar))
156
226
 
157
227
  def needs_review(self) -> ReviewType:
158
228
  """This stage produces photo groups (byte-identical duplicates).
utils/compute_indices.py CHANGED
@@ -1,235 +1,235 @@
1
- """Compute similar sequences from bins created by puting the sequence in bins defined by the max two indices of the sequence."""
2
-
3
- from __future__ import annotations
4
-
5
- from itertools import combinations
6
- from typing import cast
7
-
8
- import networkx as nx
9
-
10
- from .comparison_gates import GateName, GateSequence
11
- from .config import CONFIG
12
- from .logger import get_logger
13
- from .models import ReviewType
14
- from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
15
- from .ports import InputPort, OutputPort
16
- from .sequence import (
17
- INDEX_T,
18
- PhotoSequence,
19
- count_forest_ref_photos,
20
- count_forest_ref_sequences,
21
- count_forest_total_photos,
22
- )
23
- from .sequence_clustering import cluster_similar_sequences
24
-
25
-
26
- def build_cohabitation_graph(
27
- index_bins: dict[INDEX_T, list[PhotoSequence]],
28
- ) -> list[set[PhotoSequence]]:
29
- """Build graph from index bins and find connected components.
30
-
31
- Args:
32
- index_bins: Dict mapping index pattern → list of sequences
33
-
34
- Returns:
35
- List of connected components (each component is a set of PhotoSequence objects)
36
- """
37
- # Build graph
38
- graph: nx.Graph[PhotoSequence] = nx.Graph()
39
- graph.add_nodes_from(set().union(*index_bins.values()))
40
-
41
- # Add edges where sequences share index bins
42
- # Add edges between all pairs in this bin
43
- for index_bin in index_bins.values():
44
- for seq1, seq2 in combinations(index_bin, 2):
45
- graph.add_edge(seq1, seq2)
46
-
47
- # Find connected components
48
- result = [set(c) for c in nx.connected_components(graph)]
49
-
50
- n_seqs = len(set().union(*index_bins.values()))
51
- n_result_seqs = len(set().union(*result))
52
-
53
- assert n_seqs == n_result_seqs, f"build_cohabitation_graph had {n_seqs} but only returned {n_result_seqs}"
54
-
55
- return result
56
-
57
-
58
- class ComputeIndices(
59
- PipelineStage[
60
- set[PhotoSequence], # S: component
61
- list[PhotoSequence], # T: work data
62
- tuple[list[PhotoSequence], list[PhotoSequence]], # R: accumulator
63
- ]
64
- ):
65
- def __init__(self) -> None:
66
- """Initialize the index-based grouping stage."""
67
- super().__init__(
68
- path=CONFIG.paths.forest_sequence_matches_pkl,
69
- stage_name="Index Grouping",
70
- )
71
-
72
- # Store worker argument
73
- self.args = self.stage_name # Standard args attribute for run()
74
-
75
- # Create input port for index bins
76
- self.index_bins_i: InputPort[dict[INDEX_T, list[PhotoSequence]]] = InputPort("index_bins")
77
-
78
- # Create output ports - separate ports per downstream consumer
79
- # Full tuple output (for backward compatibility or review)
80
- self.forest_bins_o: OutputPort[tuple[list[PhotoSequence], list[PhotoSequence]]] = OutputPort(
81
- self, getter=lambda: self.result
82
- )
83
-
84
- # Forest output (for ComputePerceptualHash and ComputePerceptualMatch)
85
- self.forest_o: OutputPort[list[PhotoSequence]] = OutputPort(self, getter=lambda: self.result[0])
86
-
87
- def prepare(
88
- self,
89
- ) -> PrepareResult[set[PhotoSequence], tuple[list[PhotoSequence], list[PhotoSequence]]]:
90
- """Extract index bins, build graph, and return processable components.
91
-
92
- Reads index bins from input port and prepares work items for parallel processing.
93
-
94
- Returns:
95
- Tuple of (processable_components, accumulator)
96
- """
97
- # Read index bins from input port
98
- index_bins: dict[INDEX_T, list[PhotoSequence]] = self.index_bins_i.read()
99
- # Get reference counts from upstream for UI statistics tracking
100
- all_sequences = set().union(*index_bins.values())
101
- self.ref_photos_init = self.index_bins_i.get_ref_photo_count()
102
- self.ref_seqs_init = self.index_bins_i.get_ref_sequence_count()
103
- # Count total photos for internal invariant checking (should never change)
104
- self.total_photos = sum(seq.n_photos for seq in all_sequences)
105
-
106
- n_photos = self.total_photos
107
-
108
- # Build cohabitation graph
109
- components: list[set[PhotoSequence]] = build_cohabitation_graph(index_bins)
110
- n_component_photos = sum(seq.n_photos for seq in set().union(*components))
111
- assert n_photos == n_component_photos, (
112
- f"Had {n_photos} before cohabitation graph and {n_component_photos} afterward"
113
- )
114
-
115
- # Filter components by size
116
- max_size = CONFIG.sequences.MAX_COMPONENT_SIZE
117
- processable_components: list[set[PhotoSequence]] = sorted(
118
- [c for c in components if 2 <= len(c) <= max_size],
119
- key=lambda c: -sum([s.n_ref_photos for s in c]),
120
- )
121
- skipped_components: list[set[PhotoSequence]] = [c for c in components if len(c) > max_size or len(c) < 2]
122
-
123
- # Calculate skip statistics
124
- num_singletons = sum(1 for c in skipped_components if len(c) < 2)
125
- num_oversized = sum(1 for c in skipped_components if len(c) > max_size)
126
-
127
- get_logger().info(
128
- f"Skipped {len(skipped_components)} components ({num_singletons} singletons, {num_oversized} oversized), "
129
- f"total sequences is {sum([len(c) for c in processable_components])} in {len(processable_components)} sets"
130
- )
131
-
132
- # Initialize forest with skipped sequences (pass-through)
133
- skipped_sequences = [seq for comp in skipped_components for seq in comp]
134
- forest: list[PhotoSequence] = list(skipped_sequences)
135
- bins: list[PhotoSequence] = list(skipped_sequences)
136
-
137
- new_photos = sum(seq.n_photos for seq in set().union(*processable_components)) + +sum(
138
- v.n_photos for v in forest
139
- )
140
- assert n_photos == new_photos, f"ComputeIndices.prepare had {n_photos} photos and ended up with {new_photos}"
141
-
142
- # Return work items and tuple accumulator
143
- return processable_components, (forest, bins)
144
-
145
- @classmethod
146
- def stage_worker(cls, component: set[PhotoSequence], created_by: str) -> WorkerResult[list[PhotoSequence]]:
147
- """Process one connected component to form PhotoSequence objects.
148
-
149
- Uses predicted exemplar sequence and intersection-based comparison.
150
- Builds SequenceGroup models incrementally for review.
151
-
152
- Args:
153
- component: Set of PhotoSequence objects to compare
154
- created_by: Annotation of how the similarity was detected
155
-
156
- Returns:
157
- Tuple of (identical_groups, sequence_groups, work_sequences) where:
158
- - identical_groups: Always empty list for this stage
159
- - sequence_groups: SequenceGroup models for multi-sequence groups
160
- - work_sequences: PhotoSequence objects for pipeline flow
161
- """
162
- # ASSERTION: Count input photos (atomic invariant)
163
- input_photos: int = sum(seq.n_photos for seq in component)
164
-
165
- # Use configured gate sequence instead of hardcoded method
166
- gates = GateSequence(cast(list[GateName], CONFIG.processing.COMPARISON_GATES))
167
-
168
- # Use common clustering algorithm
169
- result_classes, sequence_groups = cluster_similar_sequences(
170
- list(component),
171
- gates,
172
- created_by,
173
- )
174
-
175
- # ASSERTION: Verify all photos preserved
176
- output_photos = sum(seq.n_photos for seq in result_classes)
177
- assert output_photos == input_photos, (
178
- f"Lost photos in stage_worker: started {input_photos}, ended {output_photos}"
179
- )
180
-
181
- return [], sequence_groups, result_classes
182
-
183
- def accumulate_results(
184
- self,
185
- accum: tuple[list[PhotoSequence], list[PhotoSequence]],
186
- job: list[PhotoSequence],
187
- ) -> None:
188
- """Accumulate worker results into forest and bins.
189
-
190
- Args:
191
- accum: Tuple of (forest, bins) - both contain all sequences
192
- job: List of PhotoSequence objects from worker
193
- """
194
- forest, bins = accum
195
- forest.extend(job)
196
- bins.extend(job)
197
-
198
- def finalise(self) -> None:
199
- forest = self.result[0]
200
- self.ref_photos_final = count_forest_ref_photos(forest)
201
- self.ref_seqs_final = len(forest)
202
-
203
- # Count total photos to ensure no photos lost (invariant check)
204
- photos_final = count_forest_total_photos(forest)
205
- count_forest_ref_sequences(forest)
206
-
207
- # FIXME: Sequence count validation disabled due to test fixture limitations
208
- assert photos_final == self.total_photos, (
209
- f"Started with {self.total_photos} photos but ended up with {photos_final}"
210
- )
211
-
212
- def needs_review(self) -> ReviewType:
213
- """This stage produces sequence groups (index overlap sequences).
214
-
215
- Returns:
216
- "sequences" to indicate this stage produces reviewable sequence groups
217
- """
218
- return "sequences"
219
-
220
- def has_review_data(self) -> bool:
221
- """Check if there are any index overlap sequence groups to review.
222
-
223
- Returns:
224
- True if forest has classes (multi-sequence groups), False otherwise
225
- """
226
- # Check if stage has run
227
- if not hasattr(self, "result") or self.result is None:
228
- return False
229
-
230
- # Check if there are any classes (multi-sequence groups) in the forest
231
- forest = self.result[0]
232
- return any(seq.is_class() for seq in forest)
233
-
234
- # Typed result field - tuple of (forest, bins)
235
- result: tuple[list[PhotoSequence], list[PhotoSequence]]
1
+ """Compute similar sequences from bins created by puting the sequence in bins defined by the max two indices of the sequence."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from itertools import combinations
6
+ from typing import cast
7
+
8
+ import networkx as nx
9
+
10
+ from .comparison_gates import GateName, GateSequence
11
+ from .config import CONFIG
12
+ from .logger import get_logger
13
+ from .models import ReviewType
14
+ from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
15
+ from .ports import InputPort, OutputPort
16
+ from .sequence import (
17
+ INDEX_T,
18
+ PhotoSequence,
19
+ count_forest_ref_photos,
20
+ count_forest_ref_sequences,
21
+ count_forest_total_photos,
22
+ )
23
+ from .sequence_clustering import cluster_similar_sequences
24
+
25
+
26
+ def build_cohabitation_graph(
27
+ index_bins: dict[INDEX_T, list[PhotoSequence]],
28
+ ) -> list[set[PhotoSequence]]:
29
+ """Build graph from index bins and find connected components.
30
+
31
+ Args:
32
+ index_bins: Dict mapping index pattern → list of sequences
33
+
34
+ Returns:
35
+ List of connected components (each component is a set of PhotoSequence objects)
36
+ """
37
+ # Build graph
38
+ graph: nx.Graph[PhotoSequence] = nx.Graph()
39
+ graph.add_nodes_from(set().union(*index_bins.values()))
40
+
41
+ # Add edges where sequences share index bins
42
+ # Add edges between all pairs in this bin
43
+ for index_bin in index_bins.values():
44
+ for seq1, seq2 in combinations(index_bin, 2):
45
+ graph.add_edge(seq1, seq2)
46
+
47
+ # Find connected components
48
+ result = [set(c) for c in nx.connected_components(graph)]
49
+
50
+ n_seqs = len(set().union(*index_bins.values()))
51
+ n_result_seqs = len(set().union(*result))
52
+
53
+ assert n_seqs == n_result_seqs, f"build_cohabitation_graph had {n_seqs} but only returned {n_result_seqs}"
54
+
55
+ return result
56
+
57
+
58
+ class ComputeIndices(
59
+ PipelineStage[
60
+ set[PhotoSequence], # S: component
61
+ list[PhotoSequence], # T: work data
62
+ tuple[list[PhotoSequence], list[PhotoSequence]], # R: accumulator
63
+ ]
64
+ ):
65
+ def __init__(self) -> None:
66
+ """Initialize the index-based grouping stage."""
67
+ super().__init__(
68
+ path=CONFIG.paths.forest_sequence_matches_pkl,
69
+ stage_name="Index Grouping",
70
+ )
71
+
72
+ # Store worker argument
73
+ self.args = self.stage_name # Standard args attribute for run()
74
+
75
+ # Create input port for index bins
76
+ self.index_bins_i: InputPort[dict[INDEX_T, list[PhotoSequence]]] = InputPort("index_bins")
77
+
78
+ # Create output ports - separate ports per downstream consumer
79
+ # Full tuple output (for backward compatibility or review)
80
+ self.forest_bins_o: OutputPort[tuple[list[PhotoSequence], list[PhotoSequence]]] = OutputPort(
81
+ self, getter=lambda: self.result
82
+ )
83
+
84
+ # Forest output (for ComputePerceptualHash and ComputePerceptualMatch)
85
+ self.forest_o: OutputPort[list[PhotoSequence]] = OutputPort(self, getter=lambda: self.result[0])
86
+
87
+ def prepare(
88
+ self,
89
+ ) -> PrepareResult[set[PhotoSequence], tuple[list[PhotoSequence], list[PhotoSequence]]]:
90
+ """Extract index bins, build graph, and return processable components.
91
+
92
+ Reads index bins from input port and prepares work items for parallel processing.
93
+
94
+ Returns:
95
+ Tuple of (processable_components, accumulator)
96
+ """
97
+ # Read index bins from input port
98
+ index_bins: dict[INDEX_T, list[PhotoSequence]] = self.index_bins_i.read()
99
+ # Get reference counts from upstream for UI statistics tracking
100
+ all_sequences = set().union(*index_bins.values())
101
+ self.ref_photos_init = self.index_bins_i.get_ref_photo_count()
102
+ self.ref_seqs_init = self.index_bins_i.get_ref_sequence_count()
103
+ # Count total photos for internal invariant checking (should never change)
104
+ self.total_photos = sum(seq.n_photos for seq in all_sequences)
105
+
106
+ n_photos = self.total_photos
107
+
108
+ # Build cohabitation graph
109
+ components: list[set[PhotoSequence]] = build_cohabitation_graph(index_bins)
110
+ n_component_photos = sum(seq.n_photos for seq in set().union(*components))
111
+ assert n_photos == n_component_photos, (
112
+ f"Had {n_photos} before cohabitation graph and {n_component_photos} afterward"
113
+ )
114
+
115
+ # Filter components by size
116
+ max_size = CONFIG.sequences.MAX_COMPONENT_SIZE
117
+ processable_components: list[set[PhotoSequence]] = sorted(
118
+ [c for c in components if 2 <= len(c) <= max_size],
119
+ key=lambda c: -sum([s.n_ref_photos for s in c]),
120
+ )
121
+ skipped_components: list[set[PhotoSequence]] = [c for c in components if len(c) > max_size or len(c) < 2]
122
+
123
+ # Calculate skip statistics
124
+ num_singletons = sum(1 for c in skipped_components if len(c) < 2)
125
+ num_oversized = sum(1 for c in skipped_components if len(c) > max_size)
126
+
127
+ get_logger().info(
128
+ f"Skipped {len(skipped_components)} components ({num_singletons} singletons, {num_oversized} oversized), "
129
+ f"total sequences is {sum([len(c) for c in processable_components])} in {len(processable_components)} sets"
130
+ )
131
+
132
+ # Initialize forest with skipped sequences (pass-through)
133
+ skipped_sequences = [seq for comp in skipped_components for seq in comp]
134
+ forest: list[PhotoSequence] = list(skipped_sequences)
135
+ bins: list[PhotoSequence] = list(skipped_sequences)
136
+
137
+ new_photos = sum(seq.n_photos for seq in set().union(*processable_components)) + +sum(
138
+ v.n_photos for v in forest
139
+ )
140
+ assert n_photos == new_photos, f"ComputeIndices.prepare had {n_photos} photos and ended up with {new_photos}"
141
+
142
+ # Return work items and tuple accumulator
143
+ return processable_components, (forest, bins)
144
+
145
+ @classmethod
146
+ def stage_worker(cls, component: set[PhotoSequence], created_by: str) -> WorkerResult[list[PhotoSequence]]:
147
+ """Process one connected component to form PhotoSequence objects.
148
+
149
+ Uses predicted exemplar sequence and intersection-based comparison.
150
+ Builds SequenceGroup models incrementally for review.
151
+
152
+ Args:
153
+ component: Set of PhotoSequence objects to compare
154
+ created_by: Annotation of how the similarity was detected
155
+
156
+ Returns:
157
+ Tuple of (identical_groups, sequence_groups, work_sequences) where:
158
+ - identical_groups: Always empty list for this stage
159
+ - sequence_groups: SequenceGroup models for multi-sequence groups
160
+ - work_sequences: PhotoSequence objects for pipeline flow
161
+ """
162
+ # ASSERTION: Count input photos (atomic invariant)
163
+ input_photos: int = sum(seq.n_photos for seq in component)
164
+
165
+ # Use configured gate sequence instead of hardcoded method
166
+ gates = GateSequence(cast(list[GateName], CONFIG.processing.COMPARISON_GATES))
167
+
168
+ # Use common clustering algorithm
169
+ result_classes, sequence_groups = cluster_similar_sequences(
170
+ list(component),
171
+ gates,
172
+ created_by,
173
+ )
174
+
175
+ # ASSERTION: Verify all photos preserved
176
+ output_photos = sum(seq.n_photos for seq in result_classes)
177
+ assert output_photos == input_photos, (
178
+ f"Lost photos in stage_worker: started {input_photos}, ended {output_photos}"
179
+ )
180
+
181
+ return [], sequence_groups, result_classes
182
+
183
+ def accumulate_results(
184
+ self,
185
+ accum: tuple[list[PhotoSequence], list[PhotoSequence]],
186
+ job: list[PhotoSequence],
187
+ ) -> None:
188
+ """Accumulate worker results into forest and bins.
189
+
190
+ Args:
191
+ accum: Tuple of (forest, bins) - both contain all sequences
192
+ job: List of PhotoSequence objects from worker
193
+ """
194
+ forest, bins = accum
195
+ forest.extend(job)
196
+ bins.extend(job)
197
+
198
+ def finalise(self) -> None:
199
+ forest = self.result[0]
200
+ self.ref_photos_final = count_forest_ref_photos(forest)
201
+ self.ref_seqs_final = len(forest)
202
+
203
+ # Count total photos to ensure no photos lost (invariant check)
204
+ photos_final = count_forest_total_photos(forest)
205
+ count_forest_ref_sequences(forest)
206
+
207
+ # FIXME: Sequence count validation disabled due to test fixture limitations
208
+ assert photos_final == self.total_photos, (
209
+ f"Started with {self.total_photos} photos but ended up with {photos_final}"
210
+ )
211
+
212
+ def needs_review(self) -> ReviewType:
213
+ """This stage produces sequence groups (index overlap sequences).
214
+
215
+ Returns:
216
+ "sequences" to indicate this stage produces reviewable sequence groups
217
+ """
218
+ return "sequences"
219
+
220
+ def has_review_data(self) -> bool:
221
+ """Check if there are any index overlap sequence groups to review.
222
+
223
+ Returns:
224
+ True if forest has classes (multi-sequence groups), False otherwise
225
+ """
226
+ # Check if stage has run
227
+ if not hasattr(self, "result") or self.result is None:
228
+ return False
229
+
230
+ # Check if there are any classes (multi-sequence groups) in the forest
231
+ forest = self.result[0]
232
+ return any(seq.is_class() for seq in forest)
233
+
234
+ # Typed result field - tuple of (forest, bins)
235
+ result: tuple[list[PhotoSequence], list[PhotoSequence]]