photo-stack-finder 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. orchestrator/__init__.py +2 -2
  2. orchestrator/app.py +6 -11
  3. orchestrator/build_pipeline.py +19 -21
  4. orchestrator/orchestrator_runner.py +11 -8
  5. orchestrator/pipeline_builder.py +126 -126
  6. orchestrator/pipeline_orchestrator.py +604 -604
  7. orchestrator/review_persistence.py +162 -162
  8. orchestrator/static/orchestrator.css +76 -76
  9. orchestrator/static/orchestrator.html +11 -5
  10. orchestrator/static/orchestrator.js +3 -1
  11. overlap_metrics/__init__.py +1 -1
  12. overlap_metrics/config.py +135 -135
  13. overlap_metrics/core.py +284 -284
  14. overlap_metrics/estimators.py +292 -292
  15. overlap_metrics/metrics.py +307 -307
  16. overlap_metrics/registry.py +99 -99
  17. overlap_metrics/utils.py +104 -104
  18. photo_compare/__init__.py +1 -1
  19. photo_compare/base.py +285 -285
  20. photo_compare/config.py +225 -225
  21. photo_compare/distance.py +15 -15
  22. photo_compare/feature_methods.py +173 -173
  23. photo_compare/file_hash.py +29 -29
  24. photo_compare/hash_methods.py +99 -99
  25. photo_compare/histogram_methods.py +118 -118
  26. photo_compare/pixel_methods.py +58 -58
  27. photo_compare/structural_methods.py +104 -104
  28. photo_compare/types.py +28 -28
  29. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/METADATA +21 -22
  30. photo_stack_finder-0.1.8.dist-info/RECORD +75 -0
  31. scripts/orchestrate.py +12 -10
  32. utils/__init__.py +4 -3
  33. utils/base_pipeline_stage.py +171 -171
  34. utils/base_ports.py +176 -176
  35. utils/benchmark_utils.py +823 -823
  36. utils/channel.py +74 -74
  37. utils/comparison_gates.py +40 -21
  38. utils/compute_benchmarks.py +355 -355
  39. utils/compute_identical.py +94 -24
  40. utils/compute_indices.py +235 -235
  41. utils/compute_perceptual_hash.py +127 -127
  42. utils/compute_perceptual_match.py +240 -240
  43. utils/compute_sha_bins.py +64 -20
  44. utils/compute_template_similarity.py +1 -1
  45. utils/compute_versions.py +483 -483
  46. utils/config.py +8 -5
  47. utils/data_io.py +83 -83
  48. utils/graph_context.py +44 -44
  49. utils/logger.py +2 -2
  50. utils/models.py +2 -2
  51. utils/photo_file.py +90 -91
  52. utils/pipeline_graph.py +334 -334
  53. utils/pipeline_stage.py +408 -408
  54. utils/plot_helpers.py +123 -123
  55. utils/ports.py +136 -136
  56. utils/progress.py +415 -415
  57. utils/report_builder.py +139 -139
  58. utils/review_types.py +55 -55
  59. utils/review_utils.py +10 -19
  60. utils/sequence.py +10 -8
  61. utils/sequence_clustering.py +1 -1
  62. utils/template.py +57 -57
  63. utils/template_parsing.py +71 -0
  64. photo_stack_finder-0.1.7.dist-info/RECORD +0 -74
  65. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/WHEEL +0 -0
  66. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/entry_points.txt +0 -0
  67. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/licenses/LICENSE +0 -0
  68. {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/top_level.txt +0 -0
@@ -1,240 +1,240 @@
1
- """Implementation of perceptual matching pipeline stage."""
2
-
3
- from __future__ import annotations
4
-
5
- from collections import defaultdict
6
- from itertools import combinations
7
- from typing import cast
8
-
9
- import networkx as nx
10
-
11
- from .comparison_gates import GateName, GateSequence
12
- from .config import CONFIG
13
- from .logger import get_logger
14
- from .models import ReviewType, SequenceGroup
15
- from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
16
- from .ports import InputPort, OutputPort
17
- from .sequence import (
18
- INDEX_T,
19
- PhotoSequence,
20
- count_forest_ref_photos,
21
- count_forest_ref_sequences,
22
- count_forest_total_photos,
23
- predict_exemplar_sequence,
24
- )
25
- from .sequence_clustering import cluster_similar_sequences
26
-
27
-
28
- class ComputePerceptualMatch(
29
- PipelineStage[
30
- list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]], # S: component
31
- list[PhotoSequence], # T: merged sequences
32
- list[PhotoSequence], # R: result forest
33
- ]
34
- ):
35
- def __init__(self) -> None:
36
- """Initialize the perceptual matching stage."""
37
- super().__init__(
38
- path=CONFIG.paths.forest_final_pkl,
39
- stage_name="Perceptual Matching",
40
- )
41
-
42
- # Store worker argument
43
- self.args = self.stage_name # Standard args attribute for run()
44
-
45
- # Create input port for forest (from ComputeIndices)
46
- self.forest_i: InputPort[list[PhotoSequence]] = InputPort("forest")
47
-
48
- # Create input port for perceptual bins (from ComputePerceptualHash)
49
- self.perceptual_bins_i: InputPort[dict[bytes, dict[int, list[INDEX_T]]]] = InputPort("perceptual_bins")
50
-
51
- # Create output port for final forest
52
- self.final_forest_o: OutputPort[list[PhotoSequence]] = OutputPort(self, getter=lambda: self.result)
53
-
54
- def prepare(
55
- self,
56
- ) -> PrepareResult[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]], list[PhotoSequence]]:
57
- """Extract index bins, build graph, and return processable components.
58
-
59
- Reads forest and bins from input ports, builds connection graph,
60
- and filters components by size.
61
-
62
- Returns:
63
- Tuple of (processable_components, skipped_sequences)
64
- """
65
- # Read from input ports
66
- forest = self.forest_i.read()
67
- # Get reference counts from upstream for UI statistics tracking
68
- self.ref_photos_init = self.forest_i.get_ref_photo_count()
69
- self.ref_seqs_init = self.forest_i.get_ref_sequence_count()
70
- # Count total photos for internal invariant checking (should never change)
71
- self.total_photos = sum(seq.n_photos for seq in forest)
72
- perceptual_bins = self.perceptual_bins_i.read()
73
-
74
- # Within each bin, calculate the number of connections between sequences and the best index mapping with its value
75
- connections: dict[tuple[int, int], list[tuple[list[INDEX_T], list[INDEX_T]]]] = defaultdict(list)
76
- associations: dict[int, list[tuple[INDEX_T, bytes]]] = defaultdict(list)
77
- k: bytes
78
- hbin: dict[int, list[INDEX_T]]
79
- for k, hbin in perceptual_bins.items():
80
- # label each index of the sequence with its hash
81
- s: int
82
- idces: list[INDEX_T]
83
- for s, idces in hbin.items():
84
- associations[s].extend([(idx, k) for idx in idces])
85
- # add the pair of index lists that are matched to the pair of sequences
86
- for (s1, hb1), (s2, hb2) in combinations(sorted(hbin.items()), 2):
87
- connections[(s1, s2)].append((hb1, hb2))
88
-
89
- # Form connection graph along with index mappings and get components of connected sequences
90
- graph: nx.Graph[int] = nx.Graph()
91
- graph.add_nodes_from(range(len(forest)))
92
- for (s1, s2), idx_pairs in connections.items():
93
- # If the sequences match for at least half their points then test them for equality
94
- if sum([min(len(idces1), len(idces2)) for idces1, idces2 in idx_pairs]) >= 0.5 * min(
95
- len(forest[s1].get_reference()), len(forest[s2].get_reference())
96
- ):
97
- graph.add_edge(s1, s2)
98
-
99
- components: list[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]]] = [
100
- [(forest[i], associations[i]) for i in c] for c in nx.connected_components(graph)
101
- ]
102
-
103
- # Filter components by size
104
- max_size = CONFIG.sequences.MAX_COMPONENT_SIZE
105
- processable_components: list[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]]] = sorted(
106
- [c for c in components if 2 <= len(c) <= max_size],
107
- key=lambda c: -sum([seq.n_ref_photos for seq, _ in c]),
108
- )
109
- skipped_components: list[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]]] = [
110
- c for c in components if len(c) > max_size or len(c) < 2
111
- ]
112
-
113
- # Flatten skipped components into result sequences
114
- results: list[PhotoSequence] = [seq for comp in skipped_components for seq, _ in comp]
115
-
116
- # Calculate skip statistics
117
- num_singletons = sum(1 for c in skipped_components if len(c) < 2)
118
- num_oversized = sum(1 for c in skipped_components if len(c) > max_size)
119
-
120
- get_logger().info(
121
- f"There are {len(processable_components)} perceptual components with an average of {float(sum([len(c) for c in processable_components])) / float(len(processable_components)) if processable_components else 0} sequences"
122
- )
123
- get_logger().info(
124
- f"Skipped {len(skipped_components)} components ({num_singletons} singletons, {num_oversized} oversized), "
125
- f"total sequences is {sum([len(c) for c in processable_components])} in {len(processable_components)} sets"
126
- )
127
-
128
- n_photos_processable = sum(seq.n_photos for component in processable_components for seq, _ in component)
129
- n_photos_skipped = sum(seq.n_photos for seq in results)
130
-
131
- assert self.total_photos == n_photos_processable + n_photos_skipped, (
132
- f"ComputePerceptualMatch._prepare_with_bins lost photos, expected {self.total_photos}, got {n_photos_processable} + {n_photos_skipped}"
133
- )
134
-
135
- return processable_components, results
136
-
137
- @classmethod
138
- def stage_worker(
139
- cls,
140
- bin_data: list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]],
141
- created_by: str,
142
- ) -> WorkerResult[list[PhotoSequence]]:
143
- # ASSERTION: Count input photos (atomic invariant)
144
- input_photos: int = sum(seq.n_photos for seq, _ in bin_data)
145
-
146
- gates = GateSequence(cast(list[GateName], CONFIG.processing.COMPARISON_GATES))
147
-
148
- # Keep sequences and their hash associations
149
- seq_with_hashes: list[tuple[PhotoSequence, dict[INDEX_T, bytes]]] = [
150
- (seq, dict(hashes)) for seq, hashes in bin_data
151
- ]
152
-
153
- result_sequences: list[PhotoSequence] = []
154
- review_groups: list[SequenceGroup] = []
155
-
156
- # Iteratively find clusters of similar sequences
157
- while seq_with_hashes:
158
- # Extract just sequences for exemplar prediction
159
- sequences = [seq for seq, _hashes in seq_with_hashes]
160
-
161
- # Pick best exemplar from remaining sequences
162
- exemplar_seq_obj = predict_exemplar_sequence(sequences)
163
- exemplar_idx = sequences.index(exemplar_seq_obj)
164
- seq_with_hashes[exemplar_idx][1]
165
-
166
- remaining_with_hashes = [
167
- (seq, hashes) for i, (seq, hashes) in enumerate(seq_with_hashes) if i != exemplar_idx
168
- ]
169
-
170
- # Use common clustering algorithm
171
- cluster_results, cluster_reviews = cluster_similar_sequences(
172
- [exemplar_seq_obj] + [seq for seq, _ in remaining_with_hashes],
173
- gates,
174
- created_by,
175
- )
176
-
177
- result_sequences.extend(cluster_results)
178
- review_groups.extend(cluster_reviews)
179
-
180
- # Remove all processed sequences from pool (they're now in cluster_results)
181
- # Note: cluster_results contains NEW PhotoSequence objects, so we track input sequences instead
182
- input_sequences = {exemplar_seq_obj} | {seq for seq, _ in remaining_with_hashes}
183
- seq_with_hashes = [(seq, hashes) for seq, hashes in seq_with_hashes if seq not in input_sequences]
184
-
185
- # ASSERTION: Verify all photos preserved
186
- output_photos = sum(seq.n_photos for seq in result_sequences)
187
- assert output_photos == input_photos, (
188
- f"Lost photos in stage_worker: started {input_photos}, ended {output_photos}"
189
- )
190
-
191
- return [], review_groups, result_sequences
192
-
193
- def accumulate_results(
194
- self,
195
- accum: list[PhotoSequence],
196
- job: list[PhotoSequence],
197
- ) -> None:
198
- accum.extend(job)
199
-
200
- def finalise(self) -> None:
201
- self.ref_photos_final = count_forest_ref_photos(self.result)
202
- self.ref_seqs_final = len(self.result)
203
-
204
- # Count total photos to ensure no photos lost (invariant check)
205
- photos_final = count_forest_total_photos(self.result)
206
- seqs_final = count_forest_ref_sequences(self.result)
207
-
208
- if seqs_final != self.ref_seqs_init:
209
- get_logger().warning(
210
- f"Sequence count mismatch in {self.stage_name}: "
211
- f"started with {self.ref_seqs_init} but ended with {seqs_final}"
212
- )
213
-
214
- assert photos_final == self.total_photos, (
215
- f"Started with {self.total_photos} photos and ended up with {photos_final}"
216
- )
217
-
218
- def needs_review(self) -> ReviewType:
219
- """This stage produces sequence groups (similar photo sequences).
220
-
221
- Returns:
222
- "sequences" to indicate this stage produces reviewable sequence groups
223
- """
224
- return "sequences"
225
-
226
- def has_review_data(self) -> bool:
227
- """Check if there are any sequence groups to review.
228
-
229
- Returns:
230
- True if forest has classes (multi-sequence groups), False otherwise
231
- """
232
- # Check if stage has run
233
- if not hasattr(self, "result") or self.result is None:
234
- return False
235
-
236
- # Check if there are any classes (multi-sequence groups)
237
- return any(seq.is_class() for seq in self.result)
238
-
239
- # Typed result field - just the forest
240
- result: list[PhotoSequence]
1
+ """Implementation of perceptual matching pipeline stage."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections import defaultdict
6
+ from itertools import combinations
7
+ from typing import cast
8
+
9
+ import networkx as nx
10
+
11
+ from .comparison_gates import GateName, GateSequence
12
+ from .config import CONFIG
13
+ from .logger import get_logger
14
+ from .models import ReviewType, SequenceGroup
15
+ from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
16
+ from .ports import InputPort, OutputPort
17
+ from .sequence import (
18
+ INDEX_T,
19
+ PhotoSequence,
20
+ count_forest_ref_photos,
21
+ count_forest_ref_sequences,
22
+ count_forest_total_photos,
23
+ predict_exemplar_sequence,
24
+ )
25
+ from .sequence_clustering import cluster_similar_sequences
26
+
27
+
28
+ class ComputePerceptualMatch(
29
+ PipelineStage[
30
+ list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]], # S: component
31
+ list[PhotoSequence], # T: merged sequences
32
+ list[PhotoSequence], # R: result forest
33
+ ]
34
+ ):
35
+ def __init__(self) -> None:
36
+ """Initialize the perceptual matching stage."""
37
+ super().__init__(
38
+ path=CONFIG.paths.forest_final_pkl,
39
+ stage_name="Perceptual Matching",
40
+ )
41
+
42
+ # Store worker argument
43
+ self.args = self.stage_name # Standard args attribute for run()
44
+
45
+ # Create input port for forest (from ComputeIndices)
46
+ self.forest_i: InputPort[list[PhotoSequence]] = InputPort("forest")
47
+
48
+ # Create input port for perceptual bins (from ComputePerceptualHash)
49
+ self.perceptual_bins_i: InputPort[dict[bytes, dict[int, list[INDEX_T]]]] = InputPort("perceptual_bins")
50
+
51
+ # Create output port for final forest
52
+ self.final_forest_o: OutputPort[list[PhotoSequence]] = OutputPort(self, getter=lambda: self.result)
53
+
54
+ def prepare(
55
+ self,
56
+ ) -> PrepareResult[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]], list[PhotoSequence]]:
57
+ """Extract index bins, build graph, and return processable components.
58
+
59
+ Reads forest and bins from input ports, builds connection graph,
60
+ and filters components by size.
61
+
62
+ Returns:
63
+ Tuple of (processable_components, skipped_sequences)
64
+ """
65
+ # Read from input ports
66
+ forest = self.forest_i.read()
67
+ # Get reference counts from upstream for UI statistics tracking
68
+ self.ref_photos_init = self.forest_i.get_ref_photo_count()
69
+ self.ref_seqs_init = self.forest_i.get_ref_sequence_count()
70
+ # Count total photos for internal invariant checking (should never change)
71
+ self.total_photos = sum(seq.n_photos for seq in forest)
72
+ perceptual_bins = self.perceptual_bins_i.read()
73
+
74
+ # Within each bin, calculate the number of connections between sequences and the best index mapping with its value
75
+ connections: dict[tuple[int, int], list[tuple[list[INDEX_T], list[INDEX_T]]]] = defaultdict(list)
76
+ associations: dict[int, list[tuple[INDEX_T, bytes]]] = defaultdict(list)
77
+ k: bytes
78
+ hbin: dict[int, list[INDEX_T]]
79
+ for k, hbin in perceptual_bins.items():
80
+ # label each index of the sequence with its hash
81
+ s: int
82
+ idces: list[INDEX_T]
83
+ for s, idces in hbin.items():
84
+ associations[s].extend([(idx, k) for idx in idces])
85
+ # add the pair of index lists that are matched to the pair of sequences
86
+ for (s1, hb1), (s2, hb2) in combinations(sorted(hbin.items()), 2):
87
+ connections[(s1, s2)].append((hb1, hb2))
88
+
89
+ # Form connection graph along with index mappings and get components of connected sequences
90
+ graph: nx.Graph[int] = nx.Graph()
91
+ graph.add_nodes_from(range(len(forest)))
92
+ for (s1, s2), idx_pairs in connections.items():
93
+ # If the sequences match for at least half their points then test them for equality
94
+ if sum([min(len(idces1), len(idces2)) for idces1, idces2 in idx_pairs]) >= 0.5 * min(
95
+ len(forest[s1].get_reference()), len(forest[s2].get_reference())
96
+ ):
97
+ graph.add_edge(s1, s2)
98
+
99
+ components: list[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]]] = [
100
+ [(forest[i], associations[i]) for i in c] for c in nx.connected_components(graph)
101
+ ]
102
+
103
+ # Filter components by size
104
+ max_size = CONFIG.sequences.MAX_COMPONENT_SIZE
105
+ processable_components: list[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]]] = sorted(
106
+ [c for c in components if 2 <= len(c) <= max_size],
107
+ key=lambda c: -sum([seq.n_ref_photos for seq, _ in c]),
108
+ )
109
+ skipped_components: list[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]]] = [
110
+ c for c in components if len(c) > max_size or len(c) < 2
111
+ ]
112
+
113
+ # Flatten skipped components into result sequences
114
+ results: list[PhotoSequence] = [seq for comp in skipped_components for seq, _ in comp]
115
+
116
+ # Calculate skip statistics
117
+ num_singletons = sum(1 for c in skipped_components if len(c) < 2)
118
+ num_oversized = sum(1 for c in skipped_components if len(c) > max_size)
119
+
120
+ get_logger().info(
121
+ f"There are {len(processable_components)} perceptual components with an average of {float(sum([len(c) for c in processable_components])) / float(len(processable_components)) if processable_components else 0} sequences"
122
+ )
123
+ get_logger().info(
124
+ f"Skipped {len(skipped_components)} components ({num_singletons} singletons, {num_oversized} oversized), "
125
+ f"total sequences is {sum([len(c) for c in processable_components])} in {len(processable_components)} sets"
126
+ )
127
+
128
+ n_photos_processable = sum(seq.n_photos for component in processable_components for seq, _ in component)
129
+ n_photos_skipped = sum(seq.n_photos for seq in results)
130
+
131
+ assert self.total_photos == n_photos_processable + n_photos_skipped, (
132
+ f"ComputePerceptualMatch._prepare_with_bins lost photos, expected {self.total_photos}, got {n_photos_processable} + {n_photos_skipped}"
133
+ )
134
+
135
+ return processable_components, results
136
+
137
+ @classmethod
138
+ def stage_worker(
139
+ cls,
140
+ bin_data: list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]],
141
+ created_by: str,
142
+ ) -> WorkerResult[list[PhotoSequence]]:
143
+ # ASSERTION: Count input photos (atomic invariant)
144
+ input_photos: int = sum(seq.n_photos for seq, _ in bin_data)
145
+
146
+ gates = GateSequence(cast(list[GateName], CONFIG.processing.COMPARISON_GATES))
147
+
148
+ # Keep sequences and their hash associations
149
+ seq_with_hashes: list[tuple[PhotoSequence, dict[INDEX_T, bytes]]] = [
150
+ (seq, dict(hashes)) for seq, hashes in bin_data
151
+ ]
152
+
153
+ result_sequences: list[PhotoSequence] = []
154
+ review_groups: list[SequenceGroup] = []
155
+
156
+ # Iteratively find clusters of similar sequences
157
+ while seq_with_hashes:
158
+ # Extract just sequences for exemplar prediction
159
+ sequences = [seq for seq, _hashes in seq_with_hashes]
160
+
161
+ # Pick best exemplar from remaining sequences
162
+ exemplar_seq_obj = predict_exemplar_sequence(sequences)
163
+ exemplar_idx = sequences.index(exemplar_seq_obj)
164
+ seq_with_hashes[exemplar_idx][1]
165
+
166
+ remaining_with_hashes = [
167
+ (seq, hashes) for i, (seq, hashes) in enumerate(seq_with_hashes) if i != exemplar_idx
168
+ ]
169
+
170
+ # Use common clustering algorithm
171
+ cluster_results, cluster_reviews = cluster_similar_sequences(
172
+ [exemplar_seq_obj] + [seq for seq, _ in remaining_with_hashes],
173
+ gates,
174
+ created_by,
175
+ )
176
+
177
+ result_sequences.extend(cluster_results)
178
+ review_groups.extend(cluster_reviews)
179
+
180
+ # Remove all processed sequences from pool (they're now in cluster_results)
181
+ # Note: cluster_results contains NEW PhotoSequence objects, so we track input sequences instead
182
+ input_sequences = {exemplar_seq_obj} | {seq for seq, _ in remaining_with_hashes}
183
+ seq_with_hashes = [(seq, hashes) for seq, hashes in seq_with_hashes if seq not in input_sequences]
184
+
185
+ # ASSERTION: Verify all photos preserved
186
+ output_photos = sum(seq.n_photos for seq in result_sequences)
187
+ assert output_photos == input_photos, (
188
+ f"Lost photos in stage_worker: started {input_photos}, ended {output_photos}"
189
+ )
190
+
191
+ return [], review_groups, result_sequences
192
+
193
+ def accumulate_results(
194
+ self,
195
+ accum: list[PhotoSequence],
196
+ job: list[PhotoSequence],
197
+ ) -> None:
198
+ accum.extend(job)
199
+
200
+ def finalise(self) -> None:
201
+ self.ref_photos_final = count_forest_ref_photos(self.result)
202
+ self.ref_seqs_final = len(self.result)
203
+
204
+ # Count total photos to ensure no photos lost (invariant check)
205
+ photos_final = count_forest_total_photos(self.result)
206
+ seqs_final = count_forest_ref_sequences(self.result)
207
+
208
+ if seqs_final != self.ref_seqs_init:
209
+ get_logger().warning(
210
+ f"Sequence count mismatch in {self.stage_name}: "
211
+ f"started with {self.ref_seqs_init} but ended with {seqs_final}"
212
+ )
213
+
214
+ assert photos_final == self.total_photos, (
215
+ f"Started with {self.total_photos} photos and ended up with {photos_final}"
216
+ )
217
+
218
+ def needs_review(self) -> ReviewType:
219
+ """This stage produces sequence groups (similar photo sequences).
220
+
221
+ Returns:
222
+ "sequences" to indicate this stage produces reviewable sequence groups
223
+ """
224
+ return "sequences"
225
+
226
+ def has_review_data(self) -> bool:
227
+ """Check if there are any sequence groups to review.
228
+
229
+ Returns:
230
+ True if forest has classes (multi-sequence groups), False otherwise
231
+ """
232
+ # Check if stage has run
233
+ if not hasattr(self, "result") or self.result is None:
234
+ return False
235
+
236
+ # Check if there are any classes (multi-sequence groups)
237
+ return any(seq.is_class() for seq in self.result)
238
+
239
+ # Typed result field - just the forest
240
+ result: list[PhotoSequence]
utils/compute_sha_bins.py CHANGED
@@ -1,26 +1,57 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import hashlib
4
+ import io
3
5
  import mimetypes
4
6
  import os
5
7
  from collections import defaultdict
6
8
  from collections.abc import Iterator
7
9
  from pathlib import Path
8
10
 
9
- from photo_compare import file_sha256
11
+ from PIL import Image
10
12
 
11
13
  from .config import CONFIG
12
14
  from .photo_file import PhotoFile
13
15
  from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
14
16
  from .ports import OutputPort
17
+ from .template_parsing import extract_template
18
+
19
+
20
+ def _get_oriented_dimensions(img: Image.Image) -> tuple[int, int]:
21
+ """Extract dimensions from PIL Image with EXIF orientation applied.
22
+
23
+ Args:
24
+ img: Opened PIL Image
25
+
26
+ Returns:
27
+ Tuple of (width, height) with EXIF orientation applied
28
+ """
29
+ # Get EXIF orientation if present
30
+ orientation: int = 0
31
+ if hasattr(img, "_getexif") and img._getexif() is not None:
32
+ exif = img._getexif()
33
+ orientation = exif.get(274, 0) # 274 = Orientation EXIF tag
34
+
35
+ # Get raw dimensions
36
+ raw_width: int = img.width
37
+ raw_height: int = img.height
38
+
39
+ # Apply EXIF orientation (swap dimensions for rotations 5,6,7,8)
40
+ # https://www.impulseadventure.com/photo/exif-orientation.html
41
+ if orientation in {5, 6, 7, 8}:
42
+ return raw_height, raw_width
43
+ return raw_width, raw_height
15
44
 
16
45
 
17
46
  class ComputeShaBins(PipelineStage[tuple[int, tuple[Path, str]], tuple[PhotoFile, str], dict[str, list[PhotoFile]]]):
18
47
  """Pipeline stage that walks source directory and bins photos by SHA256 hash.
19
48
 
20
- Creates PhotoFile objects with minimal core properties (path, mime, size_bytes)
21
- computed from file metadata only - NO image opening! SHA256 is computed from
22
- file contents and used for binning, then discarded. All image-derived properties
23
- (pixels, dimensions, EXIF) are computed lazily when first accessed.
49
+ Creates PhotoFile objects with full metadata extracted from a single file read:
50
+ - SHA256 hash (for binning, then discarded)
51
+ - File size, MIME type, path
52
+ - Image dimensions with EXIF orientation applied
53
+
54
+ PhotoFile is a pure data container - this stage performs ALL file I/O.
24
55
  """
25
56
 
26
57
  def __init__(self, source_path: Path) -> None:
@@ -82,44 +113,57 @@ class ComputeShaBins(PipelineStage[tuple[int, tuple[Path, str]], tuple[PhotoFile
82
113
 
83
114
  @classmethod
84
115
  def stage_worker(cls, param: tuple[int, tuple[Path, str]], _args: str) -> WorkerResult[tuple[PhotoFile, str]]:
85
- """Create PhotoFile with core file properties and compute SHA256.
86
-
87
- Work function for parallel processing that takes enumerated file info
88
- and returns a PhotoFile with core file properties (no image opening!).
89
- SHA256 is computed and returned separately for binning.
116
+ """Create PhotoFile with dimensions and compute SHA256 in single file read.
90
117
 
91
- Pixels, dimensions, and all image-derived properties are computed lazily
92
- when first accessed.
118
+ Work function for parallel processing that reads file once, computes SHA256,
119
+ extracts dimensions with EXIF orientation, and creates PhotoFile with all
120
+ metadata. PhotoFile.__init__ never opens files - this is the only file I/O.
93
121
 
94
- There is no exception handling in here. All exceptions should be surfaced to be dealt with by the user.
122
+ There is no exception handling in here. All exceptions should be surfaced
123
+ to be dealt with by the user.
95
124
 
96
125
  Args:
97
126
  param: (photo_id, (path, mime)) tuple
98
127
  _args: Placeholder to match pattern
99
128
 
100
129
  Returns:
101
- (PhotoFile with core properties, SHA256 hash) tuple
130
+ (PhotoFile with all metadata, SHA256 hash) tuple
102
131
  """
103
132
  photo_id: int
104
133
  path: Path
105
134
  mime: str
106
135
  photo_id, (path, mime) = param
107
136
 
108
- # Compute SHA256 (file I/O only, no image opening)
109
- sha256_hash: str = file_sha256(path)
137
+ # Read file once into memory for both SHA256 and dimensions (optimization)
138
+ with path.open("rb") as f:
139
+ file_data: bytes = f.read()
110
140
 
111
- # Get file size
112
- size_bytes: int = path.stat().st_size
141
+ # Compute SHA256 from in-memory data
142
+ sha256_hash: str = hashlib.sha256(file_data).hexdigest()
113
143
 
114
- # Create PhotoFile with core file properties only
115
- # No image opening! Pixels/dimensions computed lazily when accessed
144
+ # Extract dimensions with EXIF orientation from in-memory data
145
+ with Image.open(io.BytesIO(file_data)) as img:
146
+ width, height = _get_oriented_dimensions(img)
147
+
148
+ # Get file size from data length (avoids separate stat call)
149
+ size_bytes: int = len(file_data)
150
+
151
+ # Create PhotoFile with all metadata (no file I/O in PhotoFile.__init__)
116
152
  photo = PhotoFile(
117
153
  path=path,
118
154
  mime=mime,
119
155
  size_bytes=size_bytes,
120
156
  file_id=photo_id,
157
+ width=width,
158
+ height=height,
121
159
  )
122
160
 
161
+ # Extract template from filename (with extension) and include full directory path
162
+ # This ensures files with same name in different directories have different templates
163
+ template_pattern, index = extract_template(path.name)
164
+ template_with_path = str(path.with_name(template_pattern))
165
+ photo.cache["TEMPLATE"] = (template_with_path, index)
166
+
123
167
  # Return PhotoFile and SHA256 separately (SHA256 used for binning only)
124
168
  return (
125
169
  [],
@@ -323,7 +323,7 @@ class ComputeTemplateSimilarity(
323
323
 
324
324
  # Bin is too large - subdivide it
325
325
  if template_remainder in template_bins and template_remainder not in original_large_bins_seen:
326
- get_logger().info(f"Subdividing large bin '{template_remainder}' with {bin_size} sequences")
326
+ get_logger().debug(f"Subdividing large bin '{template_remainder}' with {bin_size} sequences")
327
327
  original_large_bins_seen.add(template_remainder)
328
328
  total_subdivisions += 1
329
329
  largest_input_bin = max(largest_input_bin, bin_size)