photo-stack-finder 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orchestrator/__init__.py +2 -2
- orchestrator/app.py +6 -11
- orchestrator/build_pipeline.py +19 -21
- orchestrator/orchestrator_runner.py +11 -8
- orchestrator/pipeline_builder.py +126 -126
- orchestrator/pipeline_orchestrator.py +604 -604
- orchestrator/review_persistence.py +162 -162
- orchestrator/static/orchestrator.css +76 -76
- orchestrator/static/orchestrator.html +11 -5
- orchestrator/static/orchestrator.js +3 -1
- overlap_metrics/__init__.py +1 -1
- overlap_metrics/config.py +135 -135
- overlap_metrics/core.py +284 -284
- overlap_metrics/estimators.py +292 -292
- overlap_metrics/metrics.py +307 -307
- overlap_metrics/registry.py +99 -99
- overlap_metrics/utils.py +104 -104
- photo_compare/__init__.py +1 -1
- photo_compare/base.py +285 -285
- photo_compare/config.py +225 -225
- photo_compare/distance.py +15 -15
- photo_compare/feature_methods.py +173 -173
- photo_compare/file_hash.py +29 -29
- photo_compare/hash_methods.py +99 -99
- photo_compare/histogram_methods.py +118 -118
- photo_compare/pixel_methods.py +58 -58
- photo_compare/structural_methods.py +104 -104
- photo_compare/types.py +28 -28
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/METADATA +21 -22
- photo_stack_finder-0.1.8.dist-info/RECORD +75 -0
- scripts/orchestrate.py +12 -10
- utils/__init__.py +4 -3
- utils/base_pipeline_stage.py +171 -171
- utils/base_ports.py +176 -176
- utils/benchmark_utils.py +823 -823
- utils/channel.py +74 -74
- utils/comparison_gates.py +40 -21
- utils/compute_benchmarks.py +355 -355
- utils/compute_identical.py +94 -24
- utils/compute_indices.py +235 -235
- utils/compute_perceptual_hash.py +127 -127
- utils/compute_perceptual_match.py +240 -240
- utils/compute_sha_bins.py +64 -20
- utils/compute_template_similarity.py +1 -1
- utils/compute_versions.py +483 -483
- utils/config.py +8 -5
- utils/data_io.py +83 -83
- utils/graph_context.py +44 -44
- utils/logger.py +2 -2
- utils/models.py +2 -2
- utils/photo_file.py +90 -91
- utils/pipeline_graph.py +334 -334
- utils/pipeline_stage.py +408 -408
- utils/plot_helpers.py +123 -123
- utils/ports.py +136 -136
- utils/progress.py +415 -415
- utils/report_builder.py +139 -139
- utils/review_types.py +55 -55
- utils/review_utils.py +10 -19
- utils/sequence.py +10 -8
- utils/sequence_clustering.py +1 -1
- utils/template.py +57 -57
- utils/template_parsing.py +71 -0
- photo_stack_finder-0.1.7.dist-info/RECORD +0 -74
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/WHEEL +0 -0
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/entry_points.txt +0 -0
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/top_level.txt +0 -0
|
@@ -1,240 +1,240 @@
|
|
|
1
|
-
"""Implementation of perceptual matching pipeline stage."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
from collections import defaultdict
|
|
6
|
-
from itertools import combinations
|
|
7
|
-
from typing import cast
|
|
8
|
-
|
|
9
|
-
import networkx as nx
|
|
10
|
-
|
|
11
|
-
from .comparison_gates import GateName, GateSequence
|
|
12
|
-
from .config import CONFIG
|
|
13
|
-
from .logger import get_logger
|
|
14
|
-
from .models import ReviewType, SequenceGroup
|
|
15
|
-
from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
|
|
16
|
-
from .ports import InputPort, OutputPort
|
|
17
|
-
from .sequence import (
|
|
18
|
-
INDEX_T,
|
|
19
|
-
PhotoSequence,
|
|
20
|
-
count_forest_ref_photos,
|
|
21
|
-
count_forest_ref_sequences,
|
|
22
|
-
count_forest_total_photos,
|
|
23
|
-
predict_exemplar_sequence,
|
|
24
|
-
)
|
|
25
|
-
from .sequence_clustering import cluster_similar_sequences
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class ComputePerceptualMatch(
|
|
29
|
-
PipelineStage[
|
|
30
|
-
list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]], # S: component
|
|
31
|
-
list[PhotoSequence], # T: merged sequences
|
|
32
|
-
list[PhotoSequence], # R: result forest
|
|
33
|
-
]
|
|
34
|
-
):
|
|
35
|
-
def __init__(self) -> None:
|
|
36
|
-
"""Initialize the perceptual matching stage."""
|
|
37
|
-
super().__init__(
|
|
38
|
-
path=CONFIG.paths.forest_final_pkl,
|
|
39
|
-
stage_name="Perceptual Matching",
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
# Store worker argument
|
|
43
|
-
self.args = self.stage_name # Standard args attribute for run()
|
|
44
|
-
|
|
45
|
-
# Create input port for forest (from ComputeIndices)
|
|
46
|
-
self.forest_i: InputPort[list[PhotoSequence]] = InputPort("forest")
|
|
47
|
-
|
|
48
|
-
# Create input port for perceptual bins (from ComputePerceptualHash)
|
|
49
|
-
self.perceptual_bins_i: InputPort[dict[bytes, dict[int, list[INDEX_T]]]] = InputPort("perceptual_bins")
|
|
50
|
-
|
|
51
|
-
# Create output port for final forest
|
|
52
|
-
self.final_forest_o: OutputPort[list[PhotoSequence]] = OutputPort(self, getter=lambda: self.result)
|
|
53
|
-
|
|
54
|
-
def prepare(
|
|
55
|
-
self,
|
|
56
|
-
) -> PrepareResult[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]], list[PhotoSequence]]:
|
|
57
|
-
"""Extract index bins, build graph, and return processable components.
|
|
58
|
-
|
|
59
|
-
Reads forest and bins from input ports, builds connection graph,
|
|
60
|
-
and filters components by size.
|
|
61
|
-
|
|
62
|
-
Returns:
|
|
63
|
-
Tuple of (processable_components, skipped_sequences)
|
|
64
|
-
"""
|
|
65
|
-
# Read from input ports
|
|
66
|
-
forest = self.forest_i.read()
|
|
67
|
-
# Get reference counts from upstream for UI statistics tracking
|
|
68
|
-
self.ref_photos_init = self.forest_i.get_ref_photo_count()
|
|
69
|
-
self.ref_seqs_init = self.forest_i.get_ref_sequence_count()
|
|
70
|
-
# Count total photos for internal invariant checking (should never change)
|
|
71
|
-
self.total_photos = sum(seq.n_photos for seq in forest)
|
|
72
|
-
perceptual_bins = self.perceptual_bins_i.read()
|
|
73
|
-
|
|
74
|
-
# Within each bin, calculate the number of connections between sequences and the best index mapping with its value
|
|
75
|
-
connections: dict[tuple[int, int], list[tuple[list[INDEX_T], list[INDEX_T]]]] = defaultdict(list)
|
|
76
|
-
associations: dict[int, list[tuple[INDEX_T, bytes]]] = defaultdict(list)
|
|
77
|
-
k: bytes
|
|
78
|
-
hbin: dict[int, list[INDEX_T]]
|
|
79
|
-
for k, hbin in perceptual_bins.items():
|
|
80
|
-
# label each index of the sequence with its hash
|
|
81
|
-
s: int
|
|
82
|
-
idces: list[INDEX_T]
|
|
83
|
-
for s, idces in hbin.items():
|
|
84
|
-
associations[s].extend([(idx, k) for idx in idces])
|
|
85
|
-
# add the pair of index lists that are matched to the pair of sequences
|
|
86
|
-
for (s1, hb1), (s2, hb2) in combinations(sorted(hbin.items()), 2):
|
|
87
|
-
connections[(s1, s2)].append((hb1, hb2))
|
|
88
|
-
|
|
89
|
-
# Form connection graph along with index mappings and get components of connected sequences
|
|
90
|
-
graph: nx.Graph[int] = nx.Graph()
|
|
91
|
-
graph.add_nodes_from(range(len(forest)))
|
|
92
|
-
for (s1, s2), idx_pairs in connections.items():
|
|
93
|
-
# If the sequences match for at least half their points then test them for equality
|
|
94
|
-
if sum([min(len(idces1), len(idces2)) for idces1, idces2 in idx_pairs]) >= 0.5 * min(
|
|
95
|
-
len(forest[s1].get_reference()), len(forest[s2].get_reference())
|
|
96
|
-
):
|
|
97
|
-
graph.add_edge(s1, s2)
|
|
98
|
-
|
|
99
|
-
components: list[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]]] = [
|
|
100
|
-
[(forest[i], associations[i]) for i in c] for c in nx.connected_components(graph)
|
|
101
|
-
]
|
|
102
|
-
|
|
103
|
-
# Filter components by size
|
|
104
|
-
max_size = CONFIG.sequences.MAX_COMPONENT_SIZE
|
|
105
|
-
processable_components: list[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]]] = sorted(
|
|
106
|
-
[c for c in components if 2 <= len(c) <= max_size],
|
|
107
|
-
key=lambda c: -sum([seq.n_ref_photos for seq, _ in c]),
|
|
108
|
-
)
|
|
109
|
-
skipped_components: list[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]]] = [
|
|
110
|
-
c for c in components if len(c) > max_size or len(c) < 2
|
|
111
|
-
]
|
|
112
|
-
|
|
113
|
-
# Flatten skipped components into result sequences
|
|
114
|
-
results: list[PhotoSequence] = [seq for comp in skipped_components for seq, _ in comp]
|
|
115
|
-
|
|
116
|
-
# Calculate skip statistics
|
|
117
|
-
num_singletons = sum(1 for c in skipped_components if len(c) < 2)
|
|
118
|
-
num_oversized = sum(1 for c in skipped_components if len(c) > max_size)
|
|
119
|
-
|
|
120
|
-
get_logger().info(
|
|
121
|
-
f"There are {len(processable_components)} perceptual components with an average of {float(sum([len(c) for c in processable_components])) / float(len(processable_components)) if processable_components else 0} sequences"
|
|
122
|
-
)
|
|
123
|
-
get_logger().info(
|
|
124
|
-
f"Skipped {len(skipped_components)} components ({num_singletons} singletons, {num_oversized} oversized), "
|
|
125
|
-
f"total sequences is {sum([len(c) for c in processable_components])} in {len(processable_components)} sets"
|
|
126
|
-
)
|
|
127
|
-
|
|
128
|
-
n_photos_processable = sum(seq.n_photos for component in processable_components for seq, _ in component)
|
|
129
|
-
n_photos_skipped = sum(seq.n_photos for seq in results)
|
|
130
|
-
|
|
131
|
-
assert self.total_photos == n_photos_processable + n_photos_skipped, (
|
|
132
|
-
f"ComputePerceptualMatch._prepare_with_bins lost photos, expected {self.total_photos}, got {n_photos_processable} + {n_photos_skipped}"
|
|
133
|
-
)
|
|
134
|
-
|
|
135
|
-
return processable_components, results
|
|
136
|
-
|
|
137
|
-
@classmethod
|
|
138
|
-
def stage_worker(
|
|
139
|
-
cls,
|
|
140
|
-
bin_data: list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]],
|
|
141
|
-
created_by: str,
|
|
142
|
-
) -> WorkerResult[list[PhotoSequence]]:
|
|
143
|
-
# ASSERTION: Count input photos (atomic invariant)
|
|
144
|
-
input_photos: int = sum(seq.n_photos for seq, _ in bin_data)
|
|
145
|
-
|
|
146
|
-
gates = GateSequence(cast(list[GateName], CONFIG.processing.COMPARISON_GATES))
|
|
147
|
-
|
|
148
|
-
# Keep sequences and their hash associations
|
|
149
|
-
seq_with_hashes: list[tuple[PhotoSequence, dict[INDEX_T, bytes]]] = [
|
|
150
|
-
(seq, dict(hashes)) for seq, hashes in bin_data
|
|
151
|
-
]
|
|
152
|
-
|
|
153
|
-
result_sequences: list[PhotoSequence] = []
|
|
154
|
-
review_groups: list[SequenceGroup] = []
|
|
155
|
-
|
|
156
|
-
# Iteratively find clusters of similar sequences
|
|
157
|
-
while seq_with_hashes:
|
|
158
|
-
# Extract just sequences for exemplar prediction
|
|
159
|
-
sequences = [seq for seq, _hashes in seq_with_hashes]
|
|
160
|
-
|
|
161
|
-
# Pick best exemplar from remaining sequences
|
|
162
|
-
exemplar_seq_obj = predict_exemplar_sequence(sequences)
|
|
163
|
-
exemplar_idx = sequences.index(exemplar_seq_obj)
|
|
164
|
-
seq_with_hashes[exemplar_idx][1]
|
|
165
|
-
|
|
166
|
-
remaining_with_hashes = [
|
|
167
|
-
(seq, hashes) for i, (seq, hashes) in enumerate(seq_with_hashes) if i != exemplar_idx
|
|
168
|
-
]
|
|
169
|
-
|
|
170
|
-
# Use common clustering algorithm
|
|
171
|
-
cluster_results, cluster_reviews = cluster_similar_sequences(
|
|
172
|
-
[exemplar_seq_obj] + [seq for seq, _ in remaining_with_hashes],
|
|
173
|
-
gates,
|
|
174
|
-
created_by,
|
|
175
|
-
)
|
|
176
|
-
|
|
177
|
-
result_sequences.extend(cluster_results)
|
|
178
|
-
review_groups.extend(cluster_reviews)
|
|
179
|
-
|
|
180
|
-
# Remove all processed sequences from pool (they're now in cluster_results)
|
|
181
|
-
# Note: cluster_results contains NEW PhotoSequence objects, so we track input sequences instead
|
|
182
|
-
input_sequences = {exemplar_seq_obj} | {seq for seq, _ in remaining_with_hashes}
|
|
183
|
-
seq_with_hashes = [(seq, hashes) for seq, hashes in seq_with_hashes if seq not in input_sequences]
|
|
184
|
-
|
|
185
|
-
# ASSERTION: Verify all photos preserved
|
|
186
|
-
output_photos = sum(seq.n_photos for seq in result_sequences)
|
|
187
|
-
assert output_photos == input_photos, (
|
|
188
|
-
f"Lost photos in stage_worker: started {input_photos}, ended {output_photos}"
|
|
189
|
-
)
|
|
190
|
-
|
|
191
|
-
return [], review_groups, result_sequences
|
|
192
|
-
|
|
193
|
-
def accumulate_results(
|
|
194
|
-
self,
|
|
195
|
-
accum: list[PhotoSequence],
|
|
196
|
-
job: list[PhotoSequence],
|
|
197
|
-
) -> None:
|
|
198
|
-
accum.extend(job)
|
|
199
|
-
|
|
200
|
-
def finalise(self) -> None:
|
|
201
|
-
self.ref_photos_final = count_forest_ref_photos(self.result)
|
|
202
|
-
self.ref_seqs_final = len(self.result)
|
|
203
|
-
|
|
204
|
-
# Count total photos to ensure no photos lost (invariant check)
|
|
205
|
-
photos_final = count_forest_total_photos(self.result)
|
|
206
|
-
seqs_final = count_forest_ref_sequences(self.result)
|
|
207
|
-
|
|
208
|
-
if seqs_final != self.ref_seqs_init:
|
|
209
|
-
get_logger().warning(
|
|
210
|
-
f"Sequence count mismatch in {self.stage_name}: "
|
|
211
|
-
f"started with {self.ref_seqs_init} but ended with {seqs_final}"
|
|
212
|
-
)
|
|
213
|
-
|
|
214
|
-
assert photos_final == self.total_photos, (
|
|
215
|
-
f"Started with {self.total_photos} photos and ended up with {photos_final}"
|
|
216
|
-
)
|
|
217
|
-
|
|
218
|
-
def needs_review(self) -> ReviewType:
|
|
219
|
-
"""This stage produces sequence groups (similar photo sequences).
|
|
220
|
-
|
|
221
|
-
Returns:
|
|
222
|
-
"sequences" to indicate this stage produces reviewable sequence groups
|
|
223
|
-
"""
|
|
224
|
-
return "sequences"
|
|
225
|
-
|
|
226
|
-
def has_review_data(self) -> bool:
|
|
227
|
-
"""Check if there are any sequence groups to review.
|
|
228
|
-
|
|
229
|
-
Returns:
|
|
230
|
-
True if forest has classes (multi-sequence groups), False otherwise
|
|
231
|
-
"""
|
|
232
|
-
# Check if stage has run
|
|
233
|
-
if not hasattr(self, "result") or self.result is None:
|
|
234
|
-
return False
|
|
235
|
-
|
|
236
|
-
# Check if there are any classes (multi-sequence groups)
|
|
237
|
-
return any(seq.is_class() for seq in self.result)
|
|
238
|
-
|
|
239
|
-
# Typed result field - just the forest
|
|
240
|
-
result: list[PhotoSequence]
|
|
1
|
+
"""Implementation of perceptual matching pipeline stage."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from itertools import combinations
|
|
7
|
+
from typing import cast
|
|
8
|
+
|
|
9
|
+
import networkx as nx
|
|
10
|
+
|
|
11
|
+
from .comparison_gates import GateName, GateSequence
|
|
12
|
+
from .config import CONFIG
|
|
13
|
+
from .logger import get_logger
|
|
14
|
+
from .models import ReviewType, SequenceGroup
|
|
15
|
+
from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
|
|
16
|
+
from .ports import InputPort, OutputPort
|
|
17
|
+
from .sequence import (
|
|
18
|
+
INDEX_T,
|
|
19
|
+
PhotoSequence,
|
|
20
|
+
count_forest_ref_photos,
|
|
21
|
+
count_forest_ref_sequences,
|
|
22
|
+
count_forest_total_photos,
|
|
23
|
+
predict_exemplar_sequence,
|
|
24
|
+
)
|
|
25
|
+
from .sequence_clustering import cluster_similar_sequences
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ComputePerceptualMatch(
|
|
29
|
+
PipelineStage[
|
|
30
|
+
list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]], # S: component
|
|
31
|
+
list[PhotoSequence], # T: merged sequences
|
|
32
|
+
list[PhotoSequence], # R: result forest
|
|
33
|
+
]
|
|
34
|
+
):
|
|
35
|
+
def __init__(self) -> None:
|
|
36
|
+
"""Initialize the perceptual matching stage."""
|
|
37
|
+
super().__init__(
|
|
38
|
+
path=CONFIG.paths.forest_final_pkl,
|
|
39
|
+
stage_name="Perceptual Matching",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Store worker argument
|
|
43
|
+
self.args = self.stage_name # Standard args attribute for run()
|
|
44
|
+
|
|
45
|
+
# Create input port for forest (from ComputeIndices)
|
|
46
|
+
self.forest_i: InputPort[list[PhotoSequence]] = InputPort("forest")
|
|
47
|
+
|
|
48
|
+
# Create input port for perceptual bins (from ComputePerceptualHash)
|
|
49
|
+
self.perceptual_bins_i: InputPort[dict[bytes, dict[int, list[INDEX_T]]]] = InputPort("perceptual_bins")
|
|
50
|
+
|
|
51
|
+
# Create output port for final forest
|
|
52
|
+
self.final_forest_o: OutputPort[list[PhotoSequence]] = OutputPort(self, getter=lambda: self.result)
|
|
53
|
+
|
|
54
|
+
def prepare(
|
|
55
|
+
self,
|
|
56
|
+
) -> PrepareResult[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]], list[PhotoSequence]]:
|
|
57
|
+
"""Extract index bins, build graph, and return processable components.
|
|
58
|
+
|
|
59
|
+
Reads forest and bins from input ports, builds connection graph,
|
|
60
|
+
and filters components by size.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Tuple of (processable_components, skipped_sequences)
|
|
64
|
+
"""
|
|
65
|
+
# Read from input ports
|
|
66
|
+
forest = self.forest_i.read()
|
|
67
|
+
# Get reference counts from upstream for UI statistics tracking
|
|
68
|
+
self.ref_photos_init = self.forest_i.get_ref_photo_count()
|
|
69
|
+
self.ref_seqs_init = self.forest_i.get_ref_sequence_count()
|
|
70
|
+
# Count total photos for internal invariant checking (should never change)
|
|
71
|
+
self.total_photos = sum(seq.n_photos for seq in forest)
|
|
72
|
+
perceptual_bins = self.perceptual_bins_i.read()
|
|
73
|
+
|
|
74
|
+
# Within each bin, calculate the number of connections between sequences and the best index mapping with its value
|
|
75
|
+
connections: dict[tuple[int, int], list[tuple[list[INDEX_T], list[INDEX_T]]]] = defaultdict(list)
|
|
76
|
+
associations: dict[int, list[tuple[INDEX_T, bytes]]] = defaultdict(list)
|
|
77
|
+
k: bytes
|
|
78
|
+
hbin: dict[int, list[INDEX_T]]
|
|
79
|
+
for k, hbin in perceptual_bins.items():
|
|
80
|
+
# label each index of the sequence with its hash
|
|
81
|
+
s: int
|
|
82
|
+
idces: list[INDEX_T]
|
|
83
|
+
for s, idces in hbin.items():
|
|
84
|
+
associations[s].extend([(idx, k) for idx in idces])
|
|
85
|
+
# add the pair of index lists that are matched to the pair of sequences
|
|
86
|
+
for (s1, hb1), (s2, hb2) in combinations(sorted(hbin.items()), 2):
|
|
87
|
+
connections[(s1, s2)].append((hb1, hb2))
|
|
88
|
+
|
|
89
|
+
# Form connection graph along with index mappings and get components of connected sequences
|
|
90
|
+
graph: nx.Graph[int] = nx.Graph()
|
|
91
|
+
graph.add_nodes_from(range(len(forest)))
|
|
92
|
+
for (s1, s2), idx_pairs in connections.items():
|
|
93
|
+
# If the sequences match for at least half their points then test them for equality
|
|
94
|
+
if sum([min(len(idces1), len(idces2)) for idces1, idces2 in idx_pairs]) >= 0.5 * min(
|
|
95
|
+
len(forest[s1].get_reference()), len(forest[s2].get_reference())
|
|
96
|
+
):
|
|
97
|
+
graph.add_edge(s1, s2)
|
|
98
|
+
|
|
99
|
+
components: list[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]]] = [
|
|
100
|
+
[(forest[i], associations[i]) for i in c] for c in nx.connected_components(graph)
|
|
101
|
+
]
|
|
102
|
+
|
|
103
|
+
# Filter components by size
|
|
104
|
+
max_size = CONFIG.sequences.MAX_COMPONENT_SIZE
|
|
105
|
+
processable_components: list[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]]] = sorted(
|
|
106
|
+
[c for c in components if 2 <= len(c) <= max_size],
|
|
107
|
+
key=lambda c: -sum([seq.n_ref_photos for seq, _ in c]),
|
|
108
|
+
)
|
|
109
|
+
skipped_components: list[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]]] = [
|
|
110
|
+
c for c in components if len(c) > max_size or len(c) < 2
|
|
111
|
+
]
|
|
112
|
+
|
|
113
|
+
# Flatten skipped components into result sequences
|
|
114
|
+
results: list[PhotoSequence] = [seq for comp in skipped_components for seq, _ in comp]
|
|
115
|
+
|
|
116
|
+
# Calculate skip statistics
|
|
117
|
+
num_singletons = sum(1 for c in skipped_components if len(c) < 2)
|
|
118
|
+
num_oversized = sum(1 for c in skipped_components if len(c) > max_size)
|
|
119
|
+
|
|
120
|
+
get_logger().info(
|
|
121
|
+
f"There are {len(processable_components)} perceptual components with an average of {float(sum([len(c) for c in processable_components])) / float(len(processable_components)) if processable_components else 0} sequences"
|
|
122
|
+
)
|
|
123
|
+
get_logger().info(
|
|
124
|
+
f"Skipped {len(skipped_components)} components ({num_singletons} singletons, {num_oversized} oversized), "
|
|
125
|
+
f"total sequences is {sum([len(c) for c in processable_components])} in {len(processable_components)} sets"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
n_photos_processable = sum(seq.n_photos for component in processable_components for seq, _ in component)
|
|
129
|
+
n_photos_skipped = sum(seq.n_photos for seq in results)
|
|
130
|
+
|
|
131
|
+
assert self.total_photos == n_photos_processable + n_photos_skipped, (
|
|
132
|
+
f"ComputePerceptualMatch._prepare_with_bins lost photos, expected {self.total_photos}, got {n_photos_processable} + {n_photos_skipped}"
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
return processable_components, results
|
|
136
|
+
|
|
137
|
+
@classmethod
|
|
138
|
+
def stage_worker(
|
|
139
|
+
cls,
|
|
140
|
+
bin_data: list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]],
|
|
141
|
+
created_by: str,
|
|
142
|
+
) -> WorkerResult[list[PhotoSequence]]:
|
|
143
|
+
# ASSERTION: Count input photos (atomic invariant)
|
|
144
|
+
input_photos: int = sum(seq.n_photos for seq, _ in bin_data)
|
|
145
|
+
|
|
146
|
+
gates = GateSequence(cast(list[GateName], CONFIG.processing.COMPARISON_GATES))
|
|
147
|
+
|
|
148
|
+
# Keep sequences and their hash associations
|
|
149
|
+
seq_with_hashes: list[tuple[PhotoSequence, dict[INDEX_T, bytes]]] = [
|
|
150
|
+
(seq, dict(hashes)) for seq, hashes in bin_data
|
|
151
|
+
]
|
|
152
|
+
|
|
153
|
+
result_sequences: list[PhotoSequence] = []
|
|
154
|
+
review_groups: list[SequenceGroup] = []
|
|
155
|
+
|
|
156
|
+
# Iteratively find clusters of similar sequences
|
|
157
|
+
while seq_with_hashes:
|
|
158
|
+
# Extract just sequences for exemplar prediction
|
|
159
|
+
sequences = [seq for seq, _hashes in seq_with_hashes]
|
|
160
|
+
|
|
161
|
+
# Pick best exemplar from remaining sequences
|
|
162
|
+
exemplar_seq_obj = predict_exemplar_sequence(sequences)
|
|
163
|
+
exemplar_idx = sequences.index(exemplar_seq_obj)
|
|
164
|
+
seq_with_hashes[exemplar_idx][1]
|
|
165
|
+
|
|
166
|
+
remaining_with_hashes = [
|
|
167
|
+
(seq, hashes) for i, (seq, hashes) in enumerate(seq_with_hashes) if i != exemplar_idx
|
|
168
|
+
]
|
|
169
|
+
|
|
170
|
+
# Use common clustering algorithm
|
|
171
|
+
cluster_results, cluster_reviews = cluster_similar_sequences(
|
|
172
|
+
[exemplar_seq_obj] + [seq for seq, _ in remaining_with_hashes],
|
|
173
|
+
gates,
|
|
174
|
+
created_by,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
result_sequences.extend(cluster_results)
|
|
178
|
+
review_groups.extend(cluster_reviews)
|
|
179
|
+
|
|
180
|
+
# Remove all processed sequences from pool (they're now in cluster_results)
|
|
181
|
+
# Note: cluster_results contains NEW PhotoSequence objects, so we track input sequences instead
|
|
182
|
+
input_sequences = {exemplar_seq_obj} | {seq for seq, _ in remaining_with_hashes}
|
|
183
|
+
seq_with_hashes = [(seq, hashes) for seq, hashes in seq_with_hashes if seq not in input_sequences]
|
|
184
|
+
|
|
185
|
+
# ASSERTION: Verify all photos preserved
|
|
186
|
+
output_photos = sum(seq.n_photos for seq in result_sequences)
|
|
187
|
+
assert output_photos == input_photos, (
|
|
188
|
+
f"Lost photos in stage_worker: started {input_photos}, ended {output_photos}"
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
return [], review_groups, result_sequences
|
|
192
|
+
|
|
193
|
+
def accumulate_results(
|
|
194
|
+
self,
|
|
195
|
+
accum: list[PhotoSequence],
|
|
196
|
+
job: list[PhotoSequence],
|
|
197
|
+
) -> None:
|
|
198
|
+
accum.extend(job)
|
|
199
|
+
|
|
200
|
+
def finalise(self) -> None:
|
|
201
|
+
self.ref_photos_final = count_forest_ref_photos(self.result)
|
|
202
|
+
self.ref_seqs_final = len(self.result)
|
|
203
|
+
|
|
204
|
+
# Count total photos to ensure no photos lost (invariant check)
|
|
205
|
+
photos_final = count_forest_total_photos(self.result)
|
|
206
|
+
seqs_final = count_forest_ref_sequences(self.result)
|
|
207
|
+
|
|
208
|
+
if seqs_final != self.ref_seqs_init:
|
|
209
|
+
get_logger().warning(
|
|
210
|
+
f"Sequence count mismatch in {self.stage_name}: "
|
|
211
|
+
f"started with {self.ref_seqs_init} but ended with {seqs_final}"
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
assert photos_final == self.total_photos, (
|
|
215
|
+
f"Started with {self.total_photos} photos and ended up with {photos_final}"
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
def needs_review(self) -> ReviewType:
|
|
219
|
+
"""This stage produces sequence groups (similar photo sequences).
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
"sequences" to indicate this stage produces reviewable sequence groups
|
|
223
|
+
"""
|
|
224
|
+
return "sequences"
|
|
225
|
+
|
|
226
|
+
def has_review_data(self) -> bool:
|
|
227
|
+
"""Check if there are any sequence groups to review.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
True if forest has classes (multi-sequence groups), False otherwise
|
|
231
|
+
"""
|
|
232
|
+
# Check if stage has run
|
|
233
|
+
if not hasattr(self, "result") or self.result is None:
|
|
234
|
+
return False
|
|
235
|
+
|
|
236
|
+
# Check if there are any classes (multi-sequence groups)
|
|
237
|
+
return any(seq.is_class() for seq in self.result)
|
|
238
|
+
|
|
239
|
+
# Typed result field - just the forest
|
|
240
|
+
result: list[PhotoSequence]
|
utils/compute_sha_bins.py
CHANGED
|
@@ -1,26 +1,57 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import hashlib
|
|
4
|
+
import io
|
|
3
5
|
import mimetypes
|
|
4
6
|
import os
|
|
5
7
|
from collections import defaultdict
|
|
6
8
|
from collections.abc import Iterator
|
|
7
9
|
from pathlib import Path
|
|
8
10
|
|
|
9
|
-
from
|
|
11
|
+
from PIL import Image
|
|
10
12
|
|
|
11
13
|
from .config import CONFIG
|
|
12
14
|
from .photo_file import PhotoFile
|
|
13
15
|
from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
|
|
14
16
|
from .ports import OutputPort
|
|
17
|
+
from .template_parsing import extract_template
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _get_oriented_dimensions(img: Image.Image) -> tuple[int, int]:
|
|
21
|
+
"""Extract dimensions from PIL Image with EXIF orientation applied.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
img: Opened PIL Image
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Tuple of (width, height) with EXIF orientation applied
|
|
28
|
+
"""
|
|
29
|
+
# Get EXIF orientation if present
|
|
30
|
+
orientation: int = 0
|
|
31
|
+
if hasattr(img, "_getexif") and img._getexif() is not None:
|
|
32
|
+
exif = img._getexif()
|
|
33
|
+
orientation = exif.get(274, 0) # 274 = Orientation EXIF tag
|
|
34
|
+
|
|
35
|
+
# Get raw dimensions
|
|
36
|
+
raw_width: int = img.width
|
|
37
|
+
raw_height: int = img.height
|
|
38
|
+
|
|
39
|
+
# Apply EXIF orientation (swap dimensions for rotations 5,6,7,8)
|
|
40
|
+
# https://www.impulseadventure.com/photo/exif-orientation.html
|
|
41
|
+
if orientation in {5, 6, 7, 8}:
|
|
42
|
+
return raw_height, raw_width
|
|
43
|
+
return raw_width, raw_height
|
|
15
44
|
|
|
16
45
|
|
|
17
46
|
class ComputeShaBins(PipelineStage[tuple[int, tuple[Path, str]], tuple[PhotoFile, str], dict[str, list[PhotoFile]]]):
|
|
18
47
|
"""Pipeline stage that walks source directory and bins photos by SHA256 hash.
|
|
19
48
|
|
|
20
|
-
Creates PhotoFile objects with
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
49
|
+
Creates PhotoFile objects with full metadata extracted from a single file read:
|
|
50
|
+
- SHA256 hash (for binning, then discarded)
|
|
51
|
+
- File size, MIME type, path
|
|
52
|
+
- Image dimensions with EXIF orientation applied
|
|
53
|
+
|
|
54
|
+
PhotoFile is a pure data container - this stage performs ALL file I/O.
|
|
24
55
|
"""
|
|
25
56
|
|
|
26
57
|
def __init__(self, source_path: Path) -> None:
|
|
@@ -82,44 +113,57 @@ class ComputeShaBins(PipelineStage[tuple[int, tuple[Path, str]], tuple[PhotoFile
|
|
|
82
113
|
|
|
83
114
|
@classmethod
|
|
84
115
|
def stage_worker(cls, param: tuple[int, tuple[Path, str]], _args: str) -> WorkerResult[tuple[PhotoFile, str]]:
|
|
85
|
-
"""Create PhotoFile with
|
|
86
|
-
|
|
87
|
-
Work function for parallel processing that takes enumerated file info
|
|
88
|
-
and returns a PhotoFile with core file properties (no image opening!).
|
|
89
|
-
SHA256 is computed and returned separately for binning.
|
|
116
|
+
"""Create PhotoFile with dimensions and compute SHA256 in single file read.
|
|
90
117
|
|
|
91
|
-
|
|
92
|
-
|
|
118
|
+
Work function for parallel processing that reads file once, computes SHA256,
|
|
119
|
+
extracts dimensions with EXIF orientation, and creates PhotoFile with all
|
|
120
|
+
metadata. PhotoFile.__init__ never opens files - this is the only file I/O.
|
|
93
121
|
|
|
94
|
-
There is no exception handling in here.
|
|
122
|
+
There is no exception handling in here. All exceptions should be surfaced
|
|
123
|
+
to be dealt with by the user.
|
|
95
124
|
|
|
96
125
|
Args:
|
|
97
126
|
param: (photo_id, (path, mime)) tuple
|
|
98
127
|
_args: Placeholder to match pattern
|
|
99
128
|
|
|
100
129
|
Returns:
|
|
101
|
-
(PhotoFile with
|
|
130
|
+
(PhotoFile with all metadata, SHA256 hash) tuple
|
|
102
131
|
"""
|
|
103
132
|
photo_id: int
|
|
104
133
|
path: Path
|
|
105
134
|
mime: str
|
|
106
135
|
photo_id, (path, mime) = param
|
|
107
136
|
|
|
108
|
-
#
|
|
109
|
-
|
|
137
|
+
# Read file once into memory for both SHA256 and dimensions (optimization)
|
|
138
|
+
with path.open("rb") as f:
|
|
139
|
+
file_data: bytes = f.read()
|
|
110
140
|
|
|
111
|
-
#
|
|
112
|
-
|
|
141
|
+
# Compute SHA256 from in-memory data
|
|
142
|
+
sha256_hash: str = hashlib.sha256(file_data).hexdigest()
|
|
113
143
|
|
|
114
|
-
#
|
|
115
|
-
|
|
144
|
+
# Extract dimensions with EXIF orientation from in-memory data
|
|
145
|
+
with Image.open(io.BytesIO(file_data)) as img:
|
|
146
|
+
width, height = _get_oriented_dimensions(img)
|
|
147
|
+
|
|
148
|
+
# Get file size from data length (avoids separate stat call)
|
|
149
|
+
size_bytes: int = len(file_data)
|
|
150
|
+
|
|
151
|
+
# Create PhotoFile with all metadata (no file I/O in PhotoFile.__init__)
|
|
116
152
|
photo = PhotoFile(
|
|
117
153
|
path=path,
|
|
118
154
|
mime=mime,
|
|
119
155
|
size_bytes=size_bytes,
|
|
120
156
|
file_id=photo_id,
|
|
157
|
+
width=width,
|
|
158
|
+
height=height,
|
|
121
159
|
)
|
|
122
160
|
|
|
161
|
+
# Extract template from filename (with extension) and include full directory path
|
|
162
|
+
# This ensures files with same name in different directories have different templates
|
|
163
|
+
template_pattern, index = extract_template(path.name)
|
|
164
|
+
template_with_path = str(path.with_name(template_pattern))
|
|
165
|
+
photo.cache["TEMPLATE"] = (template_with_path, index)
|
|
166
|
+
|
|
123
167
|
# Return PhotoFile and SHA256 separately (SHA256 used for binning only)
|
|
124
168
|
return (
|
|
125
169
|
[],
|
|
@@ -323,7 +323,7 @@ class ComputeTemplateSimilarity(
|
|
|
323
323
|
|
|
324
324
|
# Bin is too large - subdivide it
|
|
325
325
|
if template_remainder in template_bins and template_remainder not in original_large_bins_seen:
|
|
326
|
-
get_logger().
|
|
326
|
+
get_logger().debug(f"Subdividing large bin '{template_remainder}' with {bin_size} sequences")
|
|
327
327
|
original_large_bins_seen.add(template_remainder)
|
|
328
328
|
total_subdivisions += 1
|
|
329
329
|
largest_input_bin = max(largest_input_bin, bin_size)
|