photo-stack-finder 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orchestrator/__init__.py +2 -2
- orchestrator/app.py +6 -11
- orchestrator/build_pipeline.py +19 -21
- orchestrator/orchestrator_runner.py +11 -8
- orchestrator/pipeline_builder.py +126 -126
- orchestrator/pipeline_orchestrator.py +604 -604
- orchestrator/review_persistence.py +162 -162
- orchestrator/static/orchestrator.css +76 -76
- orchestrator/static/orchestrator.html +11 -5
- orchestrator/static/orchestrator.js +3 -1
- overlap_metrics/__init__.py +1 -1
- overlap_metrics/config.py +135 -135
- overlap_metrics/core.py +284 -284
- overlap_metrics/estimators.py +292 -292
- overlap_metrics/metrics.py +307 -307
- overlap_metrics/registry.py +99 -99
- overlap_metrics/utils.py +104 -104
- photo_compare/__init__.py +1 -1
- photo_compare/base.py +285 -285
- photo_compare/config.py +225 -225
- photo_compare/distance.py +15 -15
- photo_compare/feature_methods.py +173 -173
- photo_compare/file_hash.py +29 -29
- photo_compare/hash_methods.py +99 -99
- photo_compare/histogram_methods.py +118 -118
- photo_compare/pixel_methods.py +58 -58
- photo_compare/structural_methods.py +104 -104
- photo_compare/types.py +28 -28
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/METADATA +21 -22
- photo_stack_finder-0.1.8.dist-info/RECORD +75 -0
- scripts/orchestrate.py +12 -10
- utils/__init__.py +4 -3
- utils/base_pipeline_stage.py +171 -171
- utils/base_ports.py +176 -176
- utils/benchmark_utils.py +823 -823
- utils/channel.py +74 -74
- utils/comparison_gates.py +40 -21
- utils/compute_benchmarks.py +355 -355
- utils/compute_identical.py +94 -24
- utils/compute_indices.py +235 -235
- utils/compute_perceptual_hash.py +127 -127
- utils/compute_perceptual_match.py +240 -240
- utils/compute_sha_bins.py +64 -20
- utils/compute_template_similarity.py +1 -1
- utils/compute_versions.py +483 -483
- utils/config.py +8 -5
- utils/data_io.py +83 -83
- utils/graph_context.py +44 -44
- utils/logger.py +2 -2
- utils/models.py +2 -2
- utils/photo_file.py +90 -91
- utils/pipeline_graph.py +334 -334
- utils/pipeline_stage.py +408 -408
- utils/plot_helpers.py +123 -123
- utils/ports.py +136 -136
- utils/progress.py +415 -415
- utils/report_builder.py +139 -139
- utils/review_types.py +55 -55
- utils/review_utils.py +10 -19
- utils/sequence.py +10 -8
- utils/sequence_clustering.py +1 -1
- utils/template.py +57 -57
- utils/template_parsing.py +71 -0
- photo_stack_finder-0.1.7.dist-info/RECORD +0 -74
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/WHEEL +0 -0
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/entry_points.txt +0 -0
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/top_level.txt +0 -0
utils/compute_identical.py
CHANGED
|
@@ -6,18 +6,41 @@ import random
|
|
|
6
6
|
|
|
7
7
|
from .config import CONFIG
|
|
8
8
|
from .models import IdenticalGroup, ReviewType
|
|
9
|
-
from .photo_file import
|
|
9
|
+
from .photo_file import (
|
|
10
|
+
PhotoFile,
|
|
11
|
+
pick_exemplar_from_class,
|
|
12
|
+
)
|
|
10
13
|
from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
|
|
11
14
|
from .ports import InputPort, OutputPort
|
|
12
15
|
from .review_utils import build_identical_group
|
|
16
|
+
from .template_parsing import INDEX_T
|
|
13
17
|
|
|
14
18
|
|
|
15
|
-
class ComputeIdentical(
|
|
16
|
-
|
|
17
|
-
|
|
19
|
+
class ComputeIdentical(
|
|
20
|
+
PipelineStage[
|
|
21
|
+
list[PhotoFile], # WorkItem: SHA bins
|
|
22
|
+
list[PhotoFile], # Accumulator: flat list of exemplars during processing
|
|
23
|
+
dict[str, list[tuple[INDEX_T, PhotoFile]]], # Result: template bins after finalise()
|
|
24
|
+
]
|
|
25
|
+
):
|
|
26
|
+
"""Pipeline stage that detects byte-identical files and outputs template bins.
|
|
27
|
+
|
|
28
|
+
This stage consumes SHA256 bins, identifies byte-identical files within each bin,
|
|
29
|
+
picks exemplars from non-identical files, and bins the exemplars by filename template.
|
|
30
|
+
|
|
31
|
+
Input:
|
|
32
|
+
SHA256 bins (from ComputeSHABins)
|
|
33
|
+
|
|
34
|
+
Output:
|
|
35
|
+
Template bins: dict mapping template patterns to (index, photo) tuples
|
|
36
|
+
Example: {"IMG_{P0}.jpg": [("1234", photo1), ("5678", photo2)]}
|
|
37
|
+
|
|
38
|
+
Review data:
|
|
39
|
+
Identical photo groups for user review
|
|
40
|
+
"""
|
|
41
|
+
|
|
18
42
|
# Typed result field - populated after run() completes
|
|
19
|
-
|
|
20
|
-
result: list[PhotoFile]
|
|
43
|
+
result: dict[str, list[tuple[INDEX_T, PhotoFile]]]
|
|
21
44
|
|
|
22
45
|
def __init__(self) -> None:
|
|
23
46
|
"""Initialize identical files detection stage."""
|
|
@@ -33,12 +56,14 @@ class ComputeIdentical(PipelineStage[list[PhotoFile], list[PhotoFile], list[Phot
|
|
|
33
56
|
self.sha_bins_i: InputPort[dict[str, list[PhotoFile]]] = InputPort("sha_bins")
|
|
34
57
|
|
|
35
58
|
# Create output ports
|
|
36
|
-
# - nonidentical_o: for next stage (
|
|
37
|
-
self.nonidentical_o: OutputPort[list[PhotoFile]] = OutputPort(
|
|
59
|
+
# - nonidentical_o: template bins for next stage (versions)
|
|
60
|
+
self.nonidentical_o: OutputPort[dict[str, list[tuple[INDEX_T, PhotoFile]]]] = OutputPort(
|
|
61
|
+
self, getter=lambda: self.result
|
|
62
|
+
)
|
|
38
63
|
|
|
39
64
|
def prepare(
|
|
40
65
|
self,
|
|
41
|
-
) -> PrepareResult[list[PhotoFile], list[PhotoFile]]:
|
|
66
|
+
) -> PrepareResult[list[PhotoFile], dict[str, list[tuple[INDEX_T, PhotoFile]]]]:
|
|
42
67
|
"""Prepare identical file detection by splitting bins into work items.
|
|
43
68
|
|
|
44
69
|
Reads SHA bins from input port and prepares work items for parallel processing.
|
|
@@ -46,7 +71,7 @@ class ComputeIdentical(PipelineStage[list[PhotoFile], list[PhotoFile], list[Phot
|
|
|
46
71
|
Returns:
|
|
47
72
|
Tuple of (work_items, accumulator) where:
|
|
48
73
|
- work_items: List of bins with multiple photos (potential duplicates)
|
|
49
|
-
- accumulator:
|
|
74
|
+
- accumulator: Empty template bins dict (populated during accumulation)
|
|
50
75
|
"""
|
|
51
76
|
# Read SHA bins from input port
|
|
52
77
|
sha_bins: dict[str, list[PhotoFile]] = self.sha_bins_i.read()
|
|
@@ -58,14 +83,30 @@ class ComputeIdentical(PipelineStage[list[PhotoFile], list[PhotoFile], list[Phot
|
|
|
58
83
|
|
|
59
84
|
multiple_bins: list[list[PhotoFile]] = [b for b in sha_bins.values() if len(b) > 1]
|
|
60
85
|
singleton_bins: list[list[PhotoFile]] = [b for b in sha_bins.values() if len(b) == 1]
|
|
61
|
-
exemplars: list[PhotoFile] = [x for b in singleton_bins for x in b]
|
|
62
86
|
|
|
63
|
-
|
|
87
|
+
# Pre-bin singleton exemplars by template (optimization)
|
|
88
|
+
template_bins: dict[str, list[tuple[INDEX_T, PhotoFile]]] = {}
|
|
89
|
+
for bin in singleton_bins:
|
|
90
|
+
photo = bin[0]
|
|
91
|
+
assert "TEMPLATE" in photo.cache, "Template must be cached by SHA stage"
|
|
92
|
+
template_bins.setdefault(photo.template, []).append((photo.template_index, photo))
|
|
93
|
+
|
|
94
|
+
return multiple_bins, template_bins
|
|
64
95
|
|
|
65
96
|
def finalise(self) -> None:
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
97
|
+
"""Update reference counts after template binning.
|
|
98
|
+
|
|
99
|
+
Template binning already happened during accumulation, so this just
|
|
100
|
+
computes the final reference counts.
|
|
101
|
+
"""
|
|
102
|
+
# self.result is already a dict[str, list[tuple[INDEX_T, PhotoFile]]] from accumulator
|
|
103
|
+
template_bins = self.result
|
|
104
|
+
|
|
105
|
+
# Update reference counting
|
|
106
|
+
self.ref_photos_final = sum(len(photos) for photos in template_bins.values())
|
|
107
|
+
self.ref_seqs_final = len(template_bins) # Number of unique templates
|
|
108
|
+
|
|
109
|
+
# Existing invariant check (photos count unchanged)
|
|
69
110
|
photos_final: int = (
|
|
70
111
|
sum(len(cl.photos) for cl in self.identical_review_result)
|
|
71
112
|
- len(self.identical_review_result)
|
|
@@ -76,6 +117,9 @@ class ComputeIdentical(PipelineStage[list[PhotoFile], list[PhotoFile], list[Phot
|
|
|
76
117
|
f"ComputeIdentical started with {self.total_photos} photos and ended up with {photos_final}"
|
|
77
118
|
)
|
|
78
119
|
|
|
120
|
+
# Shuffle review groups for variety in review UI (byte-identical groups are 100% confident)
|
|
121
|
+
random.shuffle(self.identical_review_result)
|
|
122
|
+
|
|
79
123
|
@classmethod
|
|
80
124
|
def stage_worker(cls, photo_list: list[PhotoFile], _args: str) -> WorkerResult[list[PhotoFile]]:
|
|
81
125
|
"""Process one SHA bin to find byte-identical files.
|
|
@@ -95,18 +139,39 @@ class ComputeIdentical(PipelineStage[list[PhotoFile], list[PhotoFile], list[Phot
|
|
|
95
139
|
# Singleton bins are filtered out by prepare()
|
|
96
140
|
assert len(photo_list) >= 2
|
|
97
141
|
|
|
142
|
+
# If skipping byte-identical check, trust SHA256 uniqueness
|
|
143
|
+
# Treat all photos in the bin as identical (same SHA = identical files)
|
|
144
|
+
if CONFIG.processing.SKIP_BYTE_IDENTICAL:
|
|
145
|
+
# Pick best exemplar using same logic as non-skip path
|
|
146
|
+
# (prefer higher resolution, larger file size, stable path/ID tiebreaker)
|
|
147
|
+
photos_dict: dict[int, PhotoFile] = {pf.id: pf for pf in photo_list}
|
|
148
|
+
photo_ids: set[int] = set(photos_dict.keys())
|
|
149
|
+
exemplar_id: int = pick_exemplar_from_class(photos_dict, photo_ids)
|
|
150
|
+
exemplar: PhotoFile = photos_dict[exemplar_id]
|
|
151
|
+
|
|
152
|
+
# Mark all other photos as identical to the exemplar
|
|
153
|
+
for photo in photo_list:
|
|
154
|
+
if photo.id != exemplar_id:
|
|
155
|
+
photo.cache["IDENTICAL"] = exemplar
|
|
156
|
+
|
|
157
|
+
# Skip building review data - no need to review when trusting SHA256
|
|
158
|
+
# (building review data would open every image file just to get dimensions)
|
|
159
|
+
# Return empty review list and single exemplar
|
|
160
|
+
return [build_identical_group(photo_list, exemplar_id)], [], [exemplar]
|
|
161
|
+
|
|
162
|
+
# Otherwise, perform byte-by-byte comparison (original behavior)
|
|
98
163
|
groups: list[IdenticalGroup] = []
|
|
99
164
|
exemplars: list[PhotoFile] = []
|
|
100
165
|
|
|
101
166
|
# Build dict and set for pick_exemplar_from_class
|
|
102
|
-
photos_dict
|
|
167
|
+
photos_dict = {pf.id: pf for pf in photo_list}
|
|
103
168
|
remaining_ids: set[int] = set(photos_dict.keys())
|
|
104
169
|
|
|
105
170
|
# Process bin until empty
|
|
106
171
|
while remaining_ids:
|
|
107
172
|
# Pick exemplar from remaining files
|
|
108
|
-
exemplar_id
|
|
109
|
-
exemplar
|
|
173
|
+
exemplar_id = pick_exemplar_from_class(photos_dict, remaining_ids)
|
|
174
|
+
exemplar = photos_dict[exemplar_id]
|
|
110
175
|
|
|
111
176
|
# Create new equivalence class starting with exemplar
|
|
112
177
|
eq_class: list[PhotoFile] = [exemplar]
|
|
@@ -141,18 +206,23 @@ class ComputeIdentical(PipelineStage[list[PhotoFile], list[PhotoFile], list[Phot
|
|
|
141
206
|
groups.append(build_identical_group(eq_class, exemplar_id))
|
|
142
207
|
exemplars.append(exemplar)
|
|
143
208
|
|
|
144
|
-
# Shuffle the groups to see something more inteeresting at review time.
|
|
145
|
-
random.shuffle(groups)
|
|
146
|
-
|
|
147
209
|
return groups, [], exemplars
|
|
148
210
|
|
|
149
211
|
def accumulate_results(
|
|
150
212
|
self,
|
|
151
|
-
accum: list[PhotoFile],
|
|
213
|
+
accum: dict[str, list[tuple[INDEX_T, PhotoFile]]],
|
|
152
214
|
job: list[PhotoFile],
|
|
153
215
|
) -> None:
|
|
154
|
-
|
|
155
|
-
|
|
216
|
+
"""Accumulate exemplars from workers into template bins.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
accum: Template bins dictionary being built incrementally
|
|
220
|
+
job: List of exemplars from one worker (flat list)
|
|
221
|
+
"""
|
|
222
|
+
# Bin each exemplar by its template
|
|
223
|
+
for exemplar in job:
|
|
224
|
+
assert "TEMPLATE" in exemplar.cache, "Template must be cached by SHA stage"
|
|
225
|
+
accum.setdefault(exemplar.template, []).append((exemplar.template_index, exemplar))
|
|
156
226
|
|
|
157
227
|
def needs_review(self) -> ReviewType:
|
|
158
228
|
"""This stage produces photo groups (byte-identical duplicates).
|
utils/compute_indices.py
CHANGED
|
@@ -1,235 +1,235 @@
|
|
|
1
|
-
"""Compute similar sequences from bins created by puting the sequence in bins defined by the max two indices of the sequence."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
from itertools import combinations
|
|
6
|
-
from typing import cast
|
|
7
|
-
|
|
8
|
-
import networkx as nx
|
|
9
|
-
|
|
10
|
-
from .comparison_gates import GateName, GateSequence
|
|
11
|
-
from .config import CONFIG
|
|
12
|
-
from .logger import get_logger
|
|
13
|
-
from .models import ReviewType
|
|
14
|
-
from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
|
|
15
|
-
from .ports import InputPort, OutputPort
|
|
16
|
-
from .sequence import (
|
|
17
|
-
INDEX_T,
|
|
18
|
-
PhotoSequence,
|
|
19
|
-
count_forest_ref_photos,
|
|
20
|
-
count_forest_ref_sequences,
|
|
21
|
-
count_forest_total_photos,
|
|
22
|
-
)
|
|
23
|
-
from .sequence_clustering import cluster_similar_sequences
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def build_cohabitation_graph(
|
|
27
|
-
index_bins: dict[INDEX_T, list[PhotoSequence]],
|
|
28
|
-
) -> list[set[PhotoSequence]]:
|
|
29
|
-
"""Build graph from index bins and find connected components.
|
|
30
|
-
|
|
31
|
-
Args:
|
|
32
|
-
index_bins: Dict mapping index pattern → list of sequences
|
|
33
|
-
|
|
34
|
-
Returns:
|
|
35
|
-
List of connected components (each component is a set of PhotoSequence objects)
|
|
36
|
-
"""
|
|
37
|
-
# Build graph
|
|
38
|
-
graph: nx.Graph[PhotoSequence] = nx.Graph()
|
|
39
|
-
graph.add_nodes_from(set().union(*index_bins.values()))
|
|
40
|
-
|
|
41
|
-
# Add edges where sequences share index bins
|
|
42
|
-
# Add edges between all pairs in this bin
|
|
43
|
-
for index_bin in index_bins.values():
|
|
44
|
-
for seq1, seq2 in combinations(index_bin, 2):
|
|
45
|
-
graph.add_edge(seq1, seq2)
|
|
46
|
-
|
|
47
|
-
# Find connected components
|
|
48
|
-
result = [set(c) for c in nx.connected_components(graph)]
|
|
49
|
-
|
|
50
|
-
n_seqs = len(set().union(*index_bins.values()))
|
|
51
|
-
n_result_seqs = len(set().union(*result))
|
|
52
|
-
|
|
53
|
-
assert n_seqs == n_result_seqs, f"build_cohabitation_graph had {n_seqs} but only returned {n_result_seqs}"
|
|
54
|
-
|
|
55
|
-
return result
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
class ComputeIndices(
|
|
59
|
-
PipelineStage[
|
|
60
|
-
set[PhotoSequence], # S: component
|
|
61
|
-
list[PhotoSequence], # T: work data
|
|
62
|
-
tuple[list[PhotoSequence], list[PhotoSequence]], # R: accumulator
|
|
63
|
-
]
|
|
64
|
-
):
|
|
65
|
-
def __init__(self) -> None:
|
|
66
|
-
"""Initialize the index-based grouping stage."""
|
|
67
|
-
super().__init__(
|
|
68
|
-
path=CONFIG.paths.forest_sequence_matches_pkl,
|
|
69
|
-
stage_name="Index Grouping",
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
# Store worker argument
|
|
73
|
-
self.args = self.stage_name # Standard args attribute for run()
|
|
74
|
-
|
|
75
|
-
# Create input port for index bins
|
|
76
|
-
self.index_bins_i: InputPort[dict[INDEX_T, list[PhotoSequence]]] = InputPort("index_bins")
|
|
77
|
-
|
|
78
|
-
# Create output ports - separate ports per downstream consumer
|
|
79
|
-
# Full tuple output (for backward compatibility or review)
|
|
80
|
-
self.forest_bins_o: OutputPort[tuple[list[PhotoSequence], list[PhotoSequence]]] = OutputPort(
|
|
81
|
-
self, getter=lambda: self.result
|
|
82
|
-
)
|
|
83
|
-
|
|
84
|
-
# Forest output (for ComputePerceptualHash and ComputePerceptualMatch)
|
|
85
|
-
self.forest_o: OutputPort[list[PhotoSequence]] = OutputPort(self, getter=lambda: self.result[0])
|
|
86
|
-
|
|
87
|
-
def prepare(
|
|
88
|
-
self,
|
|
89
|
-
) -> PrepareResult[set[PhotoSequence], tuple[list[PhotoSequence], list[PhotoSequence]]]:
|
|
90
|
-
"""Extract index bins, build graph, and return processable components.
|
|
91
|
-
|
|
92
|
-
Reads index bins from input port and prepares work items for parallel processing.
|
|
93
|
-
|
|
94
|
-
Returns:
|
|
95
|
-
Tuple of (processable_components, accumulator)
|
|
96
|
-
"""
|
|
97
|
-
# Read index bins from input port
|
|
98
|
-
index_bins: dict[INDEX_T, list[PhotoSequence]] = self.index_bins_i.read()
|
|
99
|
-
# Get reference counts from upstream for UI statistics tracking
|
|
100
|
-
all_sequences = set().union(*index_bins.values())
|
|
101
|
-
self.ref_photos_init = self.index_bins_i.get_ref_photo_count()
|
|
102
|
-
self.ref_seqs_init = self.index_bins_i.get_ref_sequence_count()
|
|
103
|
-
# Count total photos for internal invariant checking (should never change)
|
|
104
|
-
self.total_photos = sum(seq.n_photos for seq in all_sequences)
|
|
105
|
-
|
|
106
|
-
n_photos = self.total_photos
|
|
107
|
-
|
|
108
|
-
# Build cohabitation graph
|
|
109
|
-
components: list[set[PhotoSequence]] = build_cohabitation_graph(index_bins)
|
|
110
|
-
n_component_photos = sum(seq.n_photos for seq in set().union(*components))
|
|
111
|
-
assert n_photos == n_component_photos, (
|
|
112
|
-
f"Had {n_photos} before cohabitation graph and {n_component_photos} afterward"
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
# Filter components by size
|
|
116
|
-
max_size = CONFIG.sequences.MAX_COMPONENT_SIZE
|
|
117
|
-
processable_components: list[set[PhotoSequence]] = sorted(
|
|
118
|
-
[c for c in components if 2 <= len(c) <= max_size],
|
|
119
|
-
key=lambda c: -sum([s.n_ref_photos for s in c]),
|
|
120
|
-
)
|
|
121
|
-
skipped_components: list[set[PhotoSequence]] = [c for c in components if len(c) > max_size or len(c) < 2]
|
|
122
|
-
|
|
123
|
-
# Calculate skip statistics
|
|
124
|
-
num_singletons = sum(1 for c in skipped_components if len(c) < 2)
|
|
125
|
-
num_oversized = sum(1 for c in skipped_components if len(c) > max_size)
|
|
126
|
-
|
|
127
|
-
get_logger().info(
|
|
128
|
-
f"Skipped {len(skipped_components)} components ({num_singletons} singletons, {num_oversized} oversized), "
|
|
129
|
-
f"total sequences is {sum([len(c) for c in processable_components])} in {len(processable_components)} sets"
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
# Initialize forest with skipped sequences (pass-through)
|
|
133
|
-
skipped_sequences = [seq for comp in skipped_components for seq in comp]
|
|
134
|
-
forest: list[PhotoSequence] = list(skipped_sequences)
|
|
135
|
-
bins: list[PhotoSequence] = list(skipped_sequences)
|
|
136
|
-
|
|
137
|
-
new_photos = sum(seq.n_photos for seq in set().union(*processable_components)) + +sum(
|
|
138
|
-
v.n_photos for v in forest
|
|
139
|
-
)
|
|
140
|
-
assert n_photos == new_photos, f"ComputeIndices.prepare had {n_photos} photos and ended up with {new_photos}"
|
|
141
|
-
|
|
142
|
-
# Return work items and tuple accumulator
|
|
143
|
-
return processable_components, (forest, bins)
|
|
144
|
-
|
|
145
|
-
@classmethod
|
|
146
|
-
def stage_worker(cls, component: set[PhotoSequence], created_by: str) -> WorkerResult[list[PhotoSequence]]:
|
|
147
|
-
"""Process one connected component to form PhotoSequence objects.
|
|
148
|
-
|
|
149
|
-
Uses predicted exemplar sequence and intersection-based comparison.
|
|
150
|
-
Builds SequenceGroup models incrementally for review.
|
|
151
|
-
|
|
152
|
-
Args:
|
|
153
|
-
component: Set of PhotoSequence objects to compare
|
|
154
|
-
created_by: Annotation of how the similarity was detected
|
|
155
|
-
|
|
156
|
-
Returns:
|
|
157
|
-
Tuple of (identical_groups, sequence_groups, work_sequences) where:
|
|
158
|
-
- identical_groups: Always empty list for this stage
|
|
159
|
-
- sequence_groups: SequenceGroup models for multi-sequence groups
|
|
160
|
-
- work_sequences: PhotoSequence objects for pipeline flow
|
|
161
|
-
"""
|
|
162
|
-
# ASSERTION: Count input photos (atomic invariant)
|
|
163
|
-
input_photos: int = sum(seq.n_photos for seq in component)
|
|
164
|
-
|
|
165
|
-
# Use configured gate sequence instead of hardcoded method
|
|
166
|
-
gates = GateSequence(cast(list[GateName], CONFIG.processing.COMPARISON_GATES))
|
|
167
|
-
|
|
168
|
-
# Use common clustering algorithm
|
|
169
|
-
result_classes, sequence_groups = cluster_similar_sequences(
|
|
170
|
-
list(component),
|
|
171
|
-
gates,
|
|
172
|
-
created_by,
|
|
173
|
-
)
|
|
174
|
-
|
|
175
|
-
# ASSERTION: Verify all photos preserved
|
|
176
|
-
output_photos = sum(seq.n_photos for seq in result_classes)
|
|
177
|
-
assert output_photos == input_photos, (
|
|
178
|
-
f"Lost photos in stage_worker: started {input_photos}, ended {output_photos}"
|
|
179
|
-
)
|
|
180
|
-
|
|
181
|
-
return [], sequence_groups, result_classes
|
|
182
|
-
|
|
183
|
-
def accumulate_results(
|
|
184
|
-
self,
|
|
185
|
-
accum: tuple[list[PhotoSequence], list[PhotoSequence]],
|
|
186
|
-
job: list[PhotoSequence],
|
|
187
|
-
) -> None:
|
|
188
|
-
"""Accumulate worker results into forest and bins.
|
|
189
|
-
|
|
190
|
-
Args:
|
|
191
|
-
accum: Tuple of (forest, bins) - both contain all sequences
|
|
192
|
-
job: List of PhotoSequence objects from worker
|
|
193
|
-
"""
|
|
194
|
-
forest, bins = accum
|
|
195
|
-
forest.extend(job)
|
|
196
|
-
bins.extend(job)
|
|
197
|
-
|
|
198
|
-
def finalise(self) -> None:
|
|
199
|
-
forest = self.result[0]
|
|
200
|
-
self.ref_photos_final = count_forest_ref_photos(forest)
|
|
201
|
-
self.ref_seqs_final = len(forest)
|
|
202
|
-
|
|
203
|
-
# Count total photos to ensure no photos lost (invariant check)
|
|
204
|
-
photos_final = count_forest_total_photos(forest)
|
|
205
|
-
count_forest_ref_sequences(forest)
|
|
206
|
-
|
|
207
|
-
# FIXME: Sequence count validation disabled due to test fixture limitations
|
|
208
|
-
assert photos_final == self.total_photos, (
|
|
209
|
-
f"Started with {self.total_photos} photos but ended up with {photos_final}"
|
|
210
|
-
)
|
|
211
|
-
|
|
212
|
-
def needs_review(self) -> ReviewType:
|
|
213
|
-
"""This stage produces sequence groups (index overlap sequences).
|
|
214
|
-
|
|
215
|
-
Returns:
|
|
216
|
-
"sequences" to indicate this stage produces reviewable sequence groups
|
|
217
|
-
"""
|
|
218
|
-
return "sequences"
|
|
219
|
-
|
|
220
|
-
def has_review_data(self) -> bool:
|
|
221
|
-
"""Check if there are any index overlap sequence groups to review.
|
|
222
|
-
|
|
223
|
-
Returns:
|
|
224
|
-
True if forest has classes (multi-sequence groups), False otherwise
|
|
225
|
-
"""
|
|
226
|
-
# Check if stage has run
|
|
227
|
-
if not hasattr(self, "result") or self.result is None:
|
|
228
|
-
return False
|
|
229
|
-
|
|
230
|
-
# Check if there are any classes (multi-sequence groups) in the forest
|
|
231
|
-
forest = self.result[0]
|
|
232
|
-
return any(seq.is_class() for seq in forest)
|
|
233
|
-
|
|
234
|
-
# Typed result field - tuple of (forest, bins)
|
|
235
|
-
result: tuple[list[PhotoSequence], list[PhotoSequence]]
|
|
1
|
+
"""Compute similar sequences from bins created by puting the sequence in bins defined by the max two indices of the sequence."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from itertools import combinations
|
|
6
|
+
from typing import cast
|
|
7
|
+
|
|
8
|
+
import networkx as nx
|
|
9
|
+
|
|
10
|
+
from .comparison_gates import GateName, GateSequence
|
|
11
|
+
from .config import CONFIG
|
|
12
|
+
from .logger import get_logger
|
|
13
|
+
from .models import ReviewType
|
|
14
|
+
from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
|
|
15
|
+
from .ports import InputPort, OutputPort
|
|
16
|
+
from .sequence import (
|
|
17
|
+
INDEX_T,
|
|
18
|
+
PhotoSequence,
|
|
19
|
+
count_forest_ref_photos,
|
|
20
|
+
count_forest_ref_sequences,
|
|
21
|
+
count_forest_total_photos,
|
|
22
|
+
)
|
|
23
|
+
from .sequence_clustering import cluster_similar_sequences
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def build_cohabitation_graph(
|
|
27
|
+
index_bins: dict[INDEX_T, list[PhotoSequence]],
|
|
28
|
+
) -> list[set[PhotoSequence]]:
|
|
29
|
+
"""Build graph from index bins and find connected components.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
index_bins: Dict mapping index pattern → list of sequences
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
List of connected components (each component is a set of PhotoSequence objects)
|
|
36
|
+
"""
|
|
37
|
+
# Build graph
|
|
38
|
+
graph: nx.Graph[PhotoSequence] = nx.Graph()
|
|
39
|
+
graph.add_nodes_from(set().union(*index_bins.values()))
|
|
40
|
+
|
|
41
|
+
# Add edges where sequences share index bins
|
|
42
|
+
# Add edges between all pairs in this bin
|
|
43
|
+
for index_bin in index_bins.values():
|
|
44
|
+
for seq1, seq2 in combinations(index_bin, 2):
|
|
45
|
+
graph.add_edge(seq1, seq2)
|
|
46
|
+
|
|
47
|
+
# Find connected components
|
|
48
|
+
result = [set(c) for c in nx.connected_components(graph)]
|
|
49
|
+
|
|
50
|
+
n_seqs = len(set().union(*index_bins.values()))
|
|
51
|
+
n_result_seqs = len(set().union(*result))
|
|
52
|
+
|
|
53
|
+
assert n_seqs == n_result_seqs, f"build_cohabitation_graph had {n_seqs} but only returned {n_result_seqs}"
|
|
54
|
+
|
|
55
|
+
return result
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class ComputeIndices(
|
|
59
|
+
PipelineStage[
|
|
60
|
+
set[PhotoSequence], # S: component
|
|
61
|
+
list[PhotoSequence], # T: work data
|
|
62
|
+
tuple[list[PhotoSequence], list[PhotoSequence]], # R: accumulator
|
|
63
|
+
]
|
|
64
|
+
):
|
|
65
|
+
def __init__(self) -> None:
|
|
66
|
+
"""Initialize the index-based grouping stage."""
|
|
67
|
+
super().__init__(
|
|
68
|
+
path=CONFIG.paths.forest_sequence_matches_pkl,
|
|
69
|
+
stage_name="Index Grouping",
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Store worker argument
|
|
73
|
+
self.args = self.stage_name # Standard args attribute for run()
|
|
74
|
+
|
|
75
|
+
# Create input port for index bins
|
|
76
|
+
self.index_bins_i: InputPort[dict[INDEX_T, list[PhotoSequence]]] = InputPort("index_bins")
|
|
77
|
+
|
|
78
|
+
# Create output ports - separate ports per downstream consumer
|
|
79
|
+
# Full tuple output (for backward compatibility or review)
|
|
80
|
+
self.forest_bins_o: OutputPort[tuple[list[PhotoSequence], list[PhotoSequence]]] = OutputPort(
|
|
81
|
+
self, getter=lambda: self.result
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Forest output (for ComputePerceptualHash and ComputePerceptualMatch)
|
|
85
|
+
self.forest_o: OutputPort[list[PhotoSequence]] = OutputPort(self, getter=lambda: self.result[0])
|
|
86
|
+
|
|
87
|
+
def prepare(
|
|
88
|
+
self,
|
|
89
|
+
) -> PrepareResult[set[PhotoSequence], tuple[list[PhotoSequence], list[PhotoSequence]]]:
|
|
90
|
+
"""Extract index bins, build graph, and return processable components.
|
|
91
|
+
|
|
92
|
+
Reads index bins from input port and prepares work items for parallel processing.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Tuple of (processable_components, accumulator)
|
|
96
|
+
"""
|
|
97
|
+
# Read index bins from input port
|
|
98
|
+
index_bins: dict[INDEX_T, list[PhotoSequence]] = self.index_bins_i.read()
|
|
99
|
+
# Get reference counts from upstream for UI statistics tracking
|
|
100
|
+
all_sequences = set().union(*index_bins.values())
|
|
101
|
+
self.ref_photos_init = self.index_bins_i.get_ref_photo_count()
|
|
102
|
+
self.ref_seqs_init = self.index_bins_i.get_ref_sequence_count()
|
|
103
|
+
# Count total photos for internal invariant checking (should never change)
|
|
104
|
+
self.total_photos = sum(seq.n_photos for seq in all_sequences)
|
|
105
|
+
|
|
106
|
+
n_photos = self.total_photos
|
|
107
|
+
|
|
108
|
+
# Build cohabitation graph
|
|
109
|
+
components: list[set[PhotoSequence]] = build_cohabitation_graph(index_bins)
|
|
110
|
+
n_component_photos = sum(seq.n_photos for seq in set().union(*components))
|
|
111
|
+
assert n_photos == n_component_photos, (
|
|
112
|
+
f"Had {n_photos} before cohabitation graph and {n_component_photos} afterward"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Filter components by size
|
|
116
|
+
max_size = CONFIG.sequences.MAX_COMPONENT_SIZE
|
|
117
|
+
processable_components: list[set[PhotoSequence]] = sorted(
|
|
118
|
+
[c for c in components if 2 <= len(c) <= max_size],
|
|
119
|
+
key=lambda c: -sum([s.n_ref_photos for s in c]),
|
|
120
|
+
)
|
|
121
|
+
skipped_components: list[set[PhotoSequence]] = [c for c in components if len(c) > max_size or len(c) < 2]
|
|
122
|
+
|
|
123
|
+
# Calculate skip statistics
|
|
124
|
+
num_singletons = sum(1 for c in skipped_components if len(c) < 2)
|
|
125
|
+
num_oversized = sum(1 for c in skipped_components if len(c) > max_size)
|
|
126
|
+
|
|
127
|
+
get_logger().info(
|
|
128
|
+
f"Skipped {len(skipped_components)} components ({num_singletons} singletons, {num_oversized} oversized), "
|
|
129
|
+
f"total sequences is {sum([len(c) for c in processable_components])} in {len(processable_components)} sets"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Initialize forest with skipped sequences (pass-through)
|
|
133
|
+
skipped_sequences = [seq for comp in skipped_components for seq in comp]
|
|
134
|
+
forest: list[PhotoSequence] = list(skipped_sequences)
|
|
135
|
+
bins: list[PhotoSequence] = list(skipped_sequences)
|
|
136
|
+
|
|
137
|
+
new_photos = sum(seq.n_photos for seq in set().union(*processable_components)) + +sum(
|
|
138
|
+
v.n_photos for v in forest
|
|
139
|
+
)
|
|
140
|
+
assert n_photos == new_photos, f"ComputeIndices.prepare had {n_photos} photos and ended up with {new_photos}"
|
|
141
|
+
|
|
142
|
+
# Return work items and tuple accumulator
|
|
143
|
+
return processable_components, (forest, bins)
|
|
144
|
+
|
|
145
|
+
@classmethod
|
|
146
|
+
def stage_worker(cls, component: set[PhotoSequence], created_by: str) -> WorkerResult[list[PhotoSequence]]:
|
|
147
|
+
"""Process one connected component to form PhotoSequence objects.
|
|
148
|
+
|
|
149
|
+
Uses predicted exemplar sequence and intersection-based comparison.
|
|
150
|
+
Builds SequenceGroup models incrementally for review.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
component: Set of PhotoSequence objects to compare
|
|
154
|
+
created_by: Annotation of how the similarity was detected
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
Tuple of (identical_groups, sequence_groups, work_sequences) where:
|
|
158
|
+
- identical_groups: Always empty list for this stage
|
|
159
|
+
- sequence_groups: SequenceGroup models for multi-sequence groups
|
|
160
|
+
- work_sequences: PhotoSequence objects for pipeline flow
|
|
161
|
+
"""
|
|
162
|
+
# ASSERTION: Count input photos (atomic invariant)
|
|
163
|
+
input_photos: int = sum(seq.n_photos for seq in component)
|
|
164
|
+
|
|
165
|
+
# Use configured gate sequence instead of hardcoded method
|
|
166
|
+
gates = GateSequence(cast(list[GateName], CONFIG.processing.COMPARISON_GATES))
|
|
167
|
+
|
|
168
|
+
# Use common clustering algorithm
|
|
169
|
+
result_classes, sequence_groups = cluster_similar_sequences(
|
|
170
|
+
list(component),
|
|
171
|
+
gates,
|
|
172
|
+
created_by,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
# ASSERTION: Verify all photos preserved
|
|
176
|
+
output_photos = sum(seq.n_photos for seq in result_classes)
|
|
177
|
+
assert output_photos == input_photos, (
|
|
178
|
+
f"Lost photos in stage_worker: started {input_photos}, ended {output_photos}"
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
return [], sequence_groups, result_classes
|
|
182
|
+
|
|
183
|
+
def accumulate_results(
|
|
184
|
+
self,
|
|
185
|
+
accum: tuple[list[PhotoSequence], list[PhotoSequence]],
|
|
186
|
+
job: list[PhotoSequence],
|
|
187
|
+
) -> None:
|
|
188
|
+
"""Accumulate worker results into forest and bins.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
accum: Tuple of (forest, bins) - both contain all sequences
|
|
192
|
+
job: List of PhotoSequence objects from worker
|
|
193
|
+
"""
|
|
194
|
+
forest, bins = accum
|
|
195
|
+
forest.extend(job)
|
|
196
|
+
bins.extend(job)
|
|
197
|
+
|
|
198
|
+
def finalise(self) -> None:
|
|
199
|
+
forest = self.result[0]
|
|
200
|
+
self.ref_photos_final = count_forest_ref_photos(forest)
|
|
201
|
+
self.ref_seqs_final = len(forest)
|
|
202
|
+
|
|
203
|
+
# Count total photos to ensure no photos lost (invariant check)
|
|
204
|
+
photos_final = count_forest_total_photos(forest)
|
|
205
|
+
count_forest_ref_sequences(forest)
|
|
206
|
+
|
|
207
|
+
# FIXME: Sequence count validation disabled due to test fixture limitations
|
|
208
|
+
assert photos_final == self.total_photos, (
|
|
209
|
+
f"Started with {self.total_photos} photos but ended up with {photos_final}"
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
def needs_review(self) -> ReviewType:
|
|
213
|
+
"""This stage produces sequence groups (index overlap sequences).
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
"sequences" to indicate this stage produces reviewable sequence groups
|
|
217
|
+
"""
|
|
218
|
+
return "sequences"
|
|
219
|
+
|
|
220
|
+
def has_review_data(self) -> bool:
|
|
221
|
+
"""Check if there are any index overlap sequence groups to review.
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
True if forest has classes (multi-sequence groups), False otherwise
|
|
225
|
+
"""
|
|
226
|
+
# Check if stage has run
|
|
227
|
+
if not hasattr(self, "result") or self.result is None:
|
|
228
|
+
return False
|
|
229
|
+
|
|
230
|
+
# Check if there are any classes (multi-sequence groups) in the forest
|
|
231
|
+
forest = self.result[0]
|
|
232
|
+
return any(seq.is_class() for seq in forest)
|
|
233
|
+
|
|
234
|
+
# Typed result field - tuple of (forest, bins)
|
|
235
|
+
result: tuple[list[PhotoSequence], list[PhotoSequence]]
|