photo-stack-finder 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orchestrator/__init__.py +2 -2
- orchestrator/app.py +6 -11
- orchestrator/build_pipeline.py +19 -21
- orchestrator/orchestrator_runner.py +11 -8
- orchestrator/pipeline_builder.py +126 -126
- orchestrator/pipeline_orchestrator.py +604 -604
- orchestrator/review_persistence.py +162 -162
- orchestrator/static/orchestrator.css +76 -76
- orchestrator/static/orchestrator.html +11 -5
- orchestrator/static/orchestrator.js +3 -1
- overlap_metrics/__init__.py +1 -1
- overlap_metrics/config.py +135 -135
- overlap_metrics/core.py +284 -284
- overlap_metrics/estimators.py +292 -292
- overlap_metrics/metrics.py +307 -307
- overlap_metrics/registry.py +99 -99
- overlap_metrics/utils.py +104 -104
- photo_compare/__init__.py +1 -1
- photo_compare/base.py +285 -285
- photo_compare/config.py +225 -225
- photo_compare/distance.py +15 -15
- photo_compare/feature_methods.py +173 -173
- photo_compare/file_hash.py +29 -29
- photo_compare/hash_methods.py +99 -99
- photo_compare/histogram_methods.py +118 -118
- photo_compare/pixel_methods.py +58 -58
- photo_compare/structural_methods.py +104 -104
- photo_compare/types.py +28 -28
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/METADATA +21 -22
- photo_stack_finder-0.1.8.dist-info/RECORD +75 -0
- scripts/orchestrate.py +12 -10
- utils/__init__.py +4 -3
- utils/base_pipeline_stage.py +171 -171
- utils/base_ports.py +176 -176
- utils/benchmark_utils.py +823 -823
- utils/channel.py +74 -74
- utils/comparison_gates.py +40 -21
- utils/compute_benchmarks.py +355 -355
- utils/compute_identical.py +94 -24
- utils/compute_indices.py +235 -235
- utils/compute_perceptual_hash.py +127 -127
- utils/compute_perceptual_match.py +240 -240
- utils/compute_sha_bins.py +64 -20
- utils/compute_template_similarity.py +1 -1
- utils/compute_versions.py +483 -483
- utils/config.py +8 -5
- utils/data_io.py +83 -83
- utils/graph_context.py +44 -44
- utils/logger.py +2 -2
- utils/models.py +2 -2
- utils/photo_file.py +90 -91
- utils/pipeline_graph.py +334 -334
- utils/pipeline_stage.py +408 -408
- utils/plot_helpers.py +123 -123
- utils/ports.py +136 -136
- utils/progress.py +415 -415
- utils/report_builder.py +139 -139
- utils/review_types.py +55 -55
- utils/review_utils.py +10 -19
- utils/sequence.py +10 -8
- utils/sequence_clustering.py +1 -1
- utils/template.py +57 -57
- utils/template_parsing.py +71 -0
- photo_stack_finder-0.1.7.dist-info/RECORD +0 -74
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/WHEEL +0 -0
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/entry_points.txt +0 -0
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/top_level.txt +0 -0
photo_compare/base.py
CHANGED
|
@@ -1,285 +1,285 @@
|
|
|
1
|
-
"""Base classes for image similarity methods with timing statistics."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import time
|
|
6
|
-
from abc import ABC, abstractmethod
|
|
7
|
-
from collections import defaultdict
|
|
8
|
-
from dataclasses import dataclass
|
|
9
|
-
from typing import Literal, TypeVar
|
|
10
|
-
|
|
11
|
-
import numpy as np
|
|
12
|
-
import numpy.typing as npt
|
|
13
|
-
|
|
14
|
-
# Type variable for the prepared data type (hidden from external users)
|
|
15
|
-
PreparedT = TypeVar("PreparedT")
|
|
16
|
-
BinKeyT = TypeVar("BinKeyT")
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@dataclass
|
|
20
|
-
class TimingStats:
|
|
21
|
-
"""Statistics for method timing."""
|
|
22
|
-
|
|
23
|
-
total_wall_time: float = 0.0
|
|
24
|
-
total_cpu_time: float = 0.0
|
|
25
|
-
call_count: int = 0
|
|
26
|
-
|
|
27
|
-
def add_timing(self, wall_time: float, cpu_time: float) -> None:
|
|
28
|
-
"""Add timing data from a single call."""
|
|
29
|
-
self.total_wall_time += wall_time
|
|
30
|
-
self.total_cpu_time += cpu_time
|
|
31
|
-
self.call_count += 1
|
|
32
|
-
|
|
33
|
-
@property
|
|
34
|
-
def avg_wall_time(self) -> float:
|
|
35
|
-
"""Average wall clock time per call."""
|
|
36
|
-
return self.total_wall_time / max(1, self.call_count)
|
|
37
|
-
|
|
38
|
-
@property
|
|
39
|
-
def avg_cpu_time(self) -> float:
|
|
40
|
-
"""Average CPU time per call."""
|
|
41
|
-
return self.total_cpu_time / max(1, self.call_count)
|
|
42
|
-
|
|
43
|
-
def accumulate(self, other: TimingStats) -> None:
|
|
44
|
-
"""Accumulate timing stats from another TimingStats instance."""
|
|
45
|
-
self.total_wall_time += other.total_wall_time
|
|
46
|
-
self.total_cpu_time += other.total_cpu_time
|
|
47
|
-
self.call_count += other.call_count
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class SimilarityMethod[PreparedT](ABC):
|
|
51
|
-
"""Base class for all similarity methods with timing statistics.
|
|
52
|
-
|
|
53
|
-
Caching is handled externally by the parent application (e.g., PhotoFile objects).
|
|
54
|
-
This class provides the core prepare and compare operations with optional timing.
|
|
55
|
-
"""
|
|
56
|
-
|
|
57
|
-
def __init__(self, method_name: ComparisonMethodName):
|
|
58
|
-
self.method_name: ComparisonMethodName = method_name
|
|
59
|
-
|
|
60
|
-
# Timing statistics
|
|
61
|
-
self._prepare_timing = TimingStats()
|
|
62
|
-
self._compare_timing = TimingStats()
|
|
63
|
-
|
|
64
|
-
@abstractmethod
|
|
65
|
-
def _prepare_single(self, pixels: npt.NDArray[np.uint8]) -> PreparedT:
|
|
66
|
-
"""Implement the actual preparation logic.
|
|
67
|
-
|
|
68
|
-
Args:
|
|
69
|
-
pixels: RGB pixel array with shape (height, width, 3), dtype uint8.
|
|
70
|
-
EXIF orientation already applied. Full resolution.
|
|
71
|
-
|
|
72
|
-
Returns:
|
|
73
|
-
Prepared data for comparison
|
|
74
|
-
"""
|
|
75
|
-
pass
|
|
76
|
-
|
|
77
|
-
@abstractmethod
|
|
78
|
-
def _compare_prepared(self, prep1: PreparedT, prep2: PreparedT) -> float:
|
|
79
|
-
"""Compare two prepared items.
|
|
80
|
-
|
|
81
|
-
Args:
|
|
82
|
-
prep1: First prepared item
|
|
83
|
-
prep2: Second prepared item
|
|
84
|
-
|
|
85
|
-
Returns:
|
|
86
|
-
Similarity score 0-1 (higher is more similar)
|
|
87
|
-
"""
|
|
88
|
-
pass
|
|
89
|
-
|
|
90
|
-
def _prepare_single_timed(self, pixels: npt.NDArray[np.uint8]) -> PreparedT:
|
|
91
|
-
"""Wrapper for _prepare_single that tracks timing."""
|
|
92
|
-
wall_start = time.perf_counter()
|
|
93
|
-
cpu_start = time.process_time()
|
|
94
|
-
|
|
95
|
-
result = self._prepare_single(pixels)
|
|
96
|
-
|
|
97
|
-
wall_elapsed = time.perf_counter() - wall_start
|
|
98
|
-
cpu_elapsed = time.process_time() - cpu_start
|
|
99
|
-
self._prepare_timing.add_timing(wall_elapsed, cpu_elapsed)
|
|
100
|
-
|
|
101
|
-
return result
|
|
102
|
-
|
|
103
|
-
def _compare_prepared_timed(self, prep1: PreparedT, prep2: PreparedT) -> float:
|
|
104
|
-
"""Wrapper for _compare_prepared that tracks timing."""
|
|
105
|
-
wall_start = time.perf_counter()
|
|
106
|
-
cpu_start = time.process_time()
|
|
107
|
-
|
|
108
|
-
result = self._compare_prepared(prep1, prep2)
|
|
109
|
-
|
|
110
|
-
wall_elapsed = time.perf_counter() - wall_start
|
|
111
|
-
cpu_elapsed = time.process_time() - cpu_start
|
|
112
|
-
self._compare_timing.add_timing(wall_elapsed, cpu_elapsed)
|
|
113
|
-
|
|
114
|
-
return result
|
|
115
|
-
|
|
116
|
-
def prepare(self, pixels: npt.NDArray[np.uint8]) -> PreparedT:
|
|
117
|
-
"""Prepare pixel array for comparison.
|
|
118
|
-
|
|
119
|
-
Args:
|
|
120
|
-
pixels: RGB pixel array with shape (height, width, 3), dtype uint8.
|
|
121
|
-
EXIF orientation already applied. Full resolution.
|
|
122
|
-
|
|
123
|
-
Returns:
|
|
124
|
-
Prepared data for comparison
|
|
125
|
-
|
|
126
|
-
Example:
|
|
127
|
-
>>> from photo_compare import create_comparison_method
|
|
128
|
-
>>> method = create_comparison_method('dhash')
|
|
129
|
-
>>> with photo1.image_data() as img1, photo2.image_data() as img2:
|
|
130
|
-
... pixels1 = img1.get_pixels()
|
|
131
|
-
... pixels2 = img2.get_pixels()
|
|
132
|
-
... prep1 = method.prepare(pixels1)
|
|
133
|
-
... prep2 = method.prepare(pixels2)
|
|
134
|
-
... similarity = method.compare(prep1, prep2)
|
|
135
|
-
"""
|
|
136
|
-
return self._prepare_single_timed(pixels)
|
|
137
|
-
|
|
138
|
-
def compare(self, prep1: PreparedT, prep2: PreparedT) -> float:
|
|
139
|
-
"""Compare two prepared items.
|
|
140
|
-
|
|
141
|
-
Args:
|
|
142
|
-
prep1: First prepared item
|
|
143
|
-
prep2: Second prepared item
|
|
144
|
-
|
|
145
|
-
Returns:
|
|
146
|
-
Similarity score 0-1 (higher is more similar)
|
|
147
|
-
"""
|
|
148
|
-
return self._compare_prepared_timed(prep1, prep2)
|
|
149
|
-
|
|
150
|
-
def timing_stats(self) -> dict[str, dict[str, float]]:
|
|
151
|
-
"""Get timing statistics for both preparation and comparison operations.
|
|
152
|
-
|
|
153
|
-
Returns:
|
|
154
|
-
Dictionary with 'preparation' and 'comparison' timing stats
|
|
155
|
-
"""
|
|
156
|
-
return {
|
|
157
|
-
"preparation": {
|
|
158
|
-
"total_wall_time": self._prepare_timing.total_wall_time,
|
|
159
|
-
"total_cpu_time": self._prepare_timing.total_cpu_time,
|
|
160
|
-
"call_count": self._prepare_timing.call_count,
|
|
161
|
-
"avg_wall_time": self._prepare_timing.avg_wall_time,
|
|
162
|
-
"avg_cpu_time": self._prepare_timing.avg_cpu_time,
|
|
163
|
-
},
|
|
164
|
-
"comparison": {
|
|
165
|
-
"total_wall_time": self._compare_timing.total_wall_time,
|
|
166
|
-
"total_cpu_time": self._compare_timing.total_cpu_time,
|
|
167
|
-
"call_count": self._compare_timing.call_count,
|
|
168
|
-
"avg_wall_time": self._compare_timing.avg_wall_time,
|
|
169
|
-
"avg_cpu_time": self._compare_timing.avg_cpu_time,
|
|
170
|
-
},
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
class BinningSimilarityMethod[PreparedT, BinKeyT](SimilarityMethod[PreparedT]):
|
|
175
|
-
"""Base class for methods that support binning (like hash-based methods).
|
|
176
|
-
|
|
177
|
-
Binning allows grouping of prepared items by their key for efficient
|
|
178
|
-
duplicate detection. Bins are maintained separately from the main cache.
|
|
179
|
-
"""
|
|
180
|
-
|
|
181
|
-
def __init__(self, method_name: ComparisonMethodName):
|
|
182
|
-
super().__init__(method_name)
|
|
183
|
-
self._bins: dict[BinKeyT, set[int]] = defaultdict(set)
|
|
184
|
-
self._file_to_bin: dict[int, BinKeyT] = {}
|
|
185
|
-
|
|
186
|
-
@abstractmethod
|
|
187
|
-
def _get_bin_key(self, prepared: PreparedT) -> BinKeyT:
|
|
188
|
-
"""Extract the binning key from prepared data.
|
|
189
|
-
|
|
190
|
-
Args:
|
|
191
|
-
prepared: Prepared data
|
|
192
|
-
|
|
193
|
-
Returns:
|
|
194
|
-
Bin key for grouping
|
|
195
|
-
"""
|
|
196
|
-
pass
|
|
197
|
-
|
|
198
|
-
def prepare_and_bin(self, file_id: int, pixels: npt.NDArray[np.uint8]) -> PreparedT:
|
|
199
|
-
"""Prepare pixel array and add to bins.
|
|
200
|
-
|
|
201
|
-
Args:
|
|
202
|
-
file_id: Unique identifier for the file
|
|
203
|
-
pixels: RGB pixel array (height, width, 3), dtype uint8
|
|
204
|
-
|
|
205
|
-
Returns:
|
|
206
|
-
Prepared data for comparison
|
|
207
|
-
"""
|
|
208
|
-
prepared = self.prepare(pixels)
|
|
209
|
-
bin_key = self._get_bin_key(prepared)
|
|
210
|
-
self._bins[bin_key].add(file_id)
|
|
211
|
-
self._file_to_bin[file_id] = bin_key
|
|
212
|
-
return prepared
|
|
213
|
-
|
|
214
|
-
def add_to_bin(self, file_id: int, prepared: PreparedT) -> None:
|
|
215
|
-
"""Add a prepared item to bins without re-preparing.
|
|
216
|
-
|
|
217
|
-
Args:
|
|
218
|
-
file_id: Unique identifier for the file
|
|
219
|
-
prepared: Pre-prepared data
|
|
220
|
-
"""
|
|
221
|
-
bin_key = self._get_bin_key(prepared)
|
|
222
|
-
self._bins[bin_key].add(file_id)
|
|
223
|
-
self._file_to_bin[file_id] = bin_key
|
|
224
|
-
|
|
225
|
-
def get_bin_candidates(self, file_id: int) -> set[int]:
|
|
226
|
-
"""Get all files in the same bin as the given file.
|
|
227
|
-
|
|
228
|
-
Args:
|
|
229
|
-
file_id: File ID to get candidates for
|
|
230
|
-
|
|
231
|
-
Returns:
|
|
232
|
-
Set of file IDs in same bin (excluding the query file)
|
|
233
|
-
"""
|
|
234
|
-
if file_id not in self._file_to_bin:
|
|
235
|
-
return set()
|
|
236
|
-
bin_key = self._file_to_bin[file_id]
|
|
237
|
-
return self._bins[bin_key] - {file_id} # Exclude self
|
|
238
|
-
|
|
239
|
-
def get_all_bins(self) -> dict[BinKeyT, set[int]]:
|
|
240
|
-
"""Get all bins for analysis.
|
|
241
|
-
|
|
242
|
-
Returns:
|
|
243
|
-
Dictionary mapping bin keys to sets of file IDs
|
|
244
|
-
"""
|
|
245
|
-
return dict(self._bins)
|
|
246
|
-
|
|
247
|
-
def get_duplicate_groups(self, min_group_size: int) -> list[set[int]]:
|
|
248
|
-
"""Get all bins with at least min_group_size files.
|
|
249
|
-
|
|
250
|
-
Args:
|
|
251
|
-
min_group_size: Minimum number of files in a group
|
|
252
|
-
|
|
253
|
-
Returns:
|
|
254
|
-
List of file ID sets representing potential duplicate groups
|
|
255
|
-
"""
|
|
256
|
-
return [file_set for file_set in self._bins.values() if len(file_set) >= min_group_size]
|
|
257
|
-
|
|
258
|
-
def clear_bins(self) -> None:
|
|
259
|
-
"""Clear all bin data."""
|
|
260
|
-
self._bins.clear()
|
|
261
|
-
self._file_to_bin.clear()
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
ComparisonMethodName = Literal[
|
|
265
|
-
# Hash methods
|
|
266
|
-
"ahash",
|
|
267
|
-
"dhash",
|
|
268
|
-
"phash",
|
|
269
|
-
"whash",
|
|
270
|
-
# Feature methods
|
|
271
|
-
"sift",
|
|
272
|
-
"akaze",
|
|
273
|
-
"orb",
|
|
274
|
-
"brisk",
|
|
275
|
-
# Structural methods
|
|
276
|
-
"ssim",
|
|
277
|
-
"ms_ssim",
|
|
278
|
-
"hog",
|
|
279
|
-
# Pixel methods
|
|
280
|
-
"mse",
|
|
281
|
-
"psnr",
|
|
282
|
-
# Histogram methods
|
|
283
|
-
"colour_histogram",
|
|
284
|
-
"hsv_histogram",
|
|
285
|
-
]
|
|
1
|
+
"""Base classes for image similarity methods with timing statistics."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Literal, TypeVar
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import numpy.typing as npt
|
|
13
|
+
|
|
14
|
+
# Type variable for the prepared data type (hidden from external users)
|
|
15
|
+
PreparedT = TypeVar("PreparedT")
|
|
16
|
+
BinKeyT = TypeVar("BinKeyT")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class TimingStats:
|
|
21
|
+
"""Statistics for method timing."""
|
|
22
|
+
|
|
23
|
+
total_wall_time: float = 0.0
|
|
24
|
+
total_cpu_time: float = 0.0
|
|
25
|
+
call_count: int = 0
|
|
26
|
+
|
|
27
|
+
def add_timing(self, wall_time: float, cpu_time: float) -> None:
|
|
28
|
+
"""Add timing data from a single call."""
|
|
29
|
+
self.total_wall_time += wall_time
|
|
30
|
+
self.total_cpu_time += cpu_time
|
|
31
|
+
self.call_count += 1
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def avg_wall_time(self) -> float:
|
|
35
|
+
"""Average wall clock time per call."""
|
|
36
|
+
return self.total_wall_time / max(1, self.call_count)
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def avg_cpu_time(self) -> float:
|
|
40
|
+
"""Average CPU time per call."""
|
|
41
|
+
return self.total_cpu_time / max(1, self.call_count)
|
|
42
|
+
|
|
43
|
+
def accumulate(self, other: TimingStats) -> None:
|
|
44
|
+
"""Accumulate timing stats from another TimingStats instance."""
|
|
45
|
+
self.total_wall_time += other.total_wall_time
|
|
46
|
+
self.total_cpu_time += other.total_cpu_time
|
|
47
|
+
self.call_count += other.call_count
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class SimilarityMethod[PreparedT](ABC):
|
|
51
|
+
"""Base class for all similarity methods with timing statistics.
|
|
52
|
+
|
|
53
|
+
Caching is handled externally by the parent application (e.g., PhotoFile objects).
|
|
54
|
+
This class provides the core prepare and compare operations with optional timing.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(self, method_name: ComparisonMethodName):
|
|
58
|
+
self.method_name: ComparisonMethodName = method_name
|
|
59
|
+
|
|
60
|
+
# Timing statistics
|
|
61
|
+
self._prepare_timing = TimingStats()
|
|
62
|
+
self._compare_timing = TimingStats()
|
|
63
|
+
|
|
64
|
+
@abstractmethod
|
|
65
|
+
def _prepare_single(self, pixels: npt.NDArray[np.uint8]) -> PreparedT:
|
|
66
|
+
"""Implement the actual preparation logic.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
pixels: RGB pixel array with shape (height, width, 3), dtype uint8.
|
|
70
|
+
EXIF orientation already applied. Full resolution.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Prepared data for comparison
|
|
74
|
+
"""
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
@abstractmethod
|
|
78
|
+
def _compare_prepared(self, prep1: PreparedT, prep2: PreparedT) -> float:
|
|
79
|
+
"""Compare two prepared items.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
prep1: First prepared item
|
|
83
|
+
prep2: Second prepared item
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Similarity score 0-1 (higher is more similar)
|
|
87
|
+
"""
|
|
88
|
+
pass
|
|
89
|
+
|
|
90
|
+
def _prepare_single_timed(self, pixels: npt.NDArray[np.uint8]) -> PreparedT:
|
|
91
|
+
"""Wrapper for _prepare_single that tracks timing."""
|
|
92
|
+
wall_start = time.perf_counter()
|
|
93
|
+
cpu_start = time.process_time()
|
|
94
|
+
|
|
95
|
+
result = self._prepare_single(pixels)
|
|
96
|
+
|
|
97
|
+
wall_elapsed = time.perf_counter() - wall_start
|
|
98
|
+
cpu_elapsed = time.process_time() - cpu_start
|
|
99
|
+
self._prepare_timing.add_timing(wall_elapsed, cpu_elapsed)
|
|
100
|
+
|
|
101
|
+
return result
|
|
102
|
+
|
|
103
|
+
def _compare_prepared_timed(self, prep1: PreparedT, prep2: PreparedT) -> float:
|
|
104
|
+
"""Wrapper for _compare_prepared that tracks timing."""
|
|
105
|
+
wall_start = time.perf_counter()
|
|
106
|
+
cpu_start = time.process_time()
|
|
107
|
+
|
|
108
|
+
result = self._compare_prepared(prep1, prep2)
|
|
109
|
+
|
|
110
|
+
wall_elapsed = time.perf_counter() - wall_start
|
|
111
|
+
cpu_elapsed = time.process_time() - cpu_start
|
|
112
|
+
self._compare_timing.add_timing(wall_elapsed, cpu_elapsed)
|
|
113
|
+
|
|
114
|
+
return result
|
|
115
|
+
|
|
116
|
+
def prepare(self, pixels: npt.NDArray[np.uint8]) -> PreparedT:
|
|
117
|
+
"""Prepare pixel array for comparison.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
pixels: RGB pixel array with shape (height, width, 3), dtype uint8.
|
|
121
|
+
EXIF orientation already applied. Full resolution.
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
Prepared data for comparison
|
|
125
|
+
|
|
126
|
+
Example:
|
|
127
|
+
>>> from photo_compare import create_comparison_method
|
|
128
|
+
>>> method = create_comparison_method('dhash')
|
|
129
|
+
>>> with photo1.image_data() as img1, photo2.image_data() as img2:
|
|
130
|
+
... pixels1 = img1.get_pixels()
|
|
131
|
+
... pixels2 = img2.get_pixels()
|
|
132
|
+
... prep1 = method.prepare(pixels1)
|
|
133
|
+
... prep2 = method.prepare(pixels2)
|
|
134
|
+
... similarity = method.compare(prep1, prep2)
|
|
135
|
+
"""
|
|
136
|
+
return self._prepare_single_timed(pixels)
|
|
137
|
+
|
|
138
|
+
def compare(self, prep1: PreparedT, prep2: PreparedT) -> float:
|
|
139
|
+
"""Compare two prepared items.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
prep1: First prepared item
|
|
143
|
+
prep2: Second prepared item
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
Similarity score 0-1 (higher is more similar)
|
|
147
|
+
"""
|
|
148
|
+
return self._compare_prepared_timed(prep1, prep2)
|
|
149
|
+
|
|
150
|
+
def timing_stats(self) -> dict[str, dict[str, float]]:
|
|
151
|
+
"""Get timing statistics for both preparation and comparison operations.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Dictionary with 'preparation' and 'comparison' timing stats
|
|
155
|
+
"""
|
|
156
|
+
return {
|
|
157
|
+
"preparation": {
|
|
158
|
+
"total_wall_time": self._prepare_timing.total_wall_time,
|
|
159
|
+
"total_cpu_time": self._prepare_timing.total_cpu_time,
|
|
160
|
+
"call_count": self._prepare_timing.call_count,
|
|
161
|
+
"avg_wall_time": self._prepare_timing.avg_wall_time,
|
|
162
|
+
"avg_cpu_time": self._prepare_timing.avg_cpu_time,
|
|
163
|
+
},
|
|
164
|
+
"comparison": {
|
|
165
|
+
"total_wall_time": self._compare_timing.total_wall_time,
|
|
166
|
+
"total_cpu_time": self._compare_timing.total_cpu_time,
|
|
167
|
+
"call_count": self._compare_timing.call_count,
|
|
168
|
+
"avg_wall_time": self._compare_timing.avg_wall_time,
|
|
169
|
+
"avg_cpu_time": self._compare_timing.avg_cpu_time,
|
|
170
|
+
},
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
class BinningSimilarityMethod[PreparedT, BinKeyT](SimilarityMethod[PreparedT]):
|
|
175
|
+
"""Base class for methods that support binning (like hash-based methods).
|
|
176
|
+
|
|
177
|
+
Binning allows grouping of prepared items by their key for efficient
|
|
178
|
+
duplicate detection. Bins are maintained separately from the main cache.
|
|
179
|
+
"""
|
|
180
|
+
|
|
181
|
+
def __init__(self, method_name: ComparisonMethodName):
|
|
182
|
+
super().__init__(method_name)
|
|
183
|
+
self._bins: dict[BinKeyT, set[int]] = defaultdict(set)
|
|
184
|
+
self._file_to_bin: dict[int, BinKeyT] = {}
|
|
185
|
+
|
|
186
|
+
@abstractmethod
|
|
187
|
+
def _get_bin_key(self, prepared: PreparedT) -> BinKeyT:
|
|
188
|
+
"""Extract the binning key from prepared data.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
prepared: Prepared data
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
Bin key for grouping
|
|
195
|
+
"""
|
|
196
|
+
pass
|
|
197
|
+
|
|
198
|
+
def prepare_and_bin(self, file_id: int, pixels: npt.NDArray[np.uint8]) -> PreparedT:
|
|
199
|
+
"""Prepare pixel array and add to bins.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
file_id: Unique identifier for the file
|
|
203
|
+
pixels: RGB pixel array (height, width, 3), dtype uint8
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
Prepared data for comparison
|
|
207
|
+
"""
|
|
208
|
+
prepared = self.prepare(pixels)
|
|
209
|
+
bin_key = self._get_bin_key(prepared)
|
|
210
|
+
self._bins[bin_key].add(file_id)
|
|
211
|
+
self._file_to_bin[file_id] = bin_key
|
|
212
|
+
return prepared
|
|
213
|
+
|
|
214
|
+
def add_to_bin(self, file_id: int, prepared: PreparedT) -> None:
|
|
215
|
+
"""Add a prepared item to bins without re-preparing.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
file_id: Unique identifier for the file
|
|
219
|
+
prepared: Pre-prepared data
|
|
220
|
+
"""
|
|
221
|
+
bin_key = self._get_bin_key(prepared)
|
|
222
|
+
self._bins[bin_key].add(file_id)
|
|
223
|
+
self._file_to_bin[file_id] = bin_key
|
|
224
|
+
|
|
225
|
+
def get_bin_candidates(self, file_id: int) -> set[int]:
|
|
226
|
+
"""Get all files in the same bin as the given file.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
file_id: File ID to get candidates for
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
Set of file IDs in same bin (excluding the query file)
|
|
233
|
+
"""
|
|
234
|
+
if file_id not in self._file_to_bin:
|
|
235
|
+
return set()
|
|
236
|
+
bin_key = self._file_to_bin[file_id]
|
|
237
|
+
return self._bins[bin_key] - {file_id} # Exclude self
|
|
238
|
+
|
|
239
|
+
def get_all_bins(self) -> dict[BinKeyT, set[int]]:
|
|
240
|
+
"""Get all bins for analysis.
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
Dictionary mapping bin keys to sets of file IDs
|
|
244
|
+
"""
|
|
245
|
+
return dict(self._bins)
|
|
246
|
+
|
|
247
|
+
def get_duplicate_groups(self, min_group_size: int) -> list[set[int]]:
|
|
248
|
+
"""Get all bins with at least min_group_size files.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
min_group_size: Minimum number of files in a group
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
List of file ID sets representing potential duplicate groups
|
|
255
|
+
"""
|
|
256
|
+
return [file_set for file_set in self._bins.values() if len(file_set) >= min_group_size]
|
|
257
|
+
|
|
258
|
+
def clear_bins(self) -> None:
|
|
259
|
+
"""Clear all bin data."""
|
|
260
|
+
self._bins.clear()
|
|
261
|
+
self._file_to_bin.clear()
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
ComparisonMethodName = Literal[
|
|
265
|
+
# Hash methods
|
|
266
|
+
"ahash",
|
|
267
|
+
"dhash",
|
|
268
|
+
"phash",
|
|
269
|
+
"whash",
|
|
270
|
+
# Feature methods
|
|
271
|
+
"sift",
|
|
272
|
+
"akaze",
|
|
273
|
+
"orb",
|
|
274
|
+
"brisk",
|
|
275
|
+
# Structural methods
|
|
276
|
+
"ssim",
|
|
277
|
+
"ms_ssim",
|
|
278
|
+
"hog",
|
|
279
|
+
# Pixel methods
|
|
280
|
+
"mse",
|
|
281
|
+
"psnr",
|
|
282
|
+
# Histogram methods
|
|
283
|
+
"colour_histogram",
|
|
284
|
+
"hsv_histogram",
|
|
285
|
+
]
|