pixmatch 0.0.4__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixmatch might be problematic. Click here for more details.
- pixmatch/__init__.py +253 -114
- pixmatch/__main__.py +2 -3
- pixmatch/utils.py +5 -3
- {pixmatch-0.0.4.dist-info → pixmatch-0.0.5.dist-info}/METADATA +1 -1
- pixmatch-0.0.5.dist-info/RECORD +8 -0
- pixmatch-0.0.4.dist-info/RECORD +0 -8
- {pixmatch-0.0.4.dist-info → pixmatch-0.0.5.dist-info}/WHEEL +0 -0
- {pixmatch-0.0.4.dist-info → pixmatch-0.0.5.dist-info}/licenses/LICENSE +0 -0
- {pixmatch-0.0.4.dist-info → pixmatch-0.0.5.dist-info}/top_level.txt +0 -0
pixmatch/__init__.py
CHANGED
|
@@ -5,36 +5,58 @@ import time
|
|
|
5
5
|
|
|
6
6
|
from collections import defaultdict
|
|
7
7
|
from dataclasses import dataclass, field
|
|
8
|
-
from
|
|
8
|
+
from functools import wraps
|
|
9
|
+
from multiprocessing import Manager, Pool
|
|
9
10
|
from pathlib import Path
|
|
10
11
|
from threading import Event
|
|
11
|
-
from typing import Union
|
|
12
|
-
from zipfile import ZipFile
|
|
12
|
+
from typing import ClassVar, Union
|
|
13
|
+
from zipfile import BadZipFile, ZipFile
|
|
13
14
|
|
|
14
15
|
import imagehash
|
|
15
16
|
import numpy as np
|
|
16
17
|
|
|
17
|
-
from PIL import Image
|
|
18
|
+
from PIL import Image, ImageFile, UnidentifiedImageError
|
|
19
|
+
|
|
20
|
+
ImageFile.LOAD_TRUNCATED_IMAGES = True # Allow damaged images
|
|
18
21
|
|
|
19
22
|
logger = logging.getLogger(__name__)
|
|
20
23
|
|
|
21
24
|
|
|
22
25
|
@dataclass(frozen=True)
|
|
23
26
|
class ZipPath:
|
|
27
|
+
"""
|
|
28
|
+
A general object describing a Path.
|
|
29
|
+
|
|
30
|
+
All paths in pixmatch will be one of these. `subpath` will be empty for non-zip file paths.
|
|
31
|
+
|
|
32
|
+
Attributes:
|
|
33
|
+
path (str): The path to the file.
|
|
34
|
+
subpath (str): The subpath in the zip if `path` is for a zip.
|
|
35
|
+
"""
|
|
36
|
+
# TODO: At some point convert this to Path.
|
|
37
|
+
# When I tried that last it introduced problems with inter-process communication
|
|
24
38
|
path: str
|
|
25
39
|
subpath: str
|
|
26
40
|
|
|
27
41
|
@property
|
|
28
|
-
def path_obj(self):
|
|
42
|
+
def path_obj(self) -> Path:
|
|
43
|
+
"""Get the path as as Path object"""
|
|
29
44
|
return Path(self.path)
|
|
30
45
|
|
|
31
46
|
@property
|
|
32
47
|
def is_gif(self) -> bool:
|
|
48
|
+
"""Is this a path to an animated image?"""
|
|
33
49
|
movie_extensions = {'.gif', '.webp'}
|
|
34
50
|
return (not self.subpath and Path(self.path).suffix.lower() in movie_extensions) \
|
|
35
51
|
or (self.subpath and self.subpath[-4:].lower() in movie_extensions)
|
|
36
52
|
|
|
53
|
+
@property
|
|
54
|
+
def is_zip(self) -> bool:
|
|
55
|
+
"""Does this point to a file located in a zip?"""
|
|
56
|
+
return bool(self.subpath)
|
|
57
|
+
|
|
37
58
|
def absolute(self):
|
|
59
|
+
"""Get the absolute version of this ZipPath"""
|
|
38
60
|
return ZipPath(str(self.path_obj.absolute()), self.subpath)
|
|
39
61
|
|
|
40
62
|
|
|
@@ -42,59 +64,66 @@ def _is_under(folder_abs: str, target: str | Path) -> bool:
|
|
|
42
64
|
"""Return True if the ZipPath's real file (zp.path) is inside folder_abs."""
|
|
43
65
|
try:
|
|
44
66
|
Path(target).absolute().relative_to(Path(folder_abs).absolute())
|
|
45
|
-
return True
|
|
46
67
|
except ValueError:
|
|
47
68
|
return False
|
|
48
69
|
|
|
70
|
+
return True
|
|
71
|
+
|
|
49
72
|
|
|
50
73
|
def phash_params_for_strength(strength: int) -> tuple[int, int]:
|
|
74
|
+
"""
|
|
75
|
+
Convert a 0-10 strength to settings for imagehash
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
tuple<int, int>: The hash size (in bytes) and the high frequency factor
|
|
79
|
+
"""
|
|
51
80
|
# TODO: This sucks.
|
|
52
81
|
strength = max(0, min(10, strength))
|
|
53
82
|
if strength >= 10:
|
|
54
|
-
return 16, 4
|
|
55
|
-
|
|
83
|
+
return 16, 4
|
|
84
|
+
if strength >= 8:
|
|
56
85
|
return 15, 4
|
|
57
|
-
|
|
86
|
+
if strength >= 7:
|
|
58
87
|
return 13, 4
|
|
59
|
-
|
|
88
|
+
if strength >= 6:
|
|
60
89
|
return 11, 4
|
|
61
|
-
|
|
90
|
+
if strength >= 5:
|
|
62
91
|
return 9, 4
|
|
63
|
-
|
|
92
|
+
if strength >= 4:
|
|
64
93
|
return 8, 4
|
|
65
|
-
|
|
94
|
+
if strength >= 3:
|
|
66
95
|
return 8, 3
|
|
67
|
-
|
|
96
|
+
if strength >= 2:
|
|
68
97
|
return 7, 3
|
|
69
|
-
|
|
70
|
-
return 6, 3
|
|
98
|
+
return 6, 3
|
|
71
99
|
|
|
72
100
|
|
|
73
|
-
def calculate_hashes(f,
|
|
101
|
+
def calculate_hashes(f, strength=5, *, is_gif=False, exact_match=False) -> tuple[str, set[str]]:
|
|
74
102
|
"""
|
|
75
103
|
Calculate hashes for a given file.
|
|
76
104
|
|
|
77
105
|
Args:
|
|
78
106
|
f (IO or str or Path): Either a file path to process, or a in-memory BytesIO object ready for reading.
|
|
79
|
-
is_gif (bool): Is this gif data? Needed if passing an in-memory BytesIO object.
|
|
80
107
|
strength (int): A number between 0 and 10 on the strength of the matches.
|
|
108
|
+
is_gif (bool): Is this gif data? Needed if passing an in-memory BytesIO object.
|
|
81
109
|
exact_match (bool): Use exact SHA256 hahes?
|
|
82
110
|
If true, strength must be 10.
|
|
83
111
|
If false, perceptual hashes will be used, even with high strength.
|
|
84
112
|
|
|
85
113
|
Returns:
|
|
86
|
-
|
|
114
|
+
tuple[str, set]: The first element is the primary hash,
|
|
115
|
+
the second element are any secondary hashes representing rotations, flips, etc...
|
|
87
116
|
"""
|
|
88
117
|
if exact_match:
|
|
89
118
|
hasher = hashlib.sha256()
|
|
90
119
|
block_size = 65536
|
|
91
|
-
with (open(f, "rb") if isinstance(f, (str, Path)) else f) as file:
|
|
120
|
+
with (open(f, "rb") if isinstance(f, (str, Path)) else f) as file: # noqa: PTH123
|
|
92
121
|
for block in iter(lambda: file.read(block_size), b""):
|
|
93
122
|
hasher.update(block)
|
|
94
|
-
return
|
|
123
|
+
return hasher.hexdigest(), set()
|
|
95
124
|
|
|
96
125
|
hash_size, highfreq_factor = phash_params_for_strength(strength)
|
|
97
|
-
with
|
|
126
|
+
with Image.open(f) as im:
|
|
98
127
|
if is_gif:
|
|
99
128
|
initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
|
|
100
129
|
# This is going to be a bit confusing but basically, imagehash produces weird hashes for some gifs
|
|
@@ -112,7 +141,7 @@ def calculate_hashes(f, is_gif=False, strength=5, exact_match=False):
|
|
|
112
141
|
for r_i, r in enumerate(initial_hash.hash)):
|
|
113
142
|
try:
|
|
114
143
|
im.seek(im.tell() + 1)
|
|
115
|
-
except EOFError:
|
|
144
|
+
except EOFError: # noqa: PERF203
|
|
116
145
|
break
|
|
117
146
|
else:
|
|
118
147
|
initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
|
|
@@ -120,63 +149,122 @@ def calculate_hashes(f, is_gif=False, strength=5, exact_match=False):
|
|
|
120
149
|
|
|
121
150
|
# For GIFs we'll look for mirrored versions but thats it
|
|
122
151
|
flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
flipped_v_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
|
|
127
|
-
images = (im, im.rotate(90), im.rotate(180), im.rotate(270),
|
|
128
|
-
flipped_h_image, flipped_h_image.rotate(90), flipped_h_image.rotate(180), flipped_h_image.rotate(270),
|
|
129
|
-
flipped_v_image, flipped_v_image.rotate(90), flipped_v_image.rotate(180), flipped_v_image.rotate(270))
|
|
130
|
-
return [imagehash.phash(image, hash_size=hash_size, highfreq_factor=highfreq_factor) for image in images]
|
|
131
|
-
|
|
152
|
+
extras = (flipped_h_image, )
|
|
153
|
+
else:
|
|
154
|
+
initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
|
|
132
155
|
|
|
133
|
-
|
|
156
|
+
flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
|
|
157
|
+
flipped_v_image = im.transpose(Image.Transpose.FLIP_TOP_BOTTOM)
|
|
158
|
+
extras = (im.rotate(90), im.rotate(180), im.rotate(270),
|
|
159
|
+
flipped_h_image, flipped_h_image.rotate(90), flipped_h_image.rotate(180),
|
|
160
|
+
flipped_h_image.rotate(270),
|
|
161
|
+
flipped_v_image, flipped_v_image.rotate(90), flipped_v_image.rotate(180),
|
|
162
|
+
flipped_v_image.rotate(270))
|
|
163
|
+
|
|
164
|
+
return str(initial_hash), {
|
|
165
|
+
str(imagehash.phash(image, hash_size=hash_size, highfreq_factor=highfreq_factor)) for image in extras
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def thread_error_handler(func):
|
|
170
|
+
"""An error handler for the thread to return information about where the error occurred"""
|
|
171
|
+
|
|
172
|
+
@wraps(func)
|
|
173
|
+
def wrapper(path, *args, **kwargs): # noqa: ANN202
|
|
174
|
+
try:
|
|
175
|
+
return func(path, *args, **kwargs)
|
|
176
|
+
except Exception as e:
|
|
177
|
+
e.input_path = path
|
|
178
|
+
raise
|
|
179
|
+
|
|
180
|
+
return wrapper
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
@thread_error_handler
|
|
184
|
+
def _process_image(
|
|
185
|
+
path: str | Path,
|
|
186
|
+
supported_extensions: set | None = None,
|
|
187
|
+
strength: int = 5,
|
|
188
|
+
*,
|
|
189
|
+
exact_match: bool = False,
|
|
190
|
+
) -> tuple[Path, tuple | dict[str, tuple]]:
|
|
191
|
+
"""Get the hashes for a given path. Is multiprocessing compatible"""
|
|
134
192
|
path = Path(path)
|
|
135
193
|
if path.suffix.lower() != '.zip':
|
|
136
194
|
return path, calculate_hashes(path, is_gif=path.suffix.lower() in {".gif", ".webp"},
|
|
137
195
|
strength=strength, exact_match=exact_match)
|
|
138
196
|
|
|
139
|
-
|
|
197
|
+
if not supported_extensions:
|
|
198
|
+
supported_extensions = ImageMatcher.SUPPORTED_EXTS
|
|
199
|
+
|
|
200
|
+
results = {}
|
|
140
201
|
with ZipFile(path) as zf:
|
|
141
202
|
for f in zf.filelist:
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
203
|
+
f_ext = f.filename[-4:].lower()
|
|
204
|
+
if f_ext not in supported_extensions:
|
|
205
|
+
continue
|
|
206
|
+
|
|
207
|
+
if f_ext == '.zip':
|
|
208
|
+
logger.warning('Have not implemented nested zip support yet! Input file: %s (%s)', path, f)
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
try:
|
|
212
|
+
with zf.open(f) as zipped_file:
|
|
213
|
+
results[f.filename] = calculate_hashes(zipped_file, is_gif=f_ext in {".gif", ".webp"},
|
|
214
|
+
strength=strength, exact_match=exact_match)
|
|
215
|
+
except BadZipFile as e:
|
|
216
|
+
logger.warning("Could not read %s in %s due to %s", f.filename, path, str(e))
|
|
217
|
+
except UnidentifiedImageError:
|
|
218
|
+
logger.warning("Could not identify image %s in %s", f.filename, path)
|
|
145
219
|
|
|
146
220
|
return path, results
|
|
147
221
|
|
|
148
222
|
|
|
149
223
|
@dataclass
|
|
150
224
|
class ImageMatch:
|
|
225
|
+
"""A match data structure containing the matches and where this match lies in the match list"""
|
|
151
226
|
match_i: int | None = field(default=None)
|
|
152
227
|
matches: list[ZipPath] = field(default_factory=list)
|
|
153
228
|
|
|
154
229
|
|
|
230
|
+
# region Events
|
|
155
231
|
@dataclass(frozen=True)
|
|
156
232
|
class NewGroup:
|
|
157
|
-
|
|
233
|
+
"""A new group event"""
|
|
234
|
+
group: "ImageMatch"
|
|
158
235
|
|
|
159
236
|
|
|
160
237
|
@dataclass(frozen=True)
|
|
161
238
|
class NewMatch:
|
|
239
|
+
"""A new match event"""
|
|
162
240
|
group: "ImageMatch"
|
|
163
241
|
path: ZipPath
|
|
164
242
|
|
|
165
243
|
|
|
166
244
|
@dataclass(frozen=True)
|
|
167
245
|
class Finished:
|
|
168
|
-
|
|
246
|
+
"""A finished event"""
|
|
169
247
|
|
|
170
248
|
|
|
171
249
|
MatcherEvent = Union[NewGroup, NewMatch, Finished]
|
|
250
|
+
# endregion
|
|
172
251
|
|
|
173
252
|
|
|
174
|
-
# TODO: FINISHED signal?
|
|
175
253
|
class ImageMatcher:
|
|
176
|
-
|
|
254
|
+
"""
|
|
255
|
+
An image matching SDK
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
strength (int): The 0-10 strength to use for matching. Defaults to 5.
|
|
259
|
+
exact_match (bool): Should use SHA-256 hashes? If False, the default, will use perceptual hashes.
|
|
260
|
+
If True, strength must be 10.
|
|
261
|
+
processes (int): The number of processes to use. Defaults to None.
|
|
262
|
+
extensions (set): The extensions to process. Optional.
|
|
263
|
+
"""
|
|
264
|
+
SUPPORTED_EXTS: ClassVar = {".jpg", ".jpeg", ".png", ".webp", ".tif", ".tiff", ".bmp", ".gif", ".zip"}
|
|
177
265
|
|
|
178
|
-
def __init__(self, strength: int = 5,
|
|
179
|
-
|
|
266
|
+
def __init__(self, strength: int = 5, processes: int | None = None, extensions: set | None = None,
|
|
267
|
+
*, exact_match: bool = False):
|
|
180
268
|
if not (0 <= strength <= 10):
|
|
181
269
|
raise ValueError("Strength must be between 0 and 10!")
|
|
182
270
|
|
|
@@ -185,57 +273,63 @@ class ImageMatcher:
|
|
|
185
273
|
self.strength = strength
|
|
186
274
|
self.exact_match = exact_match
|
|
187
275
|
self.processes = processes
|
|
276
|
+
|
|
188
277
|
self.found_images = 0
|
|
189
278
|
self.processed_images = 0
|
|
190
279
|
self.duplicate_images = 0
|
|
280
|
+
self.matches = []
|
|
191
281
|
|
|
192
282
|
m = Manager()
|
|
193
|
-
self.events = m.Queue()
|
|
194
|
-
self._new_paths = m.Queue()
|
|
195
|
-
self._removed_paths = set()
|
|
196
|
-
self.
|
|
197
|
-
self.
|
|
198
|
-
self.
|
|
199
|
-
|
|
283
|
+
self.events = m.Queue() # Events to go to higher level users
|
|
284
|
+
self._new_paths = m.Queue() # Inbound queue for new paths that are added while processing is running
|
|
285
|
+
self._removed_paths = set() # Paths that have been removed from processing after processing has been started
|
|
286
|
+
self._ignored_files = set() # Files which have been ignored and should be skipped from processing if re-ran
|
|
287
|
+
self._processed_zips = {} # Zips that have been successfully processed
|
|
288
|
+
self._hashes = defaultdict(ImageMatch) # Hash -> Paths
|
|
289
|
+
self._reverse_hashes = {} # Path -> Hash
|
|
290
|
+
|
|
291
|
+
# Pausing and finished signaling...
|
|
200
292
|
self._not_paused = Event()
|
|
201
293
|
self._not_paused.set()
|
|
202
294
|
self._finished = Event()
|
|
203
295
|
self._finished.set()
|
|
204
296
|
|
|
205
|
-
|
|
297
|
+
@property
|
|
298
|
+
def left_to_process(self):
|
|
299
|
+
"""Files that are left to process"""
|
|
300
|
+
return self.found_images - self.processed_images
|
|
206
301
|
|
|
207
302
|
def add_path(self, path: str | Path):
|
|
303
|
+
"""Add a path for processing"""
|
|
208
304
|
path = str(Path(path).absolute())
|
|
209
305
|
self._removed_paths.discard(path)
|
|
210
306
|
self._new_paths.put(path)
|
|
211
307
|
|
|
212
|
-
def remove_path(self, folder: str | Path)
|
|
308
|
+
def remove_path(self, folder: str | Path):
|
|
213
309
|
"""
|
|
214
310
|
Mark a folder to be skipped going forward, and remove already-indexed files
|
|
215
311
|
that live under it. Pauses briefly if not already paused to keep state sane.
|
|
216
312
|
"""
|
|
313
|
+
# TODO: This works but the biggest problem with it is that it will not remove any images which are still
|
|
314
|
+
# queue'd up for processing in the ThreadPool... I'm not sure how to fix that yet.
|
|
217
315
|
folder = str(Path(folder).absolute())
|
|
218
316
|
paused = self.conditional_pause()
|
|
219
317
|
self._removed_paths.add(folder)
|
|
220
|
-
self._processed_paths.discard(folder)
|
|
221
318
|
|
|
222
319
|
# Remove anything we've already seen under that folder
|
|
223
320
|
# (iterate over a copy because remove() mutates structures)
|
|
224
|
-
to_remove = [p for p in self._reverse_hashes
|
|
321
|
+
to_remove = [p for p in self._reverse_hashes if _is_under(folder, p.path)]
|
|
225
322
|
for p in to_remove:
|
|
226
323
|
self.remove(p)
|
|
227
324
|
|
|
228
|
-
self.
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
def left_to_process(self):
|
|
232
|
-
return self.found_images - self.processed_images
|
|
325
|
+
to_remove_zips = [p for p in self._processed_zips if _is_under(folder, p)]
|
|
326
|
+
for p in to_remove_zips:
|
|
327
|
+
self._processed_zips.pop(p)
|
|
233
328
|
|
|
234
|
-
|
|
235
|
-
logger.debug('Performing pause')
|
|
236
|
-
self._not_paused.clear()
|
|
329
|
+
self.conditional_resume(paused)
|
|
237
330
|
|
|
238
|
-
def conditional_pause(self):
|
|
331
|
+
def conditional_pause(self) -> bool:
|
|
332
|
+
"""Pause if not paused and return if was paused"""
|
|
239
333
|
_conditional_pause = self.is_paused()
|
|
240
334
|
if not _conditional_pause:
|
|
241
335
|
logger.debug('Performing conditional pause')
|
|
@@ -243,47 +337,59 @@ class ImageMatcher:
|
|
|
243
337
|
|
|
244
338
|
return _conditional_pause
|
|
245
339
|
|
|
246
|
-
def conditional_resume(self, was_paused):
|
|
340
|
+
def conditional_resume(self, was_paused: bool): # noqa: FBT001
|
|
341
|
+
"""Resume if not paused previous (from call to `conditional_pause`)"""
|
|
247
342
|
if not was_paused and not self.is_finished():
|
|
248
343
|
logger.debug('Performing conditional resume')
|
|
249
344
|
self.resume()
|
|
250
345
|
|
|
346
|
+
def pause(self):
|
|
347
|
+
"""Pause processing"""
|
|
348
|
+
logger.debug('Performing pause')
|
|
349
|
+
self._not_paused.clear()
|
|
350
|
+
|
|
251
351
|
def is_paused(self):
|
|
352
|
+
"""Is processing paused"""
|
|
252
353
|
return not self._not_paused.is_set()
|
|
253
354
|
|
|
254
355
|
def finish(self):
|
|
356
|
+
"""Finish processing"""
|
|
255
357
|
logger.debug('Performing finished')
|
|
256
358
|
self._finished.set()
|
|
257
359
|
|
|
258
360
|
def is_finished(self):
|
|
361
|
+
"""Is processing finished"""
|
|
259
362
|
return self._finished.is_set()
|
|
260
363
|
|
|
261
364
|
def resume(self):
|
|
365
|
+
"""Resume processing"""
|
|
262
366
|
logger.debug('Performing resume')
|
|
263
367
|
self._not_paused.set()
|
|
264
368
|
|
|
265
369
|
def running(self):
|
|
370
|
+
"""Currently running and loading hashes?"""
|
|
266
371
|
return not self.is_paused() and (not self.is_finished() or self.left_to_process)
|
|
267
372
|
|
|
268
373
|
def remove(self, path):
|
|
374
|
+
"""Remove a loaded path completely from the image matching system. Will not delete a file."""
|
|
269
375
|
# Pause things while we remove things...
|
|
270
376
|
logger.info('Removing %s from %s', path, self.__class__.__name__)
|
|
271
377
|
paused = self.conditional_pause()
|
|
272
378
|
|
|
273
|
-
|
|
274
|
-
self._hashes[
|
|
275
|
-
if len(self._hashes[
|
|
276
|
-
match_i = self._hashes[
|
|
379
|
+
hash_ = self._reverse_hashes.pop(path)
|
|
380
|
+
self._hashes[hash_].matches.remove(path)
|
|
381
|
+
if len(self._hashes[hash_].matches) == 1:
|
|
382
|
+
match_i = self._hashes[hash_].match_i
|
|
277
383
|
logger.debug('Unmatching match group %s', match_i)
|
|
278
|
-
self._hashes[
|
|
384
|
+
self._hashes[hash_].match_i = None
|
|
279
385
|
|
|
280
386
|
del self.matches[match_i]
|
|
281
387
|
self.refresh_match_indexes(match_i)
|
|
282
388
|
self.duplicate_images -= 2
|
|
283
389
|
|
|
284
|
-
elif not self._hashes[
|
|
390
|
+
elif not self._hashes[hash_].matches:
|
|
285
391
|
logger.debug('Removing empty match group')
|
|
286
|
-
del self._hashes[
|
|
392
|
+
del self._hashes[hash_]
|
|
287
393
|
|
|
288
394
|
else:
|
|
289
395
|
logger.debug('Simple removal performed')
|
|
@@ -293,81 +399,111 @@ class ImageMatcher:
|
|
|
293
399
|
self.found_images -= 1
|
|
294
400
|
self.conditional_resume(paused)
|
|
295
401
|
|
|
402
|
+
def ignore(self, path):
|
|
403
|
+
"""Remove a path from the image matching service"""
|
|
404
|
+
self.remove(path)
|
|
405
|
+
|
|
406
|
+
if path.path_obj.suffix.lower() != '.zip':
|
|
407
|
+
self._ignored_files.add(path.path)
|
|
408
|
+
|
|
296
409
|
def refresh_match_indexes(self, start=0):
|
|
410
|
+
"""Update the match_i value for all the matches passed a certain point"""
|
|
297
411
|
for match_i, match in enumerate(self.matches[start:], start=start):
|
|
298
412
|
match.match_i = match_i
|
|
299
413
|
|
|
300
414
|
def _process_image_callback(self, result):
|
|
415
|
+
"""
|
|
416
|
+
Handle the result of hashing an image.
|
|
417
|
+
|
|
418
|
+
This needs to do quite a few things including sanitizing the results,
|
|
419
|
+
actually checking if the hash matches an existing image,
|
|
420
|
+
adding the image and any matches to the backend data structures, notify any listeners,
|
|
421
|
+
update the found and processed image counts,
|
|
422
|
+
and verify that this result wasn't added as a removed path since it was queued.
|
|
423
|
+
|
|
424
|
+
Args:
|
|
425
|
+
result: A tuple consisting of the path to the file, and the resultant hashes.
|
|
426
|
+
If the hashes are a dict, then it is assumed that the path is for a zip. In that case,
|
|
427
|
+
the individual zip files will sanitized and re-ran through this callback.
|
|
428
|
+
"""
|
|
429
|
+
# TODO: This callback must return IMMEDIATELY and is currently too slow for large amounts of zips.
|
|
430
|
+
# Perhaps create a new queue/thread and queue up processing for zip results?
|
|
431
|
+
# I think the major slow point is adding to the data structures and I'm not sure if more threads will help
|
|
432
|
+
# Check for paused or finished signals
|
|
301
433
|
self._not_paused.wait()
|
|
302
434
|
if self.is_finished():
|
|
303
435
|
return
|
|
304
436
|
|
|
437
|
+
# region Sanitize results
|
|
305
438
|
path: Path | str | ZipPath
|
|
306
439
|
path, hashes = result
|
|
307
440
|
|
|
308
441
|
if any(_is_under(d, path.path if isinstance(path, ZipPath) else path) for d in self._removed_paths):
|
|
442
|
+
# This image was removed AFTER it was queue'd! So decrement the found images count and just leave...
|
|
309
443
|
self.found_images -= 1
|
|
310
444
|
return
|
|
311
445
|
|
|
312
446
|
if isinstance(hashes, dict):
|
|
313
447
|
self.found_images -= 1
|
|
448
|
+
subpaths = []
|
|
314
449
|
for sub_path, sub_hashes in hashes.items():
|
|
315
450
|
self.found_images += 1
|
|
316
|
-
|
|
451
|
+
subpaths.append(ZipPath(str(path), sub_path))
|
|
452
|
+
self._process_image_callback((subpaths[-1], sub_hashes))
|
|
453
|
+
self._processed_zips[str(path)] = subpaths
|
|
317
454
|
return
|
|
318
455
|
|
|
456
|
+
initial_hash, extra_hashes = hashes
|
|
457
|
+
extra_hashes.add(initial_hash)
|
|
319
458
|
if not isinstance(path, ZipPath):
|
|
459
|
+
# From this point on, EVERYTHING should be a ZipPath
|
|
320
460
|
path = ZipPath(str(path), "")
|
|
461
|
+
# endregion
|
|
321
462
|
|
|
322
463
|
if path in self._reverse_hashes:
|
|
323
464
|
self.found_images -= 1
|
|
324
465
|
return
|
|
325
466
|
|
|
326
467
|
self.processed_images += 1
|
|
327
|
-
for hash_ in hashes:
|
|
328
|
-
if hash_ not in self._hashes:
|
|
329
|
-
continue
|
|
330
|
-
|
|
331
|
-
self._reverse_hashes[path] = hash_
|
|
332
|
-
|
|
333
|
-
# This appears to be a new match!
|
|
334
|
-
for match in self._hashes[hash_].matches:
|
|
335
|
-
if path.absolute() == match.absolute():
|
|
336
|
-
# This appears to be a duplicate PATH...
|
|
337
|
-
logger.warning('Duplicate files entered! %s, %s', path, match)
|
|
338
|
-
return
|
|
339
468
|
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
self.duplicate_images += 2
|
|
346
|
-
self.events.put(NewGroup(self._hashes[hash_]))
|
|
347
|
-
logger.debug('New match group found: %s', self._hashes[hash_].matches)
|
|
348
|
-
else:
|
|
349
|
-
# Just another match for an existing group...
|
|
350
|
-
self.duplicate_images += 1
|
|
351
|
-
self.events.put(NewMatch(self._hashes[hash_], path))
|
|
352
|
-
logger.debug('New match found for group #%s: %s',
|
|
353
|
-
self._hashes[hash_].match_i,
|
|
354
|
-
self._hashes[hash_].matches)
|
|
355
|
-
|
|
356
|
-
break
|
|
357
|
-
else:
|
|
358
|
-
# This is a new hash, so just add it to the hashmap and move on...
|
|
469
|
+
# From testing at ~1.5m loaded images: it is ~10% faster to return a set and do this than it is to
|
|
470
|
+
# iterate over a list and do an `is in` check for each hash
|
|
471
|
+
found_hashes = self._hashes.keys() & extra_hashes
|
|
472
|
+
if not found_hashes:
|
|
473
|
+
# This is a new image not matching any previous, so just add it to the hashmap and move on...
|
|
359
474
|
# Just use the initial orientation
|
|
360
|
-
hash_ =
|
|
475
|
+
hash_ = initial_hash
|
|
361
476
|
self._reverse_hashes[path] = hash_
|
|
362
477
|
self._hashes[hash_].matches.append(path)
|
|
363
478
|
return
|
|
364
479
|
|
|
480
|
+
# We have found a match!
|
|
481
|
+
hash_ = next(iter(found_hashes))
|
|
482
|
+
self._reverse_hashes[path] = hash_
|
|
483
|
+
self._hashes[hash_].matches.append(path)
|
|
484
|
+
|
|
485
|
+
if self._hashes[hash_].match_i is None and len(self._hashes[hash_].matches) >= 2:
|
|
486
|
+
# This is a brand new match group!
|
|
487
|
+
self._hashes[hash_].match_i = len(self.matches)
|
|
488
|
+
self.matches.append(self._hashes[hash_])
|
|
489
|
+
self.duplicate_images += 2
|
|
490
|
+
self.events.put(NewGroup(self._hashes[hash_]))
|
|
491
|
+
logger.debug('New match group found: %s', self._hashes[hash_].matches)
|
|
492
|
+
else:
|
|
493
|
+
# Just another match for an existing group...
|
|
494
|
+
self.duplicate_images += 1
|
|
495
|
+
self.events.put(NewMatch(self._hashes[hash_], path))
|
|
496
|
+
logger.debug('New match found for group #%s: %s',
|
|
497
|
+
self._hashes[hash_].match_i,
|
|
498
|
+
self._hashes[hash_].matches)
|
|
499
|
+
|
|
365
500
|
def _process_image_error_callback(self, e):
|
|
501
|
+
"""Temporary for testing"""
|
|
366
502
|
self.processed_images += 1
|
|
367
|
-
|
|
503
|
+
logger.error("%s: %s (input path %s)", type(e), e, e.input_path)
|
|
368
504
|
|
|
369
505
|
def _root_stream(self):
|
|
370
|
-
|
|
506
|
+
"""This is to yield any paths for processing, then wait until processing is finished for any new paths"""
|
|
371
507
|
while not self._new_paths.empty() or self.left_to_process:
|
|
372
508
|
if self._new_paths.empty():
|
|
373
509
|
time.sleep(0.05)
|
|
@@ -376,9 +512,7 @@ class ImageMatcher:
|
|
|
376
512
|
yield self._new_paths.get_nowait()
|
|
377
513
|
|
|
378
514
|
def run(self, paths: list[str | Path]):
|
|
379
|
-
|
|
380
|
-
# TODO: Verify none of the dirs have been deleted after we started
|
|
381
|
-
|
|
515
|
+
"""Do the work of matching!"""
|
|
382
516
|
self._not_paused.set()
|
|
383
517
|
self._finished.clear()
|
|
384
518
|
|
|
@@ -393,13 +527,14 @@ class ImageMatcher:
|
|
|
393
527
|
continue
|
|
394
528
|
|
|
395
529
|
path = str(path.absolute())
|
|
396
|
-
if path in self._removed_paths
|
|
530
|
+
if path in self._removed_paths:
|
|
397
531
|
continue
|
|
398
532
|
|
|
399
533
|
for root, dirs, files in os.walk(path):
|
|
400
534
|
if self.is_finished():
|
|
401
535
|
break
|
|
402
536
|
|
|
537
|
+
dirs.sort() # This actually works to ensure that os.walk goes in alphabetical order!
|
|
403
538
|
root = Path(root)
|
|
404
539
|
|
|
405
540
|
if any(_is_under(d, root) for d in self._removed_paths):
|
|
@@ -418,8 +553,13 @@ class ImageMatcher:
|
|
|
418
553
|
if any(_is_under(d, f) for d in self._removed_paths):
|
|
419
554
|
continue
|
|
420
555
|
|
|
421
|
-
|
|
422
|
-
|
|
556
|
+
if str(f) in self._ignored_files:
|
|
557
|
+
continue
|
|
558
|
+
|
|
559
|
+
if f.suffix.lower() == '.zip':
|
|
560
|
+
if str(f.absolute()) in self._processed_zips:
|
|
561
|
+
continue
|
|
562
|
+
elif ZipPath(str(f), "") in self._reverse_hashes:
|
|
423
563
|
continue
|
|
424
564
|
|
|
425
565
|
self.found_images += 1
|
|
@@ -428,14 +568,13 @@ class ImageMatcher:
|
|
|
428
568
|
args=(f, ),
|
|
429
569
|
kwds={
|
|
430
570
|
'strength': self.strength,
|
|
571
|
+
'supported_extensions': self.extensions,
|
|
431
572
|
'exact_match': self.exact_match,
|
|
432
573
|
},
|
|
433
574
|
callback=self._process_image_callback,
|
|
434
575
|
error_callback=self._process_image_error_callback,
|
|
435
576
|
)
|
|
436
577
|
|
|
437
|
-
self._processed_paths.add(path)
|
|
438
|
-
|
|
439
578
|
tp.close()
|
|
440
579
|
|
|
441
580
|
if not self.is_finished():
|
pixmatch/__main__.py
CHANGED
|
@@ -8,10 +8,9 @@ from PySide6 import QtWidgets
|
|
|
8
8
|
|
|
9
9
|
from pixmatch.gui import MainWindow
|
|
10
10
|
|
|
11
|
-
|
|
12
11
|
if __name__ == "__main__":
|
|
13
12
|
parser = argparse.ArgumentParser(
|
|
14
|
-
description="Process zero or more file paths."
|
|
13
|
+
description="Process zero or more file paths.",
|
|
15
14
|
)
|
|
16
15
|
parser.add_argument(
|
|
17
16
|
"folders",
|
|
@@ -41,7 +40,7 @@ if __name__ == "__main__":
|
|
|
41
40
|
QToolBar { spacing: 8px; }
|
|
42
41
|
QLabel#GroupTitle { padding: 4px 0; }
|
|
43
42
|
QFrame#ImageTile { border: 1px solid #444; border-radius: 6px; padding: 6px; }
|
|
44
|
-
"""
|
|
43
|
+
""",
|
|
45
44
|
)
|
|
46
45
|
w = MainWindow(args.folders)
|
|
47
46
|
w.show()
|
pixmatch/utils.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
from typing import Iterable
|
|
2
2
|
|
|
3
|
-
|
|
4
3
|
def human_bytes(
|
|
5
4
|
n: int,
|
|
6
5
|
*,
|
|
7
6
|
base: int = 1000,
|
|
8
7
|
decimals: int = 0,
|
|
9
|
-
units: Iterable[str] = ("b", "kb", "mb", "gb", "tb", "pb", "eb", "zb", "yb")
|
|
8
|
+
units: Iterable[str] = ("b", "kb", "mb", "gb", "tb", "pb", "eb", "zb", "yb"),
|
|
10
9
|
) -> str:
|
|
11
10
|
"""
|
|
12
11
|
Convert a byte count to a human-readable string.
|
|
@@ -19,6 +18,9 @@ def human_bytes(
|
|
|
19
18
|
|
|
20
19
|
Returns:
|
|
21
20
|
A compact string like '66kb', '1mb', '1.5gb', or '999b'.
|
|
21
|
+
|
|
22
|
+
Raises:
|
|
23
|
+
ValueError: If n < 0
|
|
22
24
|
"""
|
|
23
25
|
if n < 0:
|
|
24
26
|
raise ValueError("Byte size cannot be negative")
|
|
@@ -31,6 +33,6 @@ def human_bytes(
|
|
|
31
33
|
|
|
32
34
|
if i == 0 or decimals == 0:
|
|
33
35
|
# Bytes or integer formatting requested
|
|
34
|
-
return f"{int(n
|
|
36
|
+
return f"{int(n)}{tuple(units)[i]}"
|
|
35
37
|
|
|
36
38
|
return f"{n:.{decimals}f}{tuple(units)[i]}".rstrip("0").rstrip(".")
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
pixmatch/__init__.py,sha256=U-puf9cK1V5Ooz9xRt1e4lMgQvYnCNmS_vmyUoSXIgw,22072
|
|
2
|
+
pixmatch/__main__.py,sha256=DVd1-B2O-0PC2lPgl40xDN277SPSHwOiE6pFGxK-xO0,1548
|
|
3
|
+
pixmatch/utils.py,sha256=4dHALWtt9y3EIdRLiM3GfRUho3xfn3QErQ69R20A1Lw,1120
|
|
4
|
+
pixmatch-0.0.5.dist-info/licenses/LICENSE,sha256=2bm9uFabQZ3Ykb_SaSU_uUbAj2-htc6WJQmS_65qD00,1073
|
|
5
|
+
pixmatch-0.0.5.dist-info/METADATA,sha256=Q_9R3LJVmD94mE0F8k6TlsbJymnVMpmu2xLRzomT348,3540
|
|
6
|
+
pixmatch-0.0.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
+
pixmatch-0.0.5.dist-info/top_level.txt,sha256=u-67zafU4VFT-oIM4mdGvf9KrHZvD64QjjtNzVxBj7E,9
|
|
8
|
+
pixmatch-0.0.5.dist-info/RECORD,,
|
pixmatch-0.0.4.dist-info/RECORD
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
pixmatch/__init__.py,sha256=3Q8jnIN4vXq0zJxlrgNXnpw0rdFZmwgsgxtTmcyzfyo,15682
|
|
2
|
-
pixmatch/__main__.py,sha256=cLcDXW228kPcAH5b66MP5eIEFHz6WNuOgqDpPchUke0,1547
|
|
3
|
-
pixmatch/utils.py,sha256=TLYFeMg35B62EUafErq3yaA9YC0O6Kcd3Ao4fSpTwoE,1090
|
|
4
|
-
pixmatch-0.0.4.dist-info/licenses/LICENSE,sha256=2bm9uFabQZ3Ykb_SaSU_uUbAj2-htc6WJQmS_65qD00,1073
|
|
5
|
-
pixmatch-0.0.4.dist-info/METADATA,sha256=Q_YzWyT6Iduiady1dAM_k0vd6VP8I_GLrC4bB5DuKiM,3540
|
|
6
|
-
pixmatch-0.0.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
-
pixmatch-0.0.4.dist-info/top_level.txt,sha256=u-67zafU4VFT-oIM4mdGvf9KrHZvD64QjjtNzVxBj7E,9
|
|
8
|
-
pixmatch-0.0.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|