pixmatch 0.0.3__tar.gz → 0.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixmatch might be problematic. Click here for more details.
- {pixmatch-0.0.3 → pixmatch-0.0.5}/PKG-INFO +1 -1
- pixmatch-0.0.5/pixmatch/__init__.py +585 -0
- {pixmatch-0.0.3 → pixmatch-0.0.5}/pixmatch/__main__.py +2 -3
- {pixmatch-0.0.3 → pixmatch-0.0.5}/pixmatch/utils.py +5 -3
- {pixmatch-0.0.3 → pixmatch-0.0.5}/pixmatch.egg-info/PKG-INFO +1 -1
- {pixmatch-0.0.3 → pixmatch-0.0.5}/pyproject.toml +8 -1
- pixmatch-0.0.3/pixmatch/__init__.py +0 -444
- {pixmatch-0.0.3 → pixmatch-0.0.5}/LICENSE +0 -0
- {pixmatch-0.0.3 → pixmatch-0.0.5}/README.md +0 -0
- {pixmatch-0.0.3 → pixmatch-0.0.5}/pixmatch.egg-info/SOURCES.txt +0 -0
- {pixmatch-0.0.3 → pixmatch-0.0.5}/pixmatch.egg-info/dependency_links.txt +0 -0
- {pixmatch-0.0.3 → pixmatch-0.0.5}/pixmatch.egg-info/requires.txt +0 -0
- {pixmatch-0.0.3 → pixmatch-0.0.5}/pixmatch.egg-info/top_level.txt +0 -0
- {pixmatch-0.0.3 → pixmatch-0.0.5}/setup.cfg +0 -0
|
@@ -0,0 +1,585 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from functools import wraps
|
|
9
|
+
from multiprocessing import Manager, Pool
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from threading import Event
|
|
12
|
+
from typing import ClassVar, Union
|
|
13
|
+
from zipfile import BadZipFile, ZipFile
|
|
14
|
+
|
|
15
|
+
import imagehash
|
|
16
|
+
import numpy as np
|
|
17
|
+
|
|
18
|
+
from PIL import Image, ImageFile, UnidentifiedImageError
|
|
19
|
+
|
|
20
|
+
ImageFile.LOAD_TRUNCATED_IMAGES = True # Allow damaged images
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass(frozen=True)
|
|
26
|
+
class ZipPath:
|
|
27
|
+
"""
|
|
28
|
+
A general object describing a Path.
|
|
29
|
+
|
|
30
|
+
All paths in pixmatch will be one of these. `subpath` will be empty for non-zip file paths.
|
|
31
|
+
|
|
32
|
+
Attributes:
|
|
33
|
+
path (str): The path to the file.
|
|
34
|
+
subpath (str): The subpath in the zip if `path` is for a zip.
|
|
35
|
+
"""
|
|
36
|
+
# TODO: At some point convert this to Path.
|
|
37
|
+
# When I tried that last it introduced problems with inter-process communication
|
|
38
|
+
path: str
|
|
39
|
+
subpath: str
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def path_obj(self) -> Path:
|
|
43
|
+
"""Get the path as as Path object"""
|
|
44
|
+
return Path(self.path)
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def is_gif(self) -> bool:
|
|
48
|
+
"""Is this a path to an animated image?"""
|
|
49
|
+
movie_extensions = {'.gif', '.webp'}
|
|
50
|
+
return (not self.subpath and Path(self.path).suffix.lower() in movie_extensions) \
|
|
51
|
+
or (self.subpath and self.subpath[-4:].lower() in movie_extensions)
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def is_zip(self) -> bool:
|
|
55
|
+
"""Does this point to a file located in a zip?"""
|
|
56
|
+
return bool(self.subpath)
|
|
57
|
+
|
|
58
|
+
def absolute(self):
|
|
59
|
+
"""Get the absolute version of this ZipPath"""
|
|
60
|
+
return ZipPath(str(self.path_obj.absolute()), self.subpath)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _is_under(folder_abs: str, target: str | Path) -> bool:
|
|
64
|
+
"""Return True if the ZipPath's real file (zp.path) is inside folder_abs."""
|
|
65
|
+
try:
|
|
66
|
+
Path(target).absolute().relative_to(Path(folder_abs).absolute())
|
|
67
|
+
except ValueError:
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
return True
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def phash_params_for_strength(strength: int) -> tuple[int, int]:
|
|
74
|
+
"""
|
|
75
|
+
Convert a 0-10 strength to settings for imagehash
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
tuple<int, int>: The hash size (in bytes) and the high frequency factor
|
|
79
|
+
"""
|
|
80
|
+
# TODO: This sucks.
|
|
81
|
+
strength = max(0, min(10, strength))
|
|
82
|
+
if strength >= 10:
|
|
83
|
+
return 16, 4
|
|
84
|
+
if strength >= 8:
|
|
85
|
+
return 15, 4
|
|
86
|
+
if strength >= 7:
|
|
87
|
+
return 13, 4
|
|
88
|
+
if strength >= 6:
|
|
89
|
+
return 11, 4
|
|
90
|
+
if strength >= 5:
|
|
91
|
+
return 9, 4
|
|
92
|
+
if strength >= 4:
|
|
93
|
+
return 8, 4
|
|
94
|
+
if strength >= 3:
|
|
95
|
+
return 8, 3
|
|
96
|
+
if strength >= 2:
|
|
97
|
+
return 7, 3
|
|
98
|
+
return 6, 3
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def calculate_hashes(f, strength=5, *, is_gif=False, exact_match=False) -> tuple[str, set[str]]:
|
|
102
|
+
"""
|
|
103
|
+
Calculate hashes for a given file.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
f (IO or str or Path): Either a file path to process, or a in-memory BytesIO object ready for reading.
|
|
107
|
+
strength (int): A number between 0 and 10 on the strength of the matches.
|
|
108
|
+
is_gif (bool): Is this gif data? Needed if passing an in-memory BytesIO object.
|
|
109
|
+
exact_match (bool): Use exact SHA256 hahes?
|
|
110
|
+
If true, strength must be 10.
|
|
111
|
+
If false, perceptual hashes will be used, even with high strength.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
tuple[str, set]: The first element is the primary hash,
|
|
115
|
+
the second element are any secondary hashes representing rotations, flips, etc...
|
|
116
|
+
"""
|
|
117
|
+
if exact_match:
|
|
118
|
+
hasher = hashlib.sha256()
|
|
119
|
+
block_size = 65536
|
|
120
|
+
with (open(f, "rb") if isinstance(f, (str, Path)) else f) as file: # noqa: PTH123
|
|
121
|
+
for block in iter(lambda: file.read(block_size), b""):
|
|
122
|
+
hasher.update(block)
|
|
123
|
+
return hasher.hexdigest(), set()
|
|
124
|
+
|
|
125
|
+
hash_size, highfreq_factor = phash_params_for_strength(strength)
|
|
126
|
+
with Image.open(f) as im:
|
|
127
|
+
if is_gif:
|
|
128
|
+
initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
|
|
129
|
+
# This is going to be a bit confusing but basically, imagehash produces weird hashes for some gifs
|
|
130
|
+
# because some gifs have bad first frames consisting of nothing or only a single color...
|
|
131
|
+
# To deal with that I'm looking for these bad hashes here and if its one, we advance to the next frame
|
|
132
|
+
# and use THAT for imagehash instead.
|
|
133
|
+
# The ones we need to be on the lookout for are:
|
|
134
|
+
# 1. The hash is all 1111...
|
|
135
|
+
# 2. The hash is all 0000...
|
|
136
|
+
# 3. The hash is of the form 100000...
|
|
137
|
+
# TODO: This is simply not good enough. I'm still getting bad matches for gifs, tho they are extremely rare
|
|
138
|
+
val = initial_hash.hash[0][0]
|
|
139
|
+
while all(all(x == val for x in r) for r in initial_hash.hash) \
|
|
140
|
+
or all(all(x == np.False_ or (x_i == 0 and r_i == 0) for x_i, x in enumerate(r))
|
|
141
|
+
for r_i, r in enumerate(initial_hash.hash)):
|
|
142
|
+
try:
|
|
143
|
+
im.seek(im.tell() + 1)
|
|
144
|
+
except EOFError: # noqa: PERF203
|
|
145
|
+
break
|
|
146
|
+
else:
|
|
147
|
+
initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
|
|
148
|
+
val = initial_hash.hash[0][0]
|
|
149
|
+
|
|
150
|
+
# For GIFs we'll look for mirrored versions but thats it
|
|
151
|
+
flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
|
|
152
|
+
extras = (flipped_h_image, )
|
|
153
|
+
else:
|
|
154
|
+
initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
|
|
155
|
+
|
|
156
|
+
flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
|
|
157
|
+
flipped_v_image = im.transpose(Image.Transpose.FLIP_TOP_BOTTOM)
|
|
158
|
+
extras = (im.rotate(90), im.rotate(180), im.rotate(270),
|
|
159
|
+
flipped_h_image, flipped_h_image.rotate(90), flipped_h_image.rotate(180),
|
|
160
|
+
flipped_h_image.rotate(270),
|
|
161
|
+
flipped_v_image, flipped_v_image.rotate(90), flipped_v_image.rotate(180),
|
|
162
|
+
flipped_v_image.rotate(270))
|
|
163
|
+
|
|
164
|
+
return str(initial_hash), {
|
|
165
|
+
str(imagehash.phash(image, hash_size=hash_size, highfreq_factor=highfreq_factor)) for image in extras
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def thread_error_handler(func):
|
|
170
|
+
"""An error handler for the thread to return information about where the error occurred"""
|
|
171
|
+
|
|
172
|
+
@wraps(func)
|
|
173
|
+
def wrapper(path, *args, **kwargs): # noqa: ANN202
|
|
174
|
+
try:
|
|
175
|
+
return func(path, *args, **kwargs)
|
|
176
|
+
except Exception as e:
|
|
177
|
+
e.input_path = path
|
|
178
|
+
raise
|
|
179
|
+
|
|
180
|
+
return wrapper
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
@thread_error_handler
|
|
184
|
+
def _process_image(
|
|
185
|
+
path: str | Path,
|
|
186
|
+
supported_extensions: set | None = None,
|
|
187
|
+
strength: int = 5,
|
|
188
|
+
*,
|
|
189
|
+
exact_match: bool = False,
|
|
190
|
+
) -> tuple[Path, tuple | dict[str, tuple]]:
|
|
191
|
+
"""Get the hashes for a given path. Is multiprocessing compatible"""
|
|
192
|
+
path = Path(path)
|
|
193
|
+
if path.suffix.lower() != '.zip':
|
|
194
|
+
return path, calculate_hashes(path, is_gif=path.suffix.lower() in {".gif", ".webp"},
|
|
195
|
+
strength=strength, exact_match=exact_match)
|
|
196
|
+
|
|
197
|
+
if not supported_extensions:
|
|
198
|
+
supported_extensions = ImageMatcher.SUPPORTED_EXTS
|
|
199
|
+
|
|
200
|
+
results = {}
|
|
201
|
+
with ZipFile(path) as zf:
|
|
202
|
+
for f in zf.filelist:
|
|
203
|
+
f_ext = f.filename[-4:].lower()
|
|
204
|
+
if f_ext not in supported_extensions:
|
|
205
|
+
continue
|
|
206
|
+
|
|
207
|
+
if f_ext == '.zip':
|
|
208
|
+
logger.warning('Have not implemented nested zip support yet! Input file: %s (%s)', path, f)
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
try:
|
|
212
|
+
with zf.open(f) as zipped_file:
|
|
213
|
+
results[f.filename] = calculate_hashes(zipped_file, is_gif=f_ext in {".gif", ".webp"},
|
|
214
|
+
strength=strength, exact_match=exact_match)
|
|
215
|
+
except BadZipFile as e:
|
|
216
|
+
logger.warning("Could not read %s in %s due to %s", f.filename, path, str(e))
|
|
217
|
+
except UnidentifiedImageError:
|
|
218
|
+
logger.warning("Could not identify image %s in %s", f.filename, path)
|
|
219
|
+
|
|
220
|
+
return path, results
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
@dataclass
|
|
224
|
+
class ImageMatch:
|
|
225
|
+
"""A match data structure containing the matches and where this match lies in the match list"""
|
|
226
|
+
match_i: int | None = field(default=None)
|
|
227
|
+
matches: list[ZipPath] = field(default_factory=list)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
# region Events
|
|
231
|
+
@dataclass(frozen=True)
|
|
232
|
+
class NewGroup:
|
|
233
|
+
"""A new group event"""
|
|
234
|
+
group: "ImageMatch"
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
@dataclass(frozen=True)
|
|
238
|
+
class NewMatch:
|
|
239
|
+
"""A new match event"""
|
|
240
|
+
group: "ImageMatch"
|
|
241
|
+
path: ZipPath
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
@dataclass(frozen=True)
|
|
245
|
+
class Finished:
|
|
246
|
+
"""A finished event"""
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
MatcherEvent = Union[NewGroup, NewMatch, Finished]
|
|
250
|
+
# endregion
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
class ImageMatcher:
|
|
254
|
+
"""
|
|
255
|
+
An image matching SDK
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
strength (int): The 0-10 strength to use for matching. Defaults to 5.
|
|
259
|
+
exact_match (bool): Should use SHA-256 hashes? If False, the default, will use perceptual hashes.
|
|
260
|
+
If True, strength must be 10.
|
|
261
|
+
processes (int): The number of processes to use. Defaults to None.
|
|
262
|
+
extensions (set): The extensions to process. Optional.
|
|
263
|
+
"""
|
|
264
|
+
SUPPORTED_EXTS: ClassVar = {".jpg", ".jpeg", ".png", ".webp", ".tif", ".tiff", ".bmp", ".gif", ".zip"}
|
|
265
|
+
|
|
266
|
+
def __init__(self, strength: int = 5, processes: int | None = None, extensions: set | None = None,
|
|
267
|
+
*, exact_match: bool = False):
|
|
268
|
+
if not (0 <= strength <= 10):
|
|
269
|
+
raise ValueError("Strength must be between 0 and 10!")
|
|
270
|
+
|
|
271
|
+
self.extensions = extensions or self.SUPPORTED_EXTS
|
|
272
|
+
|
|
273
|
+
self.strength = strength
|
|
274
|
+
self.exact_match = exact_match
|
|
275
|
+
self.processes = processes
|
|
276
|
+
|
|
277
|
+
self.found_images = 0
|
|
278
|
+
self.processed_images = 0
|
|
279
|
+
self.duplicate_images = 0
|
|
280
|
+
self.matches = []
|
|
281
|
+
|
|
282
|
+
m = Manager()
|
|
283
|
+
self.events = m.Queue() # Events to go to higher level users
|
|
284
|
+
self._new_paths = m.Queue() # Inbound queue for new paths that are added while processing is running
|
|
285
|
+
self._removed_paths = set() # Paths that have been removed from processing after processing has been started
|
|
286
|
+
self._ignored_files = set() # Files which have been ignored and should be skipped from processing if re-ran
|
|
287
|
+
self._processed_zips = {} # Zips that have been successfully processed
|
|
288
|
+
self._hashes = defaultdict(ImageMatch) # Hash -> Paths
|
|
289
|
+
self._reverse_hashes = {} # Path -> Hash
|
|
290
|
+
|
|
291
|
+
# Pausing and finished signaling...
|
|
292
|
+
self._not_paused = Event()
|
|
293
|
+
self._not_paused.set()
|
|
294
|
+
self._finished = Event()
|
|
295
|
+
self._finished.set()
|
|
296
|
+
|
|
297
|
+
@property
|
|
298
|
+
def left_to_process(self):
|
|
299
|
+
"""Files that are left to process"""
|
|
300
|
+
return self.found_images - self.processed_images
|
|
301
|
+
|
|
302
|
+
def add_path(self, path: str | Path):
|
|
303
|
+
"""Add a path for processing"""
|
|
304
|
+
path = str(Path(path).absolute())
|
|
305
|
+
self._removed_paths.discard(path)
|
|
306
|
+
self._new_paths.put(path)
|
|
307
|
+
|
|
308
|
+
def remove_path(self, folder: str | Path):
|
|
309
|
+
"""
|
|
310
|
+
Mark a folder to be skipped going forward, and remove already-indexed files
|
|
311
|
+
that live under it. Pauses briefly if not already paused to keep state sane.
|
|
312
|
+
"""
|
|
313
|
+
# TODO: This works but the biggest problem with it is that it will not remove any images which are still
|
|
314
|
+
# queue'd up for processing in the ThreadPool... I'm not sure how to fix that yet.
|
|
315
|
+
folder = str(Path(folder).absolute())
|
|
316
|
+
paused = self.conditional_pause()
|
|
317
|
+
self._removed_paths.add(folder)
|
|
318
|
+
|
|
319
|
+
# Remove anything we've already seen under that folder
|
|
320
|
+
# (iterate over a copy because remove() mutates structures)
|
|
321
|
+
to_remove = [p for p in self._reverse_hashes if _is_under(folder, p.path)]
|
|
322
|
+
for p in to_remove:
|
|
323
|
+
self.remove(p)
|
|
324
|
+
|
|
325
|
+
to_remove_zips = [p for p in self._processed_zips if _is_under(folder, p)]
|
|
326
|
+
for p in to_remove_zips:
|
|
327
|
+
self._processed_zips.pop(p)
|
|
328
|
+
|
|
329
|
+
self.conditional_resume(paused)
|
|
330
|
+
|
|
331
|
+
def conditional_pause(self) -> bool:
|
|
332
|
+
"""Pause if not paused and return if was paused"""
|
|
333
|
+
_conditional_pause = self.is_paused()
|
|
334
|
+
if not _conditional_pause:
|
|
335
|
+
logger.debug('Performing conditional pause')
|
|
336
|
+
self.pause()
|
|
337
|
+
|
|
338
|
+
return _conditional_pause
|
|
339
|
+
|
|
340
|
+
def conditional_resume(self, was_paused: bool): # noqa: FBT001
|
|
341
|
+
"""Resume if not paused previous (from call to `conditional_pause`)"""
|
|
342
|
+
if not was_paused and not self.is_finished():
|
|
343
|
+
logger.debug('Performing conditional resume')
|
|
344
|
+
self.resume()
|
|
345
|
+
|
|
346
|
+
def pause(self):
|
|
347
|
+
"""Pause processing"""
|
|
348
|
+
logger.debug('Performing pause')
|
|
349
|
+
self._not_paused.clear()
|
|
350
|
+
|
|
351
|
+
def is_paused(self):
|
|
352
|
+
"""Is processing paused"""
|
|
353
|
+
return not self._not_paused.is_set()
|
|
354
|
+
|
|
355
|
+
def finish(self):
|
|
356
|
+
"""Finish processing"""
|
|
357
|
+
logger.debug('Performing finished')
|
|
358
|
+
self._finished.set()
|
|
359
|
+
|
|
360
|
+
def is_finished(self):
|
|
361
|
+
"""Is processing finished"""
|
|
362
|
+
return self._finished.is_set()
|
|
363
|
+
|
|
364
|
+
def resume(self):
|
|
365
|
+
"""Resume processing"""
|
|
366
|
+
logger.debug('Performing resume')
|
|
367
|
+
self._not_paused.set()
|
|
368
|
+
|
|
369
|
+
def running(self):
|
|
370
|
+
"""Currently running and loading hashes?"""
|
|
371
|
+
return not self.is_paused() and (not self.is_finished() or self.left_to_process)
|
|
372
|
+
|
|
373
|
+
def remove(self, path):
|
|
374
|
+
"""Remove a loaded path completely from the image matching system. Will not delete a file."""
|
|
375
|
+
# Pause things while we remove things...
|
|
376
|
+
logger.info('Removing %s from %s', path, self.__class__.__name__)
|
|
377
|
+
paused = self.conditional_pause()
|
|
378
|
+
|
|
379
|
+
hash_ = self._reverse_hashes.pop(path)
|
|
380
|
+
self._hashes[hash_].matches.remove(path)
|
|
381
|
+
if len(self._hashes[hash_].matches) == 1:
|
|
382
|
+
match_i = self._hashes[hash_].match_i
|
|
383
|
+
logger.debug('Unmatching match group %s', match_i)
|
|
384
|
+
self._hashes[hash_].match_i = None
|
|
385
|
+
|
|
386
|
+
del self.matches[match_i]
|
|
387
|
+
self.refresh_match_indexes(match_i)
|
|
388
|
+
self.duplicate_images -= 2
|
|
389
|
+
|
|
390
|
+
elif not self._hashes[hash_].matches:
|
|
391
|
+
logger.debug('Removing empty match group')
|
|
392
|
+
del self._hashes[hash_]
|
|
393
|
+
|
|
394
|
+
else:
|
|
395
|
+
logger.debug('Simple removal performed')
|
|
396
|
+
self.duplicate_images -= 1
|
|
397
|
+
|
|
398
|
+
self.processed_images -= 1
|
|
399
|
+
self.found_images -= 1
|
|
400
|
+
self.conditional_resume(paused)
|
|
401
|
+
|
|
402
|
+
def ignore(self, path):
|
|
403
|
+
"""Remove a path from the image matching service"""
|
|
404
|
+
self.remove(path)
|
|
405
|
+
|
|
406
|
+
if path.path_obj.suffix.lower() != '.zip':
|
|
407
|
+
self._ignored_files.add(path.path)
|
|
408
|
+
|
|
409
|
+
def refresh_match_indexes(self, start=0):
|
|
410
|
+
"""Update the match_i value for all the matches passed a certain point"""
|
|
411
|
+
for match_i, match in enumerate(self.matches[start:], start=start):
|
|
412
|
+
match.match_i = match_i
|
|
413
|
+
|
|
414
|
+
def _process_image_callback(self, result):
|
|
415
|
+
"""
|
|
416
|
+
Handle the result of hashing an image.
|
|
417
|
+
|
|
418
|
+
This needs to do quite a few things including sanitizing the results,
|
|
419
|
+
actually checking if the hash matches an existing image,
|
|
420
|
+
adding the image and any matches to the backend data structures, notify any listeners,
|
|
421
|
+
update the found and processed image counts,
|
|
422
|
+
and verify that this result wasn't added as a removed path since it was queued.
|
|
423
|
+
|
|
424
|
+
Args:
|
|
425
|
+
result: A tuple consisting of the path to the file, and the resultant hashes.
|
|
426
|
+
If the hashes are a dict, then it is assumed that the path is for a zip. In that case,
|
|
427
|
+
the individual zip files will sanitized and re-ran through this callback.
|
|
428
|
+
"""
|
|
429
|
+
# TODO: This callback must return IMMEDIATELY and is currently too slow for large amounts of zips.
|
|
430
|
+
# Perhaps create a new queue/thread and queue up processing for zip results?
|
|
431
|
+
# I think the major slow point is adding to the data structures and I'm not sure if more threads will help
|
|
432
|
+
# Check for paused or finished signals
|
|
433
|
+
self._not_paused.wait()
|
|
434
|
+
if self.is_finished():
|
|
435
|
+
return
|
|
436
|
+
|
|
437
|
+
# region Sanitize results
|
|
438
|
+
path: Path | str | ZipPath
|
|
439
|
+
path, hashes = result
|
|
440
|
+
|
|
441
|
+
if any(_is_under(d, path.path if isinstance(path, ZipPath) else path) for d in self._removed_paths):
|
|
442
|
+
# This image was removed AFTER it was queue'd! So decrement the found images count and just leave...
|
|
443
|
+
self.found_images -= 1
|
|
444
|
+
return
|
|
445
|
+
|
|
446
|
+
if isinstance(hashes, dict):
|
|
447
|
+
self.found_images -= 1
|
|
448
|
+
subpaths = []
|
|
449
|
+
for sub_path, sub_hashes in hashes.items():
|
|
450
|
+
self.found_images += 1
|
|
451
|
+
subpaths.append(ZipPath(str(path), sub_path))
|
|
452
|
+
self._process_image_callback((subpaths[-1], sub_hashes))
|
|
453
|
+
self._processed_zips[str(path)] = subpaths
|
|
454
|
+
return
|
|
455
|
+
|
|
456
|
+
initial_hash, extra_hashes = hashes
|
|
457
|
+
extra_hashes.add(initial_hash)
|
|
458
|
+
if not isinstance(path, ZipPath):
|
|
459
|
+
# From this point on, EVERYTHING should be a ZipPath
|
|
460
|
+
path = ZipPath(str(path), "")
|
|
461
|
+
# endregion
|
|
462
|
+
|
|
463
|
+
if path in self._reverse_hashes:
|
|
464
|
+
self.found_images -= 1
|
|
465
|
+
return
|
|
466
|
+
|
|
467
|
+
self.processed_images += 1
|
|
468
|
+
|
|
469
|
+
# From testing at ~1.5m loaded images: it is ~10% faster to return a set and do this than it is to
|
|
470
|
+
# iterate over a list and do an `is in` check for each hash
|
|
471
|
+
found_hashes = self._hashes.keys() & extra_hashes
|
|
472
|
+
if not found_hashes:
|
|
473
|
+
# This is a new image not matching any previous, so just add it to the hashmap and move on...
|
|
474
|
+
# Just use the initial orientation
|
|
475
|
+
hash_ = initial_hash
|
|
476
|
+
self._reverse_hashes[path] = hash_
|
|
477
|
+
self._hashes[hash_].matches.append(path)
|
|
478
|
+
return
|
|
479
|
+
|
|
480
|
+
# We have found a match!
|
|
481
|
+
hash_ = next(iter(found_hashes))
|
|
482
|
+
self._reverse_hashes[path] = hash_
|
|
483
|
+
self._hashes[hash_].matches.append(path)
|
|
484
|
+
|
|
485
|
+
if self._hashes[hash_].match_i is None and len(self._hashes[hash_].matches) >= 2:
|
|
486
|
+
# This is a brand new match group!
|
|
487
|
+
self._hashes[hash_].match_i = len(self.matches)
|
|
488
|
+
self.matches.append(self._hashes[hash_])
|
|
489
|
+
self.duplicate_images += 2
|
|
490
|
+
self.events.put(NewGroup(self._hashes[hash_]))
|
|
491
|
+
logger.debug('New match group found: %s', self._hashes[hash_].matches)
|
|
492
|
+
else:
|
|
493
|
+
# Just another match for an existing group...
|
|
494
|
+
self.duplicate_images += 1
|
|
495
|
+
self.events.put(NewMatch(self._hashes[hash_], path))
|
|
496
|
+
logger.debug('New match found for group #%s: %s',
|
|
497
|
+
self._hashes[hash_].match_i,
|
|
498
|
+
self._hashes[hash_].matches)
|
|
499
|
+
|
|
500
|
+
def _process_image_error_callback(self, e):
|
|
501
|
+
"""Temporary for testing"""
|
|
502
|
+
self.processed_images += 1
|
|
503
|
+
logger.error("%s: %s (input path %s)", type(e), e, e.input_path)
|
|
504
|
+
|
|
505
|
+
def _root_stream(self):
|
|
506
|
+
"""This is to yield any paths for processing, then wait until processing is finished for any new paths"""
|
|
507
|
+
while not self._new_paths.empty() or self.left_to_process:
|
|
508
|
+
if self._new_paths.empty():
|
|
509
|
+
time.sleep(0.05)
|
|
510
|
+
continue
|
|
511
|
+
|
|
512
|
+
yield self._new_paths.get_nowait()
|
|
513
|
+
|
|
514
|
+
def run(self, paths: list[str | Path]):
|
|
515
|
+
"""Do the work of matching!"""
|
|
516
|
+
self._not_paused.set()
|
|
517
|
+
self._finished.clear()
|
|
518
|
+
|
|
519
|
+
for path in paths:
|
|
520
|
+
self.add_path(path)
|
|
521
|
+
|
|
522
|
+
with Pool(self.processes) as tp:
|
|
523
|
+
for path in self._root_stream():
|
|
524
|
+
path = Path(path)
|
|
525
|
+
if not path.is_dir():
|
|
526
|
+
logger.warning('A path was entered that was not a directory : %s', path)
|
|
527
|
+
continue
|
|
528
|
+
|
|
529
|
+
path = str(path.absolute())
|
|
530
|
+
if path in self._removed_paths:
|
|
531
|
+
continue
|
|
532
|
+
|
|
533
|
+
for root, dirs, files in os.walk(path):
|
|
534
|
+
if self.is_finished():
|
|
535
|
+
break
|
|
536
|
+
|
|
537
|
+
dirs.sort() # This actually works to ensure that os.walk goes in alphabetical order!
|
|
538
|
+
root = Path(root)
|
|
539
|
+
|
|
540
|
+
if any(_is_under(d, root) for d in self._removed_paths):
|
|
541
|
+
continue
|
|
542
|
+
|
|
543
|
+
for f in files:
|
|
544
|
+
self._not_paused.wait()
|
|
545
|
+
if self.is_finished():
|
|
546
|
+
break
|
|
547
|
+
|
|
548
|
+
f = root / f
|
|
549
|
+
|
|
550
|
+
if f.suffix.lower() not in self.extensions:
|
|
551
|
+
continue
|
|
552
|
+
|
|
553
|
+
if any(_is_under(d, f) for d in self._removed_paths):
|
|
554
|
+
continue
|
|
555
|
+
|
|
556
|
+
if str(f) in self._ignored_files:
|
|
557
|
+
continue
|
|
558
|
+
|
|
559
|
+
if f.suffix.lower() == '.zip':
|
|
560
|
+
if str(f.absolute()) in self._processed_zips:
|
|
561
|
+
continue
|
|
562
|
+
elif ZipPath(str(f), "") in self._reverse_hashes:
|
|
563
|
+
continue
|
|
564
|
+
|
|
565
|
+
self.found_images += 1
|
|
566
|
+
tp.apply_async(
|
|
567
|
+
_process_image,
|
|
568
|
+
args=(f, ),
|
|
569
|
+
kwds={
|
|
570
|
+
'strength': self.strength,
|
|
571
|
+
'supported_extensions': self.extensions,
|
|
572
|
+
'exact_match': self.exact_match,
|
|
573
|
+
},
|
|
574
|
+
callback=self._process_image_callback,
|
|
575
|
+
error_callback=self._process_image_error_callback,
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
tp.close()
|
|
579
|
+
|
|
580
|
+
if not self.is_finished():
|
|
581
|
+
tp.join()
|
|
582
|
+
|
|
583
|
+
if not self.is_finished():
|
|
584
|
+
self._finished.set()
|
|
585
|
+
self.events.put(Finished())
|
|
@@ -8,10 +8,9 @@ from PySide6 import QtWidgets
|
|
|
8
8
|
|
|
9
9
|
from pixmatch.gui import MainWindow
|
|
10
10
|
|
|
11
|
-
|
|
12
11
|
if __name__ == "__main__":
|
|
13
12
|
parser = argparse.ArgumentParser(
|
|
14
|
-
description="Process zero or more file paths."
|
|
13
|
+
description="Process zero or more file paths.",
|
|
15
14
|
)
|
|
16
15
|
parser.add_argument(
|
|
17
16
|
"folders",
|
|
@@ -41,7 +40,7 @@ if __name__ == "__main__":
|
|
|
41
40
|
QToolBar { spacing: 8px; }
|
|
42
41
|
QLabel#GroupTitle { padding: 4px 0; }
|
|
43
42
|
QFrame#ImageTile { border: 1px solid #444; border-radius: 6px; padding: 6px; }
|
|
44
|
-
"""
|
|
43
|
+
""",
|
|
45
44
|
)
|
|
46
45
|
w = MainWindow(args.folders)
|
|
47
46
|
w.show()
|
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
from typing import Iterable
|
|
2
2
|
|
|
3
|
-
|
|
4
3
|
def human_bytes(
|
|
5
4
|
n: int,
|
|
6
5
|
*,
|
|
7
6
|
base: int = 1000,
|
|
8
7
|
decimals: int = 0,
|
|
9
|
-
units: Iterable[str] = ("b", "kb", "mb", "gb", "tb", "pb", "eb", "zb", "yb")
|
|
8
|
+
units: Iterable[str] = ("b", "kb", "mb", "gb", "tb", "pb", "eb", "zb", "yb"),
|
|
10
9
|
) -> str:
|
|
11
10
|
"""
|
|
12
11
|
Convert a byte count to a human-readable string.
|
|
@@ -19,6 +18,9 @@ def human_bytes(
|
|
|
19
18
|
|
|
20
19
|
Returns:
|
|
21
20
|
A compact string like '66kb', '1mb', '1.5gb', or '999b'.
|
|
21
|
+
|
|
22
|
+
Raises:
|
|
23
|
+
ValueError: If n < 0
|
|
22
24
|
"""
|
|
23
25
|
if n < 0:
|
|
24
26
|
raise ValueError("Byte size cannot be negative")
|
|
@@ -31,6 +33,6 @@ def human_bytes(
|
|
|
31
33
|
|
|
32
34
|
if i == 0 or decimals == 0:
|
|
33
35
|
# Bytes or integer formatting requested
|
|
34
|
-
return f"{int(n
|
|
36
|
+
return f"{int(n)}{tuple(units)[i]}"
|
|
35
37
|
|
|
36
38
|
return f"{n:.{decimals}f}{tuple(units)[i]}".rstrip("0").rstrip(".")
|
|
@@ -5,7 +5,7 @@ requires = [
|
|
|
5
5
|
|
|
6
6
|
[project]
|
|
7
7
|
name = "pixmatch"
|
|
8
|
-
version = "0.0.
|
|
8
|
+
version = "0.0.5"
|
|
9
9
|
#dynamic = ["license"]
|
|
10
10
|
authors = [
|
|
11
11
|
{ name="Ryan Heard", email="ryanwheard@gmail.com" },
|
|
@@ -87,16 +87,23 @@ extend-select = [
|
|
|
87
87
|
"YTT", # flake8-2020
|
|
88
88
|
]
|
|
89
89
|
ignore=[
|
|
90
|
+
"ANN001", # Missing type annotation for function argument X
|
|
91
|
+
"ANN002", # Missing type annotation for *arg
|
|
92
|
+
"ANN003", # Missing type annotation for arg
|
|
90
93
|
"ANN201", # Missing return type
|
|
91
94
|
"D100", # Missing docstring in public module
|
|
92
95
|
"D104", # Missing docstring in public package
|
|
93
96
|
"D105", # Missing docstring in magic method
|
|
97
|
+
"D107", # Missing docstring in __init__ method
|
|
94
98
|
"D202", # Blank line after function docstring
|
|
99
|
+
"D205", # 1 blank line required between summary line and description
|
|
95
100
|
"D212", # Multi-line docstring summary should start at the first line
|
|
96
101
|
"D415", # Closing punctuation on docstrings
|
|
97
102
|
"PT013", # Incorrect import of pytest
|
|
98
103
|
"RUF023", # __slots__ is not sorted
|
|
104
|
+
"RUF052", # Local dummy variable X is accessed
|
|
99
105
|
"TRY003", # Avoid specifying long messages outside the exception class
|
|
106
|
+
"UP035", # Import from `collections.abc` instead: `Iterable`
|
|
100
107
|
]
|
|
101
108
|
|
|
102
109
|
[tool.ruff.lint.pydocstyle]
|
|
@@ -1,444 +0,0 @@
|
|
|
1
|
-
import hashlib
|
|
2
|
-
import logging
|
|
3
|
-
import os
|
|
4
|
-
import time
|
|
5
|
-
|
|
6
|
-
from collections import defaultdict
|
|
7
|
-
from dataclasses import dataclass, field
|
|
8
|
-
from multiprocessing import Pool, Manager
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
from threading import Event
|
|
11
|
-
from typing import Union
|
|
12
|
-
from zipfile import ZipFile
|
|
13
|
-
|
|
14
|
-
import imagehash
|
|
15
|
-
import numpy as np
|
|
16
|
-
|
|
17
|
-
from PIL import Image
|
|
18
|
-
|
|
19
|
-
logger = logging.getLogger(__name__)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
@dataclass(frozen=True)
|
|
23
|
-
class ZipPath:
|
|
24
|
-
path: str
|
|
25
|
-
subpath: str
|
|
26
|
-
|
|
27
|
-
@property
|
|
28
|
-
def path_obj(self):
|
|
29
|
-
return Path(self.path)
|
|
30
|
-
|
|
31
|
-
@property
|
|
32
|
-
def is_gif(self) -> bool:
|
|
33
|
-
movie_extensions = {'.gif', '.webp'}
|
|
34
|
-
return (not self.subpath and Path(self.path).suffix.lower() in movie_extensions) \
|
|
35
|
-
or (self.subpath and self.subpath[-4:].lower() in movie_extensions)
|
|
36
|
-
|
|
37
|
-
def absolute(self):
|
|
38
|
-
return ZipPath(str(self.path_obj.absolute()), self.subpath)
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def _is_under(folder_abs: str, target: str | Path) -> bool:
|
|
42
|
-
"""Return True if the ZipPath's real file (zp.path) is inside folder_abs."""
|
|
43
|
-
try:
|
|
44
|
-
Path(target).absolute().relative_to(Path(folder_abs).absolute())
|
|
45
|
-
return True
|
|
46
|
-
except ValueError:
|
|
47
|
-
return False
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def phash_params_for_strength(strength: int) -> tuple[int, int]:
|
|
51
|
-
# TODO: This sucks.
|
|
52
|
-
strength = max(0, min(10, strength))
|
|
53
|
-
if strength >= 10:
|
|
54
|
-
return 16, 4 # 256-bit hash, strict
|
|
55
|
-
elif strength >= 8:
|
|
56
|
-
return 15, 4
|
|
57
|
-
elif strength >= 7:
|
|
58
|
-
return 13, 4
|
|
59
|
-
elif strength >= 6:
|
|
60
|
-
return 11, 4
|
|
61
|
-
elif strength >= 5:
|
|
62
|
-
return 9, 4
|
|
63
|
-
elif strength >= 4:
|
|
64
|
-
return 8, 4
|
|
65
|
-
elif strength >= 3:
|
|
66
|
-
return 8, 3
|
|
67
|
-
elif strength >= 2:
|
|
68
|
-
return 7, 3
|
|
69
|
-
else:
|
|
70
|
-
return 6, 3
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def calculate_hashes(f, is_gif=False, strength=5, exact_match=False):
|
|
74
|
-
"""
|
|
75
|
-
Calculate hashes for a given file.
|
|
76
|
-
|
|
77
|
-
Args:
|
|
78
|
-
f (IO or str or Path): Either a file path to process, or a in-memory BytesIO object ready for reading.
|
|
79
|
-
is_gif (bool): Is this gif data? Needed if passing an in-memory BytesIO object.
|
|
80
|
-
strength (int): A number between 0 and 10 on the strength of the matches.
|
|
81
|
-
exact_match (bool): Use exact SHA256 hahes?
|
|
82
|
-
If true, strength must be 10.
|
|
83
|
-
If false, perceptual hashes will be used, even with high strength.
|
|
84
|
-
|
|
85
|
-
Returns:
|
|
86
|
-
list: The found hashes.
|
|
87
|
-
"""
|
|
88
|
-
if exact_match:
|
|
89
|
-
hasher = hashlib.sha256()
|
|
90
|
-
block_size = 65536
|
|
91
|
-
with (open(f, "rb") if isinstance(f, (str, Path)) else f) as file:
|
|
92
|
-
for block in iter(lambda: file.read(block_size), b""):
|
|
93
|
-
hasher.update(block)
|
|
94
|
-
return [hasher.hexdigest()]
|
|
95
|
-
|
|
96
|
-
hash_size, highfreq_factor = phash_params_for_strength(strength)
|
|
97
|
-
with (Image.open(f) as im):
|
|
98
|
-
if is_gif:
|
|
99
|
-
initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
|
|
100
|
-
# This is going to be a bit confusing but basically, imagehash produces weird hashes for some gifs
|
|
101
|
-
# because some gifs have bad first frames consisting of nothing or only a single color...
|
|
102
|
-
# To deal with that I'm looking for these bad hashes here and if its one, we advance to the next frame
|
|
103
|
-
# and use THAT for imagehash instead.
|
|
104
|
-
# The ones we need to be on the lookout for are:
|
|
105
|
-
# 1. The hash is all 1111...
|
|
106
|
-
# 2. The hash is all 0000...
|
|
107
|
-
# 3. The hash is of the form 100000...
|
|
108
|
-
# TODO: This is simply not good enough. I'm still getting bad matches for gifs, tho they are extremely rare
|
|
109
|
-
val = initial_hash.hash[0][0]
|
|
110
|
-
while all(all(x == val for x in r) for r in initial_hash.hash) \
|
|
111
|
-
or all(all(x == np.False_ or (x_i == 0 and r_i == 0) for x_i, x in enumerate(r))
|
|
112
|
-
for r_i, r in enumerate(initial_hash.hash)):
|
|
113
|
-
try:
|
|
114
|
-
im.seek(im.tell() + 1)
|
|
115
|
-
except EOFError:
|
|
116
|
-
break
|
|
117
|
-
else:
|
|
118
|
-
initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
|
|
119
|
-
val = initial_hash.hash[0][0]
|
|
120
|
-
|
|
121
|
-
# For GIFs we'll look for mirrored versions but thats it
|
|
122
|
-
flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
|
|
123
|
-
return [initial_hash, imagehash.phash(flipped_h_image, hash_size=hash_size, highfreq_factor=highfreq_factor)]
|
|
124
|
-
|
|
125
|
-
flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
|
|
126
|
-
flipped_v_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
|
|
127
|
-
images = (im, im.rotate(90), im.rotate(180), im.rotate(270),
|
|
128
|
-
flipped_h_image, flipped_h_image.rotate(90), flipped_h_image.rotate(180), flipped_h_image.rotate(270),
|
|
129
|
-
flipped_v_image, flipped_v_image.rotate(90), flipped_v_image.rotate(180), flipped_v_image.rotate(270))
|
|
130
|
-
return [imagehash.phash(image, hash_size=hash_size, highfreq_factor=highfreq_factor) for image in images]
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
def _process_image(path: str | Path, strength=5, exact_match=False):
|
|
134
|
-
path = Path(path)
|
|
135
|
-
if path.suffix.lower() != '.zip':
|
|
136
|
-
return path, calculate_hashes(path, is_gif=path.suffix.lower() in {".gif", ".webp"},
|
|
137
|
-
strength=strength, exact_match=exact_match)
|
|
138
|
-
|
|
139
|
-
results = dict()
|
|
140
|
-
with ZipFile(path) as zf:
|
|
141
|
-
for f in zf.filelist:
|
|
142
|
-
with zf.open(f) as zipped_file:
|
|
143
|
-
results[f.filename] = calculate_hashes(zipped_file, is_gif=f.filename[-4:].lower() in {".gif", ".webp"},
|
|
144
|
-
strength=strength, exact_match=exact_match)
|
|
145
|
-
|
|
146
|
-
return path, results
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
@dataclass
|
|
150
|
-
class ImageMatch:
|
|
151
|
-
match_i: int | None = field(default=None)
|
|
152
|
-
matches: list[ZipPath] = field(default_factory=list)
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
@dataclass(frozen=True)
|
|
156
|
-
class NewGroup:
|
|
157
|
-
group: "ImageMatch" # forward-ref to your class
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
@dataclass(frozen=True)
|
|
161
|
-
class NewMatch:
|
|
162
|
-
group: "ImageMatch"
|
|
163
|
-
path: ZipPath
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
@dataclass(frozen=True)
|
|
167
|
-
class Finished:
|
|
168
|
-
pass
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
MatcherEvent = Union[NewGroup, NewMatch, Finished]
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
# TODO: FINISHED signal?
|
|
175
|
-
class ImageMatcher:
|
|
176
|
-
SUPPORTED_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".tif", ".tiff", ".bmp", ".gif", ".zip"}
|
|
177
|
-
|
|
178
|
-
def __init__(self, strength: int = 5, exact_match: bool = False, processes: int | None = None,
|
|
179
|
-
extensions: set | None = None):
|
|
180
|
-
if not (0 <= strength <= 10):
|
|
181
|
-
raise ValueError("Strength must be between 0 and 10!")
|
|
182
|
-
|
|
183
|
-
self.extensions = extensions or self.SUPPORTED_EXTS
|
|
184
|
-
|
|
185
|
-
self.strength = strength
|
|
186
|
-
self.exact_match = exact_match
|
|
187
|
-
self.processes = processes
|
|
188
|
-
self.found_images = 0
|
|
189
|
-
self.processed_images = 0
|
|
190
|
-
self.duplicate_images = 0
|
|
191
|
-
|
|
192
|
-
m = Manager()
|
|
193
|
-
self.events = m.Queue()
|
|
194
|
-
self._new_paths = m.Queue()
|
|
195
|
-
self._removed_paths = set()
|
|
196
|
-
self._processed_paths = set()
|
|
197
|
-
self._hashes = defaultdict(ImageMatch)
|
|
198
|
-
self._reverse_hashes = dict()
|
|
199
|
-
|
|
200
|
-
self._not_paused = Event()
|
|
201
|
-
self._not_paused.set()
|
|
202
|
-
self._finished = Event()
|
|
203
|
-
self._finished.set()
|
|
204
|
-
|
|
205
|
-
self.matches = []
|
|
206
|
-
|
|
207
|
-
def add_path(self, path: str | Path):
|
|
208
|
-
path = str(Path(path).absolute())
|
|
209
|
-
self._removed_paths.discard(path)
|
|
210
|
-
self._new_paths.put(path)
|
|
211
|
-
|
|
212
|
-
def remove_path(self, folder: str | Path) -> None:
|
|
213
|
-
"""
|
|
214
|
-
Mark a folder to be skipped going forward, and remove already-indexed files
|
|
215
|
-
that live under it. Pauses briefly if not already paused to keep state sane.
|
|
216
|
-
"""
|
|
217
|
-
folder = str(Path(folder).absolute())
|
|
218
|
-
paused = self.conditional_pause()
|
|
219
|
-
self._removed_paths.add(folder)
|
|
220
|
-
self._processed_paths.discard(folder)
|
|
221
|
-
|
|
222
|
-
# Remove anything we've already seen under that folder
|
|
223
|
-
# (iterate over a copy because remove() mutates structures)
|
|
224
|
-
to_remove = [p for p in self._reverse_hashes.keys() if _is_under(folder, p.path)]
|
|
225
|
-
for p in to_remove:
|
|
226
|
-
self.remove(p)
|
|
227
|
-
|
|
228
|
-
self.conditional_resume(paused)
|
|
229
|
-
|
|
230
|
-
@property
|
|
231
|
-
def left_to_process(self):
|
|
232
|
-
return self.found_images - self.processed_images
|
|
233
|
-
|
|
234
|
-
def pause(self):
|
|
235
|
-
logger.debug('Performing pause')
|
|
236
|
-
self._not_paused.clear()
|
|
237
|
-
|
|
238
|
-
def conditional_pause(self):
|
|
239
|
-
_conditional_pause = self.is_paused()
|
|
240
|
-
if not _conditional_pause:
|
|
241
|
-
logger.debug('Performing conditional pause')
|
|
242
|
-
self.pause()
|
|
243
|
-
|
|
244
|
-
return _conditional_pause
|
|
245
|
-
|
|
246
|
-
def conditional_resume(self, was_paused):
|
|
247
|
-
if not was_paused and not self.is_finished():
|
|
248
|
-
logger.debug('Performing conditional resume')
|
|
249
|
-
self.resume()
|
|
250
|
-
|
|
251
|
-
def is_paused(self):
|
|
252
|
-
return not self._not_paused.is_set()
|
|
253
|
-
|
|
254
|
-
def finish(self):
|
|
255
|
-
logger.debug('Performing finished')
|
|
256
|
-
self._finished.set()
|
|
257
|
-
|
|
258
|
-
def is_finished(self):
|
|
259
|
-
return self._finished.is_set()
|
|
260
|
-
|
|
261
|
-
def resume(self):
|
|
262
|
-
logger.debug('Performing resume')
|
|
263
|
-
self._not_paused.set()
|
|
264
|
-
|
|
265
|
-
def running(self):
|
|
266
|
-
return not self.is_paused() and (not self.is_finished() or self.left_to_process)
|
|
267
|
-
|
|
268
|
-
def remove(self, path):
|
|
269
|
-
# Pause things while we remove things...
|
|
270
|
-
logger.info('Removing %s from %s', path, self.__class__.__name__)
|
|
271
|
-
paused = self.conditional_pause()
|
|
272
|
-
|
|
273
|
-
hash = self._reverse_hashes.pop(path)
|
|
274
|
-
self._hashes[hash].matches.remove(path)
|
|
275
|
-
if len(self._hashes[hash].matches) == 1:
|
|
276
|
-
match_i = self._hashes[hash].match_i
|
|
277
|
-
logger.debug('Unmatching match group %s', match_i)
|
|
278
|
-
self._hashes[hash].match_i = None
|
|
279
|
-
|
|
280
|
-
del self.matches[match_i]
|
|
281
|
-
self.refresh_match_indexes(match_i)
|
|
282
|
-
self.duplicate_images -= 2
|
|
283
|
-
|
|
284
|
-
elif not self._hashes[hash].matches:
|
|
285
|
-
logger.debug('Removing empty match group')
|
|
286
|
-
del self._hashes[hash]
|
|
287
|
-
|
|
288
|
-
else:
|
|
289
|
-
logger.debug('Simple removal performed')
|
|
290
|
-
self.duplicate_images -= 1
|
|
291
|
-
|
|
292
|
-
self.processed_images -= 1
|
|
293
|
-
self.found_images -= 1
|
|
294
|
-
self.conditional_resume(paused)
|
|
295
|
-
|
|
296
|
-
def refresh_match_indexes(self, start=0):
|
|
297
|
-
for match_i, match in enumerate(self.matches[start:], start=start):
|
|
298
|
-
match.match_i = match_i
|
|
299
|
-
|
|
300
|
-
def _process_image_callback(self, result):
|
|
301
|
-
self._not_paused.wait()
|
|
302
|
-
if self.is_finished():
|
|
303
|
-
return
|
|
304
|
-
|
|
305
|
-
path: Path | str | ZipPath
|
|
306
|
-
path, hashes = result
|
|
307
|
-
|
|
308
|
-
if any(_is_under(d, path.path if isinstance(path, ZipPath) else path) for d in self._removed_paths):
|
|
309
|
-
self.found_images -= 1
|
|
310
|
-
return
|
|
311
|
-
|
|
312
|
-
if isinstance(hashes, dict):
|
|
313
|
-
for sub_path, sub_hashes in hashes.items():
|
|
314
|
-
self._process_image_callback((ZipPath(str(path), sub_path), sub_hashes))
|
|
315
|
-
return
|
|
316
|
-
|
|
317
|
-
if not isinstance(path, ZipPath):
|
|
318
|
-
path = ZipPath(str(path), "")
|
|
319
|
-
|
|
320
|
-
if path in self._reverse_hashes:
|
|
321
|
-
self.found_images -= 1
|
|
322
|
-
return
|
|
323
|
-
|
|
324
|
-
self.processed_images += 1
|
|
325
|
-
for hash_ in hashes:
|
|
326
|
-
if hash_ not in self._hashes:
|
|
327
|
-
continue
|
|
328
|
-
|
|
329
|
-
self._reverse_hashes[path] = hash_
|
|
330
|
-
|
|
331
|
-
# This appears to be a new match!
|
|
332
|
-
for match in self._hashes[hash_].matches:
|
|
333
|
-
if path.absolute() == match.absolute():
|
|
334
|
-
# This appears to be a duplicate PATH...
|
|
335
|
-
logger.warning('Duplicate files entered! %s, %s', path, match)
|
|
336
|
-
return
|
|
337
|
-
|
|
338
|
-
self._hashes[hash_].matches.append(path)
|
|
339
|
-
if self._hashes[hash_].match_i is None and len(self._hashes[hash_].matches) >= 2:
|
|
340
|
-
# This is a brand new match group!
|
|
341
|
-
self._hashes[hash_].match_i = len(self.matches)
|
|
342
|
-
self.matches.append(self._hashes[hash_])
|
|
343
|
-
self.duplicate_images += 2
|
|
344
|
-
self.events.put(NewGroup(self._hashes[hash_]))
|
|
345
|
-
logger.debug('New match group found: %s', self._hashes[hash_].matches)
|
|
346
|
-
else:
|
|
347
|
-
# Just another match for an existing group...
|
|
348
|
-
self.duplicate_images += 1
|
|
349
|
-
self.events.put(NewMatch(self._hashes[hash_], path))
|
|
350
|
-
logger.debug('New match found for group #%s: %s',
|
|
351
|
-
self._hashes[hash_].match_i,
|
|
352
|
-
self._hashes[hash_].matches)
|
|
353
|
-
|
|
354
|
-
break
|
|
355
|
-
else:
|
|
356
|
-
# This is a new hash, so just add it to the hashmap and move on...
|
|
357
|
-
# Just use the initial orientation
|
|
358
|
-
hash_ = hashes[0]
|
|
359
|
-
self._reverse_hashes[path] = hash_
|
|
360
|
-
self._hashes[hash_].matches.append(path)
|
|
361
|
-
return
|
|
362
|
-
|
|
363
|
-
def _process_image_error_callback(self, e):
|
|
364
|
-
self.processed_images += 1
|
|
365
|
-
print(str(e))
|
|
366
|
-
|
|
367
|
-
def _root_stream(self):
|
|
368
|
-
# Yield any paths that come up for processing, then wait until processing is finished for any new paths
|
|
369
|
-
while not self._new_paths.empty() or self.left_to_process:
|
|
370
|
-
if self._new_paths.empty():
|
|
371
|
-
time.sleep(0.05)
|
|
372
|
-
continue
|
|
373
|
-
|
|
374
|
-
yield self._new_paths.get_nowait()
|
|
375
|
-
|
|
376
|
-
def run(self, paths: list[str | Path]):
|
|
377
|
-
# TODO: Verify none of the paths overlap
|
|
378
|
-
# TODO: Verify none of the dirs have been deleted after we started
|
|
379
|
-
|
|
380
|
-
self._not_paused.set()
|
|
381
|
-
self._finished.clear()
|
|
382
|
-
|
|
383
|
-
for path in paths:
|
|
384
|
-
self.add_path(path)
|
|
385
|
-
|
|
386
|
-
with Pool(self.processes) as tp:
|
|
387
|
-
for path in self._root_stream():
|
|
388
|
-
path = Path(path)
|
|
389
|
-
if not path.is_dir():
|
|
390
|
-
logger.warning('A path was entered that was not a directory : %s', path)
|
|
391
|
-
continue
|
|
392
|
-
|
|
393
|
-
path = str(path.absolute())
|
|
394
|
-
if path in self._removed_paths or path in self._processed_paths:
|
|
395
|
-
continue
|
|
396
|
-
|
|
397
|
-
for root, dirs, files in os.walk(path):
|
|
398
|
-
if self.is_finished():
|
|
399
|
-
break
|
|
400
|
-
|
|
401
|
-
root = Path(root)
|
|
402
|
-
|
|
403
|
-
if any(_is_under(d, root) for d in self._removed_paths):
|
|
404
|
-
continue
|
|
405
|
-
|
|
406
|
-
for f in files:
|
|
407
|
-
self._not_paused.wait()
|
|
408
|
-
if self.is_finished():
|
|
409
|
-
break
|
|
410
|
-
|
|
411
|
-
f = root / f
|
|
412
|
-
|
|
413
|
-
if f.suffix.lower() not in self.extensions:
|
|
414
|
-
continue
|
|
415
|
-
|
|
416
|
-
if any(_is_under(d, f) for d in self._removed_paths):
|
|
417
|
-
continue
|
|
418
|
-
|
|
419
|
-
# TODO: This sucks (for zips at least), but I can't iterate over the dict while its changing...
|
|
420
|
-
if ZipPath(str(f), "") in self._reverse_hashes:
|
|
421
|
-
continue
|
|
422
|
-
|
|
423
|
-
self.found_images += 1
|
|
424
|
-
tp.apply_async(
|
|
425
|
-
_process_image,
|
|
426
|
-
args=(f, ),
|
|
427
|
-
kwds={
|
|
428
|
-
'strength': self.strength,
|
|
429
|
-
'exact_match': self.exact_match,
|
|
430
|
-
},
|
|
431
|
-
callback=self._process_image_callback,
|
|
432
|
-
error_callback=self._process_image_error_callback,
|
|
433
|
-
)
|
|
434
|
-
|
|
435
|
-
self._processed_paths.add(path)
|
|
436
|
-
|
|
437
|
-
tp.close()
|
|
438
|
-
|
|
439
|
-
if not self.is_finished():
|
|
440
|
-
tp.join()
|
|
441
|
-
|
|
442
|
-
if not self.is_finished():
|
|
443
|
-
self._finished.set()
|
|
444
|
-
self.events.put(Finished())
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|