pixmatch 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixmatch/__init__.py +585 -0
- pixmatch/__main__.py +47 -0
- pixmatch/utils.py +38 -0
- pixmatch-0.0.6.dist-info/METADATA +93 -0
- pixmatch-0.0.6.dist-info/RECORD +8 -0
- pixmatch-0.0.6.dist-info/WHEEL +5 -0
- pixmatch-0.0.6.dist-info/licenses/LICENSE +19 -0
- pixmatch-0.0.6.dist-info/top_level.txt +1 -0
pixmatch/__init__.py
ADDED
|
@@ -0,0 +1,585 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from functools import wraps
|
|
9
|
+
from multiprocessing import Manager, Pool
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from threading import Event
|
|
12
|
+
from typing import ClassVar, Union
|
|
13
|
+
from zipfile import BadZipFile, ZipFile
|
|
14
|
+
|
|
15
|
+
import imagehash
|
|
16
|
+
import numpy as np
|
|
17
|
+
|
|
18
|
+
from PIL import Image, ImageFile, UnidentifiedImageError
|
|
19
|
+
|
|
20
|
+
ImageFile.LOAD_TRUNCATED_IMAGES = True # Allow damaged images
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass(frozen=True)
|
|
26
|
+
class ZipPath:
|
|
27
|
+
"""
|
|
28
|
+
A general object describing a Path.
|
|
29
|
+
|
|
30
|
+
All paths in pixmatch will be one of these. `subpath` will be empty for non-zip file paths.
|
|
31
|
+
|
|
32
|
+
Attributes:
|
|
33
|
+
path (str): The path to the file.
|
|
34
|
+
subpath (str): The subpath in the zip if `path` is for a zip.
|
|
35
|
+
"""
|
|
36
|
+
# TODO: At some point convert this to Path.
|
|
37
|
+
# When I tried that last it introduced problems with inter-process communication
|
|
38
|
+
path: str
|
|
39
|
+
subpath: str
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def path_obj(self) -> Path:
|
|
43
|
+
"""Get the path as as Path object"""
|
|
44
|
+
return Path(self.path)
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def is_gif(self) -> bool:
|
|
48
|
+
"""Is this a path to an animated image?"""
|
|
49
|
+
movie_extensions = {'.gif', '.webp'}
|
|
50
|
+
return (not self.subpath and Path(self.path).suffix.lower() in movie_extensions) \
|
|
51
|
+
or (self.subpath and self.subpath[-4:].lower() in movie_extensions)
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def is_zip(self) -> bool:
|
|
55
|
+
"""Does this point to a file located in a zip?"""
|
|
56
|
+
return bool(self.subpath)
|
|
57
|
+
|
|
58
|
+
def absolute(self):
|
|
59
|
+
"""Get the absolute version of this ZipPath"""
|
|
60
|
+
return ZipPath(str(self.path_obj.absolute()), self.subpath)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _is_under(folder_abs: str, target: str | Path) -> bool:
|
|
64
|
+
"""Return True if the ZipPath's real file (zp.path) is inside folder_abs."""
|
|
65
|
+
try:
|
|
66
|
+
Path(target).absolute().relative_to(Path(folder_abs).absolute())
|
|
67
|
+
except ValueError:
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
return True
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def phash_params_for_strength(strength: int) -> tuple[int, int]:
|
|
74
|
+
"""
|
|
75
|
+
Convert a 0-10 strength to settings for imagehash
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
tuple<int, int>: The hash size (in bytes) and the high frequency factor
|
|
79
|
+
"""
|
|
80
|
+
# TODO: This sucks.
|
|
81
|
+
strength = max(0, min(10, strength))
|
|
82
|
+
if strength >= 10:
|
|
83
|
+
return 16, 4
|
|
84
|
+
if strength >= 8:
|
|
85
|
+
return 15, 4
|
|
86
|
+
if strength >= 7:
|
|
87
|
+
return 13, 4
|
|
88
|
+
if strength >= 6:
|
|
89
|
+
return 11, 4
|
|
90
|
+
if strength >= 5:
|
|
91
|
+
return 9, 4
|
|
92
|
+
if strength >= 4:
|
|
93
|
+
return 8, 4
|
|
94
|
+
if strength >= 3:
|
|
95
|
+
return 8, 3
|
|
96
|
+
if strength >= 2:
|
|
97
|
+
return 7, 3
|
|
98
|
+
return 6, 3
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def calculate_hashes(f, strength=5, *, is_gif=False, exact_match=False) -> tuple[str, set[str]]:
|
|
102
|
+
"""
|
|
103
|
+
Calculate hashes for a given file.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
f (IO or str or Path): Either a file path to process, or a in-memory BytesIO object ready for reading.
|
|
107
|
+
strength (int): A number between 0 and 10 on the strength of the matches.
|
|
108
|
+
is_gif (bool): Is this gif data? Needed if passing an in-memory BytesIO object.
|
|
109
|
+
exact_match (bool): Use exact SHA256 hahes?
|
|
110
|
+
If true, strength must be 10.
|
|
111
|
+
If false, perceptual hashes will be used, even with high strength.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
tuple[str, set]: The first element is the primary hash,
|
|
115
|
+
the second element are any secondary hashes representing rotations, flips, etc...
|
|
116
|
+
"""
|
|
117
|
+
if exact_match:
|
|
118
|
+
hasher = hashlib.sha256()
|
|
119
|
+
block_size = 65536
|
|
120
|
+
with (open(f, "rb") if isinstance(f, (str, Path)) else f) as file: # noqa: PTH123
|
|
121
|
+
for block in iter(lambda: file.read(block_size), b""):
|
|
122
|
+
hasher.update(block)
|
|
123
|
+
return hasher.hexdigest(), set()
|
|
124
|
+
|
|
125
|
+
hash_size, highfreq_factor = phash_params_for_strength(strength)
|
|
126
|
+
with Image.open(f) as im:
|
|
127
|
+
if is_gif:
|
|
128
|
+
initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
|
|
129
|
+
# This is going to be a bit confusing but basically, imagehash produces weird hashes for some gifs
|
|
130
|
+
# because some gifs have bad first frames consisting of nothing or only a single color...
|
|
131
|
+
# To deal with that I'm looking for these bad hashes here and if its one, we advance to the next frame
|
|
132
|
+
# and use THAT for imagehash instead.
|
|
133
|
+
# The ones we need to be on the lookout for are:
|
|
134
|
+
# 1. The hash is all 1111...
|
|
135
|
+
# 2. The hash is all 0000...
|
|
136
|
+
# 3. The hash is of the form 100000...
|
|
137
|
+
# TODO: This is simply not good enough. I'm still getting bad matches for gifs, tho they are extremely rare
|
|
138
|
+
val = initial_hash.hash[0][0]
|
|
139
|
+
while all(all(x == val for x in r) for r in initial_hash.hash) \
|
|
140
|
+
or all(all(x == np.False_ or (x_i == 0 and r_i == 0) for x_i, x in enumerate(r))
|
|
141
|
+
for r_i, r in enumerate(initial_hash.hash)):
|
|
142
|
+
try:
|
|
143
|
+
im.seek(im.tell() + 1)
|
|
144
|
+
except EOFError: # noqa: PERF203
|
|
145
|
+
break
|
|
146
|
+
else:
|
|
147
|
+
initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
|
|
148
|
+
val = initial_hash.hash[0][0]
|
|
149
|
+
|
|
150
|
+
# For GIFs we'll look for mirrored versions but thats it
|
|
151
|
+
flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
|
|
152
|
+
extras = (flipped_h_image, )
|
|
153
|
+
else:
|
|
154
|
+
initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
|
|
155
|
+
|
|
156
|
+
flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
|
|
157
|
+
flipped_v_image = im.transpose(Image.Transpose.FLIP_TOP_BOTTOM)
|
|
158
|
+
extras = (im.rotate(90), im.rotate(180), im.rotate(270),
|
|
159
|
+
flipped_h_image, flipped_h_image.rotate(90), flipped_h_image.rotate(180),
|
|
160
|
+
flipped_h_image.rotate(270),
|
|
161
|
+
flipped_v_image, flipped_v_image.rotate(90), flipped_v_image.rotate(180),
|
|
162
|
+
flipped_v_image.rotate(270))
|
|
163
|
+
|
|
164
|
+
return str(initial_hash), {
|
|
165
|
+
str(imagehash.phash(image, hash_size=hash_size, highfreq_factor=highfreq_factor)) for image in extras
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def thread_error_handler(func):
|
|
170
|
+
"""An error handler for the thread to return information about where the error occurred"""
|
|
171
|
+
|
|
172
|
+
@wraps(func)
|
|
173
|
+
def wrapper(path, *args, **kwargs): # noqa: ANN202
|
|
174
|
+
try:
|
|
175
|
+
return func(path, *args, **kwargs)
|
|
176
|
+
except Exception as e:
|
|
177
|
+
e.input_path = path
|
|
178
|
+
raise
|
|
179
|
+
|
|
180
|
+
return wrapper
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
@thread_error_handler
|
|
184
|
+
def _process_image(
|
|
185
|
+
path: str | Path,
|
|
186
|
+
supported_extensions: set | None = None,
|
|
187
|
+
strength: int = 5,
|
|
188
|
+
*,
|
|
189
|
+
exact_match: bool = False,
|
|
190
|
+
) -> tuple[Path, tuple | dict[str, tuple]]:
|
|
191
|
+
"""Get the hashes for a given path. Is multiprocessing compatible"""
|
|
192
|
+
path = Path(path)
|
|
193
|
+
if path.suffix.lower() != '.zip':
|
|
194
|
+
return path, calculate_hashes(path, is_gif=path.suffix.lower() in {".gif", ".webp"},
|
|
195
|
+
strength=strength, exact_match=exact_match)
|
|
196
|
+
|
|
197
|
+
if not supported_extensions:
|
|
198
|
+
supported_extensions = ImageMatcher.SUPPORTED_EXTS
|
|
199
|
+
|
|
200
|
+
results = {}
|
|
201
|
+
with ZipFile(path) as zf:
|
|
202
|
+
for f in zf.filelist:
|
|
203
|
+
f_ext = f.filename[-4:].lower()
|
|
204
|
+
if f_ext not in supported_extensions:
|
|
205
|
+
continue
|
|
206
|
+
|
|
207
|
+
if f_ext == '.zip':
|
|
208
|
+
logger.warning('Have not implemented nested zip support yet! Input file: %s (%s)', path, f)
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
try:
|
|
212
|
+
with zf.open(f) as zipped_file:
|
|
213
|
+
results[f.filename] = calculate_hashes(zipped_file, is_gif=f_ext in {".gif", ".webp"},
|
|
214
|
+
strength=strength, exact_match=exact_match)
|
|
215
|
+
except BadZipFile as e:
|
|
216
|
+
logger.warning("Could not read %s in %s due to %s", f.filename, path, str(e))
|
|
217
|
+
except UnidentifiedImageError:
|
|
218
|
+
logger.warning("Could not identify image %s in %s", f.filename, path)
|
|
219
|
+
|
|
220
|
+
return path, results
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
@dataclass
|
|
224
|
+
class ImageMatch:
|
|
225
|
+
"""A match data structure containing the matches and where this match lies in the match list"""
|
|
226
|
+
match_i: int | None = field(default=None)
|
|
227
|
+
matches: list[ZipPath] = field(default_factory=list)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
# region Events
|
|
231
|
+
@dataclass(frozen=True)
|
|
232
|
+
class NewGroup:
|
|
233
|
+
"""A new group event"""
|
|
234
|
+
group: "ImageMatch"
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
@dataclass(frozen=True)
|
|
238
|
+
class NewMatch:
|
|
239
|
+
"""A new match event"""
|
|
240
|
+
group: "ImageMatch"
|
|
241
|
+
path: ZipPath
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
@dataclass(frozen=True)
|
|
245
|
+
class Finished:
|
|
246
|
+
"""A finished event"""
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
MatcherEvent = Union[NewGroup, NewMatch, Finished]
|
|
250
|
+
# endregion
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
class ImageMatcher:
|
|
254
|
+
"""
|
|
255
|
+
An image matching SDK
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
strength (int): The 0-10 strength to use for matching. Defaults to 5.
|
|
259
|
+
exact_match (bool): Should use SHA-256 hashes? If False, the default, will use perceptual hashes.
|
|
260
|
+
If True, strength must be 10.
|
|
261
|
+
processes (int): The number of processes to use. Defaults to None.
|
|
262
|
+
extensions (set): The extensions to process. Optional.
|
|
263
|
+
"""
|
|
264
|
+
SUPPORTED_EXTS: ClassVar = {".jpg", ".jpeg", ".png", ".webp", ".tif", ".tiff", ".bmp", ".gif", ".zip"}
|
|
265
|
+
|
|
266
|
+
def __init__(self, strength: int = 5, processes: int | None = None, extensions: set | None = None,
|
|
267
|
+
*, exact_match: bool = False):
|
|
268
|
+
if not (0 <= strength <= 10):
|
|
269
|
+
raise ValueError("Strength must be between 0 and 10!")
|
|
270
|
+
|
|
271
|
+
self.extensions = extensions or self.SUPPORTED_EXTS
|
|
272
|
+
|
|
273
|
+
self.strength = strength
|
|
274
|
+
self.exact_match = exact_match
|
|
275
|
+
self.processes = processes
|
|
276
|
+
|
|
277
|
+
self.found_images = 0
|
|
278
|
+
self.processed_images = 0
|
|
279
|
+
self.duplicate_images = 0
|
|
280
|
+
self.matches = []
|
|
281
|
+
|
|
282
|
+
m = Manager()
|
|
283
|
+
self.events = m.Queue() # Events to go to higher level users
|
|
284
|
+
self._new_paths = m.Queue() # Inbound queue for new paths that are added while processing is running
|
|
285
|
+
self._removed_paths = set() # Paths that have been removed from processing after processing has been started
|
|
286
|
+
self._ignored_files = set() # Files which have been ignored and should be skipped from processing if re-ran
|
|
287
|
+
self._processed_zips = {} # Zips that have been successfully processed
|
|
288
|
+
self._hashes = defaultdict(ImageMatch) # Hash -> Paths
|
|
289
|
+
self._reverse_hashes = {} # Path -> Hash
|
|
290
|
+
|
|
291
|
+
# Pausing and finished signaling...
|
|
292
|
+
self._not_paused = Event()
|
|
293
|
+
self._not_paused.set()
|
|
294
|
+
self._finished = Event()
|
|
295
|
+
self._finished.set()
|
|
296
|
+
|
|
297
|
+
@property
|
|
298
|
+
def left_to_process(self):
|
|
299
|
+
"""Files that are left to process"""
|
|
300
|
+
return self.found_images - self.processed_images
|
|
301
|
+
|
|
302
|
+
def add_path(self, path: str | Path):
|
|
303
|
+
"""Add a path for processing"""
|
|
304
|
+
path = str(Path(path).absolute())
|
|
305
|
+
self._removed_paths.discard(path)
|
|
306
|
+
self._new_paths.put(path)
|
|
307
|
+
|
|
308
|
+
def remove_path(self, folder: str | Path):
|
|
309
|
+
"""
|
|
310
|
+
Mark a folder to be skipped going forward, and remove already-indexed files
|
|
311
|
+
that live under it. Pauses briefly if not already paused to keep state sane.
|
|
312
|
+
"""
|
|
313
|
+
# TODO: This works but the biggest problem with it is that it will not remove any images which are still
|
|
314
|
+
# queue'd up for processing in the ThreadPool... I'm not sure how to fix that yet.
|
|
315
|
+
folder = str(Path(folder).absolute())
|
|
316
|
+
paused = self.conditional_pause()
|
|
317
|
+
self._removed_paths.add(folder)
|
|
318
|
+
|
|
319
|
+
# Remove anything we've already seen under that folder
|
|
320
|
+
# (iterate over a copy because remove() mutates structures)
|
|
321
|
+
to_remove = [p for p in self._reverse_hashes if _is_under(folder, p.path)]
|
|
322
|
+
for p in to_remove:
|
|
323
|
+
self.remove(p)
|
|
324
|
+
|
|
325
|
+
to_remove_zips = [p for p in self._processed_zips if _is_under(folder, p)]
|
|
326
|
+
for p in to_remove_zips:
|
|
327
|
+
self._processed_zips.pop(p)
|
|
328
|
+
|
|
329
|
+
self.conditional_resume(paused)
|
|
330
|
+
|
|
331
|
+
def conditional_pause(self) -> bool:
|
|
332
|
+
"""Pause if not paused and return if was paused"""
|
|
333
|
+
_conditional_pause = self.is_paused()
|
|
334
|
+
if not _conditional_pause:
|
|
335
|
+
logger.debug('Performing conditional pause')
|
|
336
|
+
self.pause()
|
|
337
|
+
|
|
338
|
+
return _conditional_pause
|
|
339
|
+
|
|
340
|
+
def conditional_resume(self, was_paused: bool): # noqa: FBT001
|
|
341
|
+
"""Resume if not paused previous (from call to `conditional_pause`)"""
|
|
342
|
+
if not was_paused and not self.is_finished():
|
|
343
|
+
logger.debug('Performing conditional resume')
|
|
344
|
+
self.resume()
|
|
345
|
+
|
|
346
|
+
def pause(self):
|
|
347
|
+
"""Pause processing"""
|
|
348
|
+
logger.debug('Performing pause')
|
|
349
|
+
self._not_paused.clear()
|
|
350
|
+
|
|
351
|
+
def is_paused(self):
|
|
352
|
+
"""Is processing paused"""
|
|
353
|
+
return not self._not_paused.is_set()
|
|
354
|
+
|
|
355
|
+
def finish(self):
|
|
356
|
+
"""Finish processing"""
|
|
357
|
+
logger.debug('Performing finished')
|
|
358
|
+
self._finished.set()
|
|
359
|
+
|
|
360
|
+
def is_finished(self):
|
|
361
|
+
"""Is processing finished"""
|
|
362
|
+
return self._finished.is_set()
|
|
363
|
+
|
|
364
|
+
def resume(self):
|
|
365
|
+
"""Resume processing"""
|
|
366
|
+
logger.debug('Performing resume')
|
|
367
|
+
self._not_paused.set()
|
|
368
|
+
|
|
369
|
+
def running(self):
|
|
370
|
+
"""Currently running and loading hashes?"""
|
|
371
|
+
return not self.is_paused() and (not self.is_finished() or self.left_to_process)
|
|
372
|
+
|
|
373
|
+
def remove(self, path):
|
|
374
|
+
"""Remove a loaded path completely from the image matching system. Will not delete a file."""
|
|
375
|
+
# Pause things while we remove things...
|
|
376
|
+
logger.info('Removing %s from %s', path, self.__class__.__name__)
|
|
377
|
+
paused = self.conditional_pause()
|
|
378
|
+
|
|
379
|
+
hash_ = self._reverse_hashes.pop(path)
|
|
380
|
+
self._hashes[hash_].matches.remove(path)
|
|
381
|
+
if len(self._hashes[hash_].matches) == 1:
|
|
382
|
+
match_i = self._hashes[hash_].match_i
|
|
383
|
+
logger.debug('Unmatching match group %s', match_i)
|
|
384
|
+
self._hashes[hash_].match_i = None
|
|
385
|
+
|
|
386
|
+
del self.matches[match_i]
|
|
387
|
+
self.refresh_match_indexes(match_i)
|
|
388
|
+
self.duplicate_images -= 2
|
|
389
|
+
|
|
390
|
+
elif not self._hashes[hash_].matches:
|
|
391
|
+
logger.debug('Removing empty match group')
|
|
392
|
+
del self._hashes[hash_]
|
|
393
|
+
|
|
394
|
+
else:
|
|
395
|
+
logger.debug('Simple removal performed')
|
|
396
|
+
self.duplicate_images -= 1
|
|
397
|
+
|
|
398
|
+
self.processed_images -= 1
|
|
399
|
+
self.found_images -= 1
|
|
400
|
+
self.conditional_resume(paused)
|
|
401
|
+
|
|
402
|
+
def ignore(self, path):
|
|
403
|
+
"""Remove a path from the image matching service"""
|
|
404
|
+
self.remove(path)
|
|
405
|
+
|
|
406
|
+
if path.path_obj.suffix.lower() != '.zip':
|
|
407
|
+
self._ignored_files.add(path.path)
|
|
408
|
+
|
|
409
|
+
def refresh_match_indexes(self, start=0):
|
|
410
|
+
"""Update the match_i value for all the matches passed a certain point"""
|
|
411
|
+
for match_i, match in enumerate(self.matches[start:], start=start):
|
|
412
|
+
match.match_i = match_i
|
|
413
|
+
|
|
414
|
+
def _process_image_callback(self, result):
|
|
415
|
+
"""
|
|
416
|
+
Handle the result of hashing an image.
|
|
417
|
+
|
|
418
|
+
This needs to do quite a few things including sanitizing the results,
|
|
419
|
+
actually checking if the hash matches an existing image,
|
|
420
|
+
adding the image and any matches to the backend data structures, notify any listeners,
|
|
421
|
+
update the found and processed image counts,
|
|
422
|
+
and verify that this result wasn't added as a removed path since it was queued.
|
|
423
|
+
|
|
424
|
+
Args:
|
|
425
|
+
result: A tuple consisting of the path to the file, and the resultant hashes.
|
|
426
|
+
If the hashes are a dict, then it is assumed that the path is for a zip. In that case,
|
|
427
|
+
the individual zip files will sanitized and re-ran through this callback.
|
|
428
|
+
"""
|
|
429
|
+
# TODO: This callback must return IMMEDIATELY and is currently too slow for large amounts of zips.
|
|
430
|
+
# Perhaps create a new queue/thread and queue up processing for zip results?
|
|
431
|
+
# I think the major slow point is adding to the data structures and I'm not sure if more threads will help
|
|
432
|
+
# Check for paused or finished signals
|
|
433
|
+
self._not_paused.wait()
|
|
434
|
+
if self.is_finished():
|
|
435
|
+
return
|
|
436
|
+
|
|
437
|
+
# region Sanitize results
|
|
438
|
+
path: Path | str | ZipPath
|
|
439
|
+
path, hashes = result
|
|
440
|
+
|
|
441
|
+
if any(_is_under(d, path.path if isinstance(path, ZipPath) else path) for d in self._removed_paths):
|
|
442
|
+
# This image was removed AFTER it was queue'd! So decrement the found images count and just leave...
|
|
443
|
+
self.found_images -= 1
|
|
444
|
+
return
|
|
445
|
+
|
|
446
|
+
if isinstance(hashes, dict):
|
|
447
|
+
self.found_images -= 1
|
|
448
|
+
subpaths = []
|
|
449
|
+
for sub_path, sub_hashes in hashes.items():
|
|
450
|
+
self.found_images += 1
|
|
451
|
+
subpaths.append(ZipPath(str(path), sub_path))
|
|
452
|
+
self._process_image_callback((subpaths[-1], sub_hashes))
|
|
453
|
+
self._processed_zips[str(path)] = subpaths
|
|
454
|
+
return
|
|
455
|
+
|
|
456
|
+
initial_hash, extra_hashes = hashes
|
|
457
|
+
extra_hashes.add(initial_hash)
|
|
458
|
+
if not isinstance(path, ZipPath):
|
|
459
|
+
# From this point on, EVERYTHING should be a ZipPath
|
|
460
|
+
path = ZipPath(str(path), "")
|
|
461
|
+
# endregion
|
|
462
|
+
|
|
463
|
+
if path in self._reverse_hashes:
|
|
464
|
+
self.found_images -= 1
|
|
465
|
+
return
|
|
466
|
+
|
|
467
|
+
self.processed_images += 1
|
|
468
|
+
|
|
469
|
+
# From testing at ~1.5m loaded images: it is ~10% faster to return a set and do this than it is to
|
|
470
|
+
# iterate over a list and do an `is in` check for each hash
|
|
471
|
+
found_hashes = self._hashes.keys() & extra_hashes
|
|
472
|
+
if not found_hashes:
|
|
473
|
+
# This is a new image not matching any previous, so just add it to the hashmap and move on...
|
|
474
|
+
# Just use the initial orientation
|
|
475
|
+
hash_ = initial_hash
|
|
476
|
+
self._reverse_hashes[path] = hash_
|
|
477
|
+
self._hashes[hash_].matches.append(path)
|
|
478
|
+
return
|
|
479
|
+
|
|
480
|
+
# We have found a match!
|
|
481
|
+
hash_ = next(iter(found_hashes))
|
|
482
|
+
self._reverse_hashes[path] = hash_
|
|
483
|
+
self._hashes[hash_].matches.append(path)
|
|
484
|
+
|
|
485
|
+
if self._hashes[hash_].match_i is None and len(self._hashes[hash_].matches) >= 2:
|
|
486
|
+
# This is a brand new match group!
|
|
487
|
+
self._hashes[hash_].match_i = len(self.matches)
|
|
488
|
+
self.matches.append(self._hashes[hash_])
|
|
489
|
+
self.duplicate_images += 2
|
|
490
|
+
self.events.put(NewGroup(self._hashes[hash_]))
|
|
491
|
+
logger.debug('New match group found: %s', self._hashes[hash_].matches)
|
|
492
|
+
else:
|
|
493
|
+
# Just another match for an existing group...
|
|
494
|
+
self.duplicate_images += 1
|
|
495
|
+
self.events.put(NewMatch(self._hashes[hash_], path))
|
|
496
|
+
logger.debug('New match found for group #%s: %s',
|
|
497
|
+
self._hashes[hash_].match_i,
|
|
498
|
+
self._hashes[hash_].matches)
|
|
499
|
+
|
|
500
|
+
def _process_image_error_callback(self, e):
|
|
501
|
+
"""Temporary for testing"""
|
|
502
|
+
self.processed_images += 1
|
|
503
|
+
logger.error("%s: %s (input path %s)", type(e).__name__, e, e.input_path)
|
|
504
|
+
|
|
505
|
+
def _root_stream(self):
|
|
506
|
+
"""This is to yield any paths for processing, then wait until processing is finished for any new paths"""
|
|
507
|
+
while not self._new_paths.empty() or self.left_to_process:
|
|
508
|
+
if self._new_paths.empty():
|
|
509
|
+
time.sleep(0.05)
|
|
510
|
+
continue
|
|
511
|
+
|
|
512
|
+
yield self._new_paths.get_nowait()
|
|
513
|
+
|
|
514
|
+
def run(self, paths: list[str | Path]):
|
|
515
|
+
"""Do the work of matching!"""
|
|
516
|
+
self._not_paused.set()
|
|
517
|
+
self._finished.clear()
|
|
518
|
+
|
|
519
|
+
for path in paths:
|
|
520
|
+
self.add_path(path)
|
|
521
|
+
|
|
522
|
+
with Pool(self.processes) as tp:
|
|
523
|
+
for path in self._root_stream():
|
|
524
|
+
path = Path(path)
|
|
525
|
+
if not path.is_dir():
|
|
526
|
+
logger.warning('A path was entered that was not a directory : %s', path)
|
|
527
|
+
continue
|
|
528
|
+
|
|
529
|
+
path = str(path.absolute())
|
|
530
|
+
if path in self._removed_paths:
|
|
531
|
+
continue
|
|
532
|
+
|
|
533
|
+
for root, dirs, files in os.walk(path):
|
|
534
|
+
if self.is_finished():
|
|
535
|
+
break
|
|
536
|
+
|
|
537
|
+
dirs.sort() # This actually works to ensure that os.walk goes in alphabetical order!
|
|
538
|
+
root = Path(root)
|
|
539
|
+
|
|
540
|
+
if any(_is_under(d, root) for d in self._removed_paths):
|
|
541
|
+
continue
|
|
542
|
+
|
|
543
|
+
for f in files:
|
|
544
|
+
self._not_paused.wait()
|
|
545
|
+
if self.is_finished():
|
|
546
|
+
break
|
|
547
|
+
|
|
548
|
+
f = root / f
|
|
549
|
+
|
|
550
|
+
if f.suffix.lower() not in self.extensions:
|
|
551
|
+
continue
|
|
552
|
+
|
|
553
|
+
if any(_is_under(d, f) for d in self._removed_paths):
|
|
554
|
+
continue
|
|
555
|
+
|
|
556
|
+
if str(f) in self._ignored_files:
|
|
557
|
+
continue
|
|
558
|
+
|
|
559
|
+
if f.suffix.lower() == '.zip':
|
|
560
|
+
if str(f.absolute()) in self._processed_zips:
|
|
561
|
+
continue
|
|
562
|
+
elif ZipPath(str(f), "") in self._reverse_hashes:
|
|
563
|
+
continue
|
|
564
|
+
|
|
565
|
+
self.found_images += 1
|
|
566
|
+
tp.apply_async(
|
|
567
|
+
_process_image,
|
|
568
|
+
args=(f, ),
|
|
569
|
+
kwds={
|
|
570
|
+
'strength': self.strength,
|
|
571
|
+
'supported_extensions': self.extensions,
|
|
572
|
+
'exact_match': self.exact_match,
|
|
573
|
+
},
|
|
574
|
+
callback=self._process_image_callback,
|
|
575
|
+
error_callback=self._process_image_error_callback,
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
tp.close()
|
|
579
|
+
|
|
580
|
+
if not self.is_finished():
|
|
581
|
+
tp.join()
|
|
582
|
+
|
|
583
|
+
if not self.is_finished():
|
|
584
|
+
self._finished.set()
|
|
585
|
+
self.events.put(Finished())
|
pixmatch/__main__.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import logging
|
|
3
|
+
import platform
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from PySide6 import QtWidgets
|
|
8
|
+
|
|
9
|
+
from pixmatch.gui import MainWindow
|
|
10
|
+
|
|
11
|
+
if __name__ == "__main__":
|
|
12
|
+
parser = argparse.ArgumentParser(
|
|
13
|
+
description="Process zero or more file paths.",
|
|
14
|
+
)
|
|
15
|
+
parser.add_argument(
|
|
16
|
+
"folders",
|
|
17
|
+
nargs="*",
|
|
18
|
+
type=Path,
|
|
19
|
+
help="Folders to load into the selected file path display (to speed up testing).",
|
|
20
|
+
)
|
|
21
|
+
parser.add_argument('--verbose', action='store_true', help="More detailed logging")
|
|
22
|
+
args = parser.parse_args()
|
|
23
|
+
|
|
24
|
+
logging.basicConfig(
|
|
25
|
+
level=logging.DEBUG if args.verbose else logging.INFO,
|
|
26
|
+
format='%(module)s::%(funcName)s::%(lineno)d %(levelname)s %(asctime)s - %(message)s',
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
if platform.system() == "Windows":
|
|
30
|
+
# Need to tell Windows to not use the Python app icon and use the Window icon isntead...
|
|
31
|
+
# I'm not sure on the specifics but calling this method with any string seems to do the trick....
|
|
32
|
+
# https://stackoverflow.com/questions/1551605
|
|
33
|
+
import ctypes
|
|
34
|
+
ctypes.windll.shell32.SetCurrentProcessExplicitAppUserModelID('company.app.1')
|
|
35
|
+
|
|
36
|
+
app = QtWidgets.QApplication([])
|
|
37
|
+
# Basic stylesheet for subtle polish without complexity.
|
|
38
|
+
app.setStyleSheet(
|
|
39
|
+
"""
|
|
40
|
+
QToolBar { spacing: 8px; }
|
|
41
|
+
QLabel#GroupTitle { padding: 4px 0; }
|
|
42
|
+
QFrame#ImageTile { border: 1px solid #444; border-radius: 6px; padding: 6px; }
|
|
43
|
+
""",
|
|
44
|
+
)
|
|
45
|
+
w = MainWindow(args.folders)
|
|
46
|
+
w.show()
|
|
47
|
+
app.exec()
|
pixmatch/utils.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from typing import Iterable
|
|
2
|
+
|
|
3
|
+
def human_bytes(
|
|
4
|
+
n: int,
|
|
5
|
+
*,
|
|
6
|
+
base: int = 1000,
|
|
7
|
+
decimals: int = 0,
|
|
8
|
+
units: Iterable[str] = ("b", "kb", "mb", "gb", "tb", "pb", "eb", "zb", "yb"),
|
|
9
|
+
) -> str:
|
|
10
|
+
"""
|
|
11
|
+
Convert a byte count to a human-readable string.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
n: Byte count (e.g., from os.stat().st_size).
|
|
15
|
+
base: 1000 for SI (kb, mb, ...), 1024 for IEC-like step size.
|
|
16
|
+
decimals: Decimal places for non-byte units (0 -> '66kb', 1 -> '1.5gb').
|
|
17
|
+
units: Unit suffixes to use. Defaults to lowercase ('kb'); swap for ('B','kB','MB',...) if preferred.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
A compact string like '66kb', '1mb', '1.5gb', or '999b'.
|
|
21
|
+
|
|
22
|
+
Raises:
|
|
23
|
+
ValueError: If n < 0
|
|
24
|
+
"""
|
|
25
|
+
if n < 0:
|
|
26
|
+
raise ValueError("Byte size cannot be negative")
|
|
27
|
+
|
|
28
|
+
i = 0
|
|
29
|
+
max_i = len(tuple(units)) - 1
|
|
30
|
+
while n >= base and i < max_i:
|
|
31
|
+
n /= base
|
|
32
|
+
i += 1
|
|
33
|
+
|
|
34
|
+
if i == 0 or decimals == 0:
|
|
35
|
+
# Bytes or integer formatting requested
|
|
36
|
+
return f"{int(n)}{tuple(units)[i]}"
|
|
37
|
+
|
|
38
|
+
return f"{n:.{decimals}f}{tuple(units)[i]}".rstrip("0").rstrip(".")
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pixmatch
|
|
3
|
+
Version: 0.0.6
|
|
4
|
+
Summary: A modern VisiPics replacement.
|
|
5
|
+
Author-email: Ryan Heard <ryanwheard@gmail.com>
|
|
6
|
+
Project-URL: Repository, https://github.com/rheard/pixmatch
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
14
|
+
Requires-Python: >=3.9
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
Requires-Dist: Pillow
|
|
18
|
+
Requires-Dist: imagehash
|
|
19
|
+
Provides-Extra: gui
|
|
20
|
+
Requires-Dist: PySide6; extra == "gui"
|
|
21
|
+
Dynamic: license-file
|
|
22
|
+
|
|
23
|
+
# PixMatch
|
|
24
|
+
|
|
25
|
+
PixMatch is a modern, cross-platform duplicate-image finder inspired by VisiPics, built with PySide6.
|
|
26
|
+
|
|
27
|
+

|
|
28
|
+
|
|
29
|
+
PixMatch scans folders (and ZIP archives) for visually similar images, groups matches,
|
|
30
|
+
and lets you quickly keep, ignore, or delete files from a clean GUI.
|
|
31
|
+
Rotated, mirrored or recompressed imgaes are no match for PixMatch!
|
|
32
|
+
PixMatch can even detect visually similar GIFs and animated WebP files.
|
|
33
|
+
Files inside ZIPs are treated as read-only “sources of truth”
|
|
34
|
+
—never deleted—so you can safely compare against archived libraries.
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
Supported extensions: `.jpg`, `.jpeg`, `.png`, `.webp`, `.tif`, `.tiff`, `.bmp`, `.gif`, `.zip`.
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
## Install
|
|
41
|
+
|
|
42
|
+
PixMatch is a standard Python app (GUI via PySide6).
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
python -m pip install pixmatch[gui]
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Running
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
python -m pixmatch
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Usage
|
|
55
|
+
|
|
56
|
+
Simply select some folders to parse and then click begin.
|
|
57
|
+
|
|
58
|
+
Once duplicate groups begin to appear in the duplicates view,
|
|
59
|
+
you can start to select actions for them and then execute those actions.
|
|
60
|
+
Clicking on a tile will cycle through actions, with red being delete, yellow being ignore, and green being no action.
|
|
61
|
+
|
|
62
|
+
Images which are in zips and cannot be deleted will have a rar icon to denote such,
|
|
63
|
+
and they cannot be marked for deletion.
|
|
64
|
+
|
|
65
|
+
The status bar under each image shows the full path, the file size, the uncompressed file size,
|
|
66
|
+
the frames in the image if it is an animated image, the image dimensions and the last modified date.
|
|
67
|
+
|
|
68
|
+
Basic status bar example:
|
|
69
|
+
|
|
70
|
+

|
|
71
|
+
|
|
72
|
+
Animated image status bar example:
|
|
73
|
+
|
|
74
|
+

|
|
75
|
+
|
|
76
|
+
#### Notes
|
|
77
|
+
* An exact match checkbox is provided. If strength is 10 and this checkbox is checked,
|
|
78
|
+
SHA-256 file hashes will be used instead of perceptual hashes.
|
|
79
|
+
|
|
80
|
+
#### Optional Args:
|
|
81
|
+
```markdown
|
|
82
|
+
positional arguments:
|
|
83
|
+
folders Folders to load into the selected file path display (to speed up testing).
|
|
84
|
+
|
|
85
|
+
options:
|
|
86
|
+
--verbose More detailed logging
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Acknowledgements
|
|
90
|
+
|
|
91
|
+
* Thanks to anyone who supported this effort, including the teams behind PySide6, Pillow, PyPI, and many other projects.
|
|
92
|
+
* Thanks to Johannes Buchner and the team behind imagehash, which serves as a large backbone in this application and saved me a lot of time.
|
|
93
|
+
* Thanks to Guillaume Fouet (aka Ozone) for VisiPics and the inspiration. Please don't be mad, I just wanted some new features like better gif and zip support.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
pixmatch/__init__.py,sha256=-jOdBz0QudNjRZYeZK6qYSDU6Plnjjt3AnV3eiFJvEs,22081
|
|
2
|
+
pixmatch/__main__.py,sha256=DVd1-B2O-0PC2lPgl40xDN277SPSHwOiE6pFGxK-xO0,1548
|
|
3
|
+
pixmatch/utils.py,sha256=4dHALWtt9y3EIdRLiM3GfRUho3xfn3QErQ69R20A1Lw,1120
|
|
4
|
+
pixmatch-0.0.6.dist-info/licenses/LICENSE,sha256=2bm9uFabQZ3Ykb_SaSU_uUbAj2-htc6WJQmS_65qD00,1073
|
|
5
|
+
pixmatch-0.0.6.dist-info/METADATA,sha256=7DEGio2Z4AsTZWjdcKeq2Dvz2IYeuphxLP58A6_Dzvk,3540
|
|
6
|
+
pixmatch-0.0.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
+
pixmatch-0.0.6.dist-info/top_level.txt,sha256=u-67zafU4VFT-oIM4mdGvf9KrHZvD64QjjtNzVxBj7E,9
|
|
8
|
+
pixmatch-0.0.6.dist-info/RECORD,,
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Copyright (c) 2018 The Python Packaging Authority
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
5
|
+
in the Software without restriction, including without limitation the rights
|
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
8
|
+
furnished to do so, subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
|
11
|
+
copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
19
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pixmatch
|