pixmatch 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixmatch might be problematic. Click here for more details.
- pixmatch/__init__.py +444 -444
- pixmatch/__main__.py +48 -48
- pixmatch/utils.py +36 -36
- {pixmatch-0.0.1.dist-info → pixmatch-0.0.2.dist-info}/METADATA +93 -93
- pixmatch-0.0.2.dist-info/RECORD +8 -0
- {pixmatch-0.0.1.dist-info → pixmatch-0.0.2.dist-info}/licenses/LICENSE +18 -18
- pixmatch/gui/__init__.py +0 -837
- pixmatch/gui/pixmatch.ico +0 -0
- pixmatch/gui/utils.py +0 -13
- pixmatch/gui/widgets.py +0 -656
- pixmatch/gui/zip.png +0 -0
- pixmatch-0.0.1.dist-info/RECORD +0 -13
- {pixmatch-0.0.1.dist-info → pixmatch-0.0.2.dist-info}/WHEEL +0 -0
- {pixmatch-0.0.1.dist-info → pixmatch-0.0.2.dist-info}/top_level.txt +0 -0
pixmatch/__init__.py
CHANGED
|
@@ -1,444 +1,444 @@
|
|
|
1
|
-
import hashlib
|
|
2
|
-
import logging
|
|
3
|
-
import os
|
|
4
|
-
import time
|
|
5
|
-
|
|
6
|
-
from collections import defaultdict
|
|
7
|
-
from dataclasses import dataclass, field
|
|
8
|
-
from multiprocessing import Pool, Manager
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
from threading import Event
|
|
11
|
-
from typing import Union
|
|
12
|
-
from zipfile import ZipFile
|
|
13
|
-
|
|
14
|
-
import imagehash
|
|
15
|
-
import numpy as np
|
|
16
|
-
|
|
17
|
-
from PIL import Image
|
|
18
|
-
|
|
19
|
-
logger = logging.getLogger(__name__)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
@dataclass(frozen=True)
|
|
23
|
-
class ZipPath:
|
|
24
|
-
path: str
|
|
25
|
-
subpath: str
|
|
26
|
-
|
|
27
|
-
@property
|
|
28
|
-
def path_obj(self):
|
|
29
|
-
return Path(self.path)
|
|
30
|
-
|
|
31
|
-
@property
|
|
32
|
-
def is_gif(self) -> bool:
|
|
33
|
-
movie_extensions = {'.gif', '.webp'}
|
|
34
|
-
return (not self.subpath and Path(self.path).suffix.lower() in movie_extensions) \
|
|
35
|
-
or (self.subpath and self.subpath[-4:].lower() in movie_extensions)
|
|
36
|
-
|
|
37
|
-
def absolute(self):
|
|
38
|
-
return ZipPath(str(self.path_obj.absolute()), self.subpath)
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def _is_under(folder_abs: str, target: str | Path) -> bool:
|
|
42
|
-
"""Return True if the ZipPath's real file (zp.path) is inside folder_abs."""
|
|
43
|
-
try:
|
|
44
|
-
Path(target).absolute().relative_to(Path(folder_abs).absolute())
|
|
45
|
-
return True
|
|
46
|
-
except ValueError:
|
|
47
|
-
return False
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def phash_params_for_strength(strength: int) -> tuple[int, int]:
|
|
51
|
-
# TODO: This sucks.
|
|
52
|
-
strength = max(0, min(10, strength))
|
|
53
|
-
if strength >= 10:
|
|
54
|
-
return 16, 4 # 256-bit hash, strict
|
|
55
|
-
elif strength >= 8:
|
|
56
|
-
return 15, 4
|
|
57
|
-
elif strength >= 7:
|
|
58
|
-
return 13, 4
|
|
59
|
-
elif strength >= 6:
|
|
60
|
-
return 11, 4
|
|
61
|
-
elif strength >= 5:
|
|
62
|
-
return 9, 4
|
|
63
|
-
elif strength >= 4:
|
|
64
|
-
return 8, 4
|
|
65
|
-
elif strength >= 3:
|
|
66
|
-
return 8, 3
|
|
67
|
-
elif strength >= 2:
|
|
68
|
-
return 7, 3
|
|
69
|
-
else:
|
|
70
|
-
return 6, 3
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def calculate_hashes(f, is_gif=False, strength=5, exact_match=False):
|
|
74
|
-
"""
|
|
75
|
-
Calculate hashes for a given file.
|
|
76
|
-
|
|
77
|
-
Args:
|
|
78
|
-
f (IO or str or Path): Either a file path to process, or a in-memory BytesIO object ready for reading.
|
|
79
|
-
is_gif (bool): Is this gif data? Needed if passing an in-memory BytesIO object.
|
|
80
|
-
strength (int): A number between 0 and 10 on the strength of the matches.
|
|
81
|
-
exact_match (bool): Use exact SHA256 hahes?
|
|
82
|
-
If true, strength must be 10.
|
|
83
|
-
If false, perceptual hashes will be used, even with high strength.
|
|
84
|
-
|
|
85
|
-
Returns:
|
|
86
|
-
list: The found hashes.
|
|
87
|
-
"""
|
|
88
|
-
if exact_match:
|
|
89
|
-
hasher = hashlib.sha256()
|
|
90
|
-
block_size = 65536
|
|
91
|
-
with (open(f, "rb") if isinstance(f, (str, Path)) else f) as file:
|
|
92
|
-
for block in iter(lambda: file.read(block_size), b""):
|
|
93
|
-
hasher.update(block)
|
|
94
|
-
return [hasher.hexdigest()]
|
|
95
|
-
|
|
96
|
-
hash_size, highfreq_factor = phash_params_for_strength(strength)
|
|
97
|
-
with (Image.open(f) as im):
|
|
98
|
-
if is_gif:
|
|
99
|
-
initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
|
|
100
|
-
# This is going to be a bit confusing but basically, imagehash produces weird hashes for some gifs
|
|
101
|
-
# because some gifs have bad first frames consisting of nothing or only a single color...
|
|
102
|
-
# To deal with that I'm looking for these bad hashes here and if its one, we advance to the next frame
|
|
103
|
-
# and use THAT for imagehash instead.
|
|
104
|
-
# The ones we need to be on the lookout for are:
|
|
105
|
-
# 1. The hash is all 1111...
|
|
106
|
-
# 2. The hash is all 0000...
|
|
107
|
-
# 3. The hash is of the form 100000...
|
|
108
|
-
# TODO: This is simply not good enough. I'm still getting bad matches for gifs, tho they are extremely rare
|
|
109
|
-
val = initial_hash.hash[0][0]
|
|
110
|
-
while all(all(x == val for x in r) for r in initial_hash.hash) \
|
|
111
|
-
or all(all(x == np.False_ or (x_i == 0 and r_i == 0) for x_i, x in enumerate(r))
|
|
112
|
-
for r_i, r in enumerate(initial_hash.hash)):
|
|
113
|
-
try:
|
|
114
|
-
im.seek(im.tell() + 1)
|
|
115
|
-
except EOFError:
|
|
116
|
-
break
|
|
117
|
-
else:
|
|
118
|
-
initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
|
|
119
|
-
val = initial_hash.hash[0][0]
|
|
120
|
-
|
|
121
|
-
# For GIFs we'll look for mirrored versions but thats it
|
|
122
|
-
flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
|
|
123
|
-
return [initial_hash, imagehash.phash(flipped_h_image, hash_size=hash_size, highfreq_factor=highfreq_factor)]
|
|
124
|
-
|
|
125
|
-
flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
|
|
126
|
-
flipped_v_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
|
|
127
|
-
images = (im, im.rotate(90), im.rotate(180), im.rotate(270),
|
|
128
|
-
flipped_h_image, flipped_h_image.rotate(90), flipped_h_image.rotate(180), flipped_h_image.rotate(270),
|
|
129
|
-
flipped_v_image, flipped_v_image.rotate(90), flipped_v_image.rotate(180), flipped_v_image.rotate(270))
|
|
130
|
-
return [imagehash.phash(image, hash_size=hash_size, highfreq_factor=highfreq_factor) for image in images]
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
def _process_image(path: str | Path, strength=5, exact_match=False):
|
|
134
|
-
path = Path(path)
|
|
135
|
-
if path.suffix.lower() != '.zip':
|
|
136
|
-
return path, calculate_hashes(path, is_gif=path.suffix.lower() in {".gif", ".webp"},
|
|
137
|
-
strength=strength, exact_match=exact_match)
|
|
138
|
-
|
|
139
|
-
results = dict()
|
|
140
|
-
with ZipFile(path) as zf:
|
|
141
|
-
for f in zf.filelist:
|
|
142
|
-
with zf.open(f) as zipped_file:
|
|
143
|
-
results[f.filename] = calculate_hashes(zipped_file, is_gif=f.filename[-4:].lower() in {".gif", ".webp"},
|
|
144
|
-
strength=strength, exact_match=exact_match)
|
|
145
|
-
|
|
146
|
-
return path, results
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
@dataclass
|
|
150
|
-
class ImageMatch:
|
|
151
|
-
match_i: int | None = field(default=None)
|
|
152
|
-
matches: list[ZipPath] = field(default_factory=list)
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
@dataclass(frozen=True)
|
|
156
|
-
class NewGroup:
|
|
157
|
-
group: "ImageMatch" # forward-ref to your class
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
@dataclass(frozen=True)
|
|
161
|
-
class NewMatch:
|
|
162
|
-
group: "ImageMatch"
|
|
163
|
-
path: ZipPath
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
@dataclass(frozen=True)
|
|
167
|
-
class Finished:
|
|
168
|
-
pass
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
MatcherEvent = Union[NewGroup, NewMatch, Finished]
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
# TODO: FINISHED signal?
|
|
175
|
-
class ImageMatcher:
|
|
176
|
-
SUPPORTED_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".tif", ".tiff", ".bmp", ".gif", ".zip"}
|
|
177
|
-
|
|
178
|
-
def __init__(self, strength: int = 5, exact_match: bool = False, processes: int | None = None,
|
|
179
|
-
extensions: set | None = None):
|
|
180
|
-
if not (0 <= strength <= 10):
|
|
181
|
-
raise ValueError("Strength must be between 0 and 10!")
|
|
182
|
-
|
|
183
|
-
self.extensions = extensions or self.SUPPORTED_EXTS
|
|
184
|
-
|
|
185
|
-
self.strength = strength
|
|
186
|
-
self.exact_match = exact_match
|
|
187
|
-
self.processes = processes
|
|
188
|
-
self.found_images = 0
|
|
189
|
-
self.processed_images = 0
|
|
190
|
-
self.duplicate_images = 0
|
|
191
|
-
|
|
192
|
-
m = Manager()
|
|
193
|
-
self.events = m.Queue()
|
|
194
|
-
self._new_paths = m.Queue()
|
|
195
|
-
self._removed_paths = set()
|
|
196
|
-
self._processed_paths = set()
|
|
197
|
-
self._hashes = defaultdict(ImageMatch)
|
|
198
|
-
self._reverse_hashes = dict()
|
|
199
|
-
|
|
200
|
-
self._not_paused = Event()
|
|
201
|
-
self._not_paused.set()
|
|
202
|
-
self._finished = Event()
|
|
203
|
-
self._finished.set()
|
|
204
|
-
|
|
205
|
-
self.matches = []
|
|
206
|
-
|
|
207
|
-
def add_path(self, path: str | Path):
|
|
208
|
-
path = str(Path(path).absolute())
|
|
209
|
-
self._removed_paths.discard(path)
|
|
210
|
-
self._new_paths.put(path)
|
|
211
|
-
|
|
212
|
-
def remove_path(self, folder: str | Path) -> None:
|
|
213
|
-
"""
|
|
214
|
-
Mark a folder to be skipped going forward, and remove already-indexed files
|
|
215
|
-
that live under it. Pauses briefly if not already paused to keep state sane.
|
|
216
|
-
"""
|
|
217
|
-
folder = str(Path(folder).absolute())
|
|
218
|
-
paused = self.conditional_pause()
|
|
219
|
-
self._removed_paths.add(folder)
|
|
220
|
-
self._processed_paths.discard(folder)
|
|
221
|
-
|
|
222
|
-
# Remove anything we've already seen under that folder
|
|
223
|
-
# (iterate over a copy because remove() mutates structures)
|
|
224
|
-
to_remove = [p for p in self._reverse_hashes.keys() if _is_under(folder, p.path)]
|
|
225
|
-
for p in to_remove:
|
|
226
|
-
self.remove(p)
|
|
227
|
-
|
|
228
|
-
self.conditional_resume(paused)
|
|
229
|
-
|
|
230
|
-
@property
|
|
231
|
-
def left_to_process(self):
|
|
232
|
-
return self.found_images - self.processed_images
|
|
233
|
-
|
|
234
|
-
def pause(self):
|
|
235
|
-
logger.debug('Performing pause')
|
|
236
|
-
self._not_paused.clear()
|
|
237
|
-
|
|
238
|
-
def conditional_pause(self):
|
|
239
|
-
_conditional_pause = self.is_paused()
|
|
240
|
-
if not _conditional_pause:
|
|
241
|
-
logger.debug('Performing conditional pause')
|
|
242
|
-
self.pause()
|
|
243
|
-
|
|
244
|
-
return _conditional_pause
|
|
245
|
-
|
|
246
|
-
def conditional_resume(self, was_paused):
|
|
247
|
-
if not was_paused and not self.is_finished():
|
|
248
|
-
logger.debug('Performing conditional resume')
|
|
249
|
-
self.resume()
|
|
250
|
-
|
|
251
|
-
def is_paused(self):
|
|
252
|
-
return not self._not_paused.is_set()
|
|
253
|
-
|
|
254
|
-
def finish(self):
|
|
255
|
-
logger.debug('Performing finished')
|
|
256
|
-
self._finished.set()
|
|
257
|
-
|
|
258
|
-
def is_finished(self):
|
|
259
|
-
return self._finished.is_set()
|
|
260
|
-
|
|
261
|
-
def resume(self):
|
|
262
|
-
logger.debug('Performing resume')
|
|
263
|
-
self._not_paused.set()
|
|
264
|
-
|
|
265
|
-
def running(self):
|
|
266
|
-
return not self.is_paused() and (not self.is_finished() or self.left_to_process)
|
|
267
|
-
|
|
268
|
-
def remove(self, path):
|
|
269
|
-
# Pause things while we remove things...
|
|
270
|
-
logger.info('Removing %s from %s', path, self.__class__.__name__)
|
|
271
|
-
paused = self.conditional_pause()
|
|
272
|
-
|
|
273
|
-
hash = self._reverse_hashes.pop(path)
|
|
274
|
-
self._hashes[hash].matches.remove(path)
|
|
275
|
-
if len(self._hashes[hash].matches) == 1:
|
|
276
|
-
match_i = self._hashes[hash].match_i
|
|
277
|
-
logger.debug('Unmatching match group %s', match_i)
|
|
278
|
-
self._hashes[hash].match_i = None
|
|
279
|
-
|
|
280
|
-
del self.matches[match_i]
|
|
281
|
-
self.refresh_match_indexes(match_i)
|
|
282
|
-
self.duplicate_images -= 2
|
|
283
|
-
|
|
284
|
-
elif not self._hashes[hash].matches:
|
|
285
|
-
logger.debug('Removing empty match group')
|
|
286
|
-
del self._hashes[hash]
|
|
287
|
-
|
|
288
|
-
else:
|
|
289
|
-
logger.debug('Simple removal performed')
|
|
290
|
-
self.duplicate_images -= 1
|
|
291
|
-
|
|
292
|
-
self.processed_images -= 1
|
|
293
|
-
self.found_images -= 1
|
|
294
|
-
self.conditional_resume(paused)
|
|
295
|
-
|
|
296
|
-
def refresh_match_indexes(self, start=0):
|
|
297
|
-
for match_i, match in enumerate(self.matches[start:], start=start):
|
|
298
|
-
match.match_i = match_i
|
|
299
|
-
|
|
300
|
-
def _process_image_callback(self, result):
|
|
301
|
-
self._not_paused.wait()
|
|
302
|
-
if self.is_finished():
|
|
303
|
-
return
|
|
304
|
-
|
|
305
|
-
path: Path | str | ZipPath
|
|
306
|
-
path, hashes = result
|
|
307
|
-
|
|
308
|
-
if any(_is_under(d, path.path if isinstance(path, ZipPath) else path) for d in self._removed_paths):
|
|
309
|
-
self.found_images -= 1
|
|
310
|
-
return
|
|
311
|
-
|
|
312
|
-
if isinstance(hashes, dict):
|
|
313
|
-
for sub_path, sub_hashes in hashes.items():
|
|
314
|
-
self._process_image_callback((ZipPath(str(path), sub_path), sub_hashes))
|
|
315
|
-
return
|
|
316
|
-
|
|
317
|
-
if not isinstance(path, ZipPath):
|
|
318
|
-
path = ZipPath(str(path), "")
|
|
319
|
-
|
|
320
|
-
if path in self._reverse_hashes:
|
|
321
|
-
self.found_images -= 1
|
|
322
|
-
return
|
|
323
|
-
|
|
324
|
-
self.processed_images += 1
|
|
325
|
-
for hash_ in hashes:
|
|
326
|
-
if hash_ not in self._hashes:
|
|
327
|
-
continue
|
|
328
|
-
|
|
329
|
-
self._reverse_hashes[path] = hash_
|
|
330
|
-
|
|
331
|
-
# This appears to be a new match!
|
|
332
|
-
for match in self._hashes[hash_].matches:
|
|
333
|
-
if path.absolute() == match.absolute():
|
|
334
|
-
# This appears to be a duplicate PATH...
|
|
335
|
-
logger.warning('Duplicate files entered! %s, %s', path, match)
|
|
336
|
-
return
|
|
337
|
-
|
|
338
|
-
self._hashes[hash_].matches.append(path)
|
|
339
|
-
if self._hashes[hash_].match_i is None and len(self._hashes[hash_].matches) >= 2:
|
|
340
|
-
# This is a brand new match group!
|
|
341
|
-
self._hashes[hash_].match_i = len(self.matches)
|
|
342
|
-
self.matches.append(self._hashes[hash_])
|
|
343
|
-
self.duplicate_images += 2
|
|
344
|
-
self.events.put(NewGroup(self._hashes[hash_]))
|
|
345
|
-
logger.debug('New match group found: %s', self._hashes[hash_].matches)
|
|
346
|
-
else:
|
|
347
|
-
# Just another match for an existing group...
|
|
348
|
-
self.duplicate_images += 1
|
|
349
|
-
self.events.put(NewMatch(self._hashes[hash_], path))
|
|
350
|
-
logger.debug('New match found for group #%s: %s',
|
|
351
|
-
self._hashes[hash_].match_i,
|
|
352
|
-
self._hashes[hash_].matches)
|
|
353
|
-
|
|
354
|
-
break
|
|
355
|
-
else:
|
|
356
|
-
# This is a new hash, so just add it to the hashmap and move on...
|
|
357
|
-
# Just use the initial orientation
|
|
358
|
-
hash_ = hashes[0]
|
|
359
|
-
self._reverse_hashes[path] = hash_
|
|
360
|
-
self._hashes[hash_].matches.append(path)
|
|
361
|
-
return
|
|
362
|
-
|
|
363
|
-
def _process_image_error_callback(self, e):
|
|
364
|
-
self.processed_images += 1
|
|
365
|
-
print(str(e))
|
|
366
|
-
|
|
367
|
-
def _root_stream(self):
|
|
368
|
-
# Yield any paths that come up for processing, then wait until processing is finished for any new paths
|
|
369
|
-
while not self._new_paths.empty() or self.left_to_process:
|
|
370
|
-
if self._new_paths.empty():
|
|
371
|
-
time.sleep(0.05)
|
|
372
|
-
continue
|
|
373
|
-
|
|
374
|
-
yield self._new_paths.get_nowait()
|
|
375
|
-
|
|
376
|
-
def run(self, paths: list[str | Path]):
|
|
377
|
-
# TODO: Verify none of the paths overlap
|
|
378
|
-
# TODO: Verify none of the dirs have been deleted after we started
|
|
379
|
-
|
|
380
|
-
self._not_paused.set()
|
|
381
|
-
self._finished.clear()
|
|
382
|
-
|
|
383
|
-
for path in paths:
|
|
384
|
-
self.add_path(path)
|
|
385
|
-
|
|
386
|
-
with Pool(self.processes) as tp:
|
|
387
|
-
for path in self._root_stream():
|
|
388
|
-
path = Path(path)
|
|
389
|
-
if not path.is_dir():
|
|
390
|
-
logger.warning('A path was entered that was not a directory : %s', path)
|
|
391
|
-
continue
|
|
392
|
-
|
|
393
|
-
path = str(path.absolute())
|
|
394
|
-
if path in self._removed_paths or path in self._processed_paths:
|
|
395
|
-
continue
|
|
396
|
-
|
|
397
|
-
for root, dirs, files in os.walk(path):
|
|
398
|
-
if self.is_finished():
|
|
399
|
-
break
|
|
400
|
-
|
|
401
|
-
root = Path(root)
|
|
402
|
-
|
|
403
|
-
if any(_is_under(d, root) for d in self._removed_paths):
|
|
404
|
-
continue
|
|
405
|
-
|
|
406
|
-
for f in files:
|
|
407
|
-
self._not_paused.wait()
|
|
408
|
-
if self.is_finished():
|
|
409
|
-
break
|
|
410
|
-
|
|
411
|
-
f = root / f
|
|
412
|
-
|
|
413
|
-
if f.suffix.lower() not in self.extensions:
|
|
414
|
-
continue
|
|
415
|
-
|
|
416
|
-
if any(_is_under(d, f) for d in self._removed_paths):
|
|
417
|
-
continue
|
|
418
|
-
|
|
419
|
-
# TODO: This sucks (for zips at least), but I can't iterate over the dict while its changing...
|
|
420
|
-
if ZipPath(str(f), "") in self._reverse_hashes:
|
|
421
|
-
continue
|
|
422
|
-
|
|
423
|
-
self.found_images += 1
|
|
424
|
-
tp.apply_async(
|
|
425
|
-
_process_image,
|
|
426
|
-
args=(f, ),
|
|
427
|
-
kwds={
|
|
428
|
-
'strength': self.strength,
|
|
429
|
-
'exact_match': self.exact_match,
|
|
430
|
-
},
|
|
431
|
-
callback=self._process_image_callback,
|
|
432
|
-
error_callback=self._process_image_error_callback,
|
|
433
|
-
)
|
|
434
|
-
|
|
435
|
-
self._processed_paths.add(path)
|
|
436
|
-
|
|
437
|
-
tp.close()
|
|
438
|
-
|
|
439
|
-
if not self.is_finished():
|
|
440
|
-
tp.join()
|
|
441
|
-
|
|
442
|
-
if not self.is_finished():
|
|
443
|
-
self._finished.set()
|
|
444
|
-
self.events.put(Finished())
|
|
1
|
+
import hashlib
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from multiprocessing import Pool, Manager
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from threading import Event
|
|
11
|
+
from typing import Union
|
|
12
|
+
from zipfile import ZipFile
|
|
13
|
+
|
|
14
|
+
import imagehash
|
|
15
|
+
import numpy as np
|
|
16
|
+
|
|
17
|
+
from PIL import Image
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True)
|
|
23
|
+
class ZipPath:
|
|
24
|
+
path: str
|
|
25
|
+
subpath: str
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def path_obj(self):
|
|
29
|
+
return Path(self.path)
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def is_gif(self) -> bool:
|
|
33
|
+
movie_extensions = {'.gif', '.webp'}
|
|
34
|
+
return (not self.subpath and Path(self.path).suffix.lower() in movie_extensions) \
|
|
35
|
+
or (self.subpath and self.subpath[-4:].lower() in movie_extensions)
|
|
36
|
+
|
|
37
|
+
def absolute(self):
|
|
38
|
+
return ZipPath(str(self.path_obj.absolute()), self.subpath)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _is_under(folder_abs: str, target: str | Path) -> bool:
|
|
42
|
+
"""Return True if the ZipPath's real file (zp.path) is inside folder_abs."""
|
|
43
|
+
try:
|
|
44
|
+
Path(target).absolute().relative_to(Path(folder_abs).absolute())
|
|
45
|
+
return True
|
|
46
|
+
except ValueError:
|
|
47
|
+
return False
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def phash_params_for_strength(strength: int) -> tuple[int, int]:
|
|
51
|
+
# TODO: This sucks.
|
|
52
|
+
strength = max(0, min(10, strength))
|
|
53
|
+
if strength >= 10:
|
|
54
|
+
return 16, 4 # 256-bit hash, strict
|
|
55
|
+
elif strength >= 8:
|
|
56
|
+
return 15, 4
|
|
57
|
+
elif strength >= 7:
|
|
58
|
+
return 13, 4
|
|
59
|
+
elif strength >= 6:
|
|
60
|
+
return 11, 4
|
|
61
|
+
elif strength >= 5:
|
|
62
|
+
return 9, 4
|
|
63
|
+
elif strength >= 4:
|
|
64
|
+
return 8, 4
|
|
65
|
+
elif strength >= 3:
|
|
66
|
+
return 8, 3
|
|
67
|
+
elif strength >= 2:
|
|
68
|
+
return 7, 3
|
|
69
|
+
else:
|
|
70
|
+
return 6, 3
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def calculate_hashes(f, is_gif=False, strength=5, exact_match=False):
|
|
74
|
+
"""
|
|
75
|
+
Calculate hashes for a given file.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
f (IO or str or Path): Either a file path to process, or a in-memory BytesIO object ready for reading.
|
|
79
|
+
is_gif (bool): Is this gif data? Needed if passing an in-memory BytesIO object.
|
|
80
|
+
strength (int): A number between 0 and 10 on the strength of the matches.
|
|
81
|
+
exact_match (bool): Use exact SHA256 hahes?
|
|
82
|
+
If true, strength must be 10.
|
|
83
|
+
If false, perceptual hashes will be used, even with high strength.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
list: The found hashes.
|
|
87
|
+
"""
|
|
88
|
+
if exact_match:
|
|
89
|
+
hasher = hashlib.sha256()
|
|
90
|
+
block_size = 65536
|
|
91
|
+
with (open(f, "rb") if isinstance(f, (str, Path)) else f) as file:
|
|
92
|
+
for block in iter(lambda: file.read(block_size), b""):
|
|
93
|
+
hasher.update(block)
|
|
94
|
+
return [hasher.hexdigest()]
|
|
95
|
+
|
|
96
|
+
hash_size, highfreq_factor = phash_params_for_strength(strength)
|
|
97
|
+
with (Image.open(f) as im):
|
|
98
|
+
if is_gif:
|
|
99
|
+
initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
|
|
100
|
+
# This is going to be a bit confusing but basically, imagehash produces weird hashes for some gifs
|
|
101
|
+
# because some gifs have bad first frames consisting of nothing or only a single color...
|
|
102
|
+
# To deal with that I'm looking for these bad hashes here and if its one, we advance to the next frame
|
|
103
|
+
# and use THAT for imagehash instead.
|
|
104
|
+
# The ones we need to be on the lookout for are:
|
|
105
|
+
# 1. The hash is all 1111...
|
|
106
|
+
# 2. The hash is all 0000...
|
|
107
|
+
# 3. The hash is of the form 100000...
|
|
108
|
+
# TODO: This is simply not good enough. I'm still getting bad matches for gifs, tho they are extremely rare
|
|
109
|
+
val = initial_hash.hash[0][0]
|
|
110
|
+
while all(all(x == val for x in r) for r in initial_hash.hash) \
|
|
111
|
+
or all(all(x == np.False_ or (x_i == 0 and r_i == 0) for x_i, x in enumerate(r))
|
|
112
|
+
for r_i, r in enumerate(initial_hash.hash)):
|
|
113
|
+
try:
|
|
114
|
+
im.seek(im.tell() + 1)
|
|
115
|
+
except EOFError:
|
|
116
|
+
break
|
|
117
|
+
else:
|
|
118
|
+
initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
|
|
119
|
+
val = initial_hash.hash[0][0]
|
|
120
|
+
|
|
121
|
+
# For GIFs we'll look for mirrored versions but thats it
|
|
122
|
+
flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
|
|
123
|
+
return [initial_hash, imagehash.phash(flipped_h_image, hash_size=hash_size, highfreq_factor=highfreq_factor)]
|
|
124
|
+
|
|
125
|
+
flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
|
|
126
|
+
flipped_v_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
|
|
127
|
+
images = (im, im.rotate(90), im.rotate(180), im.rotate(270),
|
|
128
|
+
flipped_h_image, flipped_h_image.rotate(90), flipped_h_image.rotate(180), flipped_h_image.rotate(270),
|
|
129
|
+
flipped_v_image, flipped_v_image.rotate(90), flipped_v_image.rotate(180), flipped_v_image.rotate(270))
|
|
130
|
+
return [imagehash.phash(image, hash_size=hash_size, highfreq_factor=highfreq_factor) for image in images]
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _process_image(path: str | Path, strength=5, exact_match=False):
|
|
134
|
+
path = Path(path)
|
|
135
|
+
if path.suffix.lower() != '.zip':
|
|
136
|
+
return path, calculate_hashes(path, is_gif=path.suffix.lower() in {".gif", ".webp"},
|
|
137
|
+
strength=strength, exact_match=exact_match)
|
|
138
|
+
|
|
139
|
+
results = dict()
|
|
140
|
+
with ZipFile(path) as zf:
|
|
141
|
+
for f in zf.filelist:
|
|
142
|
+
with zf.open(f) as zipped_file:
|
|
143
|
+
results[f.filename] = calculate_hashes(zipped_file, is_gif=f.filename[-4:].lower() in {".gif", ".webp"},
|
|
144
|
+
strength=strength, exact_match=exact_match)
|
|
145
|
+
|
|
146
|
+
return path, results
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@dataclass
|
|
150
|
+
class ImageMatch:
|
|
151
|
+
match_i: int | None = field(default=None)
|
|
152
|
+
matches: list[ZipPath] = field(default_factory=list)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@dataclass(frozen=True)
|
|
156
|
+
class NewGroup:
|
|
157
|
+
group: "ImageMatch" # forward-ref to your class
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@dataclass(frozen=True)
|
|
161
|
+
class NewMatch:
|
|
162
|
+
group: "ImageMatch"
|
|
163
|
+
path: ZipPath
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
@dataclass(frozen=True)
|
|
167
|
+
class Finished:
|
|
168
|
+
pass
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
MatcherEvent = Union[NewGroup, NewMatch, Finished]
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
# TODO: FINISHED signal?
|
|
175
|
+
class ImageMatcher:
|
|
176
|
+
SUPPORTED_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".tif", ".tiff", ".bmp", ".gif", ".zip"}
|
|
177
|
+
|
|
178
|
+
def __init__(self, strength: int = 5, exact_match: bool = False, processes: int | None = None,
|
|
179
|
+
extensions: set | None = None):
|
|
180
|
+
if not (0 <= strength <= 10):
|
|
181
|
+
raise ValueError("Strength must be between 0 and 10!")
|
|
182
|
+
|
|
183
|
+
self.extensions = extensions or self.SUPPORTED_EXTS
|
|
184
|
+
|
|
185
|
+
self.strength = strength
|
|
186
|
+
self.exact_match = exact_match
|
|
187
|
+
self.processes = processes
|
|
188
|
+
self.found_images = 0
|
|
189
|
+
self.processed_images = 0
|
|
190
|
+
self.duplicate_images = 0
|
|
191
|
+
|
|
192
|
+
m = Manager()
|
|
193
|
+
self.events = m.Queue()
|
|
194
|
+
self._new_paths = m.Queue()
|
|
195
|
+
self._removed_paths = set()
|
|
196
|
+
self._processed_paths = set()
|
|
197
|
+
self._hashes = defaultdict(ImageMatch)
|
|
198
|
+
self._reverse_hashes = dict()
|
|
199
|
+
|
|
200
|
+
self._not_paused = Event()
|
|
201
|
+
self._not_paused.set()
|
|
202
|
+
self._finished = Event()
|
|
203
|
+
self._finished.set()
|
|
204
|
+
|
|
205
|
+
self.matches = []
|
|
206
|
+
|
|
207
|
+
def add_path(self, path: str | Path):
|
|
208
|
+
path = str(Path(path).absolute())
|
|
209
|
+
self._removed_paths.discard(path)
|
|
210
|
+
self._new_paths.put(path)
|
|
211
|
+
|
|
212
|
+
def remove_path(self, folder: str | Path) -> None:
|
|
213
|
+
"""
|
|
214
|
+
Mark a folder to be skipped going forward, and remove already-indexed files
|
|
215
|
+
that live under it. Pauses briefly if not already paused to keep state sane.
|
|
216
|
+
"""
|
|
217
|
+
folder = str(Path(folder).absolute())
|
|
218
|
+
paused = self.conditional_pause()
|
|
219
|
+
self._removed_paths.add(folder)
|
|
220
|
+
self._processed_paths.discard(folder)
|
|
221
|
+
|
|
222
|
+
# Remove anything we've already seen under that folder
|
|
223
|
+
# (iterate over a copy because remove() mutates structures)
|
|
224
|
+
to_remove = [p for p in self._reverse_hashes.keys() if _is_under(folder, p.path)]
|
|
225
|
+
for p in to_remove:
|
|
226
|
+
self.remove(p)
|
|
227
|
+
|
|
228
|
+
self.conditional_resume(paused)
|
|
229
|
+
|
|
230
|
+
@property
|
|
231
|
+
def left_to_process(self):
|
|
232
|
+
return self.found_images - self.processed_images
|
|
233
|
+
|
|
234
|
+
def pause(self):
|
|
235
|
+
logger.debug('Performing pause')
|
|
236
|
+
self._not_paused.clear()
|
|
237
|
+
|
|
238
|
+
def conditional_pause(self):
|
|
239
|
+
_conditional_pause = self.is_paused()
|
|
240
|
+
if not _conditional_pause:
|
|
241
|
+
logger.debug('Performing conditional pause')
|
|
242
|
+
self.pause()
|
|
243
|
+
|
|
244
|
+
return _conditional_pause
|
|
245
|
+
|
|
246
|
+
def conditional_resume(self, was_paused):
|
|
247
|
+
if not was_paused and not self.is_finished():
|
|
248
|
+
logger.debug('Performing conditional resume')
|
|
249
|
+
self.resume()
|
|
250
|
+
|
|
251
|
+
def is_paused(self):
|
|
252
|
+
return not self._not_paused.is_set()
|
|
253
|
+
|
|
254
|
+
def finish(self):
|
|
255
|
+
logger.debug('Performing finished')
|
|
256
|
+
self._finished.set()
|
|
257
|
+
|
|
258
|
+
def is_finished(self):
|
|
259
|
+
return self._finished.is_set()
|
|
260
|
+
|
|
261
|
+
def resume(self):
|
|
262
|
+
logger.debug('Performing resume')
|
|
263
|
+
self._not_paused.set()
|
|
264
|
+
|
|
265
|
+
def running(self):
|
|
266
|
+
return not self.is_paused() and (not self.is_finished() or self.left_to_process)
|
|
267
|
+
|
|
268
|
+
def remove(self, path):
|
|
269
|
+
# Pause things while we remove things...
|
|
270
|
+
logger.info('Removing %s from %s', path, self.__class__.__name__)
|
|
271
|
+
paused = self.conditional_pause()
|
|
272
|
+
|
|
273
|
+
hash = self._reverse_hashes.pop(path)
|
|
274
|
+
self._hashes[hash].matches.remove(path)
|
|
275
|
+
if len(self._hashes[hash].matches) == 1:
|
|
276
|
+
match_i = self._hashes[hash].match_i
|
|
277
|
+
logger.debug('Unmatching match group %s', match_i)
|
|
278
|
+
self._hashes[hash].match_i = None
|
|
279
|
+
|
|
280
|
+
del self.matches[match_i]
|
|
281
|
+
self.refresh_match_indexes(match_i)
|
|
282
|
+
self.duplicate_images -= 2
|
|
283
|
+
|
|
284
|
+
elif not self._hashes[hash].matches:
|
|
285
|
+
logger.debug('Removing empty match group')
|
|
286
|
+
del self._hashes[hash]
|
|
287
|
+
|
|
288
|
+
else:
|
|
289
|
+
logger.debug('Simple removal performed')
|
|
290
|
+
self.duplicate_images -= 1
|
|
291
|
+
|
|
292
|
+
self.processed_images -= 1
|
|
293
|
+
self.found_images -= 1
|
|
294
|
+
self.conditional_resume(paused)
|
|
295
|
+
|
|
296
|
+
def refresh_match_indexes(self, start=0):
|
|
297
|
+
for match_i, match in enumerate(self.matches[start:], start=start):
|
|
298
|
+
match.match_i = match_i
|
|
299
|
+
|
|
300
|
+
def _process_image_callback(self, result):
|
|
301
|
+
self._not_paused.wait()
|
|
302
|
+
if self.is_finished():
|
|
303
|
+
return
|
|
304
|
+
|
|
305
|
+
path: Path | str | ZipPath
|
|
306
|
+
path, hashes = result
|
|
307
|
+
|
|
308
|
+
if any(_is_under(d, path.path if isinstance(path, ZipPath) else path) for d in self._removed_paths):
|
|
309
|
+
self.found_images -= 1
|
|
310
|
+
return
|
|
311
|
+
|
|
312
|
+
if isinstance(hashes, dict):
|
|
313
|
+
for sub_path, sub_hashes in hashes.items():
|
|
314
|
+
self._process_image_callback((ZipPath(str(path), sub_path), sub_hashes))
|
|
315
|
+
return
|
|
316
|
+
|
|
317
|
+
if not isinstance(path, ZipPath):
|
|
318
|
+
path = ZipPath(str(path), "")
|
|
319
|
+
|
|
320
|
+
if path in self._reverse_hashes:
|
|
321
|
+
self.found_images -= 1
|
|
322
|
+
return
|
|
323
|
+
|
|
324
|
+
self.processed_images += 1
|
|
325
|
+
for hash_ in hashes:
|
|
326
|
+
if hash_ not in self._hashes:
|
|
327
|
+
continue
|
|
328
|
+
|
|
329
|
+
self._reverse_hashes[path] = hash_
|
|
330
|
+
|
|
331
|
+
# This appears to be a new match!
|
|
332
|
+
for match in self._hashes[hash_].matches:
|
|
333
|
+
if path.absolute() == match.absolute():
|
|
334
|
+
# This appears to be a duplicate PATH...
|
|
335
|
+
logger.warning('Duplicate files entered! %s, %s', path, match)
|
|
336
|
+
return
|
|
337
|
+
|
|
338
|
+
self._hashes[hash_].matches.append(path)
|
|
339
|
+
if self._hashes[hash_].match_i is None and len(self._hashes[hash_].matches) >= 2:
|
|
340
|
+
# This is a brand new match group!
|
|
341
|
+
self._hashes[hash_].match_i = len(self.matches)
|
|
342
|
+
self.matches.append(self._hashes[hash_])
|
|
343
|
+
self.duplicate_images += 2
|
|
344
|
+
self.events.put(NewGroup(self._hashes[hash_]))
|
|
345
|
+
logger.debug('New match group found: %s', self._hashes[hash_].matches)
|
|
346
|
+
else:
|
|
347
|
+
# Just another match for an existing group...
|
|
348
|
+
self.duplicate_images += 1
|
|
349
|
+
self.events.put(NewMatch(self._hashes[hash_], path))
|
|
350
|
+
logger.debug('New match found for group #%s: %s',
|
|
351
|
+
self._hashes[hash_].match_i,
|
|
352
|
+
self._hashes[hash_].matches)
|
|
353
|
+
|
|
354
|
+
break
|
|
355
|
+
else:
|
|
356
|
+
# This is a new hash, so just add it to the hashmap and move on...
|
|
357
|
+
# Just use the initial orientation
|
|
358
|
+
hash_ = hashes[0]
|
|
359
|
+
self._reverse_hashes[path] = hash_
|
|
360
|
+
self._hashes[hash_].matches.append(path)
|
|
361
|
+
return
|
|
362
|
+
|
|
363
|
+
def _process_image_error_callback(self, e):
|
|
364
|
+
self.processed_images += 1
|
|
365
|
+
print(str(e))
|
|
366
|
+
|
|
367
|
+
def _root_stream(self):
|
|
368
|
+
# Yield any paths that come up for processing, then wait until processing is finished for any new paths
|
|
369
|
+
while not self._new_paths.empty() or self.left_to_process:
|
|
370
|
+
if self._new_paths.empty():
|
|
371
|
+
time.sleep(0.05)
|
|
372
|
+
continue
|
|
373
|
+
|
|
374
|
+
yield self._new_paths.get_nowait()
|
|
375
|
+
|
|
376
|
+
def run(self, paths: list[str | Path]):
|
|
377
|
+
# TODO: Verify none of the paths overlap
|
|
378
|
+
# TODO: Verify none of the dirs have been deleted after we started
|
|
379
|
+
|
|
380
|
+
self._not_paused.set()
|
|
381
|
+
self._finished.clear()
|
|
382
|
+
|
|
383
|
+
for path in paths:
|
|
384
|
+
self.add_path(path)
|
|
385
|
+
|
|
386
|
+
with Pool(self.processes) as tp:
|
|
387
|
+
for path in self._root_stream():
|
|
388
|
+
path = Path(path)
|
|
389
|
+
if not path.is_dir():
|
|
390
|
+
logger.warning('A path was entered that was not a directory : %s', path)
|
|
391
|
+
continue
|
|
392
|
+
|
|
393
|
+
path = str(path.absolute())
|
|
394
|
+
if path in self._removed_paths or path in self._processed_paths:
|
|
395
|
+
continue
|
|
396
|
+
|
|
397
|
+
for root, dirs, files in os.walk(path):
|
|
398
|
+
if self.is_finished():
|
|
399
|
+
break
|
|
400
|
+
|
|
401
|
+
root = Path(root)
|
|
402
|
+
|
|
403
|
+
if any(_is_under(d, root) for d in self._removed_paths):
|
|
404
|
+
continue
|
|
405
|
+
|
|
406
|
+
for f in files:
|
|
407
|
+
self._not_paused.wait()
|
|
408
|
+
if self.is_finished():
|
|
409
|
+
break
|
|
410
|
+
|
|
411
|
+
f = root / f
|
|
412
|
+
|
|
413
|
+
if f.suffix.lower() not in self.extensions:
|
|
414
|
+
continue
|
|
415
|
+
|
|
416
|
+
if any(_is_under(d, f) for d in self._removed_paths):
|
|
417
|
+
continue
|
|
418
|
+
|
|
419
|
+
# TODO: This sucks (for zips at least), but I can't iterate over the dict while its changing...
|
|
420
|
+
if ZipPath(str(f), "") in self._reverse_hashes:
|
|
421
|
+
continue
|
|
422
|
+
|
|
423
|
+
self.found_images += 1
|
|
424
|
+
tp.apply_async(
|
|
425
|
+
_process_image,
|
|
426
|
+
args=(f, ),
|
|
427
|
+
kwds={
|
|
428
|
+
'strength': self.strength,
|
|
429
|
+
'exact_match': self.exact_match,
|
|
430
|
+
},
|
|
431
|
+
callback=self._process_image_callback,
|
|
432
|
+
error_callback=self._process_image_error_callback,
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
self._processed_paths.add(path)
|
|
436
|
+
|
|
437
|
+
tp.close()
|
|
438
|
+
|
|
439
|
+
if not self.is_finished():
|
|
440
|
+
tp.join()
|
|
441
|
+
|
|
442
|
+
if not self.is_finished():
|
|
443
|
+
self._finished.set()
|
|
444
|
+
self.events.put(Finished())
|