pixmatch 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixmatch might be problematic. Click here for more details.

pixmatch/__init__.py CHANGED
@@ -1,444 +1,444 @@
1
- import hashlib
2
- import logging
3
- import os
4
- import time
5
-
6
- from collections import defaultdict
7
- from dataclasses import dataclass, field
8
- from multiprocessing import Pool, Manager
9
- from pathlib import Path
10
- from threading import Event
11
- from typing import Union
12
- from zipfile import ZipFile
13
-
14
- import imagehash
15
- import numpy as np
16
-
17
- from PIL import Image
18
-
19
- logger = logging.getLogger(__name__)
20
-
21
-
22
- @dataclass(frozen=True)
23
- class ZipPath:
24
- path: str
25
- subpath: str
26
-
27
- @property
28
- def path_obj(self):
29
- return Path(self.path)
30
-
31
- @property
32
- def is_gif(self) -> bool:
33
- movie_extensions = {'.gif', '.webp'}
34
- return (not self.subpath and Path(self.path).suffix.lower() in movie_extensions) \
35
- or (self.subpath and self.subpath[-4:].lower() in movie_extensions)
36
-
37
- def absolute(self):
38
- return ZipPath(str(self.path_obj.absolute()), self.subpath)
39
-
40
-
41
- def _is_under(folder_abs: str, target: str | Path) -> bool:
42
- """Return True if the ZipPath's real file (zp.path) is inside folder_abs."""
43
- try:
44
- Path(target).absolute().relative_to(Path(folder_abs).absolute())
45
- return True
46
- except ValueError:
47
- return False
48
-
49
-
50
- def phash_params_for_strength(strength: int) -> tuple[int, int]:
51
- # TODO: This sucks.
52
- strength = max(0, min(10, strength))
53
- if strength >= 10:
54
- return 16, 4 # 256-bit hash, strict
55
- elif strength >= 8:
56
- return 15, 4
57
- elif strength >= 7:
58
- return 13, 4
59
- elif strength >= 6:
60
- return 11, 4
61
- elif strength >= 5:
62
- return 9, 4
63
- elif strength >= 4:
64
- return 8, 4
65
- elif strength >= 3:
66
- return 8, 3
67
- elif strength >= 2:
68
- return 7, 3
69
- else:
70
- return 6, 3
71
-
72
-
73
- def calculate_hashes(f, is_gif=False, strength=5, exact_match=False):
74
- """
75
- Calculate hashes for a given file.
76
-
77
- Args:
78
- f (IO or str or Path): Either a file path to process, or a in-memory BytesIO object ready for reading.
79
- is_gif (bool): Is this gif data? Needed if passing an in-memory BytesIO object.
80
- strength (int): A number between 0 and 10 on the strength of the matches.
81
- exact_match (bool): Use exact SHA256 hahes?
82
- If true, strength must be 10.
83
- If false, perceptual hashes will be used, even with high strength.
84
-
85
- Returns:
86
- list: The found hashes.
87
- """
88
- if exact_match:
89
- hasher = hashlib.sha256()
90
- block_size = 65536
91
- with (open(f, "rb") if isinstance(f, (str, Path)) else f) as file:
92
- for block in iter(lambda: file.read(block_size), b""):
93
- hasher.update(block)
94
- return [hasher.hexdigest()]
95
-
96
- hash_size, highfreq_factor = phash_params_for_strength(strength)
97
- with (Image.open(f) as im):
98
- if is_gif:
99
- initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
100
- # This is going to be a bit confusing but basically, imagehash produces weird hashes for some gifs
101
- # because some gifs have bad first frames consisting of nothing or only a single color...
102
- # To deal with that I'm looking for these bad hashes here and if its one, we advance to the next frame
103
- # and use THAT for imagehash instead.
104
- # The ones we need to be on the lookout for are:
105
- # 1. The hash is all 1111...
106
- # 2. The hash is all 0000...
107
- # 3. The hash is of the form 100000...
108
- # TODO: This is simply not good enough. I'm still getting bad matches for gifs, tho they are extremely rare
109
- val = initial_hash.hash[0][0]
110
- while all(all(x == val for x in r) for r in initial_hash.hash) \
111
- or all(all(x == np.False_ or (x_i == 0 and r_i == 0) for x_i, x in enumerate(r))
112
- for r_i, r in enumerate(initial_hash.hash)):
113
- try:
114
- im.seek(im.tell() + 1)
115
- except EOFError:
116
- break
117
- else:
118
- initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
119
- val = initial_hash.hash[0][0]
120
-
121
- # For GIFs we'll look for mirrored versions but thats it
122
- flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
123
- return [initial_hash, imagehash.phash(flipped_h_image, hash_size=hash_size, highfreq_factor=highfreq_factor)]
124
-
125
- flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
126
- flipped_v_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
127
- images = (im, im.rotate(90), im.rotate(180), im.rotate(270),
128
- flipped_h_image, flipped_h_image.rotate(90), flipped_h_image.rotate(180), flipped_h_image.rotate(270),
129
- flipped_v_image, flipped_v_image.rotate(90), flipped_v_image.rotate(180), flipped_v_image.rotate(270))
130
- return [imagehash.phash(image, hash_size=hash_size, highfreq_factor=highfreq_factor) for image in images]
131
-
132
-
133
- def _process_image(path: str | Path, strength=5, exact_match=False):
134
- path = Path(path)
135
- if path.suffix.lower() != '.zip':
136
- return path, calculate_hashes(path, is_gif=path.suffix.lower() in {".gif", ".webp"},
137
- strength=strength, exact_match=exact_match)
138
-
139
- results = dict()
140
- with ZipFile(path) as zf:
141
- for f in zf.filelist:
142
- with zf.open(f) as zipped_file:
143
- results[f.filename] = calculate_hashes(zipped_file, is_gif=f.filename[-4:].lower() in {".gif", ".webp"},
144
- strength=strength, exact_match=exact_match)
145
-
146
- return path, results
147
-
148
-
149
- @dataclass
150
- class ImageMatch:
151
- match_i: int | None = field(default=None)
152
- matches: list[ZipPath] = field(default_factory=list)
153
-
154
-
155
- @dataclass(frozen=True)
156
- class NewGroup:
157
- group: "ImageMatch" # forward-ref to your class
158
-
159
-
160
- @dataclass(frozen=True)
161
- class NewMatch:
162
- group: "ImageMatch"
163
- path: ZipPath
164
-
165
-
166
- @dataclass(frozen=True)
167
- class Finished:
168
- pass
169
-
170
-
171
- MatcherEvent = Union[NewGroup, NewMatch, Finished]
172
-
173
-
174
- # TODO: FINISHED signal?
175
- class ImageMatcher:
176
- SUPPORTED_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".tif", ".tiff", ".bmp", ".gif", ".zip"}
177
-
178
- def __init__(self, strength: int = 5, exact_match: bool = False, processes: int | None = None,
179
- extensions: set | None = None):
180
- if not (0 <= strength <= 10):
181
- raise ValueError("Strength must be between 0 and 10!")
182
-
183
- self.extensions = extensions or self.SUPPORTED_EXTS
184
-
185
- self.strength = strength
186
- self.exact_match = exact_match
187
- self.processes = processes
188
- self.found_images = 0
189
- self.processed_images = 0
190
- self.duplicate_images = 0
191
-
192
- m = Manager()
193
- self.events = m.Queue()
194
- self._new_paths = m.Queue()
195
- self._removed_paths = set()
196
- self._processed_paths = set()
197
- self._hashes = defaultdict(ImageMatch)
198
- self._reverse_hashes = dict()
199
-
200
- self._not_paused = Event()
201
- self._not_paused.set()
202
- self._finished = Event()
203
- self._finished.set()
204
-
205
- self.matches = []
206
-
207
- def add_path(self, path: str | Path):
208
- path = str(Path(path).absolute())
209
- self._removed_paths.discard(path)
210
- self._new_paths.put(path)
211
-
212
- def remove_path(self, folder: str | Path) -> None:
213
- """
214
- Mark a folder to be skipped going forward, and remove already-indexed files
215
- that live under it. Pauses briefly if not already paused to keep state sane.
216
- """
217
- folder = str(Path(folder).absolute())
218
- paused = self.conditional_pause()
219
- self._removed_paths.add(folder)
220
- self._processed_paths.discard(folder)
221
-
222
- # Remove anything we've already seen under that folder
223
- # (iterate over a copy because remove() mutates structures)
224
- to_remove = [p for p in self._reverse_hashes.keys() if _is_under(folder, p.path)]
225
- for p in to_remove:
226
- self.remove(p)
227
-
228
- self.conditional_resume(paused)
229
-
230
- @property
231
- def left_to_process(self):
232
- return self.found_images - self.processed_images
233
-
234
- def pause(self):
235
- logger.debug('Performing pause')
236
- self._not_paused.clear()
237
-
238
- def conditional_pause(self):
239
- _conditional_pause = self.is_paused()
240
- if not _conditional_pause:
241
- logger.debug('Performing conditional pause')
242
- self.pause()
243
-
244
- return _conditional_pause
245
-
246
- def conditional_resume(self, was_paused):
247
- if not was_paused and not self.is_finished():
248
- logger.debug('Performing conditional resume')
249
- self.resume()
250
-
251
- def is_paused(self):
252
- return not self._not_paused.is_set()
253
-
254
- def finish(self):
255
- logger.debug('Performing finished')
256
- self._finished.set()
257
-
258
- def is_finished(self):
259
- return self._finished.is_set()
260
-
261
- def resume(self):
262
- logger.debug('Performing resume')
263
- self._not_paused.set()
264
-
265
- def running(self):
266
- return not self.is_paused() and (not self.is_finished() or self.left_to_process)
267
-
268
- def remove(self, path):
269
- # Pause things while we remove things...
270
- logger.info('Removing %s from %s', path, self.__class__.__name__)
271
- paused = self.conditional_pause()
272
-
273
- hash = self._reverse_hashes.pop(path)
274
- self._hashes[hash].matches.remove(path)
275
- if len(self._hashes[hash].matches) == 1:
276
- match_i = self._hashes[hash].match_i
277
- logger.debug('Unmatching match group %s', match_i)
278
- self._hashes[hash].match_i = None
279
-
280
- del self.matches[match_i]
281
- self.refresh_match_indexes(match_i)
282
- self.duplicate_images -= 2
283
-
284
- elif not self._hashes[hash].matches:
285
- logger.debug('Removing empty match group')
286
- del self._hashes[hash]
287
-
288
- else:
289
- logger.debug('Simple removal performed')
290
- self.duplicate_images -= 1
291
-
292
- self.processed_images -= 1
293
- self.found_images -= 1
294
- self.conditional_resume(paused)
295
-
296
- def refresh_match_indexes(self, start=0):
297
- for match_i, match in enumerate(self.matches[start:], start=start):
298
- match.match_i = match_i
299
-
300
- def _process_image_callback(self, result):
301
- self._not_paused.wait()
302
- if self.is_finished():
303
- return
304
-
305
- path: Path | str | ZipPath
306
- path, hashes = result
307
-
308
- if any(_is_under(d, path.path if isinstance(path, ZipPath) else path) for d in self._removed_paths):
309
- self.found_images -= 1
310
- return
311
-
312
- if isinstance(hashes, dict):
313
- for sub_path, sub_hashes in hashes.items():
314
- self._process_image_callback((ZipPath(str(path), sub_path), sub_hashes))
315
- return
316
-
317
- if not isinstance(path, ZipPath):
318
- path = ZipPath(str(path), "")
319
-
320
- if path in self._reverse_hashes:
321
- self.found_images -= 1
322
- return
323
-
324
- self.processed_images += 1
325
- for hash_ in hashes:
326
- if hash_ not in self._hashes:
327
- continue
328
-
329
- self._reverse_hashes[path] = hash_
330
-
331
- # This appears to be a new match!
332
- for match in self._hashes[hash_].matches:
333
- if path.absolute() == match.absolute():
334
- # This appears to be a duplicate PATH...
335
- logger.warning('Duplicate files entered! %s, %s', path, match)
336
- return
337
-
338
- self._hashes[hash_].matches.append(path)
339
- if self._hashes[hash_].match_i is None and len(self._hashes[hash_].matches) >= 2:
340
- # This is a brand new match group!
341
- self._hashes[hash_].match_i = len(self.matches)
342
- self.matches.append(self._hashes[hash_])
343
- self.duplicate_images += 2
344
- self.events.put(NewGroup(self._hashes[hash_]))
345
- logger.debug('New match group found: %s', self._hashes[hash_].matches)
346
- else:
347
- # Just another match for an existing group...
348
- self.duplicate_images += 1
349
- self.events.put(NewMatch(self._hashes[hash_], path))
350
- logger.debug('New match found for group #%s: %s',
351
- self._hashes[hash_].match_i,
352
- self._hashes[hash_].matches)
353
-
354
- break
355
- else:
356
- # This is a new hash, so just add it to the hashmap and move on...
357
- # Just use the initial orientation
358
- hash_ = hashes[0]
359
- self._reverse_hashes[path] = hash_
360
- self._hashes[hash_].matches.append(path)
361
- return
362
-
363
- def _process_image_error_callback(self, e):
364
- self.processed_images += 1
365
- print(str(e))
366
-
367
- def _root_stream(self):
368
- # Yield any paths that come up for processing, then wait until processing is finished for any new paths
369
- while not self._new_paths.empty() or self.left_to_process:
370
- if self._new_paths.empty():
371
- time.sleep(0.05)
372
- continue
373
-
374
- yield self._new_paths.get_nowait()
375
-
376
- def run(self, paths: list[str | Path]):
377
- # TODO: Verify none of the paths overlap
378
- # TODO: Verify none of the dirs have been deleted after we started
379
-
380
- self._not_paused.set()
381
- self._finished.clear()
382
-
383
- for path in paths:
384
- self.add_path(path)
385
-
386
- with Pool(self.processes) as tp:
387
- for path in self._root_stream():
388
- path = Path(path)
389
- if not path.is_dir():
390
- logger.warning('A path was entered that was not a directory : %s', path)
391
- continue
392
-
393
- path = str(path.absolute())
394
- if path in self._removed_paths or path in self._processed_paths:
395
- continue
396
-
397
- for root, dirs, files in os.walk(path):
398
- if self.is_finished():
399
- break
400
-
401
- root = Path(root)
402
-
403
- if any(_is_under(d, root) for d in self._removed_paths):
404
- continue
405
-
406
- for f in files:
407
- self._not_paused.wait()
408
- if self.is_finished():
409
- break
410
-
411
- f = root / f
412
-
413
- if f.suffix.lower() not in self.extensions:
414
- continue
415
-
416
- if any(_is_under(d, f) for d in self._removed_paths):
417
- continue
418
-
419
- # TODO: This sucks (for zips at least), but I can't iterate over the dict while its changing...
420
- if ZipPath(str(f), "") in self._reverse_hashes:
421
- continue
422
-
423
- self.found_images += 1
424
- tp.apply_async(
425
- _process_image,
426
- args=(f, ),
427
- kwds={
428
- 'strength': self.strength,
429
- 'exact_match': self.exact_match,
430
- },
431
- callback=self._process_image_callback,
432
- error_callback=self._process_image_error_callback,
433
- )
434
-
435
- self._processed_paths.add(path)
436
-
437
- tp.close()
438
-
439
- if not self.is_finished():
440
- tp.join()
441
-
442
- if not self.is_finished():
443
- self._finished.set()
444
- self.events.put(Finished())
1
+ import hashlib
2
+ import logging
3
+ import os
4
+ import time
5
+
6
+ from collections import defaultdict
7
+ from dataclasses import dataclass, field
8
+ from multiprocessing import Pool, Manager
9
+ from pathlib import Path
10
+ from threading import Event
11
+ from typing import Union
12
+ from zipfile import ZipFile
13
+
14
+ import imagehash
15
+ import numpy as np
16
+
17
+ from PIL import Image
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class ZipPath:
24
+ path: str
25
+ subpath: str
26
+
27
+ @property
28
+ def path_obj(self):
29
+ return Path(self.path)
30
+
31
+ @property
32
+ def is_gif(self) -> bool:
33
+ movie_extensions = {'.gif', '.webp'}
34
+ return (not self.subpath and Path(self.path).suffix.lower() in movie_extensions) \
35
+ or (self.subpath and self.subpath[-4:].lower() in movie_extensions)
36
+
37
+ def absolute(self):
38
+ return ZipPath(str(self.path_obj.absolute()), self.subpath)
39
+
40
+
41
+ def _is_under(folder_abs: str, target: str | Path) -> bool:
42
+ """Return True if the ZipPath's real file (zp.path) is inside folder_abs."""
43
+ try:
44
+ Path(target).absolute().relative_to(Path(folder_abs).absolute())
45
+ return True
46
+ except ValueError:
47
+ return False
48
+
49
+
50
+ def phash_params_for_strength(strength: int) -> tuple[int, int]:
51
+ # TODO: This sucks.
52
+ strength = max(0, min(10, strength))
53
+ if strength >= 10:
54
+ return 16, 4 # 256-bit hash, strict
55
+ elif strength >= 8:
56
+ return 15, 4
57
+ elif strength >= 7:
58
+ return 13, 4
59
+ elif strength >= 6:
60
+ return 11, 4
61
+ elif strength >= 5:
62
+ return 9, 4
63
+ elif strength >= 4:
64
+ return 8, 4
65
+ elif strength >= 3:
66
+ return 8, 3
67
+ elif strength >= 2:
68
+ return 7, 3
69
+ else:
70
+ return 6, 3
71
+
72
+
73
+ def calculate_hashes(f, is_gif=False, strength=5, exact_match=False):
74
+ """
75
+ Calculate hashes for a given file.
76
+
77
+ Args:
78
+ f (IO or str or Path): Either a file path to process, or a in-memory BytesIO object ready for reading.
79
+ is_gif (bool): Is this gif data? Needed if passing an in-memory BytesIO object.
80
+ strength (int): A number between 0 and 10 on the strength of the matches.
81
+ exact_match (bool): Use exact SHA256 hahes?
82
+ If true, strength must be 10.
83
+ If false, perceptual hashes will be used, even with high strength.
84
+
85
+ Returns:
86
+ list: The found hashes.
87
+ """
88
+ if exact_match:
89
+ hasher = hashlib.sha256()
90
+ block_size = 65536
91
+ with (open(f, "rb") if isinstance(f, (str, Path)) else f) as file:
92
+ for block in iter(lambda: file.read(block_size), b""):
93
+ hasher.update(block)
94
+ return [hasher.hexdigest()]
95
+
96
+ hash_size, highfreq_factor = phash_params_for_strength(strength)
97
+ with (Image.open(f) as im):
98
+ if is_gif:
99
+ initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
100
+ # This is going to be a bit confusing but basically, imagehash produces weird hashes for some gifs
101
+ # because some gifs have bad first frames consisting of nothing or only a single color...
102
+ # To deal with that I'm looking for these bad hashes here and if its one, we advance to the next frame
103
+ # and use THAT for imagehash instead.
104
+ # The ones we need to be on the lookout for are:
105
+ # 1. The hash is all 1111...
106
+ # 2. The hash is all 0000...
107
+ # 3. The hash is of the form 100000...
108
+ # TODO: This is simply not good enough. I'm still getting bad matches for gifs, tho they are extremely rare
109
+ val = initial_hash.hash[0][0]
110
+ while all(all(x == val for x in r) for r in initial_hash.hash) \
111
+ or all(all(x == np.False_ or (x_i == 0 and r_i == 0) for x_i, x in enumerate(r))
112
+ for r_i, r in enumerate(initial_hash.hash)):
113
+ try:
114
+ im.seek(im.tell() + 1)
115
+ except EOFError:
116
+ break
117
+ else:
118
+ initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
119
+ val = initial_hash.hash[0][0]
120
+
121
+ # For GIFs we'll look for mirrored versions but thats it
122
+ flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
123
+ return [initial_hash, imagehash.phash(flipped_h_image, hash_size=hash_size, highfreq_factor=highfreq_factor)]
124
+
125
+ flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
126
+ flipped_v_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
127
+ images = (im, im.rotate(90), im.rotate(180), im.rotate(270),
128
+ flipped_h_image, flipped_h_image.rotate(90), flipped_h_image.rotate(180), flipped_h_image.rotate(270),
129
+ flipped_v_image, flipped_v_image.rotate(90), flipped_v_image.rotate(180), flipped_v_image.rotate(270))
130
+ return [imagehash.phash(image, hash_size=hash_size, highfreq_factor=highfreq_factor) for image in images]
131
+
132
+
133
+ def _process_image(path: str | Path, strength=5, exact_match=False):
134
+ path = Path(path)
135
+ if path.suffix.lower() != '.zip':
136
+ return path, calculate_hashes(path, is_gif=path.suffix.lower() in {".gif", ".webp"},
137
+ strength=strength, exact_match=exact_match)
138
+
139
+ results = dict()
140
+ with ZipFile(path) as zf:
141
+ for f in zf.filelist:
142
+ with zf.open(f) as zipped_file:
143
+ results[f.filename] = calculate_hashes(zipped_file, is_gif=f.filename[-4:].lower() in {".gif", ".webp"},
144
+ strength=strength, exact_match=exact_match)
145
+
146
+ return path, results
147
+
148
+
149
+ @dataclass
150
+ class ImageMatch:
151
+ match_i: int | None = field(default=None)
152
+ matches: list[ZipPath] = field(default_factory=list)
153
+
154
+
155
+ @dataclass(frozen=True)
156
+ class NewGroup:
157
+ group: "ImageMatch" # forward-ref to your class
158
+
159
+
160
+ @dataclass(frozen=True)
161
+ class NewMatch:
162
+ group: "ImageMatch"
163
+ path: ZipPath
164
+
165
+
166
+ @dataclass(frozen=True)
167
+ class Finished:
168
+ pass
169
+
170
+
171
+ MatcherEvent = Union[NewGroup, NewMatch, Finished]
172
+
173
+
174
+ # TODO: FINISHED signal?
175
+ class ImageMatcher:
176
+ SUPPORTED_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".tif", ".tiff", ".bmp", ".gif", ".zip"}
177
+
178
+ def __init__(self, strength: int = 5, exact_match: bool = False, processes: int | None = None,
179
+ extensions: set | None = None):
180
+ if not (0 <= strength <= 10):
181
+ raise ValueError("Strength must be between 0 and 10!")
182
+
183
+ self.extensions = extensions or self.SUPPORTED_EXTS
184
+
185
+ self.strength = strength
186
+ self.exact_match = exact_match
187
+ self.processes = processes
188
+ self.found_images = 0
189
+ self.processed_images = 0
190
+ self.duplicate_images = 0
191
+
192
+ m = Manager()
193
+ self.events = m.Queue()
194
+ self._new_paths = m.Queue()
195
+ self._removed_paths = set()
196
+ self._processed_paths = set()
197
+ self._hashes = defaultdict(ImageMatch)
198
+ self._reverse_hashes = dict()
199
+
200
+ self._not_paused = Event()
201
+ self._not_paused.set()
202
+ self._finished = Event()
203
+ self._finished.set()
204
+
205
+ self.matches = []
206
+
207
+ def add_path(self, path: str | Path):
208
+ path = str(Path(path).absolute())
209
+ self._removed_paths.discard(path)
210
+ self._new_paths.put(path)
211
+
212
+ def remove_path(self, folder: str | Path) -> None:
213
+ """
214
+ Mark a folder to be skipped going forward, and remove already-indexed files
215
+ that live under it. Pauses briefly if not already paused to keep state sane.
216
+ """
217
+ folder = str(Path(folder).absolute())
218
+ paused = self.conditional_pause()
219
+ self._removed_paths.add(folder)
220
+ self._processed_paths.discard(folder)
221
+
222
+ # Remove anything we've already seen under that folder
223
+ # (iterate over a copy because remove() mutates structures)
224
+ to_remove = [p for p in self._reverse_hashes.keys() if _is_under(folder, p.path)]
225
+ for p in to_remove:
226
+ self.remove(p)
227
+
228
+ self.conditional_resume(paused)
229
+
230
+ @property
231
+ def left_to_process(self):
232
+ return self.found_images - self.processed_images
233
+
234
+ def pause(self):
235
+ logger.debug('Performing pause')
236
+ self._not_paused.clear()
237
+
238
+ def conditional_pause(self):
239
+ _conditional_pause = self.is_paused()
240
+ if not _conditional_pause:
241
+ logger.debug('Performing conditional pause')
242
+ self.pause()
243
+
244
+ return _conditional_pause
245
+
246
+ def conditional_resume(self, was_paused):
247
+ if not was_paused and not self.is_finished():
248
+ logger.debug('Performing conditional resume')
249
+ self.resume()
250
+
251
+ def is_paused(self):
252
+ return not self._not_paused.is_set()
253
+
254
+ def finish(self):
255
+ logger.debug('Performing finished')
256
+ self._finished.set()
257
+
258
+ def is_finished(self):
259
+ return self._finished.is_set()
260
+
261
+ def resume(self):
262
+ logger.debug('Performing resume')
263
+ self._not_paused.set()
264
+
265
+ def running(self):
266
+ return not self.is_paused() and (not self.is_finished() or self.left_to_process)
267
+
268
+ def remove(self, path):
269
+ # Pause things while we remove things...
270
+ logger.info('Removing %s from %s', path, self.__class__.__name__)
271
+ paused = self.conditional_pause()
272
+
273
+ hash = self._reverse_hashes.pop(path)
274
+ self._hashes[hash].matches.remove(path)
275
+ if len(self._hashes[hash].matches) == 1:
276
+ match_i = self._hashes[hash].match_i
277
+ logger.debug('Unmatching match group %s', match_i)
278
+ self._hashes[hash].match_i = None
279
+
280
+ del self.matches[match_i]
281
+ self.refresh_match_indexes(match_i)
282
+ self.duplicate_images -= 2
283
+
284
+ elif not self._hashes[hash].matches:
285
+ logger.debug('Removing empty match group')
286
+ del self._hashes[hash]
287
+
288
+ else:
289
+ logger.debug('Simple removal performed')
290
+ self.duplicate_images -= 1
291
+
292
+ self.processed_images -= 1
293
+ self.found_images -= 1
294
+ self.conditional_resume(paused)
295
+
296
+ def refresh_match_indexes(self, start=0):
297
+ for match_i, match in enumerate(self.matches[start:], start=start):
298
+ match.match_i = match_i
299
+
300
+ def _process_image_callback(self, result):
301
+ self._not_paused.wait()
302
+ if self.is_finished():
303
+ return
304
+
305
+ path: Path | str | ZipPath
306
+ path, hashes = result
307
+
308
+ if any(_is_under(d, path.path if isinstance(path, ZipPath) else path) for d in self._removed_paths):
309
+ self.found_images -= 1
310
+ return
311
+
312
+ if isinstance(hashes, dict):
313
+ for sub_path, sub_hashes in hashes.items():
314
+ self._process_image_callback((ZipPath(str(path), sub_path), sub_hashes))
315
+ return
316
+
317
+ if not isinstance(path, ZipPath):
318
+ path = ZipPath(str(path), "")
319
+
320
+ if path in self._reverse_hashes:
321
+ self.found_images -= 1
322
+ return
323
+
324
+ self.processed_images += 1
325
+ for hash_ in hashes:
326
+ if hash_ not in self._hashes:
327
+ continue
328
+
329
+ self._reverse_hashes[path] = hash_
330
+
331
+ # This appears to be a new match!
332
+ for match in self._hashes[hash_].matches:
333
+ if path.absolute() == match.absolute():
334
+ # This appears to be a duplicate PATH...
335
+ logger.warning('Duplicate files entered! %s, %s', path, match)
336
+ return
337
+
338
+ self._hashes[hash_].matches.append(path)
339
+ if self._hashes[hash_].match_i is None and len(self._hashes[hash_].matches) >= 2:
340
+ # This is a brand new match group!
341
+ self._hashes[hash_].match_i = len(self.matches)
342
+ self.matches.append(self._hashes[hash_])
343
+ self.duplicate_images += 2
344
+ self.events.put(NewGroup(self._hashes[hash_]))
345
+ logger.debug('New match group found: %s', self._hashes[hash_].matches)
346
+ else:
347
+ # Just another match for an existing group...
348
+ self.duplicate_images += 1
349
+ self.events.put(NewMatch(self._hashes[hash_], path))
350
+ logger.debug('New match found for group #%s: %s',
351
+ self._hashes[hash_].match_i,
352
+ self._hashes[hash_].matches)
353
+
354
+ break
355
+ else:
356
+ # This is a new hash, so just add it to the hashmap and move on...
357
+ # Just use the initial orientation
358
+ hash_ = hashes[0]
359
+ self._reverse_hashes[path] = hash_
360
+ self._hashes[hash_].matches.append(path)
361
+ return
362
+
363
+ def _process_image_error_callback(self, e):
364
+ self.processed_images += 1
365
+ print(str(e))
366
+
367
+ def _root_stream(self):
368
+ # Yield any paths that come up for processing, then wait until processing is finished for any new paths
369
+ while not self._new_paths.empty() or self.left_to_process:
370
+ if self._new_paths.empty():
371
+ time.sleep(0.05)
372
+ continue
373
+
374
+ yield self._new_paths.get_nowait()
375
+
376
+ def run(self, paths: list[str | Path]):
377
+ # TODO: Verify none of the paths overlap
378
+ # TODO: Verify none of the dirs have been deleted after we started
379
+
380
+ self._not_paused.set()
381
+ self._finished.clear()
382
+
383
+ for path in paths:
384
+ self.add_path(path)
385
+
386
+ with Pool(self.processes) as tp:
387
+ for path in self._root_stream():
388
+ path = Path(path)
389
+ if not path.is_dir():
390
+ logger.warning('A path was entered that was not a directory : %s', path)
391
+ continue
392
+
393
+ path = str(path.absolute())
394
+ if path in self._removed_paths or path in self._processed_paths:
395
+ continue
396
+
397
+ for root, dirs, files in os.walk(path):
398
+ if self.is_finished():
399
+ break
400
+
401
+ root = Path(root)
402
+
403
+ if any(_is_under(d, root) for d in self._removed_paths):
404
+ continue
405
+
406
+ for f in files:
407
+ self._not_paused.wait()
408
+ if self.is_finished():
409
+ break
410
+
411
+ f = root / f
412
+
413
+ if f.suffix.lower() not in self.extensions:
414
+ continue
415
+
416
+ if any(_is_under(d, f) for d in self._removed_paths):
417
+ continue
418
+
419
+ # TODO: This sucks (for zips at least), but I can't iterate over the dict while its changing...
420
+ if ZipPath(str(f), "") in self._reverse_hashes:
421
+ continue
422
+
423
+ self.found_images += 1
424
+ tp.apply_async(
425
+ _process_image,
426
+ args=(f, ),
427
+ kwds={
428
+ 'strength': self.strength,
429
+ 'exact_match': self.exact_match,
430
+ },
431
+ callback=self._process_image_callback,
432
+ error_callback=self._process_image_error_callback,
433
+ )
434
+
435
+ self._processed_paths.add(path)
436
+
437
+ tp.close()
438
+
439
+ if not self.is_finished():
440
+ tp.join()
441
+
442
+ if not self.is_finished():
443
+ self._finished.set()
444
+ self.events.put(Finished())