pixmatch 0.0.4__tar.gz → 0.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixmatch might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pixmatch
3
- Version: 0.0.4
3
+ Version: 0.0.5
4
4
  Summary: A modern VisiPics replacement.
5
5
  Author-email: Ryan Heard <ryanwheard@gmail.com>
6
6
  Project-URL: Repository, https://github.com/rheard/pixmatch
@@ -0,0 +1,585 @@
1
+ import hashlib
2
+ import logging
3
+ import os
4
+ import time
5
+
6
+ from collections import defaultdict
7
+ from dataclasses import dataclass, field
8
+ from functools import wraps
9
+ from multiprocessing import Manager, Pool
10
+ from pathlib import Path
11
+ from threading import Event
12
+ from typing import ClassVar, Union
13
+ from zipfile import BadZipFile, ZipFile
14
+
15
+ import imagehash
16
+ import numpy as np
17
+
18
+ from PIL import Image, ImageFile, UnidentifiedImageError
19
+
20
+ ImageFile.LOAD_TRUNCATED_IMAGES = True # Allow damaged images
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class ZipPath:
27
+ """
28
+ A general object describing a Path.
29
+
30
+ All paths in pixmatch will be one of these. `subpath` will be empty for non-zip file paths.
31
+
32
+ Attributes:
33
+ path (str): The path to the file.
34
+ subpath (str): The subpath in the zip if `path` is for a zip.
35
+ """
36
+ # TODO: At some point convert this to Path.
37
+ # When I tried that last it introduced problems with inter-process communication
38
+ path: str
39
+ subpath: str
40
+
41
+ @property
42
+ def path_obj(self) -> Path:
43
+ """Get the path as as Path object"""
44
+ return Path(self.path)
45
+
46
+ @property
47
+ def is_gif(self) -> bool:
48
+ """Is this a path to an animated image?"""
49
+ movie_extensions = {'.gif', '.webp'}
50
+ return (not self.subpath and Path(self.path).suffix.lower() in movie_extensions) \
51
+ or (self.subpath and self.subpath[-4:].lower() in movie_extensions)
52
+
53
+ @property
54
+ def is_zip(self) -> bool:
55
+ """Does this point to a file located in a zip?"""
56
+ return bool(self.subpath)
57
+
58
+ def absolute(self):
59
+ """Get the absolute version of this ZipPath"""
60
+ return ZipPath(str(self.path_obj.absolute()), self.subpath)
61
+
62
+
63
+ def _is_under(folder_abs: str, target: str | Path) -> bool:
64
+ """Return True if the ZipPath's real file (zp.path) is inside folder_abs."""
65
+ try:
66
+ Path(target).absolute().relative_to(Path(folder_abs).absolute())
67
+ except ValueError:
68
+ return False
69
+
70
+ return True
71
+
72
+
73
+ def phash_params_for_strength(strength: int) -> tuple[int, int]:
74
+ """
75
+ Convert a 0-10 strength to settings for imagehash
76
+
77
+ Returns:
78
+ tuple<int, int>: The hash size (in bytes) and the high frequency factor
79
+ """
80
+ # TODO: This sucks.
81
+ strength = max(0, min(10, strength))
82
+ if strength >= 10:
83
+ return 16, 4
84
+ if strength >= 8:
85
+ return 15, 4
86
+ if strength >= 7:
87
+ return 13, 4
88
+ if strength >= 6:
89
+ return 11, 4
90
+ if strength >= 5:
91
+ return 9, 4
92
+ if strength >= 4:
93
+ return 8, 4
94
+ if strength >= 3:
95
+ return 8, 3
96
+ if strength >= 2:
97
+ return 7, 3
98
+ return 6, 3
99
+
100
+
101
+ def calculate_hashes(f, strength=5, *, is_gif=False, exact_match=False) -> tuple[str, set[str]]:
102
+ """
103
+ Calculate hashes for a given file.
104
+
105
+ Args:
106
+ f (IO or str or Path): Either a file path to process, or a in-memory BytesIO object ready for reading.
107
+ strength (int): A number between 0 and 10 on the strength of the matches.
108
+ is_gif (bool): Is this gif data? Needed if passing an in-memory BytesIO object.
109
+ exact_match (bool): Use exact SHA256 hahes?
110
+ If true, strength must be 10.
111
+ If false, perceptual hashes will be used, even with high strength.
112
+
113
+ Returns:
114
+ tuple[str, set]: The first element is the primary hash,
115
+ the second element are any secondary hashes representing rotations, flips, etc...
116
+ """
117
+ if exact_match:
118
+ hasher = hashlib.sha256()
119
+ block_size = 65536
120
+ with (open(f, "rb") if isinstance(f, (str, Path)) else f) as file: # noqa: PTH123
121
+ for block in iter(lambda: file.read(block_size), b""):
122
+ hasher.update(block)
123
+ return hasher.hexdigest(), set()
124
+
125
+ hash_size, highfreq_factor = phash_params_for_strength(strength)
126
+ with Image.open(f) as im:
127
+ if is_gif:
128
+ initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
129
+ # This is going to be a bit confusing but basically, imagehash produces weird hashes for some gifs
130
+ # because some gifs have bad first frames consisting of nothing or only a single color...
131
+ # To deal with that I'm looking for these bad hashes here and if its one, we advance to the next frame
132
+ # and use THAT for imagehash instead.
133
+ # The ones we need to be on the lookout for are:
134
+ # 1. The hash is all 1111...
135
+ # 2. The hash is all 0000...
136
+ # 3. The hash is of the form 100000...
137
+ # TODO: This is simply not good enough. I'm still getting bad matches for gifs, tho they are extremely rare
138
+ val = initial_hash.hash[0][0]
139
+ while all(all(x == val for x in r) for r in initial_hash.hash) \
140
+ or all(all(x == np.False_ or (x_i == 0 and r_i == 0) for x_i, x in enumerate(r))
141
+ for r_i, r in enumerate(initial_hash.hash)):
142
+ try:
143
+ im.seek(im.tell() + 1)
144
+ except EOFError: # noqa: PERF203
145
+ break
146
+ else:
147
+ initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
148
+ val = initial_hash.hash[0][0]
149
+
150
+ # For GIFs we'll look for mirrored versions but thats it
151
+ flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
152
+ extras = (flipped_h_image, )
153
+ else:
154
+ initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
155
+
156
+ flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
157
+ flipped_v_image = im.transpose(Image.Transpose.FLIP_TOP_BOTTOM)
158
+ extras = (im.rotate(90), im.rotate(180), im.rotate(270),
159
+ flipped_h_image, flipped_h_image.rotate(90), flipped_h_image.rotate(180),
160
+ flipped_h_image.rotate(270),
161
+ flipped_v_image, flipped_v_image.rotate(90), flipped_v_image.rotate(180),
162
+ flipped_v_image.rotate(270))
163
+
164
+ return str(initial_hash), {
165
+ str(imagehash.phash(image, hash_size=hash_size, highfreq_factor=highfreq_factor)) for image in extras
166
+ }
167
+
168
+
169
+ def thread_error_handler(func):
170
+ """An error handler for the thread to return information about where the error occurred"""
171
+
172
+ @wraps(func)
173
+ def wrapper(path, *args, **kwargs): # noqa: ANN202
174
+ try:
175
+ return func(path, *args, **kwargs)
176
+ except Exception as e:
177
+ e.input_path = path
178
+ raise
179
+
180
+ return wrapper
181
+
182
+
183
+ @thread_error_handler
184
+ def _process_image(
185
+ path: str | Path,
186
+ supported_extensions: set | None = None,
187
+ strength: int = 5,
188
+ *,
189
+ exact_match: bool = False,
190
+ ) -> tuple[Path, tuple | dict[str, tuple]]:
191
+ """Get the hashes for a given path. Is multiprocessing compatible"""
192
+ path = Path(path)
193
+ if path.suffix.lower() != '.zip':
194
+ return path, calculate_hashes(path, is_gif=path.suffix.lower() in {".gif", ".webp"},
195
+ strength=strength, exact_match=exact_match)
196
+
197
+ if not supported_extensions:
198
+ supported_extensions = ImageMatcher.SUPPORTED_EXTS
199
+
200
+ results = {}
201
+ with ZipFile(path) as zf:
202
+ for f in zf.filelist:
203
+ f_ext = f.filename[-4:].lower()
204
+ if f_ext not in supported_extensions:
205
+ continue
206
+
207
+ if f_ext == '.zip':
208
+ logger.warning('Have not implemented nested zip support yet! Input file: %s (%s)', path, f)
209
+ continue
210
+
211
+ try:
212
+ with zf.open(f) as zipped_file:
213
+ results[f.filename] = calculate_hashes(zipped_file, is_gif=f_ext in {".gif", ".webp"},
214
+ strength=strength, exact_match=exact_match)
215
+ except BadZipFile as e:
216
+ logger.warning("Could not read %s in %s due to %s", f.filename, path, str(e))
217
+ except UnidentifiedImageError:
218
+ logger.warning("Could not identify image %s in %s", f.filename, path)
219
+
220
+ return path, results
221
+
222
+
223
+ @dataclass
224
+ class ImageMatch:
225
+ """A match data structure containing the matches and where this match lies in the match list"""
226
+ match_i: int | None = field(default=None)
227
+ matches: list[ZipPath] = field(default_factory=list)
228
+
229
+
230
+ # region Events
231
+ @dataclass(frozen=True)
232
+ class NewGroup:
233
+ """A new group event"""
234
+ group: "ImageMatch"
235
+
236
+
237
+ @dataclass(frozen=True)
238
+ class NewMatch:
239
+ """A new match event"""
240
+ group: "ImageMatch"
241
+ path: ZipPath
242
+
243
+
244
+ @dataclass(frozen=True)
245
+ class Finished:
246
+ """A finished event"""
247
+
248
+
249
+ MatcherEvent = Union[NewGroup, NewMatch, Finished]
250
+ # endregion
251
+
252
+
253
+ class ImageMatcher:
254
+ """
255
+ An image matching SDK
256
+
257
+ Args:
258
+ strength (int): The 0-10 strength to use for matching. Defaults to 5.
259
+ exact_match (bool): Should use SHA-256 hashes? If False, the default, will use perceptual hashes.
260
+ If True, strength must be 10.
261
+ processes (int): The number of processes to use. Defaults to None.
262
+ extensions (set): The extensions to process. Optional.
263
+ """
264
+ SUPPORTED_EXTS: ClassVar = {".jpg", ".jpeg", ".png", ".webp", ".tif", ".tiff", ".bmp", ".gif", ".zip"}
265
+
266
+ def __init__(self, strength: int = 5, processes: int | None = None, extensions: set | None = None,
267
+ *, exact_match: bool = False):
268
+ if not (0 <= strength <= 10):
269
+ raise ValueError("Strength must be between 0 and 10!")
270
+
271
+ self.extensions = extensions or self.SUPPORTED_EXTS
272
+
273
+ self.strength = strength
274
+ self.exact_match = exact_match
275
+ self.processes = processes
276
+
277
+ self.found_images = 0
278
+ self.processed_images = 0
279
+ self.duplicate_images = 0
280
+ self.matches = []
281
+
282
+ m = Manager()
283
+ self.events = m.Queue() # Events to go to higher level users
284
+ self._new_paths = m.Queue() # Inbound queue for new paths that are added while processing is running
285
+ self._removed_paths = set() # Paths that have been removed from processing after processing has been started
286
+ self._ignored_files = set() # Files which have been ignored and should be skipped from processing if re-ran
287
+ self._processed_zips = {} # Zips that have been successfully processed
288
+ self._hashes = defaultdict(ImageMatch) # Hash -> Paths
289
+ self._reverse_hashes = {} # Path -> Hash
290
+
291
+ # Pausing and finished signaling...
292
+ self._not_paused = Event()
293
+ self._not_paused.set()
294
+ self._finished = Event()
295
+ self._finished.set()
296
+
297
+ @property
298
+ def left_to_process(self):
299
+ """Files that are left to process"""
300
+ return self.found_images - self.processed_images
301
+
302
+ def add_path(self, path: str | Path):
303
+ """Add a path for processing"""
304
+ path = str(Path(path).absolute())
305
+ self._removed_paths.discard(path)
306
+ self._new_paths.put(path)
307
+
308
+ def remove_path(self, folder: str | Path):
309
+ """
310
+ Mark a folder to be skipped going forward, and remove already-indexed files
311
+ that live under it. Pauses briefly if not already paused to keep state sane.
312
+ """
313
+ # TODO: This works but the biggest problem with it is that it will not remove any images which are still
314
+ # queue'd up for processing in the ThreadPool... I'm not sure how to fix that yet.
315
+ folder = str(Path(folder).absolute())
316
+ paused = self.conditional_pause()
317
+ self._removed_paths.add(folder)
318
+
319
+ # Remove anything we've already seen under that folder
320
+ # (iterate over a copy because remove() mutates structures)
321
+ to_remove = [p for p in self._reverse_hashes if _is_under(folder, p.path)]
322
+ for p in to_remove:
323
+ self.remove(p)
324
+
325
+ to_remove_zips = [p for p in self._processed_zips if _is_under(folder, p)]
326
+ for p in to_remove_zips:
327
+ self._processed_zips.pop(p)
328
+
329
+ self.conditional_resume(paused)
330
+
331
+ def conditional_pause(self) -> bool:
332
+ """Pause if not paused and return if was paused"""
333
+ _conditional_pause = self.is_paused()
334
+ if not _conditional_pause:
335
+ logger.debug('Performing conditional pause')
336
+ self.pause()
337
+
338
+ return _conditional_pause
339
+
340
+ def conditional_resume(self, was_paused: bool): # noqa: FBT001
341
+ """Resume if not paused previous (from call to `conditional_pause`)"""
342
+ if not was_paused and not self.is_finished():
343
+ logger.debug('Performing conditional resume')
344
+ self.resume()
345
+
346
+ def pause(self):
347
+ """Pause processing"""
348
+ logger.debug('Performing pause')
349
+ self._not_paused.clear()
350
+
351
+ def is_paused(self):
352
+ """Is processing paused"""
353
+ return not self._not_paused.is_set()
354
+
355
+ def finish(self):
356
+ """Finish processing"""
357
+ logger.debug('Performing finished')
358
+ self._finished.set()
359
+
360
+ def is_finished(self):
361
+ """Is processing finished"""
362
+ return self._finished.is_set()
363
+
364
+ def resume(self):
365
+ """Resume processing"""
366
+ logger.debug('Performing resume')
367
+ self._not_paused.set()
368
+
369
+ def running(self):
370
+ """Currently running and loading hashes?"""
371
+ return not self.is_paused() and (not self.is_finished() or self.left_to_process)
372
+
373
+ def remove(self, path):
374
+ """Remove a loaded path completely from the image matching system. Will not delete a file."""
375
+ # Pause things while we remove things...
376
+ logger.info('Removing %s from %s', path, self.__class__.__name__)
377
+ paused = self.conditional_pause()
378
+
379
+ hash_ = self._reverse_hashes.pop(path)
380
+ self._hashes[hash_].matches.remove(path)
381
+ if len(self._hashes[hash_].matches) == 1:
382
+ match_i = self._hashes[hash_].match_i
383
+ logger.debug('Unmatching match group %s', match_i)
384
+ self._hashes[hash_].match_i = None
385
+
386
+ del self.matches[match_i]
387
+ self.refresh_match_indexes(match_i)
388
+ self.duplicate_images -= 2
389
+
390
+ elif not self._hashes[hash_].matches:
391
+ logger.debug('Removing empty match group')
392
+ del self._hashes[hash_]
393
+
394
+ else:
395
+ logger.debug('Simple removal performed')
396
+ self.duplicate_images -= 1
397
+
398
+ self.processed_images -= 1
399
+ self.found_images -= 1
400
+ self.conditional_resume(paused)
401
+
402
+ def ignore(self, path):
403
+ """Remove a path from the image matching service"""
404
+ self.remove(path)
405
+
406
+ if path.path_obj.suffix.lower() != '.zip':
407
+ self._ignored_files.add(path.path)
408
+
409
+ def refresh_match_indexes(self, start=0):
410
+ """Update the match_i value for all the matches passed a certain point"""
411
+ for match_i, match in enumerate(self.matches[start:], start=start):
412
+ match.match_i = match_i
413
+
414
+ def _process_image_callback(self, result):
415
+ """
416
+ Handle the result of hashing an image.
417
+
418
+ This needs to do quite a few things including sanitizing the results,
419
+ actually checking if the hash matches an existing image,
420
+ adding the image and any matches to the backend data structures, notify any listeners,
421
+ update the found and processed image counts,
422
+ and verify that this result wasn't added as a removed path since it was queued.
423
+
424
+ Args:
425
+ result: A tuple consisting of the path to the file, and the resultant hashes.
426
+ If the hashes are a dict, then it is assumed that the path is for a zip. In that case,
427
+ the individual zip files will sanitized and re-ran through this callback.
428
+ """
429
+ # TODO: This callback must return IMMEDIATELY and is currently too slow for large amounts of zips.
430
+ # Perhaps create a new queue/thread and queue up processing for zip results?
431
+ # I think the major slow point is adding to the data structures and I'm not sure if more threads will help
432
+ # Check for paused or finished signals
433
+ self._not_paused.wait()
434
+ if self.is_finished():
435
+ return
436
+
437
+ # region Sanitize results
438
+ path: Path | str | ZipPath
439
+ path, hashes = result
440
+
441
+ if any(_is_under(d, path.path if isinstance(path, ZipPath) else path) for d in self._removed_paths):
442
+ # This image was removed AFTER it was queue'd! So decrement the found images count and just leave...
443
+ self.found_images -= 1
444
+ return
445
+
446
+ if isinstance(hashes, dict):
447
+ self.found_images -= 1
448
+ subpaths = []
449
+ for sub_path, sub_hashes in hashes.items():
450
+ self.found_images += 1
451
+ subpaths.append(ZipPath(str(path), sub_path))
452
+ self._process_image_callback((subpaths[-1], sub_hashes))
453
+ self._processed_zips[str(path)] = subpaths
454
+ return
455
+
456
+ initial_hash, extra_hashes = hashes
457
+ extra_hashes.add(initial_hash)
458
+ if not isinstance(path, ZipPath):
459
+ # From this point on, EVERYTHING should be a ZipPath
460
+ path = ZipPath(str(path), "")
461
+ # endregion
462
+
463
+ if path in self._reverse_hashes:
464
+ self.found_images -= 1
465
+ return
466
+
467
+ self.processed_images += 1
468
+
469
+ # From testing at ~1.5m loaded images: it is ~10% faster to return a set and do this than it is to
470
+ # iterate over a list and do an `is in` check for each hash
471
+ found_hashes = self._hashes.keys() & extra_hashes
472
+ if not found_hashes:
473
+ # This is a new image not matching any previous, so just add it to the hashmap and move on...
474
+ # Just use the initial orientation
475
+ hash_ = initial_hash
476
+ self._reverse_hashes[path] = hash_
477
+ self._hashes[hash_].matches.append(path)
478
+ return
479
+
480
+ # We have found a match!
481
+ hash_ = next(iter(found_hashes))
482
+ self._reverse_hashes[path] = hash_
483
+ self._hashes[hash_].matches.append(path)
484
+
485
+ if self._hashes[hash_].match_i is None and len(self._hashes[hash_].matches) >= 2:
486
+ # This is a brand new match group!
487
+ self._hashes[hash_].match_i = len(self.matches)
488
+ self.matches.append(self._hashes[hash_])
489
+ self.duplicate_images += 2
490
+ self.events.put(NewGroup(self._hashes[hash_]))
491
+ logger.debug('New match group found: %s', self._hashes[hash_].matches)
492
+ else:
493
+ # Just another match for an existing group...
494
+ self.duplicate_images += 1
495
+ self.events.put(NewMatch(self._hashes[hash_], path))
496
+ logger.debug('New match found for group #%s: %s',
497
+ self._hashes[hash_].match_i,
498
+ self._hashes[hash_].matches)
499
+
500
+ def _process_image_error_callback(self, e):
501
+ """Temporary for testing"""
502
+ self.processed_images += 1
503
+ logger.error("%s: %s (input path %s)", type(e), e, e.input_path)
504
+
505
+ def _root_stream(self):
506
+ """This is to yield any paths for processing, then wait until processing is finished for any new paths"""
507
+ while not self._new_paths.empty() or self.left_to_process:
508
+ if self._new_paths.empty():
509
+ time.sleep(0.05)
510
+ continue
511
+
512
+ yield self._new_paths.get_nowait()
513
+
514
+ def run(self, paths: list[str | Path]):
515
+ """Do the work of matching!"""
516
+ self._not_paused.set()
517
+ self._finished.clear()
518
+
519
+ for path in paths:
520
+ self.add_path(path)
521
+
522
+ with Pool(self.processes) as tp:
523
+ for path in self._root_stream():
524
+ path = Path(path)
525
+ if not path.is_dir():
526
+ logger.warning('A path was entered that was not a directory : %s', path)
527
+ continue
528
+
529
+ path = str(path.absolute())
530
+ if path in self._removed_paths:
531
+ continue
532
+
533
+ for root, dirs, files in os.walk(path):
534
+ if self.is_finished():
535
+ break
536
+
537
+ dirs.sort() # This actually works to ensure that os.walk goes in alphabetical order!
538
+ root = Path(root)
539
+
540
+ if any(_is_under(d, root) for d in self._removed_paths):
541
+ continue
542
+
543
+ for f in files:
544
+ self._not_paused.wait()
545
+ if self.is_finished():
546
+ break
547
+
548
+ f = root / f
549
+
550
+ if f.suffix.lower() not in self.extensions:
551
+ continue
552
+
553
+ if any(_is_under(d, f) for d in self._removed_paths):
554
+ continue
555
+
556
+ if str(f) in self._ignored_files:
557
+ continue
558
+
559
+ if f.suffix.lower() == '.zip':
560
+ if str(f.absolute()) in self._processed_zips:
561
+ continue
562
+ elif ZipPath(str(f), "") in self._reverse_hashes:
563
+ continue
564
+
565
+ self.found_images += 1
566
+ tp.apply_async(
567
+ _process_image,
568
+ args=(f, ),
569
+ kwds={
570
+ 'strength': self.strength,
571
+ 'supported_extensions': self.extensions,
572
+ 'exact_match': self.exact_match,
573
+ },
574
+ callback=self._process_image_callback,
575
+ error_callback=self._process_image_error_callback,
576
+ )
577
+
578
+ tp.close()
579
+
580
+ if not self.is_finished():
581
+ tp.join()
582
+
583
+ if not self.is_finished():
584
+ self._finished.set()
585
+ self.events.put(Finished())
@@ -8,10 +8,9 @@ from PySide6 import QtWidgets
8
8
 
9
9
  from pixmatch.gui import MainWindow
10
10
 
11
-
12
11
  if __name__ == "__main__":
13
12
  parser = argparse.ArgumentParser(
14
- description="Process zero or more file paths."
13
+ description="Process zero or more file paths.",
15
14
  )
16
15
  parser.add_argument(
17
16
  "folders",
@@ -41,7 +40,7 @@ if __name__ == "__main__":
41
40
  QToolBar { spacing: 8px; }
42
41
  QLabel#GroupTitle { padding: 4px 0; }
43
42
  QFrame#ImageTile { border: 1px solid #444; border-radius: 6px; padding: 6px; }
44
- """
43
+ """,
45
44
  )
46
45
  w = MainWindow(args.folders)
47
46
  w.show()
@@ -1,12 +1,11 @@
1
1
  from typing import Iterable
2
2
 
3
-
4
3
  def human_bytes(
5
4
  n: int,
6
5
  *,
7
6
  base: int = 1000,
8
7
  decimals: int = 0,
9
- units: Iterable[str] = ("b", "kb", "mb", "gb", "tb", "pb", "eb", "zb", "yb")
8
+ units: Iterable[str] = ("b", "kb", "mb", "gb", "tb", "pb", "eb", "zb", "yb"),
10
9
  ) -> str:
11
10
  """
12
11
  Convert a byte count to a human-readable string.
@@ -19,6 +18,9 @@ def human_bytes(
19
18
 
20
19
  Returns:
21
20
  A compact string like '66kb', '1mb', '1.5gb', or '999b'.
21
+
22
+ Raises:
23
+ ValueError: If n < 0
22
24
  """
23
25
  if n < 0:
24
26
  raise ValueError("Byte size cannot be negative")
@@ -31,6 +33,6 @@ def human_bytes(
31
33
 
32
34
  if i == 0 or decimals == 0:
33
35
  # Bytes or integer formatting requested
34
- return f"{int(n if i else n)}{tuple(units)[i]}"
36
+ return f"{int(n)}{tuple(units)[i]}"
35
37
 
36
38
  return f"{n:.{decimals}f}{tuple(units)[i]}".rstrip("0").rstrip(".")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pixmatch
3
- Version: 0.0.4
3
+ Version: 0.0.5
4
4
  Summary: A modern VisiPics replacement.
5
5
  Author-email: Ryan Heard <ryanwheard@gmail.com>
6
6
  Project-URL: Repository, https://github.com/rheard/pixmatch
@@ -5,7 +5,7 @@ requires = [
5
5
 
6
6
  [project]
7
7
  name = "pixmatch"
8
- version = "0.0.4"
8
+ version = "0.0.5"
9
9
  #dynamic = ["license"]
10
10
  authors = [
11
11
  { name="Ryan Heard", email="ryanwheard@gmail.com" },
@@ -87,16 +87,23 @@ extend-select = [
87
87
  "YTT", # flake8-2020
88
88
  ]
89
89
  ignore=[
90
+ "ANN001", # Missing type annotation for function argument X
91
+ "ANN002", # Missing type annotation for *arg
92
+ "ANN003", # Missing type annotation for arg
90
93
  "ANN201", # Missing return type
91
94
  "D100", # Missing docstring in public module
92
95
  "D104", # Missing docstring in public package
93
96
  "D105", # Missing docstring in magic method
97
+ "D107", # Missing docstring in __init__ method
94
98
  "D202", # Blank line after function docstring
99
+ "D205", # 1 blank line required between summary line and description
95
100
  "D212", # Multi-line docstring summary should start at the first line
96
101
  "D415", # Closing punctuation on docstrings
97
102
  "PT013", # Incorrect import of pytest
98
103
  "RUF023", # __slots__ is not sorted
104
+ "RUF052", # Local dummy variable X is accessed
99
105
  "TRY003", # Avoid specifying long messages outside the exception class
106
+ "UP035", # Import from `collections.abc` instead: `Iterable`
100
107
  ]
101
108
 
102
109
  [tool.ruff.lint.pydocstyle]
@@ -1,446 +0,0 @@
1
- import hashlib
2
- import logging
3
- import os
4
- import time
5
-
6
- from collections import defaultdict
7
- from dataclasses import dataclass, field
8
- from multiprocessing import Pool, Manager
9
- from pathlib import Path
10
- from threading import Event
11
- from typing import Union
12
- from zipfile import ZipFile
13
-
14
- import imagehash
15
- import numpy as np
16
-
17
- from PIL import Image
18
-
19
- logger = logging.getLogger(__name__)
20
-
21
-
22
- @dataclass(frozen=True)
23
- class ZipPath:
24
- path: str
25
- subpath: str
26
-
27
- @property
28
- def path_obj(self):
29
- return Path(self.path)
30
-
31
- @property
32
- def is_gif(self) -> bool:
33
- movie_extensions = {'.gif', '.webp'}
34
- return (not self.subpath and Path(self.path).suffix.lower() in movie_extensions) \
35
- or (self.subpath and self.subpath[-4:].lower() in movie_extensions)
36
-
37
- def absolute(self):
38
- return ZipPath(str(self.path_obj.absolute()), self.subpath)
39
-
40
-
41
- def _is_under(folder_abs: str, target: str | Path) -> bool:
42
- """Return True if the ZipPath's real file (zp.path) is inside folder_abs."""
43
- try:
44
- Path(target).absolute().relative_to(Path(folder_abs).absolute())
45
- return True
46
- except ValueError:
47
- return False
48
-
49
-
50
- def phash_params_for_strength(strength: int) -> tuple[int, int]:
51
- # TODO: This sucks.
52
- strength = max(0, min(10, strength))
53
- if strength >= 10:
54
- return 16, 4 # 256-bit hash, strict
55
- elif strength >= 8:
56
- return 15, 4
57
- elif strength >= 7:
58
- return 13, 4
59
- elif strength >= 6:
60
- return 11, 4
61
- elif strength >= 5:
62
- return 9, 4
63
- elif strength >= 4:
64
- return 8, 4
65
- elif strength >= 3:
66
- return 8, 3
67
- elif strength >= 2:
68
- return 7, 3
69
- else:
70
- return 6, 3
71
-
72
-
73
- def calculate_hashes(f, is_gif=False, strength=5, exact_match=False):
74
- """
75
- Calculate hashes for a given file.
76
-
77
- Args:
78
- f (IO or str or Path): Either a file path to process, or a in-memory BytesIO object ready for reading.
79
- is_gif (bool): Is this gif data? Needed if passing an in-memory BytesIO object.
80
- strength (int): A number between 0 and 10 on the strength of the matches.
81
- exact_match (bool): Use exact SHA256 hahes?
82
- If true, strength must be 10.
83
- If false, perceptual hashes will be used, even with high strength.
84
-
85
- Returns:
86
- list: The found hashes.
87
- """
88
- if exact_match:
89
- hasher = hashlib.sha256()
90
- block_size = 65536
91
- with (open(f, "rb") if isinstance(f, (str, Path)) else f) as file:
92
- for block in iter(lambda: file.read(block_size), b""):
93
- hasher.update(block)
94
- return [hasher.hexdigest()]
95
-
96
- hash_size, highfreq_factor = phash_params_for_strength(strength)
97
- with (Image.open(f) as im):
98
- if is_gif:
99
- initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
100
- # This is going to be a bit confusing but basically, imagehash produces weird hashes for some gifs
101
- # because some gifs have bad first frames consisting of nothing or only a single color...
102
- # To deal with that I'm looking for these bad hashes here and if its one, we advance to the next frame
103
- # and use THAT for imagehash instead.
104
- # The ones we need to be on the lookout for are:
105
- # 1. The hash is all 1111...
106
- # 2. The hash is all 0000...
107
- # 3. The hash is of the form 100000...
108
- # TODO: This is simply not good enough. I'm still getting bad matches for gifs, tho they are extremely rare
109
- val = initial_hash.hash[0][0]
110
- while all(all(x == val for x in r) for r in initial_hash.hash) \
111
- or all(all(x == np.False_ or (x_i == 0 and r_i == 0) for x_i, x in enumerate(r))
112
- for r_i, r in enumerate(initial_hash.hash)):
113
- try:
114
- im.seek(im.tell() + 1)
115
- except EOFError:
116
- break
117
- else:
118
- initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
119
- val = initial_hash.hash[0][0]
120
-
121
- # For GIFs we'll look for mirrored versions but thats it
122
- flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
123
- return [initial_hash, imagehash.phash(flipped_h_image, hash_size=hash_size, highfreq_factor=highfreq_factor)]
124
-
125
- flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
126
- flipped_v_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
127
- images = (im, im.rotate(90), im.rotate(180), im.rotate(270),
128
- flipped_h_image, flipped_h_image.rotate(90), flipped_h_image.rotate(180), flipped_h_image.rotate(270),
129
- flipped_v_image, flipped_v_image.rotate(90), flipped_v_image.rotate(180), flipped_v_image.rotate(270))
130
- return [imagehash.phash(image, hash_size=hash_size, highfreq_factor=highfreq_factor) for image in images]
131
-
132
-
133
- def _process_image(path: str | Path, strength=5, exact_match=False):
134
- path = Path(path)
135
- if path.suffix.lower() != '.zip':
136
- return path, calculate_hashes(path, is_gif=path.suffix.lower() in {".gif", ".webp"},
137
- strength=strength, exact_match=exact_match)
138
-
139
- results = dict()
140
- with ZipFile(path) as zf:
141
- for f in zf.filelist:
142
- with zf.open(f) as zipped_file:
143
- results[f.filename] = calculate_hashes(zipped_file, is_gif=f.filename[-4:].lower() in {".gif", ".webp"},
144
- strength=strength, exact_match=exact_match)
145
-
146
- return path, results
147
-
148
-
149
- @dataclass
150
- class ImageMatch:
151
- match_i: int | None = field(default=None)
152
- matches: list[ZipPath] = field(default_factory=list)
153
-
154
-
155
- @dataclass(frozen=True)
156
- class NewGroup:
157
- group: "ImageMatch" # forward-ref to your class
158
-
159
-
160
- @dataclass(frozen=True)
161
- class NewMatch:
162
- group: "ImageMatch"
163
- path: ZipPath
164
-
165
-
166
- @dataclass(frozen=True)
167
- class Finished:
168
- pass
169
-
170
-
171
- MatcherEvent = Union[NewGroup, NewMatch, Finished]
172
-
173
-
174
- # TODO: FINISHED signal?
175
- class ImageMatcher:
176
- SUPPORTED_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".tif", ".tiff", ".bmp", ".gif", ".zip"}
177
-
178
- def __init__(self, strength: int = 5, exact_match: bool = False, processes: int | None = None,
179
- extensions: set | None = None):
180
- if not (0 <= strength <= 10):
181
- raise ValueError("Strength must be between 0 and 10!")
182
-
183
- self.extensions = extensions or self.SUPPORTED_EXTS
184
-
185
- self.strength = strength
186
- self.exact_match = exact_match
187
- self.processes = processes
188
- self.found_images = 0
189
- self.processed_images = 0
190
- self.duplicate_images = 0
191
-
192
- m = Manager()
193
- self.events = m.Queue()
194
- self._new_paths = m.Queue()
195
- self._removed_paths = set()
196
- self._processed_paths = set()
197
- self._hashes = defaultdict(ImageMatch)
198
- self._reverse_hashes = dict()
199
-
200
- self._not_paused = Event()
201
- self._not_paused.set()
202
- self._finished = Event()
203
- self._finished.set()
204
-
205
- self.matches = []
206
-
207
- def add_path(self, path: str | Path):
208
- path = str(Path(path).absolute())
209
- self._removed_paths.discard(path)
210
- self._new_paths.put(path)
211
-
212
- def remove_path(self, folder: str | Path) -> None:
213
- """
214
- Mark a folder to be skipped going forward, and remove already-indexed files
215
- that live under it. Pauses briefly if not already paused to keep state sane.
216
- """
217
- folder = str(Path(folder).absolute())
218
- paused = self.conditional_pause()
219
- self._removed_paths.add(folder)
220
- self._processed_paths.discard(folder)
221
-
222
- # Remove anything we've already seen under that folder
223
- # (iterate over a copy because remove() mutates structures)
224
- to_remove = [p for p in self._reverse_hashes.keys() if _is_under(folder, p.path)]
225
- for p in to_remove:
226
- self.remove(p)
227
-
228
- self.conditional_resume(paused)
229
-
230
- @property
231
- def left_to_process(self):
232
- return self.found_images - self.processed_images
233
-
234
- def pause(self):
235
- logger.debug('Performing pause')
236
- self._not_paused.clear()
237
-
238
- def conditional_pause(self):
239
- _conditional_pause = self.is_paused()
240
- if not _conditional_pause:
241
- logger.debug('Performing conditional pause')
242
- self.pause()
243
-
244
- return _conditional_pause
245
-
246
- def conditional_resume(self, was_paused):
247
- if not was_paused and not self.is_finished():
248
- logger.debug('Performing conditional resume')
249
- self.resume()
250
-
251
- def is_paused(self):
252
- return not self._not_paused.is_set()
253
-
254
- def finish(self):
255
- logger.debug('Performing finished')
256
- self._finished.set()
257
-
258
- def is_finished(self):
259
- return self._finished.is_set()
260
-
261
- def resume(self):
262
- logger.debug('Performing resume')
263
- self._not_paused.set()
264
-
265
- def running(self):
266
- return not self.is_paused() and (not self.is_finished() or self.left_to_process)
267
-
268
- def remove(self, path):
269
- # Pause things while we remove things...
270
- logger.info('Removing %s from %s', path, self.__class__.__name__)
271
- paused = self.conditional_pause()
272
-
273
- hash = self._reverse_hashes.pop(path)
274
- self._hashes[hash].matches.remove(path)
275
- if len(self._hashes[hash].matches) == 1:
276
- match_i = self._hashes[hash].match_i
277
- logger.debug('Unmatching match group %s', match_i)
278
- self._hashes[hash].match_i = None
279
-
280
- del self.matches[match_i]
281
- self.refresh_match_indexes(match_i)
282
- self.duplicate_images -= 2
283
-
284
- elif not self._hashes[hash].matches:
285
- logger.debug('Removing empty match group')
286
- del self._hashes[hash]
287
-
288
- else:
289
- logger.debug('Simple removal performed')
290
- self.duplicate_images -= 1
291
-
292
- self.processed_images -= 1
293
- self.found_images -= 1
294
- self.conditional_resume(paused)
295
-
296
- def refresh_match_indexes(self, start=0):
297
- for match_i, match in enumerate(self.matches[start:], start=start):
298
- match.match_i = match_i
299
-
300
- def _process_image_callback(self, result):
301
- self._not_paused.wait()
302
- if self.is_finished():
303
- return
304
-
305
- path: Path | str | ZipPath
306
- path, hashes = result
307
-
308
- if any(_is_under(d, path.path if isinstance(path, ZipPath) else path) for d in self._removed_paths):
309
- self.found_images -= 1
310
- return
311
-
312
- if isinstance(hashes, dict):
313
- self.found_images -= 1
314
- for sub_path, sub_hashes in hashes.items():
315
- self.found_images += 1
316
- self._process_image_callback((ZipPath(str(path), sub_path), sub_hashes))
317
- return
318
-
319
- if not isinstance(path, ZipPath):
320
- path = ZipPath(str(path), "")
321
-
322
- if path in self._reverse_hashes:
323
- self.found_images -= 1
324
- return
325
-
326
- self.processed_images += 1
327
- for hash_ in hashes:
328
- if hash_ not in self._hashes:
329
- continue
330
-
331
- self._reverse_hashes[path] = hash_
332
-
333
- # This appears to be a new match!
334
- for match in self._hashes[hash_].matches:
335
- if path.absolute() == match.absolute():
336
- # This appears to be a duplicate PATH...
337
- logger.warning('Duplicate files entered! %s, %s', path, match)
338
- return
339
-
340
- self._hashes[hash_].matches.append(path)
341
- if self._hashes[hash_].match_i is None and len(self._hashes[hash_].matches) >= 2:
342
- # This is a brand new match group!
343
- self._hashes[hash_].match_i = len(self.matches)
344
- self.matches.append(self._hashes[hash_])
345
- self.duplicate_images += 2
346
- self.events.put(NewGroup(self._hashes[hash_]))
347
- logger.debug('New match group found: %s', self._hashes[hash_].matches)
348
- else:
349
- # Just another match for an existing group...
350
- self.duplicate_images += 1
351
- self.events.put(NewMatch(self._hashes[hash_], path))
352
- logger.debug('New match found for group #%s: %s',
353
- self._hashes[hash_].match_i,
354
- self._hashes[hash_].matches)
355
-
356
- break
357
- else:
358
- # This is a new hash, so just add it to the hashmap and move on...
359
- # Just use the initial orientation
360
- hash_ = hashes[0]
361
- self._reverse_hashes[path] = hash_
362
- self._hashes[hash_].matches.append(path)
363
- return
364
-
365
- def _process_image_error_callback(self, e):
366
- self.processed_images += 1
367
- print(str(e))
368
-
369
- def _root_stream(self):
370
- # Yield any paths that come up for processing, then wait until processing is finished for any new paths
371
- while not self._new_paths.empty() or self.left_to_process:
372
- if self._new_paths.empty():
373
- time.sleep(0.05)
374
- continue
375
-
376
- yield self._new_paths.get_nowait()
377
-
378
- def run(self, paths: list[str | Path]):
379
- # TODO: Verify none of the paths overlap
380
- # TODO: Verify none of the dirs have been deleted after we started
381
-
382
- self._not_paused.set()
383
- self._finished.clear()
384
-
385
- for path in paths:
386
- self.add_path(path)
387
-
388
- with Pool(self.processes) as tp:
389
- for path in self._root_stream():
390
- path = Path(path)
391
- if not path.is_dir():
392
- logger.warning('A path was entered that was not a directory : %s', path)
393
- continue
394
-
395
- path = str(path.absolute())
396
- if path in self._removed_paths or path in self._processed_paths:
397
- continue
398
-
399
- for root, dirs, files in os.walk(path):
400
- if self.is_finished():
401
- break
402
-
403
- root = Path(root)
404
-
405
- if any(_is_under(d, root) for d in self._removed_paths):
406
- continue
407
-
408
- for f in files:
409
- self._not_paused.wait()
410
- if self.is_finished():
411
- break
412
-
413
- f = root / f
414
-
415
- if f.suffix.lower() not in self.extensions:
416
- continue
417
-
418
- if any(_is_under(d, f) for d in self._removed_paths):
419
- continue
420
-
421
- # TODO: This sucks (for zips at least), but I can't iterate over the dict while its changing...
422
- if ZipPath(str(f), "") in self._reverse_hashes:
423
- continue
424
-
425
- self.found_images += 1
426
- tp.apply_async(
427
- _process_image,
428
- args=(f, ),
429
- kwds={
430
- 'strength': self.strength,
431
- 'exact_match': self.exact_match,
432
- },
433
- callback=self._process_image_callback,
434
- error_callback=self._process_image_error_callback,
435
- )
436
-
437
- self._processed_paths.add(path)
438
-
439
- tp.close()
440
-
441
- if not self.is_finished():
442
- tp.join()
443
-
444
- if not self.is_finished():
445
- self._finished.set()
446
- self.events.put(Finished())
File without changes
File without changes
File without changes