pixmatch 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pixmatch/__init__.py ADDED
@@ -0,0 +1,585 @@
1
+ import hashlib
2
+ import logging
3
+ import os
4
+ import time
5
+
6
+ from collections import defaultdict
7
+ from dataclasses import dataclass, field
8
+ from functools import wraps
9
+ from multiprocessing import Manager, Pool
10
+ from pathlib import Path
11
+ from threading import Event
12
+ from typing import ClassVar, Union
13
+ from zipfile import BadZipFile, ZipFile
14
+
15
+ import imagehash
16
+ import numpy as np
17
+
18
+ from PIL import Image, ImageFile, UnidentifiedImageError
19
+
20
+ ImageFile.LOAD_TRUNCATED_IMAGES = True # Allow damaged images
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class ZipPath:
27
+ """
28
+ A general object describing a Path.
29
+
30
+ All paths in pixmatch will be one of these. `subpath` will be empty for non-zip file paths.
31
+
32
+ Attributes:
33
+ path (str): The path to the file.
34
+ subpath (str): The subpath in the zip if `path` is for a zip.
35
+ """
36
+ # TODO: At some point convert this to Path.
37
+ # When I tried that last it introduced problems with inter-process communication
38
+ path: str
39
+ subpath: str
40
+
41
+ @property
42
+ def path_obj(self) -> Path:
43
+ """Get the path as as Path object"""
44
+ return Path(self.path)
45
+
46
+ @property
47
+ def is_gif(self) -> bool:
48
+ """Is this a path to an animated image?"""
49
+ movie_extensions = {'.gif', '.webp'}
50
+ return (not self.subpath and Path(self.path).suffix.lower() in movie_extensions) \
51
+ or (self.subpath and self.subpath[-4:].lower() in movie_extensions)
52
+
53
+ @property
54
+ def is_zip(self) -> bool:
55
+ """Does this point to a file located in a zip?"""
56
+ return bool(self.subpath)
57
+
58
+ def absolute(self):
59
+ """Get the absolute version of this ZipPath"""
60
+ return ZipPath(str(self.path_obj.absolute()), self.subpath)
61
+
62
+
63
+ def _is_under(folder_abs: str, target: str | Path) -> bool:
64
+ """Return True if the ZipPath's real file (zp.path) is inside folder_abs."""
65
+ try:
66
+ Path(target).absolute().relative_to(Path(folder_abs).absolute())
67
+ except ValueError:
68
+ return False
69
+
70
+ return True
71
+
72
+
73
+ def phash_params_for_strength(strength: int) -> tuple[int, int]:
74
+ """
75
+ Convert a 0-10 strength to settings for imagehash
76
+
77
+ Returns:
78
+ tuple<int, int>: The hash size (in bytes) and the high frequency factor
79
+ """
80
+ # TODO: This sucks.
81
+ strength = max(0, min(10, strength))
82
+ if strength >= 10:
83
+ return 16, 4
84
+ if strength >= 8:
85
+ return 15, 4
86
+ if strength >= 7:
87
+ return 13, 4
88
+ if strength >= 6:
89
+ return 11, 4
90
+ if strength >= 5:
91
+ return 9, 4
92
+ if strength >= 4:
93
+ return 8, 4
94
+ if strength >= 3:
95
+ return 8, 3
96
+ if strength >= 2:
97
+ return 7, 3
98
+ return 6, 3
99
+
100
+
101
+ def calculate_hashes(f, strength=5, *, is_gif=False, exact_match=False) -> tuple[str, set[str]]:
102
+ """
103
+ Calculate hashes for a given file.
104
+
105
+ Args:
106
+ f (IO or str or Path): Either a file path to process, or a in-memory BytesIO object ready for reading.
107
+ strength (int): A number between 0 and 10 on the strength of the matches.
108
+ is_gif (bool): Is this gif data? Needed if passing an in-memory BytesIO object.
109
+ exact_match (bool): Use exact SHA256 hahes?
110
+ If true, strength must be 10.
111
+ If false, perceptual hashes will be used, even with high strength.
112
+
113
+ Returns:
114
+ tuple[str, set]: The first element is the primary hash,
115
+ the second element are any secondary hashes representing rotations, flips, etc...
116
+ """
117
+ if exact_match:
118
+ hasher = hashlib.sha256()
119
+ block_size = 65536
120
+ with (open(f, "rb") if isinstance(f, (str, Path)) else f) as file: # noqa: PTH123
121
+ for block in iter(lambda: file.read(block_size), b""):
122
+ hasher.update(block)
123
+ return hasher.hexdigest(), set()
124
+
125
+ hash_size, highfreq_factor = phash_params_for_strength(strength)
126
+ with Image.open(f) as im:
127
+ if is_gif:
128
+ initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
129
+ # This is going to be a bit confusing but basically, imagehash produces weird hashes for some gifs
130
+ # because some gifs have bad first frames consisting of nothing or only a single color...
131
+ # To deal with that I'm looking for these bad hashes here and if its one, we advance to the next frame
132
+ # and use THAT for imagehash instead.
133
+ # The ones we need to be on the lookout for are:
134
+ # 1. The hash is all 1111...
135
+ # 2. The hash is all 0000...
136
+ # 3. The hash is of the form 100000...
137
+ # TODO: This is simply not good enough. I'm still getting bad matches for gifs, tho they are extremely rare
138
+ val = initial_hash.hash[0][0]
139
+ while all(all(x == val for x in r) for r in initial_hash.hash) \
140
+ or all(all(x == np.False_ or (x_i == 0 and r_i == 0) for x_i, x in enumerate(r))
141
+ for r_i, r in enumerate(initial_hash.hash)):
142
+ try:
143
+ im.seek(im.tell() + 1)
144
+ except EOFError: # noqa: PERF203
145
+ break
146
+ else:
147
+ initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
148
+ val = initial_hash.hash[0][0]
149
+
150
+ # For GIFs we'll look for mirrored versions but thats it
151
+ flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
152
+ extras = (flipped_h_image, )
153
+ else:
154
+ initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
155
+
156
+ flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
157
+ flipped_v_image = im.transpose(Image.Transpose.FLIP_TOP_BOTTOM)
158
+ extras = (im.rotate(90), im.rotate(180), im.rotate(270),
159
+ flipped_h_image, flipped_h_image.rotate(90), flipped_h_image.rotate(180),
160
+ flipped_h_image.rotate(270),
161
+ flipped_v_image, flipped_v_image.rotate(90), flipped_v_image.rotate(180),
162
+ flipped_v_image.rotate(270))
163
+
164
+ return str(initial_hash), {
165
+ str(imagehash.phash(image, hash_size=hash_size, highfreq_factor=highfreq_factor)) for image in extras
166
+ }
167
+
168
+
169
+ def thread_error_handler(func):
170
+ """An error handler for the thread to return information about where the error occurred"""
171
+
172
+ @wraps(func)
173
+ def wrapper(path, *args, **kwargs): # noqa: ANN202
174
+ try:
175
+ return func(path, *args, **kwargs)
176
+ except Exception as e:
177
+ e.input_path = path
178
+ raise
179
+
180
+ return wrapper
181
+
182
+
183
+ @thread_error_handler
184
+ def _process_image(
185
+ path: str | Path,
186
+ supported_extensions: set | None = None,
187
+ strength: int = 5,
188
+ *,
189
+ exact_match: bool = False,
190
+ ) -> tuple[Path, tuple | dict[str, tuple]]:
191
+ """Get the hashes for a given path. Is multiprocessing compatible"""
192
+ path = Path(path)
193
+ if path.suffix.lower() != '.zip':
194
+ return path, calculate_hashes(path, is_gif=path.suffix.lower() in {".gif", ".webp"},
195
+ strength=strength, exact_match=exact_match)
196
+
197
+ if not supported_extensions:
198
+ supported_extensions = ImageMatcher.SUPPORTED_EXTS
199
+
200
+ results = {}
201
+ with ZipFile(path) as zf:
202
+ for f in zf.filelist:
203
+ f_ext = f.filename[-4:].lower()
204
+ if f_ext not in supported_extensions:
205
+ continue
206
+
207
+ if f_ext == '.zip':
208
+ logger.warning('Have not implemented nested zip support yet! Input file: %s (%s)', path, f)
209
+ continue
210
+
211
+ try:
212
+ with zf.open(f) as zipped_file:
213
+ results[f.filename] = calculate_hashes(zipped_file, is_gif=f_ext in {".gif", ".webp"},
214
+ strength=strength, exact_match=exact_match)
215
+ except BadZipFile as e:
216
+ logger.warning("Could not read %s in %s due to %s", f.filename, path, str(e))
217
+ except UnidentifiedImageError:
218
+ logger.warning("Could not identify image %s in %s", f.filename, path)
219
+
220
+ return path, results
221
+
222
+
223
+ @dataclass
224
+ class ImageMatch:
225
+ """A match data structure containing the matches and where this match lies in the match list"""
226
+ match_i: int | None = field(default=None)
227
+ matches: list[ZipPath] = field(default_factory=list)
228
+
229
+
230
+ # region Events
231
+ @dataclass(frozen=True)
232
+ class NewGroup:
233
+ """A new group event"""
234
+ group: "ImageMatch"
235
+
236
+
237
+ @dataclass(frozen=True)
238
+ class NewMatch:
239
+ """A new match event"""
240
+ group: "ImageMatch"
241
+ path: ZipPath
242
+
243
+
244
+ @dataclass(frozen=True)
245
+ class Finished:
246
+ """A finished event"""
247
+
248
+
249
+ MatcherEvent = Union[NewGroup, NewMatch, Finished]
250
+ # endregion
251
+
252
+
253
+ class ImageMatcher:
254
+ """
255
+ An image matching SDK
256
+
257
+ Args:
258
+ strength (int): The 0-10 strength to use for matching. Defaults to 5.
259
+ exact_match (bool): Should use SHA-256 hashes? If False, the default, will use perceptual hashes.
260
+ If True, strength must be 10.
261
+ processes (int): The number of processes to use. Defaults to None.
262
+ extensions (set): The extensions to process. Optional.
263
+ """
264
+ SUPPORTED_EXTS: ClassVar = {".jpg", ".jpeg", ".png", ".webp", ".tif", ".tiff", ".bmp", ".gif", ".zip"}
265
+
266
+ def __init__(self, strength: int = 5, processes: int | None = None, extensions: set | None = None,
267
+ *, exact_match: bool = False):
268
+ if not (0 <= strength <= 10):
269
+ raise ValueError("Strength must be between 0 and 10!")
270
+
271
+ self.extensions = extensions or self.SUPPORTED_EXTS
272
+
273
+ self.strength = strength
274
+ self.exact_match = exact_match
275
+ self.processes = processes
276
+
277
+ self.found_images = 0
278
+ self.processed_images = 0
279
+ self.duplicate_images = 0
280
+ self.matches = []
281
+
282
+ m = Manager()
283
+ self.events = m.Queue() # Events to go to higher level users
284
+ self._new_paths = m.Queue() # Inbound queue for new paths that are added while processing is running
285
+ self._removed_paths = set() # Paths that have been removed from processing after processing has been started
286
+ self._ignored_files = set() # Files which have been ignored and should be skipped from processing if re-ran
287
+ self._processed_zips = {} # Zips that have been successfully processed
288
+ self._hashes = defaultdict(ImageMatch) # Hash -> Paths
289
+ self._reverse_hashes = {} # Path -> Hash
290
+
291
+ # Pausing and finished signaling...
292
+ self._not_paused = Event()
293
+ self._not_paused.set()
294
+ self._finished = Event()
295
+ self._finished.set()
296
+
297
+ @property
298
+ def left_to_process(self):
299
+ """Files that are left to process"""
300
+ return self.found_images - self.processed_images
301
+
302
+ def add_path(self, path: str | Path):
303
+ """Add a path for processing"""
304
+ path = str(Path(path).absolute())
305
+ self._removed_paths.discard(path)
306
+ self._new_paths.put(path)
307
+
308
+ def remove_path(self, folder: str | Path):
309
+ """
310
+ Mark a folder to be skipped going forward, and remove already-indexed files
311
+ that live under it. Pauses briefly if not already paused to keep state sane.
312
+ """
313
+ # TODO: This works but the biggest problem with it is that it will not remove any images which are still
314
+ # queue'd up for processing in the ThreadPool... I'm not sure how to fix that yet.
315
+ folder = str(Path(folder).absolute())
316
+ paused = self.conditional_pause()
317
+ self._removed_paths.add(folder)
318
+
319
+ # Remove anything we've already seen under that folder
320
+ # (iterate over a copy because remove() mutates structures)
321
+ to_remove = [p for p in self._reverse_hashes if _is_under(folder, p.path)]
322
+ for p in to_remove:
323
+ self.remove(p)
324
+
325
+ to_remove_zips = [p for p in self._processed_zips if _is_under(folder, p)]
326
+ for p in to_remove_zips:
327
+ self._processed_zips.pop(p)
328
+
329
+ self.conditional_resume(paused)
330
+
331
+ def conditional_pause(self) -> bool:
332
+ """Pause if not paused and return if was paused"""
333
+ _conditional_pause = self.is_paused()
334
+ if not _conditional_pause:
335
+ logger.debug('Performing conditional pause')
336
+ self.pause()
337
+
338
+ return _conditional_pause
339
+
340
+ def conditional_resume(self, was_paused: bool): # noqa: FBT001
341
+ """Resume if not paused previous (from call to `conditional_pause`)"""
342
+ if not was_paused and not self.is_finished():
343
+ logger.debug('Performing conditional resume')
344
+ self.resume()
345
+
346
+ def pause(self):
347
+ """Pause processing"""
348
+ logger.debug('Performing pause')
349
+ self._not_paused.clear()
350
+
351
+ def is_paused(self):
352
+ """Is processing paused"""
353
+ return not self._not_paused.is_set()
354
+
355
+ def finish(self):
356
+ """Finish processing"""
357
+ logger.debug('Performing finished')
358
+ self._finished.set()
359
+
360
+ def is_finished(self):
361
+ """Is processing finished"""
362
+ return self._finished.is_set()
363
+
364
+ def resume(self):
365
+ """Resume processing"""
366
+ logger.debug('Performing resume')
367
+ self._not_paused.set()
368
+
369
+ def running(self):
370
+ """Currently running and loading hashes?"""
371
+ return not self.is_paused() and (not self.is_finished() or self.left_to_process)
372
+
373
+ def remove(self, path):
374
+ """Remove a loaded path completely from the image matching system. Will not delete a file."""
375
+ # Pause things while we remove things...
376
+ logger.info('Removing %s from %s', path, self.__class__.__name__)
377
+ paused = self.conditional_pause()
378
+
379
+ hash_ = self._reverse_hashes.pop(path)
380
+ self._hashes[hash_].matches.remove(path)
381
+ if len(self._hashes[hash_].matches) == 1:
382
+ match_i = self._hashes[hash_].match_i
383
+ logger.debug('Unmatching match group %s', match_i)
384
+ self._hashes[hash_].match_i = None
385
+
386
+ del self.matches[match_i]
387
+ self.refresh_match_indexes(match_i)
388
+ self.duplicate_images -= 2
389
+
390
+ elif not self._hashes[hash_].matches:
391
+ logger.debug('Removing empty match group')
392
+ del self._hashes[hash_]
393
+
394
+ else:
395
+ logger.debug('Simple removal performed')
396
+ self.duplicate_images -= 1
397
+
398
+ self.processed_images -= 1
399
+ self.found_images -= 1
400
+ self.conditional_resume(paused)
401
+
402
+ def ignore(self, path):
403
+ """Remove a path from the image matching service"""
404
+ self.remove(path)
405
+
406
+ if path.path_obj.suffix.lower() != '.zip':
407
+ self._ignored_files.add(path.path)
408
+
409
+ def refresh_match_indexes(self, start=0):
410
+ """Update the match_i value for all the matches passed a certain point"""
411
+ for match_i, match in enumerate(self.matches[start:], start=start):
412
+ match.match_i = match_i
413
+
414
+ def _process_image_callback(self, result):
415
+ """
416
+ Handle the result of hashing an image.
417
+
418
+ This needs to do quite a few things including sanitizing the results,
419
+ actually checking if the hash matches an existing image,
420
+ adding the image and any matches to the backend data structures, notify any listeners,
421
+ update the found and processed image counts,
422
+ and verify that this result wasn't added as a removed path since it was queued.
423
+
424
+ Args:
425
+ result: A tuple consisting of the path to the file, and the resultant hashes.
426
+ If the hashes are a dict, then it is assumed that the path is for a zip. In that case,
427
+ the individual zip files will sanitized and re-ran through this callback.
428
+ """
429
+ # TODO: This callback must return IMMEDIATELY and is currently too slow for large amounts of zips.
430
+ # Perhaps create a new queue/thread and queue up processing for zip results?
431
+ # I think the major slow point is adding to the data structures and I'm not sure if more threads will help
432
+ # Check for paused or finished signals
433
+ self._not_paused.wait()
434
+ if self.is_finished():
435
+ return
436
+
437
+ # region Sanitize results
438
+ path: Path | str | ZipPath
439
+ path, hashes = result
440
+
441
+ if any(_is_under(d, path.path if isinstance(path, ZipPath) else path) for d in self._removed_paths):
442
+ # This image was removed AFTER it was queue'd! So decrement the found images count and just leave...
443
+ self.found_images -= 1
444
+ return
445
+
446
+ if isinstance(hashes, dict):
447
+ self.found_images -= 1
448
+ subpaths = []
449
+ for sub_path, sub_hashes in hashes.items():
450
+ self.found_images += 1
451
+ subpaths.append(ZipPath(str(path), sub_path))
452
+ self._process_image_callback((subpaths[-1], sub_hashes))
453
+ self._processed_zips[str(path)] = subpaths
454
+ return
455
+
456
+ initial_hash, extra_hashes = hashes
457
+ extra_hashes.add(initial_hash)
458
+ if not isinstance(path, ZipPath):
459
+ # From this point on, EVERYTHING should be a ZipPath
460
+ path = ZipPath(str(path), "")
461
+ # endregion
462
+
463
+ if path in self._reverse_hashes:
464
+ self.found_images -= 1
465
+ return
466
+
467
+ self.processed_images += 1
468
+
469
+ # From testing at ~1.5m loaded images: it is ~10% faster to return a set and do this than it is to
470
+ # iterate over a list and do an `is in` check for each hash
471
+ found_hashes = self._hashes.keys() & extra_hashes
472
+ if not found_hashes:
473
+ # This is a new image not matching any previous, so just add it to the hashmap and move on...
474
+ # Just use the initial orientation
475
+ hash_ = initial_hash
476
+ self._reverse_hashes[path] = hash_
477
+ self._hashes[hash_].matches.append(path)
478
+ return
479
+
480
+ # We have found a match!
481
+ hash_ = next(iter(found_hashes))
482
+ self._reverse_hashes[path] = hash_
483
+ self._hashes[hash_].matches.append(path)
484
+
485
+ if self._hashes[hash_].match_i is None and len(self._hashes[hash_].matches) >= 2:
486
+ # This is a brand new match group!
487
+ self._hashes[hash_].match_i = len(self.matches)
488
+ self.matches.append(self._hashes[hash_])
489
+ self.duplicate_images += 2
490
+ self.events.put(NewGroup(self._hashes[hash_]))
491
+ logger.debug('New match group found: %s', self._hashes[hash_].matches)
492
+ else:
493
+ # Just another match for an existing group...
494
+ self.duplicate_images += 1
495
+ self.events.put(NewMatch(self._hashes[hash_], path))
496
+ logger.debug('New match found for group #%s: %s',
497
+ self._hashes[hash_].match_i,
498
+ self._hashes[hash_].matches)
499
+
500
+ def _process_image_error_callback(self, e):
501
+ """Temporary for testing"""
502
+ self.processed_images += 1
503
+ logger.error("%s: %s (input path %s)", type(e).__name__, e, e.input_path)
504
+
505
+ def _root_stream(self):
506
+ """This is to yield any paths for processing, then wait until processing is finished for any new paths"""
507
+ while not self._new_paths.empty() or self.left_to_process:
508
+ if self._new_paths.empty():
509
+ time.sleep(0.05)
510
+ continue
511
+
512
+ yield self._new_paths.get_nowait()
513
+
514
+ def run(self, paths: list[str | Path]):
515
+ """Do the work of matching!"""
516
+ self._not_paused.set()
517
+ self._finished.clear()
518
+
519
+ for path in paths:
520
+ self.add_path(path)
521
+
522
+ with Pool(self.processes) as tp:
523
+ for path in self._root_stream():
524
+ path = Path(path)
525
+ if not path.is_dir():
526
+ logger.warning('A path was entered that was not a directory : %s', path)
527
+ continue
528
+
529
+ path = str(path.absolute())
530
+ if path in self._removed_paths:
531
+ continue
532
+
533
+ for root, dirs, files in os.walk(path):
534
+ if self.is_finished():
535
+ break
536
+
537
+ dirs.sort() # This actually works to ensure that os.walk goes in alphabetical order!
538
+ root = Path(root)
539
+
540
+ if any(_is_under(d, root) for d in self._removed_paths):
541
+ continue
542
+
543
+ for f in files:
544
+ self._not_paused.wait()
545
+ if self.is_finished():
546
+ break
547
+
548
+ f = root / f
549
+
550
+ if f.suffix.lower() not in self.extensions:
551
+ continue
552
+
553
+ if any(_is_under(d, f) for d in self._removed_paths):
554
+ continue
555
+
556
+ if str(f) in self._ignored_files:
557
+ continue
558
+
559
+ if f.suffix.lower() == '.zip':
560
+ if str(f.absolute()) in self._processed_zips:
561
+ continue
562
+ elif ZipPath(str(f), "") in self._reverse_hashes:
563
+ continue
564
+
565
+ self.found_images += 1
566
+ tp.apply_async(
567
+ _process_image,
568
+ args=(f, ),
569
+ kwds={
570
+ 'strength': self.strength,
571
+ 'supported_extensions': self.extensions,
572
+ 'exact_match': self.exact_match,
573
+ },
574
+ callback=self._process_image_callback,
575
+ error_callback=self._process_image_error_callback,
576
+ )
577
+
578
+ tp.close()
579
+
580
+ if not self.is_finished():
581
+ tp.join()
582
+
583
+ if not self.is_finished():
584
+ self._finished.set()
585
+ self.events.put(Finished())
pixmatch/__main__.py ADDED
@@ -0,0 +1,47 @@
1
+ import argparse
2
+ import logging
3
+ import platform
4
+
5
+ from pathlib import Path
6
+
7
+ from PySide6 import QtWidgets
8
+
9
+ from pixmatch.gui import MainWindow
10
+
11
+ if __name__ == "__main__":
12
+ parser = argparse.ArgumentParser(
13
+ description="Process zero or more file paths.",
14
+ )
15
+ parser.add_argument(
16
+ "folders",
17
+ nargs="*",
18
+ type=Path,
19
+ help="Folders to load into the selected file path display (to speed up testing).",
20
+ )
21
+ parser.add_argument('--verbose', action='store_true', help="More detailed logging")
22
+ args = parser.parse_args()
23
+
24
+ logging.basicConfig(
25
+ level=logging.DEBUG if args.verbose else logging.INFO,
26
+ format='%(module)s::%(funcName)s::%(lineno)d %(levelname)s %(asctime)s - %(message)s',
27
+ )
28
+
29
+ if platform.system() == "Windows":
30
+ # Need to tell Windows to not use the Python app icon and use the Window icon isntead...
31
+ # I'm not sure on the specifics but calling this method with any string seems to do the trick....
32
+ # https://stackoverflow.com/questions/1551605
33
+ import ctypes
34
+ ctypes.windll.shell32.SetCurrentProcessExplicitAppUserModelID('company.app.1')
35
+
36
+ app = QtWidgets.QApplication([])
37
+ # Basic stylesheet for subtle polish without complexity.
38
+ app.setStyleSheet(
39
+ """
40
+ QToolBar { spacing: 8px; }
41
+ QLabel#GroupTitle { padding: 4px 0; }
42
+ QFrame#ImageTile { border: 1px solid #444; border-radius: 6px; padding: 6px; }
43
+ """,
44
+ )
45
+ w = MainWindow(args.folders)
46
+ w.show()
47
+ app.exec()
pixmatch/utils.py ADDED
@@ -0,0 +1,38 @@
1
+ from typing import Iterable
2
+
3
+ def human_bytes(
4
+ n: int,
5
+ *,
6
+ base: int = 1000,
7
+ decimals: int = 0,
8
+ units: Iterable[str] = ("b", "kb", "mb", "gb", "tb", "pb", "eb", "zb", "yb"),
9
+ ) -> str:
10
+ """
11
+ Convert a byte count to a human-readable string.
12
+
13
+ Args:
14
+ n: Byte count (e.g., from os.stat().st_size).
15
+ base: 1000 for SI (kb, mb, ...), 1024 for IEC-like step size.
16
+ decimals: Decimal places for non-byte units (0 -> '66kb', 1 -> '1.5gb').
17
+ units: Unit suffixes to use. Defaults to lowercase ('kb'); swap for ('B','kB','MB',...) if preferred.
18
+
19
+ Returns:
20
+ A compact string like '66kb', '1mb', '1.5gb', or '999b'.
21
+
22
+ Raises:
23
+ ValueError: If n < 0
24
+ """
25
+ if n < 0:
26
+ raise ValueError("Byte size cannot be negative")
27
+
28
+ i = 0
29
+ max_i = len(tuple(units)) - 1
30
+ while n >= base and i < max_i:
31
+ n /= base
32
+ i += 1
33
+
34
+ if i == 0 or decimals == 0:
35
+ # Bytes or integer formatting requested
36
+ return f"{int(n)}{tuple(units)[i]}"
37
+
38
+ return f"{n:.{decimals}f}{tuple(units)[i]}".rstrip("0").rstrip(".")
@@ -0,0 +1,93 @@
1
+ Metadata-Version: 2.4
2
+ Name: pixmatch
3
+ Version: 0.0.6
4
+ Summary: A modern VisiPics replacement.
5
+ Author-email: Ryan Heard <ryanwheard@gmail.com>
6
+ Project-URL: Repository, https://github.com/rheard/pixmatch
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Programming Language :: Python :: 3.9
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Classifier: Programming Language :: Python :: Implementation :: CPython
14
+ Requires-Python: >=3.9
15
+ Description-Content-Type: text/markdown
16
+ License-File: LICENSE
17
+ Requires-Dist: Pillow
18
+ Requires-Dist: imagehash
19
+ Provides-Extra: gui
20
+ Requires-Dist: PySide6; extra == "gui"
21
+ Dynamic: license-file
22
+
23
+ # PixMatch
24
+
25
+ PixMatch is a modern, cross-platform duplicate-image finder inspired by VisiPics, built with PySide6.
26
+
27
+ ![Basic view of the application](https://github.com/rheard/markdown/blob/main/pixmatch/basic.jpg?raw=true)
28
+
29
+ PixMatch scans folders (and ZIP archives) for visually similar images, groups matches,
30
+ and lets you quickly keep, ignore, or delete files from a clean GUI.
31
+ Rotated, mirrored or recompressed imgaes are no match for PixMatch!
32
+ PixMatch can even detect visually similar GIFs and animated WebP files.
33
+ Files inside ZIPs are treated as read-only “sources of truth”
34
+ —never deleted—so you can safely compare against archived libraries.
35
+
36
+
37
+ Supported extensions: `.jpg`, `.jpeg`, `.png`, `.webp`, `.tif`, `.tiff`, `.bmp`, `.gif`, `.zip`.
38
+
39
+
40
+ ## Install
41
+
42
+ PixMatch is a standard Python app (GUI via PySide6).
43
+
44
+ ```bash
45
+ python -m pip install pixmatch[gui]
46
+ ```
47
+
48
+ ## Running
49
+
50
+ ```bash
51
+ python -m pixmatch
52
+ ```
53
+
54
+ ### Usage
55
+
56
+ Simply select some folders to parse and then click begin.
57
+
58
+ Once duplicate groups begin to appear in the duplicates view,
59
+ you can start to select actions for them and then execute those actions.
60
+ Clicking on a tile will cycle through actions, with red being delete, yellow being ignore, and green being no action.
61
+
62
+ Images which are in zips and cannot be deleted will have a rar icon to denote such,
63
+ and they cannot be marked for deletion.
64
+
65
+ The status bar under each image shows the full path, the file size, the uncompressed file size,
66
+ the frames in the image if it is an animated image, the image dimensions and the last modified date.
67
+
68
+ Basic status bar example:
69
+
70
+ ![Example of the status bar with a basic image loaded](https://github.com/rheard/markdown/blob/main/pixmatch/basic_status.jpg?raw=true)
71
+
72
+ Animated image status bar example:
73
+
74
+ ![Example of the status bar with an animated image loaded](https://github.com/rheard/markdown/blob/main/pixmatch/gif_status.jpg?raw=true)
75
+
76
+ #### Notes
77
+ * An exact match checkbox is provided. If strength is 10 and this checkbox is checked,
78
+ SHA-256 file hashes will be used instead of perceptual hashes.
79
+
80
+ #### Optional Args:
81
+ ```markdown
82
+ positional arguments:
83
+ folders Folders to load into the selected file path display (to speed up testing).
84
+
85
+ options:
86
+ --verbose More detailed logging
87
+ ```
88
+
89
+ ## Acknowledgements
90
+
91
+ * Thanks to anyone who supported this effort, including the teams behind PySide6, Pillow, PyPI, and many other projects.
92
+ * Thanks to Johannes Buchner and the team behind imagehash, which serves as a large backbone in this application and saved me a lot of time.
93
+ * Thanks to Guillaume Fouet (aka Ozone) for VisiPics and the inspiration. Please don't be mad, I just wanted some new features like better gif and zip support.
@@ -0,0 +1,8 @@
1
+ pixmatch/__init__.py,sha256=-jOdBz0QudNjRZYeZK6qYSDU6Plnjjt3AnV3eiFJvEs,22081
2
+ pixmatch/__main__.py,sha256=DVd1-B2O-0PC2lPgl40xDN277SPSHwOiE6pFGxK-xO0,1548
3
+ pixmatch/utils.py,sha256=4dHALWtt9y3EIdRLiM3GfRUho3xfn3QErQ69R20A1Lw,1120
4
+ pixmatch-0.0.6.dist-info/licenses/LICENSE,sha256=2bm9uFabQZ3Ykb_SaSU_uUbAj2-htc6WJQmS_65qD00,1073
5
+ pixmatch-0.0.6.dist-info/METADATA,sha256=7DEGio2Z4AsTZWjdcKeq2Dvz2IYeuphxLP58A6_Dzvk,3540
6
+ pixmatch-0.0.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
+ pixmatch-0.0.6.dist-info/top_level.txt,sha256=u-67zafU4VFT-oIM4mdGvf9KrHZvD64QjjtNzVxBj7E,9
8
+ pixmatch-0.0.6.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2018 The Python Packaging Authority
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ pixmatch