pixmatch 0.0.4__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixmatch might be problematic. Click here for more details.

pixmatch/__init__.py CHANGED
@@ -5,36 +5,58 @@ import time
5
5
 
6
6
  from collections import defaultdict
7
7
  from dataclasses import dataclass, field
8
- from multiprocessing import Pool, Manager
8
+ from functools import wraps
9
+ from multiprocessing import Manager, Pool
9
10
  from pathlib import Path
10
11
  from threading import Event
11
- from typing import Union
12
- from zipfile import ZipFile
12
+ from typing import ClassVar, Union
13
+ from zipfile import BadZipFile, ZipFile
13
14
 
14
15
  import imagehash
15
16
  import numpy as np
16
17
 
17
- from PIL import Image
18
+ from PIL import Image, ImageFile, UnidentifiedImageError
19
+
20
+ ImageFile.LOAD_TRUNCATED_IMAGES = True # Allow damaged images
18
21
 
19
22
  logger = logging.getLogger(__name__)
20
23
 
21
24
 
22
25
  @dataclass(frozen=True)
23
26
  class ZipPath:
27
+ """
28
+ A general object describing a Path.
29
+
30
+ All paths in pixmatch will be one of these. `subpath` will be empty for non-zip file paths.
31
+
32
+ Attributes:
33
+ path (str): The path to the file.
34
+ subpath (str): The subpath in the zip if `path` is for a zip.
35
+ """
36
+ # TODO: At some point convert this to Path.
37
+ # When I tried that last it introduced problems with inter-process communication
24
38
  path: str
25
39
  subpath: str
26
40
 
27
41
  @property
28
- def path_obj(self):
42
+ def path_obj(self) -> Path:
43
+ """Get the path as as Path object"""
29
44
  return Path(self.path)
30
45
 
31
46
  @property
32
47
  def is_gif(self) -> bool:
48
+ """Is this a path to an animated image?"""
33
49
  movie_extensions = {'.gif', '.webp'}
34
50
  return (not self.subpath and Path(self.path).suffix.lower() in movie_extensions) \
35
51
  or (self.subpath and self.subpath[-4:].lower() in movie_extensions)
36
52
 
53
+ @property
54
+ def is_zip(self) -> bool:
55
+ """Does this point to a file located in a zip?"""
56
+ return bool(self.subpath)
57
+
37
58
  def absolute(self):
59
+ """Get the absolute version of this ZipPath"""
38
60
  return ZipPath(str(self.path_obj.absolute()), self.subpath)
39
61
 
40
62
 
@@ -42,59 +64,66 @@ def _is_under(folder_abs: str, target: str | Path) -> bool:
42
64
  """Return True if the ZipPath's real file (zp.path) is inside folder_abs."""
43
65
  try:
44
66
  Path(target).absolute().relative_to(Path(folder_abs).absolute())
45
- return True
46
67
  except ValueError:
47
68
  return False
48
69
 
70
+ return True
71
+
49
72
 
50
73
  def phash_params_for_strength(strength: int) -> tuple[int, int]:
74
+ """
75
+ Convert a 0-10 strength to settings for imagehash
76
+
77
+ Returns:
78
+ tuple<int, int>: The hash size (in bytes) and the high frequency factor
79
+ """
51
80
  # TODO: This sucks.
52
81
  strength = max(0, min(10, strength))
53
82
  if strength >= 10:
54
- return 16, 4 # 256-bit hash, strict
55
- elif strength >= 8:
83
+ return 16, 4
84
+ if strength >= 8:
56
85
  return 15, 4
57
- elif strength >= 7:
86
+ if strength >= 7:
58
87
  return 13, 4
59
- elif strength >= 6:
88
+ if strength >= 6:
60
89
  return 11, 4
61
- elif strength >= 5:
90
+ if strength >= 5:
62
91
  return 9, 4
63
- elif strength >= 4:
92
+ if strength >= 4:
64
93
  return 8, 4
65
- elif strength >= 3:
94
+ if strength >= 3:
66
95
  return 8, 3
67
- elif strength >= 2:
96
+ if strength >= 2:
68
97
  return 7, 3
69
- else:
70
- return 6, 3
98
+ return 6, 3
71
99
 
72
100
 
73
- def calculate_hashes(f, is_gif=False, strength=5, exact_match=False):
101
+ def calculate_hashes(f, strength=5, *, is_gif=False, exact_match=False) -> tuple[str, set[str]]:
74
102
  """
75
103
  Calculate hashes for a given file.
76
104
 
77
105
  Args:
78
106
  f (IO or str or Path): Either a file path to process, or a in-memory BytesIO object ready for reading.
79
- is_gif (bool): Is this gif data? Needed if passing an in-memory BytesIO object.
80
107
  strength (int): A number between 0 and 10 on the strength of the matches.
108
+ is_gif (bool): Is this gif data? Needed if passing an in-memory BytesIO object.
81
109
  exact_match (bool): Use exact SHA256 hahes?
82
110
  If true, strength must be 10.
83
111
  If false, perceptual hashes will be used, even with high strength.
84
112
 
85
113
  Returns:
86
- list: The found hashes.
114
+ tuple[str, set]: The first element is the primary hash,
115
+ the second element are any secondary hashes representing rotations, flips, etc...
87
116
  """
88
117
  if exact_match:
89
118
  hasher = hashlib.sha256()
90
119
  block_size = 65536
91
- with (open(f, "rb") if isinstance(f, (str, Path)) else f) as file:
120
+ with (open(f, "rb") if isinstance(f, (str, Path)) else f) as file: # noqa: PTH123
92
121
  for block in iter(lambda: file.read(block_size), b""):
93
122
  hasher.update(block)
94
- return [hasher.hexdigest()]
123
+ return hasher.hexdigest(), set()
95
124
 
96
125
  hash_size, highfreq_factor = phash_params_for_strength(strength)
97
- with (Image.open(f) as im):
126
+ with Image.open(f) as im:
98
127
  if is_gif:
99
128
  initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
100
129
  # This is going to be a bit confusing but basically, imagehash produces weird hashes for some gifs
@@ -112,7 +141,7 @@ def calculate_hashes(f, is_gif=False, strength=5, exact_match=False):
112
141
  for r_i, r in enumerate(initial_hash.hash)):
113
142
  try:
114
143
  im.seek(im.tell() + 1)
115
- except EOFError:
144
+ except EOFError: # noqa: PERF203
116
145
  break
117
146
  else:
118
147
  initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
@@ -120,63 +149,122 @@ def calculate_hashes(f, is_gif=False, strength=5, exact_match=False):
120
149
 
121
150
  # For GIFs we'll look for mirrored versions but thats it
122
151
  flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
123
- return [initial_hash, imagehash.phash(flipped_h_image, hash_size=hash_size, highfreq_factor=highfreq_factor)]
124
-
125
- flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
126
- flipped_v_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
127
- images = (im, im.rotate(90), im.rotate(180), im.rotate(270),
128
- flipped_h_image, flipped_h_image.rotate(90), flipped_h_image.rotate(180), flipped_h_image.rotate(270),
129
- flipped_v_image, flipped_v_image.rotate(90), flipped_v_image.rotate(180), flipped_v_image.rotate(270))
130
- return [imagehash.phash(image, hash_size=hash_size, highfreq_factor=highfreq_factor) for image in images]
131
-
152
+ extras = (flipped_h_image, )
153
+ else:
154
+ initial_hash = imagehash.phash(im, hash_size=hash_size, highfreq_factor=highfreq_factor)
132
155
 
133
- def _process_image(path: str | Path, strength=5, exact_match=False):
156
+ flipped_h_image = im.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
157
+ flipped_v_image = im.transpose(Image.Transpose.FLIP_TOP_BOTTOM)
158
+ extras = (im.rotate(90), im.rotate(180), im.rotate(270),
159
+ flipped_h_image, flipped_h_image.rotate(90), flipped_h_image.rotate(180),
160
+ flipped_h_image.rotate(270),
161
+ flipped_v_image, flipped_v_image.rotate(90), flipped_v_image.rotate(180),
162
+ flipped_v_image.rotate(270))
163
+
164
+ return str(initial_hash), {
165
+ str(imagehash.phash(image, hash_size=hash_size, highfreq_factor=highfreq_factor)) for image in extras
166
+ }
167
+
168
+
169
+ def thread_error_handler(func):
170
+ """An error handler for the thread to return information about where the error occurred"""
171
+
172
+ @wraps(func)
173
+ def wrapper(path, *args, **kwargs): # noqa: ANN202
174
+ try:
175
+ return func(path, *args, **kwargs)
176
+ except Exception as e:
177
+ e.input_path = path
178
+ raise
179
+
180
+ return wrapper
181
+
182
+
183
+ @thread_error_handler
184
+ def _process_image(
185
+ path: str | Path,
186
+ supported_extensions: set | None = None,
187
+ strength: int = 5,
188
+ *,
189
+ exact_match: bool = False,
190
+ ) -> tuple[Path, tuple | dict[str, tuple]]:
191
+ """Get the hashes for a given path. Is multiprocessing compatible"""
134
192
  path = Path(path)
135
193
  if path.suffix.lower() != '.zip':
136
194
  return path, calculate_hashes(path, is_gif=path.suffix.lower() in {".gif", ".webp"},
137
195
  strength=strength, exact_match=exact_match)
138
196
 
139
- results = dict()
197
+ if not supported_extensions:
198
+ supported_extensions = ImageMatcher.SUPPORTED_EXTS
199
+
200
+ results = {}
140
201
  with ZipFile(path) as zf:
141
202
  for f in zf.filelist:
142
- with zf.open(f) as zipped_file:
143
- results[f.filename] = calculate_hashes(zipped_file, is_gif=f.filename[-4:].lower() in {".gif", ".webp"},
144
- strength=strength, exact_match=exact_match)
203
+ f_ext = f.filename[-4:].lower()
204
+ if f_ext not in supported_extensions:
205
+ continue
206
+
207
+ if f_ext == '.zip':
208
+ logger.warning('Have not implemented nested zip support yet! Input file: %s (%s)', path, f)
209
+ continue
210
+
211
+ try:
212
+ with zf.open(f) as zipped_file:
213
+ results[f.filename] = calculate_hashes(zipped_file, is_gif=f_ext in {".gif", ".webp"},
214
+ strength=strength, exact_match=exact_match)
215
+ except BadZipFile as e:
216
+ logger.warning("Could not read %s in %s due to %s", f.filename, path, str(e))
217
+ except UnidentifiedImageError:
218
+ logger.warning("Could not identify image %s in %s", f.filename, path)
145
219
 
146
220
  return path, results
147
221
 
148
222
 
149
223
  @dataclass
150
224
  class ImageMatch:
225
+ """A match data structure containing the matches and where this match lies in the match list"""
151
226
  match_i: int | None = field(default=None)
152
227
  matches: list[ZipPath] = field(default_factory=list)
153
228
 
154
229
 
230
+ # region Events
155
231
  @dataclass(frozen=True)
156
232
  class NewGroup:
157
- group: "ImageMatch" # forward-ref to your class
233
+ """A new group event"""
234
+ group: "ImageMatch"
158
235
 
159
236
 
160
237
  @dataclass(frozen=True)
161
238
  class NewMatch:
239
+ """A new match event"""
162
240
  group: "ImageMatch"
163
241
  path: ZipPath
164
242
 
165
243
 
166
244
  @dataclass(frozen=True)
167
245
  class Finished:
168
- pass
246
+ """A finished event"""
169
247
 
170
248
 
171
249
  MatcherEvent = Union[NewGroup, NewMatch, Finished]
250
+ # endregion
172
251
 
173
252
 
174
- # TODO: FINISHED signal?
175
253
  class ImageMatcher:
176
- SUPPORTED_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".tif", ".tiff", ".bmp", ".gif", ".zip"}
254
+ """
255
+ An image matching SDK
256
+
257
+ Args:
258
+ strength (int): The 0-10 strength to use for matching. Defaults to 5.
259
+ exact_match (bool): Should use SHA-256 hashes? If False, the default, will use perceptual hashes.
260
+ If True, strength must be 10.
261
+ processes (int): The number of processes to use. Defaults to None.
262
+ extensions (set): The extensions to process. Optional.
263
+ """
264
+ SUPPORTED_EXTS: ClassVar = {".jpg", ".jpeg", ".png", ".webp", ".tif", ".tiff", ".bmp", ".gif", ".zip"}
177
265
 
178
- def __init__(self, strength: int = 5, exact_match: bool = False, processes: int | None = None,
179
- extensions: set | None = None):
266
+ def __init__(self, strength: int = 5, processes: int | None = None, extensions: set | None = None,
267
+ *, exact_match: bool = False):
180
268
  if not (0 <= strength <= 10):
181
269
  raise ValueError("Strength must be between 0 and 10!")
182
270
 
@@ -185,57 +273,63 @@ class ImageMatcher:
185
273
  self.strength = strength
186
274
  self.exact_match = exact_match
187
275
  self.processes = processes
276
+
188
277
  self.found_images = 0
189
278
  self.processed_images = 0
190
279
  self.duplicate_images = 0
280
+ self.matches = []
191
281
 
192
282
  m = Manager()
193
- self.events = m.Queue()
194
- self._new_paths = m.Queue()
195
- self._removed_paths = set()
196
- self._processed_paths = set()
197
- self._hashes = defaultdict(ImageMatch)
198
- self._reverse_hashes = dict()
199
-
283
+ self.events = m.Queue() # Events to go to higher level users
284
+ self._new_paths = m.Queue() # Inbound queue for new paths that are added while processing is running
285
+ self._removed_paths = set() # Paths that have been removed from processing after processing has been started
286
+ self._ignored_files = set() # Files which have been ignored and should be skipped from processing if re-ran
287
+ self._processed_zips = {} # Zips that have been successfully processed
288
+ self._hashes = defaultdict(ImageMatch) # Hash -> Paths
289
+ self._reverse_hashes = {} # Path -> Hash
290
+
291
+ # Pausing and finished signaling...
200
292
  self._not_paused = Event()
201
293
  self._not_paused.set()
202
294
  self._finished = Event()
203
295
  self._finished.set()
204
296
 
205
- self.matches = []
297
+ @property
298
+ def left_to_process(self):
299
+ """Files that are left to process"""
300
+ return self.found_images - self.processed_images
206
301
 
207
302
  def add_path(self, path: str | Path):
303
+ """Add a path for processing"""
208
304
  path = str(Path(path).absolute())
209
305
  self._removed_paths.discard(path)
210
306
  self._new_paths.put(path)
211
307
 
212
- def remove_path(self, folder: str | Path) -> None:
308
+ def remove_path(self, folder: str | Path):
213
309
  """
214
310
  Mark a folder to be skipped going forward, and remove already-indexed files
215
311
  that live under it. Pauses briefly if not already paused to keep state sane.
216
312
  """
313
+ # TODO: This works but the biggest problem with it is that it will not remove any images which are still
314
+ # queue'd up for processing in the ThreadPool... I'm not sure how to fix that yet.
217
315
  folder = str(Path(folder).absolute())
218
316
  paused = self.conditional_pause()
219
317
  self._removed_paths.add(folder)
220
- self._processed_paths.discard(folder)
221
318
 
222
319
  # Remove anything we've already seen under that folder
223
320
  # (iterate over a copy because remove() mutates structures)
224
- to_remove = [p for p in self._reverse_hashes.keys() if _is_under(folder, p.path)]
321
+ to_remove = [p for p in self._reverse_hashes if _is_under(folder, p.path)]
225
322
  for p in to_remove:
226
323
  self.remove(p)
227
324
 
228
- self.conditional_resume(paused)
229
-
230
- @property
231
- def left_to_process(self):
232
- return self.found_images - self.processed_images
325
+ to_remove_zips = [p for p in self._processed_zips if _is_under(folder, p)]
326
+ for p in to_remove_zips:
327
+ self._processed_zips.pop(p)
233
328
 
234
- def pause(self):
235
- logger.debug('Performing pause')
236
- self._not_paused.clear()
329
+ self.conditional_resume(paused)
237
330
 
238
- def conditional_pause(self):
331
+ def conditional_pause(self) -> bool:
332
+ """Pause if not paused and return if was paused"""
239
333
  _conditional_pause = self.is_paused()
240
334
  if not _conditional_pause:
241
335
  logger.debug('Performing conditional pause')
@@ -243,47 +337,59 @@ class ImageMatcher:
243
337
 
244
338
  return _conditional_pause
245
339
 
246
- def conditional_resume(self, was_paused):
340
+ def conditional_resume(self, was_paused: bool): # noqa: FBT001
341
+ """Resume if not paused previous (from call to `conditional_pause`)"""
247
342
  if not was_paused and not self.is_finished():
248
343
  logger.debug('Performing conditional resume')
249
344
  self.resume()
250
345
 
346
+ def pause(self):
347
+ """Pause processing"""
348
+ logger.debug('Performing pause')
349
+ self._not_paused.clear()
350
+
251
351
  def is_paused(self):
352
+ """Is processing paused"""
252
353
  return not self._not_paused.is_set()
253
354
 
254
355
  def finish(self):
356
+ """Finish processing"""
255
357
  logger.debug('Performing finished')
256
358
  self._finished.set()
257
359
 
258
360
  def is_finished(self):
361
+ """Is processing finished"""
259
362
  return self._finished.is_set()
260
363
 
261
364
  def resume(self):
365
+ """Resume processing"""
262
366
  logger.debug('Performing resume')
263
367
  self._not_paused.set()
264
368
 
265
369
  def running(self):
370
+ """Currently running and loading hashes?"""
266
371
  return not self.is_paused() and (not self.is_finished() or self.left_to_process)
267
372
 
268
373
  def remove(self, path):
374
+ """Remove a loaded path completely from the image matching system. Will not delete a file."""
269
375
  # Pause things while we remove things...
270
376
  logger.info('Removing %s from %s', path, self.__class__.__name__)
271
377
  paused = self.conditional_pause()
272
378
 
273
- hash = self._reverse_hashes.pop(path)
274
- self._hashes[hash].matches.remove(path)
275
- if len(self._hashes[hash].matches) == 1:
276
- match_i = self._hashes[hash].match_i
379
+ hash_ = self._reverse_hashes.pop(path)
380
+ self._hashes[hash_].matches.remove(path)
381
+ if len(self._hashes[hash_].matches) == 1:
382
+ match_i = self._hashes[hash_].match_i
277
383
  logger.debug('Unmatching match group %s', match_i)
278
- self._hashes[hash].match_i = None
384
+ self._hashes[hash_].match_i = None
279
385
 
280
386
  del self.matches[match_i]
281
387
  self.refresh_match_indexes(match_i)
282
388
  self.duplicate_images -= 2
283
389
 
284
- elif not self._hashes[hash].matches:
390
+ elif not self._hashes[hash_].matches:
285
391
  logger.debug('Removing empty match group')
286
- del self._hashes[hash]
392
+ del self._hashes[hash_]
287
393
 
288
394
  else:
289
395
  logger.debug('Simple removal performed')
@@ -293,81 +399,111 @@ class ImageMatcher:
293
399
  self.found_images -= 1
294
400
  self.conditional_resume(paused)
295
401
 
402
+ def ignore(self, path):
403
+ """Remove a path from the image matching service"""
404
+ self.remove(path)
405
+
406
+ if path.path_obj.suffix.lower() != '.zip':
407
+ self._ignored_files.add(path.path)
408
+
296
409
  def refresh_match_indexes(self, start=0):
410
+ """Update the match_i value for all the matches passed a certain point"""
297
411
  for match_i, match in enumerate(self.matches[start:], start=start):
298
412
  match.match_i = match_i
299
413
 
300
414
  def _process_image_callback(self, result):
415
+ """
416
+ Handle the result of hashing an image.
417
+
418
+ This needs to do quite a few things including sanitizing the results,
419
+ actually checking if the hash matches an existing image,
420
+ adding the image and any matches to the backend data structures, notify any listeners,
421
+ update the found and processed image counts,
422
+ and verify that this result wasn't added as a removed path since it was queued.
423
+
424
+ Args:
425
+ result: A tuple consisting of the path to the file, and the resultant hashes.
426
+ If the hashes are a dict, then it is assumed that the path is for a zip. In that case,
427
+ the individual zip files will sanitized and re-ran through this callback.
428
+ """
429
+ # TODO: This callback must return IMMEDIATELY and is currently too slow for large amounts of zips.
430
+ # Perhaps create a new queue/thread and queue up processing for zip results?
431
+ # I think the major slow point is adding to the data structures and I'm not sure if more threads will help
432
+ # Check for paused or finished signals
301
433
  self._not_paused.wait()
302
434
  if self.is_finished():
303
435
  return
304
436
 
437
+ # region Sanitize results
305
438
  path: Path | str | ZipPath
306
439
  path, hashes = result
307
440
 
308
441
  if any(_is_under(d, path.path if isinstance(path, ZipPath) else path) for d in self._removed_paths):
442
+ # This image was removed AFTER it was queue'd! So decrement the found images count and just leave...
309
443
  self.found_images -= 1
310
444
  return
311
445
 
312
446
  if isinstance(hashes, dict):
313
447
  self.found_images -= 1
448
+ subpaths = []
314
449
  for sub_path, sub_hashes in hashes.items():
315
450
  self.found_images += 1
316
- self._process_image_callback((ZipPath(str(path), sub_path), sub_hashes))
451
+ subpaths.append(ZipPath(str(path), sub_path))
452
+ self._process_image_callback((subpaths[-1], sub_hashes))
453
+ self._processed_zips[str(path)] = subpaths
317
454
  return
318
455
 
456
+ initial_hash, extra_hashes = hashes
457
+ extra_hashes.add(initial_hash)
319
458
  if not isinstance(path, ZipPath):
459
+ # From this point on, EVERYTHING should be a ZipPath
320
460
  path = ZipPath(str(path), "")
461
+ # endregion
321
462
 
322
463
  if path in self._reverse_hashes:
323
464
  self.found_images -= 1
324
465
  return
325
466
 
326
467
  self.processed_images += 1
327
- for hash_ in hashes:
328
- if hash_ not in self._hashes:
329
- continue
330
-
331
- self._reverse_hashes[path] = hash_
332
-
333
- # This appears to be a new match!
334
- for match in self._hashes[hash_].matches:
335
- if path.absolute() == match.absolute():
336
- # This appears to be a duplicate PATH...
337
- logger.warning('Duplicate files entered! %s, %s', path, match)
338
- return
339
468
 
340
- self._hashes[hash_].matches.append(path)
341
- if self._hashes[hash_].match_i is None and len(self._hashes[hash_].matches) >= 2:
342
- # This is a brand new match group!
343
- self._hashes[hash_].match_i = len(self.matches)
344
- self.matches.append(self._hashes[hash_])
345
- self.duplicate_images += 2
346
- self.events.put(NewGroup(self._hashes[hash_]))
347
- logger.debug('New match group found: %s', self._hashes[hash_].matches)
348
- else:
349
- # Just another match for an existing group...
350
- self.duplicate_images += 1
351
- self.events.put(NewMatch(self._hashes[hash_], path))
352
- logger.debug('New match found for group #%s: %s',
353
- self._hashes[hash_].match_i,
354
- self._hashes[hash_].matches)
355
-
356
- break
357
- else:
358
- # This is a new hash, so just add it to the hashmap and move on...
469
+ # From testing at ~1.5m loaded images: it is ~10% faster to return a set and do this than it is to
470
+ # iterate over a list and do an `is in` check for each hash
471
+ found_hashes = self._hashes.keys() & extra_hashes
472
+ if not found_hashes:
473
+ # This is a new image not matching any previous, so just add it to the hashmap and move on...
359
474
  # Just use the initial orientation
360
- hash_ = hashes[0]
475
+ hash_ = initial_hash
361
476
  self._reverse_hashes[path] = hash_
362
477
  self._hashes[hash_].matches.append(path)
363
478
  return
364
479
 
480
+ # We have found a match!
481
+ hash_ = next(iter(found_hashes))
482
+ self._reverse_hashes[path] = hash_
483
+ self._hashes[hash_].matches.append(path)
484
+
485
+ if self._hashes[hash_].match_i is None and len(self._hashes[hash_].matches) >= 2:
486
+ # This is a brand new match group!
487
+ self._hashes[hash_].match_i = len(self.matches)
488
+ self.matches.append(self._hashes[hash_])
489
+ self.duplicate_images += 2
490
+ self.events.put(NewGroup(self._hashes[hash_]))
491
+ logger.debug('New match group found: %s', self._hashes[hash_].matches)
492
+ else:
493
+ # Just another match for an existing group...
494
+ self.duplicate_images += 1
495
+ self.events.put(NewMatch(self._hashes[hash_], path))
496
+ logger.debug('New match found for group #%s: %s',
497
+ self._hashes[hash_].match_i,
498
+ self._hashes[hash_].matches)
499
+
365
500
  def _process_image_error_callback(self, e):
501
+ """Temporary for testing"""
366
502
  self.processed_images += 1
367
- print(str(e))
503
+ logger.error("%s: %s (input path %s)", type(e), e, e.input_path)
368
504
 
369
505
  def _root_stream(self):
370
- # Yield any paths that come up for processing, then wait until processing is finished for any new paths
506
+ """This is to yield any paths for processing, then wait until processing is finished for any new paths"""
371
507
  while not self._new_paths.empty() or self.left_to_process:
372
508
  if self._new_paths.empty():
373
509
  time.sleep(0.05)
@@ -376,9 +512,7 @@ class ImageMatcher:
376
512
  yield self._new_paths.get_nowait()
377
513
 
378
514
  def run(self, paths: list[str | Path]):
379
- # TODO: Verify none of the paths overlap
380
- # TODO: Verify none of the dirs have been deleted after we started
381
-
515
+ """Do the work of matching!"""
382
516
  self._not_paused.set()
383
517
  self._finished.clear()
384
518
 
@@ -393,13 +527,14 @@ class ImageMatcher:
393
527
  continue
394
528
 
395
529
  path = str(path.absolute())
396
- if path in self._removed_paths or path in self._processed_paths:
530
+ if path in self._removed_paths:
397
531
  continue
398
532
 
399
533
  for root, dirs, files in os.walk(path):
400
534
  if self.is_finished():
401
535
  break
402
536
 
537
+ dirs.sort() # This actually works to ensure that os.walk goes in alphabetical order!
403
538
  root = Path(root)
404
539
 
405
540
  if any(_is_under(d, root) for d in self._removed_paths):
@@ -418,8 +553,13 @@ class ImageMatcher:
418
553
  if any(_is_under(d, f) for d in self._removed_paths):
419
554
  continue
420
555
 
421
- # TODO: This sucks (for zips at least), but I can't iterate over the dict while its changing...
422
- if ZipPath(str(f), "") in self._reverse_hashes:
556
+ if str(f) in self._ignored_files:
557
+ continue
558
+
559
+ if f.suffix.lower() == '.zip':
560
+ if str(f.absolute()) in self._processed_zips:
561
+ continue
562
+ elif ZipPath(str(f), "") in self._reverse_hashes:
423
563
  continue
424
564
 
425
565
  self.found_images += 1
@@ -428,14 +568,13 @@ class ImageMatcher:
428
568
  args=(f, ),
429
569
  kwds={
430
570
  'strength': self.strength,
571
+ 'supported_extensions': self.extensions,
431
572
  'exact_match': self.exact_match,
432
573
  },
433
574
  callback=self._process_image_callback,
434
575
  error_callback=self._process_image_error_callback,
435
576
  )
436
577
 
437
- self._processed_paths.add(path)
438
-
439
578
  tp.close()
440
579
 
441
580
  if not self.is_finished():
pixmatch/__main__.py CHANGED
@@ -8,10 +8,9 @@ from PySide6 import QtWidgets
8
8
 
9
9
  from pixmatch.gui import MainWindow
10
10
 
11
-
12
11
  if __name__ == "__main__":
13
12
  parser = argparse.ArgumentParser(
14
- description="Process zero or more file paths."
13
+ description="Process zero or more file paths.",
15
14
  )
16
15
  parser.add_argument(
17
16
  "folders",
@@ -41,7 +40,7 @@ if __name__ == "__main__":
41
40
  QToolBar { spacing: 8px; }
42
41
  QLabel#GroupTitle { padding: 4px 0; }
43
42
  QFrame#ImageTile { border: 1px solid #444; border-radius: 6px; padding: 6px; }
44
- """
43
+ """,
45
44
  )
46
45
  w = MainWindow(args.folders)
47
46
  w.show()
pixmatch/utils.py CHANGED
@@ -1,12 +1,11 @@
1
1
  from typing import Iterable
2
2
 
3
-
4
3
  def human_bytes(
5
4
  n: int,
6
5
  *,
7
6
  base: int = 1000,
8
7
  decimals: int = 0,
9
- units: Iterable[str] = ("b", "kb", "mb", "gb", "tb", "pb", "eb", "zb", "yb")
8
+ units: Iterable[str] = ("b", "kb", "mb", "gb", "tb", "pb", "eb", "zb", "yb"),
10
9
  ) -> str:
11
10
  """
12
11
  Convert a byte count to a human-readable string.
@@ -19,6 +18,9 @@ def human_bytes(
19
18
 
20
19
  Returns:
21
20
  A compact string like '66kb', '1mb', '1.5gb', or '999b'.
21
+
22
+ Raises:
23
+ ValueError: If n < 0
22
24
  """
23
25
  if n < 0:
24
26
  raise ValueError("Byte size cannot be negative")
@@ -31,6 +33,6 @@ def human_bytes(
31
33
 
32
34
  if i == 0 or decimals == 0:
33
35
  # Bytes or integer formatting requested
34
- return f"{int(n if i else n)}{tuple(units)[i]}"
36
+ return f"{int(n)}{tuple(units)[i]}"
35
37
 
36
38
  return f"{n:.{decimals}f}{tuple(units)[i]}".rstrip("0").rstrip(".")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pixmatch
3
- Version: 0.0.4
3
+ Version: 0.0.5
4
4
  Summary: A modern VisiPics replacement.
5
5
  Author-email: Ryan Heard <ryanwheard@gmail.com>
6
6
  Project-URL: Repository, https://github.com/rheard/pixmatch
@@ -0,0 +1,8 @@
1
+ pixmatch/__init__.py,sha256=U-puf9cK1V5Ooz9xRt1e4lMgQvYnCNmS_vmyUoSXIgw,22072
2
+ pixmatch/__main__.py,sha256=DVd1-B2O-0PC2lPgl40xDN277SPSHwOiE6pFGxK-xO0,1548
3
+ pixmatch/utils.py,sha256=4dHALWtt9y3EIdRLiM3GfRUho3xfn3QErQ69R20A1Lw,1120
4
+ pixmatch-0.0.5.dist-info/licenses/LICENSE,sha256=2bm9uFabQZ3Ykb_SaSU_uUbAj2-htc6WJQmS_65qD00,1073
5
+ pixmatch-0.0.5.dist-info/METADATA,sha256=Q_9R3LJVmD94mE0F8k6TlsbJymnVMpmu2xLRzomT348,3540
6
+ pixmatch-0.0.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
+ pixmatch-0.0.5.dist-info/top_level.txt,sha256=u-67zafU4VFT-oIM4mdGvf9KrHZvD64QjjtNzVxBj7E,9
8
+ pixmatch-0.0.5.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- pixmatch/__init__.py,sha256=3Q8jnIN4vXq0zJxlrgNXnpw0rdFZmwgsgxtTmcyzfyo,15682
2
- pixmatch/__main__.py,sha256=cLcDXW228kPcAH5b66MP5eIEFHz6WNuOgqDpPchUke0,1547
3
- pixmatch/utils.py,sha256=TLYFeMg35B62EUafErq3yaA9YC0O6Kcd3Ao4fSpTwoE,1090
4
- pixmatch-0.0.4.dist-info/licenses/LICENSE,sha256=2bm9uFabQZ3Ykb_SaSU_uUbAj2-htc6WJQmS_65qD00,1073
5
- pixmatch-0.0.4.dist-info/METADATA,sha256=Q_YzWyT6Iduiady1dAM_k0vd6VP8I_GLrC4bB5DuKiM,3540
6
- pixmatch-0.0.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
- pixmatch-0.0.4.dist-info/top_level.txt,sha256=u-67zafU4VFT-oIM4mdGvf9KrHZvD64QjjtNzVxBj7E,9
8
- pixmatch-0.0.4.dist-info/RECORD,,