imgduptective 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.1
2
+ Name: imgduptective
3
+ Version: 0.1.0
4
+ Summary: Image Duplicates Detective
5
+ Author-Email: sacha <sachahony@gmail.com>, Sacha Hony <zazahohonini@gmail.com>
6
+ License: MIT
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: Operating System :: OS Independent
9
+ Classifier: Programming Language :: Python :: 3
10
+ Project-URL: Homepage, https://github.com/zazaho/imgduptective
11
+ Requires-Python: >=3.10
12
+ Requires-Dist: Pillow
13
+ Description-Content-Type: text/markdown
14
+
15
+ # Image Duplicate Detectiver (imgduptective)
16
+
17
+ # Description
@@ -0,0 +1,3 @@
1
+ # Image Duplicate Detectiver (imgduptective)
2
+
3
+ # Description
File without changes
@@ -0,0 +1,530 @@
1
+ #!/usr/bin/env python3
2
+ """imgduptective - non-interactive duplicate image detector using gradient horizontal hash."""
3
+ import argparse
4
+ import hashlib
5
+ import sqlite3
6
+ import sys
7
+ from collections import Counter
8
+ from multiprocessing import Pool
9
+ from pathlib import Path
10
+
11
+ from PIL import Image
12
+
13
+ # --- Image handling ---
14
+
15
+ _EXIF_ORIENTATION_CODE = 274
16
+ _EXIF_ORIENTATION_CORRECTION_MAPPING = {
17
+ 2: Image.FLIP_LEFT_RIGHT,
18
+ 3: Image.ROTATE_180,
19
+ 4: Image.FLIP_TOP_BOTTOM,
20
+ 5: Image.TRANSPOSE,
21
+ 6: Image.ROTATE_270,
22
+ 7: Image.TRANSVERSE,
23
+ 8: Image.ROTATE_90,
24
+ }
25
+ _TABLE16 = [i / 256 for i in range(65536)]
26
+
27
+
28
+ def image_open(fn):
29
+ try:
30
+ img = Image.open(fn)
31
+ if img.format == "PNG" and img.mode == "I" and max(img.getdata()) > 255:
32
+ img = img.point(_TABLE16, "L")
33
+ except Exception:
34
+ return None
35
+ try:
36
+ exif = img._getexif()
37
+ if exif and _EXIF_ORIENTATION_CODE in exif:
38
+ code = exif[_EXIF_ORIENTATION_CODE]
39
+ if code in _EXIF_ORIENTATION_CORRECTION_MAPPING:
40
+ img = img.transpose(_EXIF_ORIENTATION_CORRECTION_MAPPING[code])
41
+ except Exception:
42
+ pass
43
+ return img
44
+
45
+
46
+ # --- Hash functions ---
47
+
48
+ def dhash_horizontal(img):
49
+ try:
50
+ i9x8 = img.convert("L").resize((9, 8), Image.BOX)
51
+ except:
52
+ return None
53
+ h = 0
54
+ for y in range(8):
55
+ for x in range(8):
56
+ if i9x8.getpixel((x + 1, y)) > i9x8.getpixel((x, y)):
57
+ h |= 1 << (y * 8 + x)
58
+ return h
59
+
60
+ # --- Worker functions for multiprocessing ---
61
+
62
+ def _compute_filehash(filepath_str):
63
+ hasher = hashlib.sha1()
64
+ with open(filepath_str, "rb") as f:
65
+ while chunk := f.read(65536):
66
+ hasher.update(chunk)
67
+ return (filepath_str, hasher.hexdigest())
68
+
69
+
70
+ def _compute_imagehash(filepath_str):
71
+ img = image_open(filepath_str)
72
+ if img is None:
73
+ return (filepath_str, None)
74
+ return (filepath_str, dhash_horizontal(img))
75
+
76
+
77
+ def _compare_range(args):
78
+ """Compare a range of the triangular matrix."""
79
+ row_start, row_end, hashes, threshold = args
80
+ matches = []
81
+ for i in range(row_start, row_end):
82
+ h_i = hashes[i]
83
+ for j in range(i + 1, len(hashes)):
84
+ if (h_i ^ hashes[j]).bit_count() <= threshold:
85
+ matches.append((i, j))
86
+ return matches
87
+
88
+
89
+ # --- Database ---
90
+
91
+ DB_PATH = Path.home() / ".config" / "imgduptective" / "imgduptective.db"
92
+
93
+
94
+ def get_db():
95
+ DB_PATH.parent.mkdir(parents=True, exist_ok=True)
96
+ conn = sqlite3.connect(DB_PATH)
97
+ conn.execute(
98
+ "CREATE TABLE IF NOT EXISTS HashValueTable "
99
+ "(id integer PRIMARY KEY, FileHash text NOT NULL UNIQUE, "
100
+ "ImageHashValue text NOT NULL)"
101
+ )
102
+ conn.execute(
103
+ "CREATE TABLE IF NOT EXISTS FileTable "
104
+ "(id integer PRIMARY KEY, FilePath text NOT NULL UNIQUE, "
105
+ "FileHash text NOT NULL, ImageHashValue text)"
106
+ )
107
+ conn.commit()
108
+ return conn
109
+
110
+
111
+ def db_get_imagehashes(conn, filehashes):
112
+ """Batch lookup: return dict of filehash -> imagehash (int) for those found."""
113
+ result = {}
114
+ filehashes = list(filehashes)
115
+ batch_size = 500
116
+ for i in range(0, len(filehashes), batch_size):
117
+ batch = filehashes[i:i + batch_size]
118
+ placeholders = ",".join("?" * len(batch))
119
+ rows = conn.execute(
120
+ f"SELECT FileHash, ImageHashValue FROM HashValueTable WHERE FileHash IN ({placeholders})",
121
+ batch,
122
+ ).fetchall()
123
+ for filehash, imagehash_hex in rows:
124
+ result[filehash] = int(imagehash_hex, 16)
125
+ return result
126
+
127
+
128
+ def db_set_imagehashes(conn, pairs):
129
+ """Batch insert: pairs is list of (filehash, imagehash_int)."""
130
+ conn.executemany(
131
+ "INSERT OR IGNORE INTO HashValueTable (FileHash, ImageHashValue) VALUES(?, ?)",
132
+ [(fh, format(ih, "016x")) for fh, ih in pairs],
133
+ )
134
+ conn.commit()
135
+
136
+
137
+ def db_upsert_files(conn, rows):
138
+ """Batch upsert: rows is list of (filepath, filehash, imagehash_hex_or_none)."""
139
+ conn.executemany(
140
+ "INSERT INTO FileTable (FilePath, FileHash, ImageHashValue) VALUES(?, ?, ?) "
141
+ "ON CONFLICT(FilePath) DO UPDATE SET FileHash=excluded.FileHash, ImageHashValue=excluded.ImageHashValue",
142
+ rows,
143
+ )
144
+ conn.commit()
145
+
146
+
147
+ def db_prune_missing(conn):
148
+ """Remove FileTable entries where file no longer exists. Process in batches."""
149
+ cursor = conn.execute("SELECT id, FilePath FROM FileTable")
150
+ missing = []
151
+ while True:
152
+ rows = cursor.fetchmany(1000)
153
+ if not rows:
154
+ break
155
+ missing.extend(r[0] for r in rows if not Path(r[1]).exists())
156
+ if missing:
157
+ conn.executemany("DELETE FROM FileTable WHERE id=?", [(i,) for i in missing])
158
+ conn.commit()
159
+
160
+
161
+ # --- Main logic ---
162
+
163
+ PHOTO_EXTENSIONS = {".jpg", ".jpeg", ".png", ".heic", ".heif", ".webp", ".tiff", ".tif", ".bmp", ".gif"}
164
+
165
+
166
+ def find_files(root, photos_only=False):
167
+ """Walk root, yield regular files (no symlinks)."""
168
+ for path in Path(root).rglob("*"):
169
+ if path.is_file() and not path.is_symlink():
170
+ if photos_only and path.suffix.lower() not in PHOTO_EXTENSIONS:
171
+ continue
172
+ yield path
173
+
174
+
175
+ def progress(current, total, label="Progress"):
176
+ pct = current * 100 // total
177
+ print(f"\r{label}: {pct}%", end="", file=sys.stderr, flush=True)
178
+ if current == total:
179
+ print(file=sys.stderr)
180
+
181
+
182
+ def load_entries(files, conn):
183
+ """Compute filehashes in parallel, lookup/compute imagehashes, return entries dict."""
184
+ file_strs = [str(f) for f in files]
185
+ total = len(file_strs)
186
+ print(f"Found {total} files", file=sys.stderr)
187
+
188
+ # Step 1: compute file hashes in parallel (streaming)
189
+ print("\rHashing files...", end="", file=sys.stderr, flush=True)
190
+ with Pool() as pool:
191
+ filehash_results = pool.map(_compute_filehash, file_strs)
192
+ print("\rHashing files... done", file=sys.stderr)
193
+
194
+ # filepath_str -> filehash
195
+ filehash_map = dict(filehash_results)
196
+
197
+ # Step 2: batch DB lookup for existing imagehashes
198
+ unique_filehashes = set(filehash_map.values())
199
+ known = db_get_imagehashes(conn, unique_filehashes)
200
+
201
+ # Step 3: compute missing imagehashes in parallel
202
+ need_compute = [fp for fp, fh in filehash_map.items() if fh not in known]
203
+ print(f"{len(need_compute)} files need image hash computation", file=sys.stderr)
204
+ if need_compute:
205
+ print(f"\rComputing image hashes ({len(need_compute)} files)...", end="", file=sys.stderr, flush=True)
206
+ with Pool() as pool:
207
+ imagehash_results = pool.map(_compute_imagehash, need_compute)
208
+ print(f"\rComputing image hashes ({len(need_compute)} files)... done", file=sys.stderr)
209
+
210
+ # Store new hashes in DB (batch)
211
+ new_pairs = []
212
+ for filepath_str, imagehash in imagehash_results:
213
+ if imagehash is not None:
214
+ filehash = filehash_map[filepath_str]
215
+ known[filehash] = imagehash
216
+ new_pairs.append((filehash, imagehash))
217
+ if new_pairs:
218
+ db_set_imagehashes(conn, new_pairs)
219
+
220
+ # Step 4: build entries and batch upsert to FileTable
221
+ entries = {}
222
+ upsert_rows = []
223
+ for filepath_str, filehash in filehash_map.items():
224
+ imagehash = known.get(filehash)
225
+ imagehash_hex = format(imagehash, "016x") if imagehash is not None else None
226
+ upsert_rows.append((filepath_str, filehash, imagehash_hex))
227
+ if imagehash is not None:
228
+ entries[filepath_str] = (filehash, imagehash)
229
+ db_upsert_files(conn, upsert_rows)
230
+
231
+ return entries
232
+
233
+
234
+ def find_duplicates(entries, threshold):
235
+ """Find near-duplicate groups using multiprocessing for comparison."""
236
+ filepaths = list(entries.keys())
237
+ n = len(filepaths)
238
+ # Flat list of hashes indexed by position
239
+ hashes = [entries[fp][1] for fp in filepaths]
240
+
241
+ if n < 2:
242
+ return []
243
+
244
+ total_pairs = n * (n - 1) // 2
245
+ print(f"\rComparing {total_pairs} pairs...", end="", file=sys.stderr, flush=True)
246
+
247
+ # Split rows across workers (each worker handles a range of 'i' values)
248
+ num_workers = min(8, n)
249
+ chunk_size = max(1, n // num_workers)
250
+ ranges = []
251
+ for start in range(0, n, chunk_size):
252
+ end = min(start + chunk_size, n)
253
+ ranges.append((start, end, hashes, threshold))
254
+
255
+ with Pool() as pool:
256
+ results = pool.map(_compare_range, ranges)
257
+ print(f"\rComparing {total_pairs} pairs... done", file=sys.stderr)
258
+
259
+ # Merge results into groups using filepath strings
260
+ groups = {}
261
+ for matches in results:
262
+ for i, j in matches:
263
+ a, b = filepaths[i], filepaths[j]
264
+ groups.setdefault(a, {a}).add(b)
265
+ groups.setdefault(b, {b}).add(a)
266
+
267
+ # Remove subgroups
268
+ unique_groups = []
269
+ for g in sorted(groups.values(), key=len, reverse=True):
270
+ if not any(g <= existing for existing in unique_groups):
271
+ unique_groups.append(g)
272
+
273
+ return unique_groups
274
+
275
+
276
+ def view_groups(groups):
277
+ """Simple tkinter viewer to browse duplicate groups side by side."""
278
+ import gzip
279
+ import shutil
280
+ import tkinter as tk
281
+
282
+ groups = [sorted(g) for g in groups]
283
+ cwd = str(Path.cwd())
284
+ state = {"idx": 0}
285
+
286
+ def rel_path(filepath):
287
+ """Show path relative to cwd, or enough to distinguish."""
288
+ if filepath.startswith(cwd):
289
+ return filepath[len(cwd):].lstrip("/")
290
+ return filepath
291
+
292
+ def gzip_delete(filepath):
293
+ """Compress file with gzip and remove the original."""
294
+ with open(filepath, "rb") as f_in:
295
+ with gzip.open(filepath + ".gz", "wb") as f_out:
296
+ shutil.copyfileobj(f_in, f_out)
297
+ Path(filepath).unlink()
298
+
299
+ root = tk.Tk()
300
+ root.title("imgduptective viewer")
301
+ root.geometry("1200x700")
302
+
303
+ top_frame = tk.Frame(root)
304
+ top_frame.pack(fill="x")
305
+ label = tk.Label(top_frame, text="", font=("", 12))
306
+ label.pack(side="left", padx=10)
307
+ help_label = tk.Label(top_frame, text="←/→:navigate d:delete(gzip) q:quit", font=("", 9))
308
+ help_label.pack(side="right", padx=10)
309
+
310
+ canvas_frame = tk.Frame(root)
311
+ canvas_frame.pack(fill="both", expand=True)
312
+
313
+ def show_group():
314
+ for w in canvas_frame.winfo_children():
315
+ w.destroy()
316
+
317
+ idx = state["idx"]
318
+ group = groups[idx]
319
+ label.config(text=f"Group {idx + 1}/{len(groups)} ({len(group)} files)")
320
+
321
+ root.update_idletasks()
322
+ n = len(group)
323
+ max_w = max(100, (root.winfo_width() - 20) // n - 10)
324
+ max_h = root.winfo_height() - 100
325
+
326
+ state["photos"] = []
327
+ state["frames"] = []
328
+ state["selected"] = set()
329
+
330
+ for i, filepath in enumerate(group):
331
+ frame = tk.Frame(canvas_frame, borderwidth=2, relief="groove")
332
+ frame.pack(side="left", padx=5, pady=5, fill="both", expand=True)
333
+ state["frames"].append(frame)
334
+
335
+ img = image_open(filepath)
336
+ if img:
337
+ ratio = min(max_w / img.size[0], max_h / img.size[1])
338
+ if ratio < 1:
339
+ img = img.resize((int(img.size[0] * ratio), int(img.size[1] * ratio)), Image.LANCZOS)
340
+ from PIL import ImageTk
341
+ photo = ImageTk.PhotoImage(img)
342
+ state["photos"].append(photo)
343
+ img_label = tk.Label(frame, image=photo)
344
+ img_label.pack()
345
+ img_label.bind("<Button-1>", lambda e, idx=i: toggle_select(idx))
346
+
347
+ tk.Label(frame, text=rel_path(filepath), wraplength=max_w, font=("", 9)).pack()
348
+
349
+ def toggle_select(i):
350
+ if i in state["selected"]:
351
+ state["selected"].discard(i)
352
+ state["frames"][i].config(relief="groove", bg="#d9d9d9")
353
+ else:
354
+ state["selected"].add(i)
355
+ state["frames"][i].config(relief="solid", bg="#ffcccc")
356
+
357
+ def delete_selected(event=None):
358
+ idx = state["idx"]
359
+ group = groups[idx]
360
+ to_delete = sorted(state["selected"], reverse=True)
361
+ if not to_delete:
362
+ return
363
+ for i in to_delete:
364
+ filepath = group[i]
365
+ try:
366
+ gzip_delete(filepath)
367
+ except Exception:
368
+ pass
369
+ # Remove deleted from group
370
+ groups[idx] = [f for i, f in enumerate(group) if i not in state["selected"]]
371
+ # Remove empty groups
372
+ if len(groups[idx]) < 2:
373
+ groups.pop(idx)
374
+ if not groups:
375
+ root.destroy()
376
+ return
377
+ state["idx"] = state["idx"] % len(groups)
378
+ show_group()
379
+
380
+ def next_group(event=None):
381
+ state["idx"] = (state["idx"] + 1) % len(groups)
382
+ show_group()
383
+
384
+ def prev_group(event=None):
385
+ state["idx"] = (state["idx"] - 1) % len(groups)
386
+ show_group()
387
+
388
+ root.bind("<Right>", next_group)
389
+ root.bind("<n>", next_group)
390
+ root.bind("<space>", next_group)
391
+ root.bind("<Left>", prev_group)
392
+ root.bind("<p>", prev_group)
393
+ root.bind("<d>", delete_selected)
394
+ root.bind("<Delete>", delete_selected)
395
+ root.bind("<q>", lambda e: root.destroy())
396
+ root.bind("<Escape>", lambda e: root.destroy())
397
+
398
+ show_group()
399
+ root.mainloop()
400
+
401
+
402
+ def main():
403
+ parser = argparse.ArgumentParser(description="Detect near-duplicate images.")
404
+ parser.add_argument("threshold", type=int, nargs="?", help="Maximum hamming distance to consider a match")
405
+ parser.add_argument("--view", action="store_true", help="Open the builtin viewer to inspect duplicate groups")
406
+ parser.add_argument("--stats", action="store_true", help="Show per-directory duplicate statistics")
407
+ parser.add_argument("--check", action="store_true", help="Check what duplicates would be added without modifying the database")
408
+ parser.add_argument("--add", action="store_true", help="Add hashes to the database without comparing")
409
+ parser.add_argument("--photos", action="store_true", help="Only process common photo file formats")
410
+ parser.add_argument("--exact", action="store_true", help="Show only exact file matches instead of similar")
411
+ args = parser.parse_args()
412
+
413
+ if not args.add and not args.exact and args.threshold is None:
414
+ parser.error("threshold is required unless --add or --exact is used")
415
+
416
+ threshold = args.threshold
417
+ conn = get_db()
418
+
419
+ if args.check:
420
+ existing = {}
421
+ for row in conn.execute("SELECT FilePath, ImageHashValue FROM FileTable WHERE ImageHashValue IS NOT NULL").fetchall():
422
+ existing[row[0]] = int(row[1], 16)
423
+
424
+ files = list(find_files(Path.cwd(), args.photos))
425
+ file_strs = [str(f) for f in files]
426
+
427
+ with Pool() as pool:
428
+ filehash_results = pool.map(_compute_filehash, file_strs)
429
+
430
+ filehash_map = dict(filehash_results)
431
+ known = db_get_imagehashes(conn, set(filehash_map.values()))
432
+
433
+ need_compute = [fp for fp, fh in filehash_map.items() if fh not in known]
434
+ if need_compute:
435
+ with Pool() as pool:
436
+ imagehash_results = pool.map(_compute_imagehash, need_compute)
437
+ for fp, ih in imagehash_results:
438
+ if ih is not None:
439
+ known[filehash_map[fp]] = ih
440
+
441
+ new_files = {}
442
+ for fp, fh in filehash_map.items():
443
+ ih = known.get(fh)
444
+ if ih is not None:
445
+ new_files[fp] = ih
446
+
447
+ found = False
448
+ for new_path, new_hash in new_files.items():
449
+ matches = [ex_path for ex_path, ex_hash in existing.items()
450
+ if ex_path != new_path and bin(new_hash ^ ex_hash).count("1") <= threshold]
451
+ if matches:
452
+ if not found:
453
+ print("=== New duplicates if added ===")
454
+ found = True
455
+ print(f"{new_path}\t->\t" + "\t".join(sorted(matches)))
456
+
457
+ conn.close()
458
+ return
459
+
460
+ if args.add:
461
+ files = list(find_files(Path.cwd(), args.photos))
462
+ load_entries(files, conn)
463
+ db_prune_missing(conn)
464
+ conn.close()
465
+ return
466
+
467
+ files = list(find_files(Path.cwd(), args.photos))
468
+ entries = load_entries(files, conn)
469
+ db_prune_missing(conn)
470
+
471
+ # Report exact duplicates (same filehash)
472
+ hash_to_files = {}
473
+ for filepath, (filehash, imagehash) in entries.items():
474
+ hash_to_files.setdefault(filehash, []).append(filepath)
475
+ exact_dupes = [sorted(v) for v in hash_to_files.values() if len(v) > 1]
476
+
477
+ # Find near-duplicate groups
478
+ unique_groups = find_duplicates(entries, threshold) if not args.exact else []
479
+
480
+ # --stats: only show statistics
481
+ if args.stats:
482
+ total = len(entries)
483
+ dir_total = Counter()
484
+ for filepath in entries:
485
+ dir_total[str(Path(filepath).parent)] += 1
486
+
487
+ if args.exact:
488
+ dupes = sum(len(g) for g in exact_dupes)
489
+ print(f"Exact duplicates: {dupes}/{total}")
490
+ dir_dupes = Counter()
491
+ for group in exact_dupes:
492
+ for filepath in group:
493
+ dir_dupes[str(Path(filepath).parent)] += 1
494
+ else:
495
+ all_dupes = {fp for group in unique_groups for fp in group}
496
+ print(f"Near-duplicates: {len(all_dupes)}/{total}")
497
+ dir_dupes = Counter()
498
+ for filepath in all_dupes:
499
+ dir_dupes[str(Path(filepath).parent)] += 1
500
+ if dir_dupes:
501
+ for dirname, count in dir_dupes.most_common():
502
+ print(f"{count}/{dir_total[dirname]}\t{dirname}")
503
+ conn.close()
504
+ return
505
+
506
+ # --view: open builtin viewer
507
+ groups_to_view = exact_dupes if args.exact else unique_groups
508
+ if args.view:
509
+ if groups_to_view:
510
+ view_groups(groups_to_view)
511
+ conn.close()
512
+ return
513
+
514
+ # Output groups
515
+ if args.exact:
516
+ if exact_dupes:
517
+ print("=== Exact duplicates ===")
518
+ for files in sorted(exact_dupes, key=lambda x: x[0]):
519
+ print("\t".join(files))
520
+ else:
521
+ if unique_groups:
522
+ print("=== Near-duplicate groups ===")
523
+ for group in unique_groups:
524
+ print("\t".join(sorted(group)))
525
+
526
+ conn.close()
527
+
528
+
529
+ if __name__ == "__main__":
530
+ main()
@@ -0,0 +1,35 @@
1
+ [project]
2
+ name = "imgduptective"
3
+ version = "0.1.0"
4
+ description = "Image Duplicates Detective"
5
+ authors = [
6
+ { name = "sacha", email = "sachahony@gmail.com" },
7
+ { name = "Sacha Hony", email = "zazahohonini@gmail.com" },
8
+ ]
9
+ dependencies = [
10
+ "Pillow",
11
+ ]
12
+ requires-python = ">=3.10"
13
+ readme = "README.md"
14
+ classifiers = [
15
+ "License :: OSI Approved :: MIT License",
16
+ "Operating System :: OS Independent",
17
+ "Programming Language :: Python :: 3",
18
+ ]
19
+
20
+ [project.license]
21
+ text = "MIT"
22
+
23
+ [project.urls]
24
+ Homepage = "https://github.com/zazaho/imgduptective"
25
+
26
+ [project.scripts]
27
+ imgduptective = "imgduptective.imgduptective:main"
28
+
29
+ [build-system]
30
+ requires = [
31
+ "pdm-backend",
32
+ ]
33
+ build-backend = "pdm.backend"
34
+
35
+ [tool]