imgduptective 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: imgduptective
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Image Duplicates Detective
|
|
5
|
+
Author-Email: sacha <sachahony@gmail.com>, Sacha Hony <zazahohonini@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Project-URL: Homepage, https://github.com/zazaho/imgduptective
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
|
+
Requires-Dist: Pillow
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# Image Duplicate Detectiver (imgduptective)
|
|
16
|
+
|
|
17
|
+
# Description
|
|
File without changes
|
|
@@ -0,0 +1,530 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""imgduptective - non-interactive duplicate image detector using gradient horizontal hash."""
|
|
3
|
+
import argparse
|
|
4
|
+
import hashlib
|
|
5
|
+
import sqlite3
|
|
6
|
+
import sys
|
|
7
|
+
from collections import Counter
|
|
8
|
+
from multiprocessing import Pool
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from PIL import Image
|
|
12
|
+
|
|
13
|
+
# --- Image handling ---
|
|
14
|
+
|
|
15
|
+
_EXIF_ORIENTATION_CODE = 274
|
|
16
|
+
_EXIF_ORIENTATION_CORRECTION_MAPPING = {
|
|
17
|
+
2: Image.FLIP_LEFT_RIGHT,
|
|
18
|
+
3: Image.ROTATE_180,
|
|
19
|
+
4: Image.FLIP_TOP_BOTTOM,
|
|
20
|
+
5: Image.TRANSPOSE,
|
|
21
|
+
6: Image.ROTATE_270,
|
|
22
|
+
7: Image.TRANSVERSE,
|
|
23
|
+
8: Image.ROTATE_90,
|
|
24
|
+
}
|
|
25
|
+
_TABLE16 = [i / 256 for i in range(65536)]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def image_open(fn):
|
|
29
|
+
try:
|
|
30
|
+
img = Image.open(fn)
|
|
31
|
+
if img.format == "PNG" and img.mode == "I" and max(img.getdata()) > 255:
|
|
32
|
+
img = img.point(_TABLE16, "L")
|
|
33
|
+
except Exception:
|
|
34
|
+
return None
|
|
35
|
+
try:
|
|
36
|
+
exif = img._getexif()
|
|
37
|
+
if exif and _EXIF_ORIENTATION_CODE in exif:
|
|
38
|
+
code = exif[_EXIF_ORIENTATION_CODE]
|
|
39
|
+
if code in _EXIF_ORIENTATION_CORRECTION_MAPPING:
|
|
40
|
+
img = img.transpose(_EXIF_ORIENTATION_CORRECTION_MAPPING[code])
|
|
41
|
+
except Exception:
|
|
42
|
+
pass
|
|
43
|
+
return img
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# --- Hash functions ---
|
|
47
|
+
|
|
48
|
+
def dhash_horizontal(img):
|
|
49
|
+
try:
|
|
50
|
+
i9x8 = img.convert("L").resize((9, 8), Image.BOX)
|
|
51
|
+
except:
|
|
52
|
+
return None
|
|
53
|
+
h = 0
|
|
54
|
+
for y in range(8):
|
|
55
|
+
for x in range(8):
|
|
56
|
+
if i9x8.getpixel((x + 1, y)) > i9x8.getpixel((x, y)):
|
|
57
|
+
h |= 1 << (y * 8 + x)
|
|
58
|
+
return h
|
|
59
|
+
|
|
60
|
+
# --- Worker functions for multiprocessing ---
|
|
61
|
+
|
|
62
|
+
def _compute_filehash(filepath_str):
|
|
63
|
+
hasher = hashlib.sha1()
|
|
64
|
+
with open(filepath_str, "rb") as f:
|
|
65
|
+
while chunk := f.read(65536):
|
|
66
|
+
hasher.update(chunk)
|
|
67
|
+
return (filepath_str, hasher.hexdigest())
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _compute_imagehash(filepath_str):
|
|
71
|
+
img = image_open(filepath_str)
|
|
72
|
+
if img is None:
|
|
73
|
+
return (filepath_str, None)
|
|
74
|
+
return (filepath_str, dhash_horizontal(img))
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _compare_range(args):
|
|
78
|
+
"""Compare a range of the triangular matrix."""
|
|
79
|
+
row_start, row_end, hashes, threshold = args
|
|
80
|
+
matches = []
|
|
81
|
+
for i in range(row_start, row_end):
|
|
82
|
+
h_i = hashes[i]
|
|
83
|
+
for j in range(i + 1, len(hashes)):
|
|
84
|
+
if (h_i ^ hashes[j]).bit_count() <= threshold:
|
|
85
|
+
matches.append((i, j))
|
|
86
|
+
return matches
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# --- Database ---
|
|
90
|
+
|
|
91
|
+
DB_PATH = Path.home() / ".config" / "imgduptective" / "imgduptective.db"
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def get_db():
|
|
95
|
+
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
96
|
+
conn = sqlite3.connect(DB_PATH)
|
|
97
|
+
conn.execute(
|
|
98
|
+
"CREATE TABLE IF NOT EXISTS HashValueTable "
|
|
99
|
+
"(id integer PRIMARY KEY, FileHash text NOT NULL UNIQUE, "
|
|
100
|
+
"ImageHashValue text NOT NULL)"
|
|
101
|
+
)
|
|
102
|
+
conn.execute(
|
|
103
|
+
"CREATE TABLE IF NOT EXISTS FileTable "
|
|
104
|
+
"(id integer PRIMARY KEY, FilePath text NOT NULL UNIQUE, "
|
|
105
|
+
"FileHash text NOT NULL, ImageHashValue text)"
|
|
106
|
+
)
|
|
107
|
+
conn.commit()
|
|
108
|
+
return conn
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def db_get_imagehashes(conn, filehashes):
|
|
112
|
+
"""Batch lookup: return dict of filehash -> imagehash (int) for those found."""
|
|
113
|
+
result = {}
|
|
114
|
+
filehashes = list(filehashes)
|
|
115
|
+
batch_size = 500
|
|
116
|
+
for i in range(0, len(filehashes), batch_size):
|
|
117
|
+
batch = filehashes[i:i + batch_size]
|
|
118
|
+
placeholders = ",".join("?" * len(batch))
|
|
119
|
+
rows = conn.execute(
|
|
120
|
+
f"SELECT FileHash, ImageHashValue FROM HashValueTable WHERE FileHash IN ({placeholders})",
|
|
121
|
+
batch,
|
|
122
|
+
).fetchall()
|
|
123
|
+
for filehash, imagehash_hex in rows:
|
|
124
|
+
result[filehash] = int(imagehash_hex, 16)
|
|
125
|
+
return result
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def db_set_imagehashes(conn, pairs):
|
|
129
|
+
"""Batch insert: pairs is list of (filehash, imagehash_int)."""
|
|
130
|
+
conn.executemany(
|
|
131
|
+
"INSERT OR IGNORE INTO HashValueTable (FileHash, ImageHashValue) VALUES(?, ?)",
|
|
132
|
+
[(fh, format(ih, "016x")) for fh, ih in pairs],
|
|
133
|
+
)
|
|
134
|
+
conn.commit()
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def db_upsert_files(conn, rows):
|
|
138
|
+
"""Batch upsert: rows is list of (filepath, filehash, imagehash_hex_or_none)."""
|
|
139
|
+
conn.executemany(
|
|
140
|
+
"INSERT INTO FileTable (FilePath, FileHash, ImageHashValue) VALUES(?, ?, ?) "
|
|
141
|
+
"ON CONFLICT(FilePath) DO UPDATE SET FileHash=excluded.FileHash, ImageHashValue=excluded.ImageHashValue",
|
|
142
|
+
rows,
|
|
143
|
+
)
|
|
144
|
+
conn.commit()
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def db_prune_missing(conn):
|
|
148
|
+
"""Remove FileTable entries where file no longer exists. Process in batches."""
|
|
149
|
+
cursor = conn.execute("SELECT id, FilePath FROM FileTable")
|
|
150
|
+
missing = []
|
|
151
|
+
while True:
|
|
152
|
+
rows = cursor.fetchmany(1000)
|
|
153
|
+
if not rows:
|
|
154
|
+
break
|
|
155
|
+
missing.extend(r[0] for r in rows if not Path(r[1]).exists())
|
|
156
|
+
if missing:
|
|
157
|
+
conn.executemany("DELETE FROM FileTable WHERE id=?", [(i,) for i in missing])
|
|
158
|
+
conn.commit()
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# --- Main logic ---
|
|
162
|
+
|
|
163
|
+
PHOTO_EXTENSIONS = {".jpg", ".jpeg", ".png", ".heic", ".heif", ".webp", ".tiff", ".tif", ".bmp", ".gif"}
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def find_files(root, photos_only=False):
|
|
167
|
+
"""Walk root, yield regular files (no symlinks)."""
|
|
168
|
+
for path in Path(root).rglob("*"):
|
|
169
|
+
if path.is_file() and not path.is_symlink():
|
|
170
|
+
if photos_only and path.suffix.lower() not in PHOTO_EXTENSIONS:
|
|
171
|
+
continue
|
|
172
|
+
yield path
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def progress(current, total, label="Progress"):
|
|
176
|
+
pct = current * 100 // total
|
|
177
|
+
print(f"\r{label}: {pct}%", end="", file=sys.stderr, flush=True)
|
|
178
|
+
if current == total:
|
|
179
|
+
print(file=sys.stderr)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def load_entries(files, conn):
|
|
183
|
+
"""Compute filehashes in parallel, lookup/compute imagehashes, return entries dict."""
|
|
184
|
+
file_strs = [str(f) for f in files]
|
|
185
|
+
total = len(file_strs)
|
|
186
|
+
print(f"Found {total} files", file=sys.stderr)
|
|
187
|
+
|
|
188
|
+
# Step 1: compute file hashes in parallel (streaming)
|
|
189
|
+
print("\rHashing files...", end="", file=sys.stderr, flush=True)
|
|
190
|
+
with Pool() as pool:
|
|
191
|
+
filehash_results = pool.map(_compute_filehash, file_strs)
|
|
192
|
+
print("\rHashing files... done", file=sys.stderr)
|
|
193
|
+
|
|
194
|
+
# filepath_str -> filehash
|
|
195
|
+
filehash_map = dict(filehash_results)
|
|
196
|
+
|
|
197
|
+
# Step 2: batch DB lookup for existing imagehashes
|
|
198
|
+
unique_filehashes = set(filehash_map.values())
|
|
199
|
+
known = db_get_imagehashes(conn, unique_filehashes)
|
|
200
|
+
|
|
201
|
+
# Step 3: compute missing imagehashes in parallel
|
|
202
|
+
need_compute = [fp for fp, fh in filehash_map.items() if fh not in known]
|
|
203
|
+
print(f"{len(need_compute)} files need image hash computation", file=sys.stderr)
|
|
204
|
+
if need_compute:
|
|
205
|
+
print(f"\rComputing image hashes ({len(need_compute)} files)...", end="", file=sys.stderr, flush=True)
|
|
206
|
+
with Pool() as pool:
|
|
207
|
+
imagehash_results = pool.map(_compute_imagehash, need_compute)
|
|
208
|
+
print(f"\rComputing image hashes ({len(need_compute)} files)... done", file=sys.stderr)
|
|
209
|
+
|
|
210
|
+
# Store new hashes in DB (batch)
|
|
211
|
+
new_pairs = []
|
|
212
|
+
for filepath_str, imagehash in imagehash_results:
|
|
213
|
+
if imagehash is not None:
|
|
214
|
+
filehash = filehash_map[filepath_str]
|
|
215
|
+
known[filehash] = imagehash
|
|
216
|
+
new_pairs.append((filehash, imagehash))
|
|
217
|
+
if new_pairs:
|
|
218
|
+
db_set_imagehashes(conn, new_pairs)
|
|
219
|
+
|
|
220
|
+
# Step 4: build entries and batch upsert to FileTable
|
|
221
|
+
entries = {}
|
|
222
|
+
upsert_rows = []
|
|
223
|
+
for filepath_str, filehash in filehash_map.items():
|
|
224
|
+
imagehash = known.get(filehash)
|
|
225
|
+
imagehash_hex = format(imagehash, "016x") if imagehash is not None else None
|
|
226
|
+
upsert_rows.append((filepath_str, filehash, imagehash_hex))
|
|
227
|
+
if imagehash is not None:
|
|
228
|
+
entries[filepath_str] = (filehash, imagehash)
|
|
229
|
+
db_upsert_files(conn, upsert_rows)
|
|
230
|
+
|
|
231
|
+
return entries
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def find_duplicates(entries, threshold):
|
|
235
|
+
"""Find near-duplicate groups using multiprocessing for comparison."""
|
|
236
|
+
filepaths = list(entries.keys())
|
|
237
|
+
n = len(filepaths)
|
|
238
|
+
# Flat list of hashes indexed by position
|
|
239
|
+
hashes = [entries[fp][1] for fp in filepaths]
|
|
240
|
+
|
|
241
|
+
if n < 2:
|
|
242
|
+
return []
|
|
243
|
+
|
|
244
|
+
total_pairs = n * (n - 1) // 2
|
|
245
|
+
print(f"\rComparing {total_pairs} pairs...", end="", file=sys.stderr, flush=True)
|
|
246
|
+
|
|
247
|
+
# Split rows across workers (each worker handles a range of 'i' values)
|
|
248
|
+
num_workers = min(8, n)
|
|
249
|
+
chunk_size = max(1, n // num_workers)
|
|
250
|
+
ranges = []
|
|
251
|
+
for start in range(0, n, chunk_size):
|
|
252
|
+
end = min(start + chunk_size, n)
|
|
253
|
+
ranges.append((start, end, hashes, threshold))
|
|
254
|
+
|
|
255
|
+
with Pool() as pool:
|
|
256
|
+
results = pool.map(_compare_range, ranges)
|
|
257
|
+
print(f"\rComparing {total_pairs} pairs... done", file=sys.stderr)
|
|
258
|
+
|
|
259
|
+
# Merge results into groups using filepath strings
|
|
260
|
+
groups = {}
|
|
261
|
+
for matches in results:
|
|
262
|
+
for i, j in matches:
|
|
263
|
+
a, b = filepaths[i], filepaths[j]
|
|
264
|
+
groups.setdefault(a, {a}).add(b)
|
|
265
|
+
groups.setdefault(b, {b}).add(a)
|
|
266
|
+
|
|
267
|
+
# Remove subgroups
|
|
268
|
+
unique_groups = []
|
|
269
|
+
for g in sorted(groups.values(), key=len, reverse=True):
|
|
270
|
+
if not any(g <= existing for existing in unique_groups):
|
|
271
|
+
unique_groups.append(g)
|
|
272
|
+
|
|
273
|
+
return unique_groups
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def view_groups(groups):
|
|
277
|
+
"""Simple tkinter viewer to browse duplicate groups side by side."""
|
|
278
|
+
import gzip
|
|
279
|
+
import shutil
|
|
280
|
+
import tkinter as tk
|
|
281
|
+
|
|
282
|
+
groups = [sorted(g) for g in groups]
|
|
283
|
+
cwd = str(Path.cwd())
|
|
284
|
+
state = {"idx": 0}
|
|
285
|
+
|
|
286
|
+
def rel_path(filepath):
|
|
287
|
+
"""Show path relative to cwd, or enough to distinguish."""
|
|
288
|
+
if filepath.startswith(cwd):
|
|
289
|
+
return filepath[len(cwd):].lstrip("/")
|
|
290
|
+
return filepath
|
|
291
|
+
|
|
292
|
+
def gzip_delete(filepath):
|
|
293
|
+
"""Compress file with gzip and remove the original."""
|
|
294
|
+
with open(filepath, "rb") as f_in:
|
|
295
|
+
with gzip.open(filepath + ".gz", "wb") as f_out:
|
|
296
|
+
shutil.copyfileobj(f_in, f_out)
|
|
297
|
+
Path(filepath).unlink()
|
|
298
|
+
|
|
299
|
+
root = tk.Tk()
|
|
300
|
+
root.title("imgduptective viewer")
|
|
301
|
+
root.geometry("1200x700")
|
|
302
|
+
|
|
303
|
+
top_frame = tk.Frame(root)
|
|
304
|
+
top_frame.pack(fill="x")
|
|
305
|
+
label = tk.Label(top_frame, text="", font=("", 12))
|
|
306
|
+
label.pack(side="left", padx=10)
|
|
307
|
+
help_label = tk.Label(top_frame, text="←/→:navigate d:delete(gzip) q:quit", font=("", 9))
|
|
308
|
+
help_label.pack(side="right", padx=10)
|
|
309
|
+
|
|
310
|
+
canvas_frame = tk.Frame(root)
|
|
311
|
+
canvas_frame.pack(fill="both", expand=True)
|
|
312
|
+
|
|
313
|
+
def show_group():
|
|
314
|
+
for w in canvas_frame.winfo_children():
|
|
315
|
+
w.destroy()
|
|
316
|
+
|
|
317
|
+
idx = state["idx"]
|
|
318
|
+
group = groups[idx]
|
|
319
|
+
label.config(text=f"Group {idx + 1}/{len(groups)} ({len(group)} files)")
|
|
320
|
+
|
|
321
|
+
root.update_idletasks()
|
|
322
|
+
n = len(group)
|
|
323
|
+
max_w = max(100, (root.winfo_width() - 20) // n - 10)
|
|
324
|
+
max_h = root.winfo_height() - 100
|
|
325
|
+
|
|
326
|
+
state["photos"] = []
|
|
327
|
+
state["frames"] = []
|
|
328
|
+
state["selected"] = set()
|
|
329
|
+
|
|
330
|
+
for i, filepath in enumerate(group):
|
|
331
|
+
frame = tk.Frame(canvas_frame, borderwidth=2, relief="groove")
|
|
332
|
+
frame.pack(side="left", padx=5, pady=5, fill="both", expand=True)
|
|
333
|
+
state["frames"].append(frame)
|
|
334
|
+
|
|
335
|
+
img = image_open(filepath)
|
|
336
|
+
if img:
|
|
337
|
+
ratio = min(max_w / img.size[0], max_h / img.size[1])
|
|
338
|
+
if ratio < 1:
|
|
339
|
+
img = img.resize((int(img.size[0] * ratio), int(img.size[1] * ratio)), Image.LANCZOS)
|
|
340
|
+
from PIL import ImageTk
|
|
341
|
+
photo = ImageTk.PhotoImage(img)
|
|
342
|
+
state["photos"].append(photo)
|
|
343
|
+
img_label = tk.Label(frame, image=photo)
|
|
344
|
+
img_label.pack()
|
|
345
|
+
img_label.bind("<Button-1>", lambda e, idx=i: toggle_select(idx))
|
|
346
|
+
|
|
347
|
+
tk.Label(frame, text=rel_path(filepath), wraplength=max_w, font=("", 9)).pack()
|
|
348
|
+
|
|
349
|
+
def toggle_select(i):
|
|
350
|
+
if i in state["selected"]:
|
|
351
|
+
state["selected"].discard(i)
|
|
352
|
+
state["frames"][i].config(relief="groove", bg="#d9d9d9")
|
|
353
|
+
else:
|
|
354
|
+
state["selected"].add(i)
|
|
355
|
+
state["frames"][i].config(relief="solid", bg="#ffcccc")
|
|
356
|
+
|
|
357
|
+
def delete_selected(event=None):
|
|
358
|
+
idx = state["idx"]
|
|
359
|
+
group = groups[idx]
|
|
360
|
+
to_delete = sorted(state["selected"], reverse=True)
|
|
361
|
+
if not to_delete:
|
|
362
|
+
return
|
|
363
|
+
for i in to_delete:
|
|
364
|
+
filepath = group[i]
|
|
365
|
+
try:
|
|
366
|
+
gzip_delete(filepath)
|
|
367
|
+
except Exception:
|
|
368
|
+
pass
|
|
369
|
+
# Remove deleted from group
|
|
370
|
+
groups[idx] = [f for i, f in enumerate(group) if i not in state["selected"]]
|
|
371
|
+
# Remove empty groups
|
|
372
|
+
if len(groups[idx]) < 2:
|
|
373
|
+
groups.pop(idx)
|
|
374
|
+
if not groups:
|
|
375
|
+
root.destroy()
|
|
376
|
+
return
|
|
377
|
+
state["idx"] = state["idx"] % len(groups)
|
|
378
|
+
show_group()
|
|
379
|
+
|
|
380
|
+
def next_group(event=None):
|
|
381
|
+
state["idx"] = (state["idx"] + 1) % len(groups)
|
|
382
|
+
show_group()
|
|
383
|
+
|
|
384
|
+
def prev_group(event=None):
|
|
385
|
+
state["idx"] = (state["idx"] - 1) % len(groups)
|
|
386
|
+
show_group()
|
|
387
|
+
|
|
388
|
+
root.bind("<Right>", next_group)
|
|
389
|
+
root.bind("<n>", next_group)
|
|
390
|
+
root.bind("<space>", next_group)
|
|
391
|
+
root.bind("<Left>", prev_group)
|
|
392
|
+
root.bind("<p>", prev_group)
|
|
393
|
+
root.bind("<d>", delete_selected)
|
|
394
|
+
root.bind("<Delete>", delete_selected)
|
|
395
|
+
root.bind("<q>", lambda e: root.destroy())
|
|
396
|
+
root.bind("<Escape>", lambda e: root.destroy())
|
|
397
|
+
|
|
398
|
+
show_group()
|
|
399
|
+
root.mainloop()
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def main():
|
|
403
|
+
parser = argparse.ArgumentParser(description="Detect near-duplicate images.")
|
|
404
|
+
parser.add_argument("threshold", type=int, nargs="?", help="Maximum hamming distance to consider a match")
|
|
405
|
+
parser.add_argument("--view", action="store_true", help="Open the builtin viewer to inspect duplicate groups")
|
|
406
|
+
parser.add_argument("--stats", action="store_true", help="Show per-directory duplicate statistics")
|
|
407
|
+
parser.add_argument("--check", action="store_true", help="Check what duplicates would be added without modifying the database")
|
|
408
|
+
parser.add_argument("--add", action="store_true", help="Add hashes to the database without comparing")
|
|
409
|
+
parser.add_argument("--photos", action="store_true", help="Only process common photo file formats")
|
|
410
|
+
parser.add_argument("--exact", action="store_true", help="Show only exact file matches instead of similar")
|
|
411
|
+
args = parser.parse_args()
|
|
412
|
+
|
|
413
|
+
if not args.add and not args.exact and args.threshold is None:
|
|
414
|
+
parser.error("threshold is required unless --add or --exact is used")
|
|
415
|
+
|
|
416
|
+
threshold = args.threshold
|
|
417
|
+
conn = get_db()
|
|
418
|
+
|
|
419
|
+
if args.check:
|
|
420
|
+
existing = {}
|
|
421
|
+
for row in conn.execute("SELECT FilePath, ImageHashValue FROM FileTable WHERE ImageHashValue IS NOT NULL").fetchall():
|
|
422
|
+
existing[row[0]] = int(row[1], 16)
|
|
423
|
+
|
|
424
|
+
files = list(find_files(Path.cwd(), args.photos))
|
|
425
|
+
file_strs = [str(f) for f in files]
|
|
426
|
+
|
|
427
|
+
with Pool() as pool:
|
|
428
|
+
filehash_results = pool.map(_compute_filehash, file_strs)
|
|
429
|
+
|
|
430
|
+
filehash_map = dict(filehash_results)
|
|
431
|
+
known = db_get_imagehashes(conn, set(filehash_map.values()))
|
|
432
|
+
|
|
433
|
+
need_compute = [fp for fp, fh in filehash_map.items() if fh not in known]
|
|
434
|
+
if need_compute:
|
|
435
|
+
with Pool() as pool:
|
|
436
|
+
imagehash_results = pool.map(_compute_imagehash, need_compute)
|
|
437
|
+
for fp, ih in imagehash_results:
|
|
438
|
+
if ih is not None:
|
|
439
|
+
known[filehash_map[fp]] = ih
|
|
440
|
+
|
|
441
|
+
new_files = {}
|
|
442
|
+
for fp, fh in filehash_map.items():
|
|
443
|
+
ih = known.get(fh)
|
|
444
|
+
if ih is not None:
|
|
445
|
+
new_files[fp] = ih
|
|
446
|
+
|
|
447
|
+
found = False
|
|
448
|
+
for new_path, new_hash in new_files.items():
|
|
449
|
+
matches = [ex_path for ex_path, ex_hash in existing.items()
|
|
450
|
+
if ex_path != new_path and bin(new_hash ^ ex_hash).count("1") <= threshold]
|
|
451
|
+
if matches:
|
|
452
|
+
if not found:
|
|
453
|
+
print("=== New duplicates if added ===")
|
|
454
|
+
found = True
|
|
455
|
+
print(f"{new_path}\t->\t" + "\t".join(sorted(matches)))
|
|
456
|
+
|
|
457
|
+
conn.close()
|
|
458
|
+
return
|
|
459
|
+
|
|
460
|
+
if args.add:
|
|
461
|
+
files = list(find_files(Path.cwd(), args.photos))
|
|
462
|
+
load_entries(files, conn)
|
|
463
|
+
db_prune_missing(conn)
|
|
464
|
+
conn.close()
|
|
465
|
+
return
|
|
466
|
+
|
|
467
|
+
files = list(find_files(Path.cwd(), args.photos))
|
|
468
|
+
entries = load_entries(files, conn)
|
|
469
|
+
db_prune_missing(conn)
|
|
470
|
+
|
|
471
|
+
# Report exact duplicates (same filehash)
|
|
472
|
+
hash_to_files = {}
|
|
473
|
+
for filepath, (filehash, imagehash) in entries.items():
|
|
474
|
+
hash_to_files.setdefault(filehash, []).append(filepath)
|
|
475
|
+
exact_dupes = [sorted(v) for v in hash_to_files.values() if len(v) > 1]
|
|
476
|
+
|
|
477
|
+
# Find near-duplicate groups
|
|
478
|
+
unique_groups = find_duplicates(entries, threshold) if not args.exact else []
|
|
479
|
+
|
|
480
|
+
# --stats: only show statistics
|
|
481
|
+
if args.stats:
|
|
482
|
+
total = len(entries)
|
|
483
|
+
dir_total = Counter()
|
|
484
|
+
for filepath in entries:
|
|
485
|
+
dir_total[str(Path(filepath).parent)] += 1
|
|
486
|
+
|
|
487
|
+
if args.exact:
|
|
488
|
+
dupes = sum(len(g) for g in exact_dupes)
|
|
489
|
+
print(f"Exact duplicates: {dupes}/{total}")
|
|
490
|
+
dir_dupes = Counter()
|
|
491
|
+
for group in exact_dupes:
|
|
492
|
+
for filepath in group:
|
|
493
|
+
dir_dupes[str(Path(filepath).parent)] += 1
|
|
494
|
+
else:
|
|
495
|
+
all_dupes = {fp for group in unique_groups for fp in group}
|
|
496
|
+
print(f"Near-duplicates: {len(all_dupes)}/{total}")
|
|
497
|
+
dir_dupes = Counter()
|
|
498
|
+
for filepath in all_dupes:
|
|
499
|
+
dir_dupes[str(Path(filepath).parent)] += 1
|
|
500
|
+
if dir_dupes:
|
|
501
|
+
for dirname, count in dir_dupes.most_common():
|
|
502
|
+
print(f"{count}/{dir_total[dirname]}\t{dirname}")
|
|
503
|
+
conn.close()
|
|
504
|
+
return
|
|
505
|
+
|
|
506
|
+
# --view: open builtin viewer
|
|
507
|
+
groups_to_view = exact_dupes if args.exact else unique_groups
|
|
508
|
+
if args.view:
|
|
509
|
+
if groups_to_view:
|
|
510
|
+
view_groups(groups_to_view)
|
|
511
|
+
conn.close()
|
|
512
|
+
return
|
|
513
|
+
|
|
514
|
+
# Output groups
|
|
515
|
+
if args.exact:
|
|
516
|
+
if exact_dupes:
|
|
517
|
+
print("=== Exact duplicates ===")
|
|
518
|
+
for files in sorted(exact_dupes, key=lambda x: x[0]):
|
|
519
|
+
print("\t".join(files))
|
|
520
|
+
else:
|
|
521
|
+
if unique_groups:
|
|
522
|
+
print("=== Near-duplicate groups ===")
|
|
523
|
+
for group in unique_groups:
|
|
524
|
+
print("\t".join(sorted(group)))
|
|
525
|
+
|
|
526
|
+
conn.close()
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
if __name__ == "__main__":
|
|
530
|
+
main()
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "imgduptective"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Image Duplicates Detective"
|
|
5
|
+
authors = [
|
|
6
|
+
{ name = "sacha", email = "sachahony@gmail.com" },
|
|
7
|
+
{ name = "Sacha Hony", email = "zazahohonini@gmail.com" },
|
|
8
|
+
]
|
|
9
|
+
dependencies = [
|
|
10
|
+
"Pillow",
|
|
11
|
+
]
|
|
12
|
+
requires-python = ">=3.10"
|
|
13
|
+
readme = "README.md"
|
|
14
|
+
classifiers = [
|
|
15
|
+
"License :: OSI Approved :: MIT License",
|
|
16
|
+
"Operating System :: OS Independent",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[project.license]
|
|
21
|
+
text = "MIT"
|
|
22
|
+
|
|
23
|
+
[project.urls]
|
|
24
|
+
Homepage = "https://github.com/zazaho/imgduptective"
|
|
25
|
+
|
|
26
|
+
[project.scripts]
|
|
27
|
+
imgduptective = "imgduptective.imgduptective:main"
|
|
28
|
+
|
|
29
|
+
[build-system]
|
|
30
|
+
requires = [
|
|
31
|
+
"pdm-backend",
|
|
32
|
+
]
|
|
33
|
+
build-backend = "pdm.backend"
|
|
34
|
+
|
|
35
|
+
[tool]
|