skilleter-thingy 0.0.23__py3-none-any.whl → 0.0.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skilleter-thingy might be problematic. Click here for more details.

@@ -0,0 +1,281 @@
1
+ #!/usr/bin/env python3
2
+ """Hash photos to find closely-similar images and report them"""
3
+
4
+ # TODO: Configurable hash type and stored in pickle data
5
+ # TODO: Option to pickle duplicates
6
+ # TODO: Check for files that are no longer present after scanning and remove them from the data
7
+ # TODO: Display duplicates - borrow code from imagedup
8
+ # TODO: Note that if a==b, we get reports for a==b and b==a so we need to do some filtering and decide on what's likely to be an original and what is a duplicate
9
+ # TODO: Maybe store file date, size and image dimensions in 'updated' for better comparisons
10
+ # TODO: Pickle data named for directories
11
+ # TODO: Use database instead of pickling - very slow to load big pickles - would facilitate scanning/checking specific directories
12
+ # TODO: If directories specified, the only scan and compare within those directories (but keep everything in pickle file (yes, it will get biggerer, and biggererer)
13
+
14
+ import threading
15
+ import queue
16
+ import sys
17
+ import os
18
+ import pickle
19
+ import argparse
20
+ from pathlib import PurePath
21
+ from typing import List, Union
22
+
23
+ from imagededup.methods import PHash
24
+
25
+ import matplotlib.gridspec as gridspec
26
+ import matplotlib.pyplot as plt
27
+ from matplotlib import figure
28
+
29
+ import numpy as np
30
+ from PIL import Image
31
+
32
+ ################################################################################
33
+
34
+ PICKLE_FILE = 'imagedupe.pickle'
35
+
36
+ IMAGE_EXT_LIST = ('.jpeg', '.jpg', '.png', '.bmp', '.mpo', '.ppm', '.tif', '.tiff', '.gif', '.svg', '.pgm', '.pbm', '.webp')
37
+
38
+ PICKLE_VERSION = 1
39
+
40
+ NUM_THREADS = 24
41
+
42
+ ################################################################################
43
+
44
+ def queue_files_to_hash(file_queue, directories):
45
+ """Read all the specfied directories queue every file therein"""
46
+
47
+ # Walk each directory tree
48
+
49
+ for directory in directories:
50
+ print(f'Scanning directory tree {directory}')
51
+
52
+ for root, _, files in os.walk(directory):
53
+ print(f'Scanning directory {root}')
54
+
55
+ for file in files:
56
+ file_queue.put(os.path.join(root, file))
57
+
58
+ ################################################################################
59
+
60
+ def hasher_thread(method, file_queue, hash_queue, hashes, updated):
61
+ """Thread - reads a file from the queue and, if it is an unhashed or updated image
62
+ calculate the hash and post it and the modification time on the updated queue"""
63
+
64
+ while not file_queue.empty():
65
+ filepath = file_queue.get()
66
+
67
+ fileext = os.path.splitext(filepath)[1]
68
+ mod_time = os.path.getmtime(filepath)
69
+
70
+ # If the file type is an image and the file hasn't been hashed or the modification time has changed
71
+ # then save the hash and the modification time
72
+
73
+ if fileext.lower() in IMAGE_EXT_LIST and (filepath not in hashes or mod_time != updated[filepath]):
74
+ # Calculate the hash and store path, dimensions and file size under the hash entry in the hashes table
75
+
76
+ print(f'Calculating hash for {filepath}')
77
+ encoding = method.encode_image(image_file=filepath)
78
+
79
+ if encoding:
80
+ hash_queue.put({'filepath': filepath, 'encoding': encoding, 'updated': mod_time})
81
+ else:
82
+ print(f'Invalid image {filepath}')
83
+
84
+ file_queue.task_done()
85
+
86
+ ################################################################################
87
+
88
+ def hash_directories(directories, method, hashes, updated):
89
+ """Scan for new files and calculate their hashes"""
90
+
91
+ # Create the I/O queues
92
+
93
+ file_queue = queue.Queue()
94
+ hash_queue = queue.Queue()
95
+
96
+ # Queue the list of files to hash
97
+
98
+ queue_files_to_hash(file_queue, directories)
99
+
100
+ # Start the threads hashing away
101
+
102
+ thread_list = []
103
+ for _ in range(NUM_THREADS):
104
+ thread = threading.Thread(target=hasher_thread, daemon=True, args=(method, file_queue, hash_queue, hashes, updated))
105
+ thread.start()
106
+ thread_list.append(thread)
107
+
108
+ # Wait for the threads to stop working
109
+
110
+ print('Waiting for threads to terminate')
111
+
112
+ for thread in thread_list:
113
+ thread.join()
114
+
115
+ # Process the results
116
+
117
+ while not hash_queue.empty():
118
+ entry = hash_queue.get()
119
+
120
+ filepath = entry['filepath']
121
+ hashes[filepath] = entry['encoding']
122
+ updated[filepath] = entry['updated']
123
+
124
+ ################################################################################
125
+
126
+ def plot_images(
127
+ orig: str,
128
+ image_list: List,
129
+ scores: bool = False,
130
+ outfile: str = None,
131
+ ) -> None:
132
+ """
133
+ Plotting function for plot_duplicates() defined below.
134
+
135
+ Args:
136
+ orig: filename for which duplicates are to be plotted.
137
+ image_list: List of duplicate filenames, could also be with scores (filename, score).
138
+ scores: Whether only filenames are present in the image_list or scores as well.
139
+ outfile: Name of the file to save the plot.
140
+ """
141
+
142
+ def formatter(val: Union[int, np.float32]):
143
+ """
144
+ For printing floats only upto 3rd precision. Ints are unchanged.
145
+ """
146
+ if isinstance(val, np.float32):
147
+ return f'{val:.3f}'
148
+
149
+ return val
150
+
151
+ n_ims = len(image_list)
152
+ ncols = 4 # fixed for a consistent layout
153
+ nrows = int(np.ceil(n_ims / ncols)) + 1
154
+ fig = figure.Figure(figsize=(10, 14))
155
+
156
+ gs = gridspec.GridSpec(nrows=nrows, ncols=ncols)
157
+ ax = plt.subplot(
158
+ gs[0, 1:3]
159
+ ) # Always plot the original image in the middle of top row
160
+ ax.imshow(Image.open(orig))
161
+ ax.set_title(f'Original Image: {format(orig)}')
162
+ ax.axis('off')
163
+
164
+ for i in range(0, n_ims):
165
+ row_num = (i // ncols) + 1
166
+ col_num = i % ncols
167
+
168
+ ax = plt.subplot(gs[row_num, col_num])
169
+ if scores:
170
+ ax.imshow(Image.open(image_list[i][0]))
171
+ val = formatter(image_list[i][1])
172
+ title = ' '.join([image_list[i][0], f'({val})'])
173
+ else:
174
+ ax.imshow(Image.open(image_list[i]))
175
+ title = image_list[i]
176
+
177
+ ax.set_title(title, fontsize=6)
178
+ ax.axis('off')
179
+ gs.tight_layout(fig)
180
+
181
+ if outfile:
182
+ plt.savefig(outfile)
183
+
184
+ plt.show()
185
+ plt.close()
186
+
187
+ ################################################################################
188
+
189
+ def main():
190
+ """Read the hashes and report duplicates in a vaguely civilised way"""
191
+
192
+ # Hashing and comparison method
193
+
194
+ method = PHash()
195
+
196
+ # Handle command line arguments
197
+
198
+ parser = argparse.ArgumentParser(description='Search for similar images')
199
+ parser.add_argument('--no-scan', action='store_true', help='Use pickled scan data without updating it')
200
+ parser.add_argument('--no-compare', action='store_true', help='Use pickled comparison data without updating it')
201
+ parser.add_argument('--show', action='store_true', help='Show duplicate images')
202
+ parser.add_argument('directories', nargs='*', action='store', help='Directories to search')
203
+
204
+ args = parser.parse_args()
205
+
206
+ breakpoint()
207
+
208
+ if not args.no_scan and not args.directories:
209
+ print('You must be specify at least one directory in order to perform a scan')
210
+ sys.exit(1)
211
+
212
+ # We pickle the current set of files, hashes and comparisons
213
+
214
+ try:
215
+ print('Loading cached data')
216
+
217
+ with open(PICKLE_FILE, 'rb') as pickles:
218
+ data = pickle.load(pickles)
219
+
220
+ if data['version'] != PICKLE_VERSION:
221
+ print(f'WARNING: Current version is {PICKLE_VERSION} but saved data is from version {data["version"]}. Interesting things could happen....')
222
+
223
+ hashes = data['hashes']
224
+ updated = data['updated']
225
+ duplicates = data.get('duplicates', None)
226
+
227
+ except (FileNotFoundError, EOFError):
228
+ if args.no_scan:
229
+ print('ERROR: Cannot use no-scan option as no cached scan data is available')
230
+ sys.exit(1)
231
+
232
+ hashes = {}
233
+ updated = {}
234
+ duplicates = {}
235
+
236
+ if args.no_compare and not duplicates:
237
+ print('ERROR: Cannot use no-compare option as no cached comparison data is available')
238
+ sys.exit(1)
239
+
240
+ if not args.no_scan:
241
+ # Scan for new values and calculate hashes
242
+
243
+ hash_directories(method, args.directories, hashes, updated)
244
+
245
+ if not args.no_compare:
246
+ # Look for duplicates
247
+
248
+ duplicates = method.find_duplicates(encoding_map=hashes)
249
+
250
+ # Pickle the updated results
251
+
252
+ with open(PICKLE_FILE, 'wb') as pickles:
253
+ dupe_data = {'hashes': hashes, 'updated': updated, 'version': PICKLE_VERSION, 'duplicates': duplicates}
254
+ pickle.dump(dupe_data, pickles)
255
+
256
+ # Report them
257
+
258
+ for entry in duplicates:
259
+ if duplicates[entry]:
260
+ print(f'{entry}: {duplicates[entry]}')
261
+ if args.show:
262
+ plot_duplicates(entry, duplicates[entry])
263
+
264
+ ################################################################################
265
+
266
+ def photodupe():
267
+ """Entry point"""
268
+
269
+ try:
270
+ main()
271
+
272
+ except KeyboardInterrupt:
273
+ sys.exit(1)
274
+
275
+ except BrokenPipeError:
276
+ sys.exit(2)
277
+
278
+ ################################################################################
279
+
280
+ if __name__ == '__main__':
281
+ photodupe()
@@ -5,6 +5,7 @@ import sys
5
5
  import os
6
6
  import sys
7
7
  import pickle
8
+ import argparse
8
9
 
9
10
  import PIL
10
11
 
@@ -15,28 +16,42 @@ import imagehash
15
16
 
16
17
  ################################################################################
17
18
 
18
- def read_image_hashes():
19
+ def read_image_hashes(directories):
19
20
  """Read all the specfied directories and hash every picture therein"""
20
21
 
21
22
  hashes = defaultdict(list)
22
23
 
23
- for directory in sys.argv[1:]:
24
+ # Walk each directory tree
25
+
26
+ for directory in directories:
27
+ print(f'Scanning directory tree {directory}')
28
+
24
29
  for root, _, files in os.walk(directory):
30
+ print(f'Scanning directory {root}')
31
+
25
32
  for file in files:
26
33
  filepath = os.path.join(root, file)
27
-
28
- try:
29
- with Image.open(filepath) as image:
30
- hash_value = imagehash.average_hash(image, hash_size=12)
31
-
32
- size = os.stat(filepath).st_size
33
- hashes[hash_value].append({'path': filepath, 'width': image.width, 'height': image.height, 'size': size})
34
-
35
- except PIL.UnidentifiedImageError:
36
- sys.stderr.write(f'ERROR: Unrecognized format {filepath}\n')
37
-
38
- except OSError:
39
- sys.stderr.write(f'ERROR: Unable to read {filepath}\n')
34
+
35
+ fileext = os.path.splitext(file)[1]
36
+
37
+ if fileext.lower() not in ('.jbf', '.ini', '.xml', '.ffs_db'):
38
+ # Calculate the hash and store path, dimensions and file size under the hash entry in the hashes table
39
+
40
+ try:
41
+ with Image.open(filepath) as image:
42
+ hash_value = imagehash.average_hash(image, hash_size=12)
43
+
44
+ size = os.stat(filepath).st_size
45
+ hashes[hash_value].append({'path': filepath, 'width': image.width, 'height': image.height, 'size': size})
46
+
47
+ except PIL.UnidentifiedImageError:
48
+ sys.stderr.write(f'ERROR: Unrecognized format {filepath} (size={size})\n')
49
+
50
+ except OSError:
51
+ sys.stderr.write(f'ERROR: Unable to read {filepath} (size={size})\n')
52
+
53
+ # Return the hash table
54
+
40
55
  return hashes
41
56
 
42
57
  ################################################################################
@@ -44,6 +59,15 @@ def read_image_hashes():
44
59
  def main():
45
60
  """Read the hashes and report duplicates in a vaguely civilised way"""
46
61
 
62
+ parser = argparse.ArgumentParser(description='Search for similar images')
63
+ parser.add_argument('directories', nargs='*', action='store', help='Directories to search')
64
+
65
+ args = parser.parse_args()
66
+
67
+ if not args.directories:
68
+ print('You must be specify at least one directory')
69
+ sys.exit(1)
70
+
47
71
  try:
48
72
  print('Loading cached data')
49
73
 
@@ -52,8 +76,10 @@ def main():
52
76
  except (FileNotFoundError, EOFError):
53
77
  print('Scanning directories')
54
78
 
55
- hashes = read_image_hashes()
79
+ hashes = read_image_hashes(args.directories)
56
80
 
81
+ # Sort the list of hashes so that we can easily find close matches
82
+
57
83
  print('Sorting hashes')
58
84
 
59
85
  hash_values = sorted([str(hashval) for hashval in hashes])
@@ -217,7 +217,7 @@ def show_cpu_load(scr, first, w, h, x, y):
217
217
  yo += 1
218
218
 
219
219
  if yo > h-2:
220
- xo += w/3
220
+ xo += w//3
221
221
  yo = 0
222
222
 
223
223
  x += w//2
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skilleter_thingy
3
- Version: 0.0.23
3
+ Version: 0.0.25
4
4
  Summary: A collection of useful utilities, mainly aimed at making Git more friendly
5
5
  Author-email: John Skilleter <john@skilleter.org.uk>
6
6
  Project-URL: Home, https://skilleter.org.uk
@@ -10,7 +10,6 @@ Classifier: Operating System :: OS Independent
10
10
  Requires-Python: >=3.6
11
11
  Description-Content-Type: text/markdown
12
12
  License-File: LICENSE
13
- Requires-Dist: boto3
14
13
  Requires-Dist: imagehash
15
14
  Requires-Dist: inotify
16
15
  Requires-Dist: pillow
@@ -19,6 +18,7 @@ Requires-Dist: pyaml
19
18
  Requires-Dist: pygit2
20
19
  Requires-Dist: python-dateutil
21
20
  Requires-Dist: requests
21
+ Requires-Dist: numpy
22
22
 
23
23
  # Thingy
24
24
 
@@ -1,6 +1,5 @@
1
1
  skilleter_thingy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  skilleter_thingy/addpath.py,sha256=4Yhhgjjz1XDI98j0dAiQpNA2ejLefeWUTeSg3nIXQq0,3842
3
- skilleter_thingy/aws.py,sha256=k08MT1866KuMjgG7ECr7LCuqcgWx78iPDcS9CmemIoA,18614
4
3
  skilleter_thingy/borger.py,sha256=AQX7OHeGXcUjkgyXEWE2h_oOey9eczZBbKjrreXvRAs,7832
5
4
  skilleter_thingy/colour.py,sha256=D-RTYsND6Xm6m3xl0mOe9QSrTNYsyY0K_a8x3id2gvg,7031
6
5
  skilleter_thingy/console_colours.py,sha256=dT5qc_B62VJaWs92yrFt1izoB7hs3a22t3lfrZFybG4,1786
@@ -33,11 +32,12 @@ skilleter_thingy/gitlab.py,sha256=TKKkrQ3CfbWViEf_fnroQJSW8Nh-YwUeX6zo7GmzndI,60
33
32
  skilleter_thingy/gitprompt.py,sha256=m3WbeEd0WgNIyKbtx-wp-kHnOG83cWjq_KA8W6RmD4g,8925
34
33
  skilleter_thingy/gl.py,sha256=fb9OkxfPSuqyfnLYauN0F1jZYIJNLXhNwlU8PRjJ4j4,5964
35
34
  skilleter_thingy/gphotosync.py,sha256=-LbNXxi34sMSufmZ-NoStIe-6cKFKK2zvcDH7quI8W8,22040
35
+ skilleter_thingy/imagedupe.py,sha256=60iSz4unS3REAWYHcWa4fnX0-kOrKVYcGuV2xBksj_E,9120
36
36
  skilleter_thingy/linecount.py,sha256=lw3vuXUUnMwrUihY6bHfZJsRKe6ZMCRz3952Z9N-ogI,4316
37
37
  skilleter_thingy/logger.py,sha256=xKgPAq8KGXmtaXIFjFs1AmZJXtYrXJn2sqL3oxHZjfQ,3107
38
38
  skilleter_thingy/moviemover.py,sha256=j_Xb9_jFdgpFBAXcF4tEqbnKH_FonlnUU39LiCK980k,4470
39
39
  skilleter_thingy/path.py,sha256=3ba_e-QwYpAs-jFVWoV8sfjVjs_59uc5JZt-87Hqn6g,4737
40
- skilleter_thingy/photodupe.py,sha256=EiWTDLI4tFZp8ruwkbCayMTiwRfMI7O_kmbWgXwZGVQ,3178
40
+ skilleter_thingy/photodupe.py,sha256=FssLgbLnqHPuSvMGtRyOa7bRdowufJOQaJJ56f9ybxk,4195
41
41
  skilleter_thingy/phototidier.py,sha256=1uj1XbUemJOalNC3PwTG2-_yUQp4WMAcmwRr7QXPn1w,7823
42
42
  skilleter_thingy/popup.py,sha256=jW-nbpdeswqEMTli7OmBv1J8XQsvFoMI0J33O6dOeu8,2529
43
43
  skilleter_thingy/process.py,sha256=RmXj2RrzwqP_mugeVejtjgF_T3PD7M10XsWuAnc99t0,3565
@@ -47,10 +47,9 @@ skilleter_thingy/remdir.py,sha256=zp5Nr0IMGXQ-b5iT48O5arqWoSjW65Xnr-SpKuav1Ac,46
47
47
  skilleter_thingy/rmdupe.py,sha256=tcX3w8XvliGwBMdSt9BUu07kuDtQEc0IiU8sCxmgzHA,17117
48
48
  skilleter_thingy/rpylint.py,sha256=na39x0yNXDwDkG9yP48BoM5FeTut-OS4AVsYixE0YZU,2639
49
49
  skilleter_thingy/run.py,sha256=EGYJSuMcOmUca6dpfVUFE41vG9C6ZNK8hzZlJCJE6Rs,12619
50
- skilleter_thingy/s3_sync.py,sha256=TITptjua_B-iwPlgTniuoxPvuEnQjyTKfs6l9CKHbXc,13849
51
50
  skilleter_thingy/splitpics.py,sha256=qRlJrqet7TEI6SodS4bkuKXQUpOdMaqmjE4c1CR7ouo,3266
52
51
  skilleter_thingy/strreplace.py,sha256=xsIWw0hc452rYEBtNEQFKIzmV03xjm_Taz-eDTmFFKI,2539
53
- skilleter_thingy/sysmon.py,sha256=zSnR9oqGr1TwOhwn7Mcofq2fcnoVjlg5gRH56l2w1N0,11347
52
+ skilleter_thingy/sysmon.py,sha256=XRZG6EVSzoVYan_N16qVB1l1RaU51uvLWlRA0CDjC54,11348
54
53
  skilleter_thingy/tfm.py,sha256=wG4oNhn1pBcLwPPzZc19x_HyPYsM1pl8E0skvj03IH8,33712
55
54
  skilleter_thingy/tfm_pane.py,sha256=BmyRDKZyr0mS89MbudbOUA4uBh4E6X3tGct6JX87vI8,19829
56
55
  skilleter_thingy/tfparse.py,sha256=y4MSipVPO-P12QXMSXAT92qy9YMqNDl_1Thum9j7S2g,2993
@@ -59,9 +58,9 @@ skilleter_thingy/trimpath.py,sha256=SAfOB75_dTldQHjam4kQy1J42209NYPYi8vVAaNn1e8,
59
58
  skilleter_thingy/window_rename.py,sha256=dCBgZqih_3YKHt35hsOAhARFp3QxOi8w8huC63sqJK8,3128
60
59
  skilleter_thingy/xchmod.py,sha256=F9_lxKuLqVlHHr3oBI3dkMoFOuwRzYDlpQMTmDcjpBI,4590
61
60
  skilleter_thingy/yamlcheck.py,sha256=FXylZ5NtHirDlPVhVEUZUZkTugVR-g51BbjaN06akAc,2868
62
- skilleter_thingy-0.0.23.dist-info/LICENSE,sha256=ljOS4DjXvqEo5VzGfdaRwgRZPbNScGBmfwyC8PChvmQ,32422
63
- skilleter_thingy-0.0.23.dist-info/METADATA,sha256=amrMSKYsBCKb8c3vm4e0e-pDl27Gamk9K6rn7oHy2oU,5231
64
- skilleter_thingy-0.0.23.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
65
- skilleter_thingy-0.0.23.dist-info/entry_points.txt,sha256=jf7hVfH0dojLxk0D4TjIazKoqdToditCGdCfyhIotME,1971
66
- skilleter_thingy-0.0.23.dist-info/top_level.txt,sha256=8-JhgToBBiWURunmvfpSxEvNkDHQQ7r25-aBXtZv61g,17
67
- skilleter_thingy-0.0.23.dist-info/RECORD,,
61
+ skilleter_thingy-0.0.25.dist-info/LICENSE,sha256=ljOS4DjXvqEo5VzGfdaRwgRZPbNScGBmfwyC8PChvmQ,32422
62
+ skilleter_thingy-0.0.25.dist-info/METADATA,sha256=2n3rE7kBsRxBjonvItXcdpwaJf6stvUGB4g2hCmAWCM,5231
63
+ skilleter_thingy-0.0.25.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
64
+ skilleter_thingy-0.0.25.dist-info/entry_points.txt,sha256=jf7hVfH0dojLxk0D4TjIazKoqdToditCGdCfyhIotME,1971
65
+ skilleter_thingy-0.0.25.dist-info/top_level.txt,sha256=8-JhgToBBiWURunmvfpSxEvNkDHQQ7r25-aBXtZv61g,17
66
+ skilleter_thingy-0.0.25.dist-info/RECORD,,