megadetector 5.0.15__py3-none-any.whl → 5.0.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (34) hide show
  1. megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +387 -0
  2. megadetector/data_management/importers/snapshot_safari_importer_reprise.py +28 -16
  3. megadetector/data_management/lila/generate_lila_per_image_labels.py +3 -3
  4. megadetector/data_management/lila/test_lila_metadata_urls.py +2 -2
  5. megadetector/data_management/remove_exif.py +61 -36
  6. megadetector/data_management/yolo_to_coco.py +25 -6
  7. megadetector/detection/process_video.py +270 -127
  8. megadetector/detection/pytorch_detector.py +13 -11
  9. megadetector/detection/run_detector.py +9 -2
  10. megadetector/detection/run_detector_batch.py +8 -1
  11. megadetector/detection/run_inference_with_yolov5_val.py +58 -10
  12. megadetector/detection/tf_detector.py +8 -2
  13. megadetector/detection/video_utils.py +214 -18
  14. megadetector/postprocessing/md_to_coco.py +31 -9
  15. megadetector/postprocessing/postprocess_batch_results.py +23 -7
  16. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +5 -2
  17. megadetector/postprocessing/subset_json_detector_output.py +22 -12
  18. megadetector/taxonomy_mapping/map_new_lila_datasets.py +3 -3
  19. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +2 -1
  20. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +1 -1
  21. megadetector/taxonomy_mapping/simple_image_download.py +5 -0
  22. megadetector/taxonomy_mapping/species_lookup.py +1 -1
  23. megadetector/utils/ct_utils.py +48 -0
  24. megadetector/utils/md_tests.py +231 -56
  25. megadetector/utils/path_utils.py +2 -2
  26. megadetector/utils/torch_test.py +32 -0
  27. megadetector/utils/url_utils.py +101 -4
  28. megadetector/visualization/visualization_utils.py +21 -6
  29. megadetector/visualization/visualize_db.py +16 -0
  30. {megadetector-5.0.15.dist-info → megadetector-5.0.17.dist-info}/LICENSE +0 -0
  31. {megadetector-5.0.15.dist-info → megadetector-5.0.17.dist-info}/METADATA +5 -7
  32. {megadetector-5.0.15.dist-info → megadetector-5.0.17.dist-info}/RECORD +34 -32
  33. {megadetector-5.0.15.dist-info → megadetector-5.0.17.dist-info}/WHEEL +1 -1
  34. {megadetector-5.0.15.dist-info → megadetector-5.0.17.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,387 @@
1
+ """
2
+
3
+ import_desert_lion_conservation_camera_traps.py
4
+
5
+ Prepare the Desert Lion Conservation Camera Traps dataset for release on LILA.
6
+
7
+ """
8
+
9
+ #%% Imports and constants
10
+
11
+ import os
12
+ import json
13
+
14
+ input_base_folder = r'i:/data/desert-lion'
15
+ assert os.path.isdir(input_base_folder)
16
+
17
+ # md_results_file = r'i:/data/desert-lion/desert-lion-camera-traps-2024-07-14-v5a.0.0_detections-all.json'
18
+ md_results_file = r'i:/data/desert-lion/desert-lion-camera-traps-2024-07-14-v5a.0.0_detections.json'
19
+ assert os.path.isfile(md_results_file)
20
+
21
+ export_base = os.path.join(input_base_folder,'annotated-imgs')
22
+ assert os.path.isdir(export_base)
23
+
24
+ preview_dir = r'g:\temp\desert-lion-viz'
25
+ output_file = os.path.join(input_base_folder,'desert_lion_camera_traps.json')
26
+ output_zipfile = os.path.join(input_base_folder,'desert-lion-camera-traps-images.zip')
27
+
28
+ exif_cache_file_post_exif_removal = os.path.join(input_base_folder,'exif_data_post_exif_removal.json')
29
+ exif_cache_file = os.path.join(input_base_folder,'exif_data.json')
30
+
31
+
32
+ #%% Find images and videos
33
+
34
+ from megadetector.detection.video_utils import find_videos
35
+ from megadetector.utils.path_utils import find_images
36
+
37
+ video_files = find_videos(input_base_folder,recursive=True,return_relative_paths=True,convert_slashes=True)
38
+ image_files = find_images(input_base_folder,recursive=True,return_relative_paths=True,convert_slashes=True)
39
+
40
+ n_annotated_imgs = len([fn for fn in image_files if 'annotated-imgs' in fn])
41
+ print('Found {} images ({} in the annotated-imgs folder), {} videos'.format(
42
+ len(image_files),n_annotated_imgs,len(video_files)))
43
+
44
+
45
+ #%% Read EXIF data
46
+
47
+ from megadetector.data_management.read_exif import read_exif_from_folder, ReadExifOptions
48
+
49
+ exif_options = ReadExifOptions()
50
+ exif_options.n_workers = 10
51
+
52
+ if os.path.isfile(exif_cache_file):
53
+ print('EXIF cache {} exists, skipping EXIF read'.format(exif_cache_file))
54
+ with open(exif_cache_file,'r') as f:
55
+ exif_data = json.load(f)
56
+ else:
57
+ exif_data = read_exif_from_folder(input_folder=input_base_folder,
58
+ output_file=exif_cache_file,
59
+ options=exif_options,
60
+ filenames=None,
61
+ recursive=True)
62
+
63
+ assert len(exif_data) == len(image_files)
64
+
65
+
66
+ #%% Remove EXIF data
67
+
68
+ from megadetector.data_management.remove_exif import remove_exif
69
+ remove_exif(input_base_folder,recursive=True,n_processes=1)
70
+
71
+
72
+ #%% Read EXIF data again
73
+
74
+ exif_data_post_exif_removal = read_exif_from_folder(input_folder=input_base_folder,
75
+ output_file=exif_cache_file_post_exif_removal,
76
+ options=exif_options,
77
+ filenames=None,
78
+ recursive=True)
79
+
80
+
81
+ #%% Make sure no lat/lon data is present
82
+
83
+ from tqdm import tqdm
84
+
85
+ for i_image,im in enumerate(tqdm(exif_data_post_exif_removal)):
86
+ tags = im['exif_tags']
87
+ if tags is None:
88
+ continue
89
+ for k in tags:
90
+ assert 'gps' not in str(k).lower()
91
+
92
+
93
+ #%% Look for images that contain humans
94
+
95
+ with open(md_results_file,'r') as f:
96
+ md_results = json.load(f)
97
+
98
+ assert len(md_results['images']) == len(image_files)
99
+
100
+ human_threshold = 0.1
101
+ human_categories = ['2','3']
102
+
103
+ candidate_human_images = set()
104
+ failed_images = set()
105
+
106
+ # i_image = 0; im = md_results['images'][0]
107
+ for i_image,im in tqdm(enumerate(md_results['images']),total=len(md_results['images'])):
108
+
109
+ if 'failure' in im:
110
+ failed_images.add(im['file'])
111
+ continue
112
+
113
+ for det in im['detections']:
114
+ if det['category'] in human_categories and det['conf'] >= human_threshold:
115
+ candidate_human_images.add(im['file'])
116
+ break
117
+
118
+ # ...for each detection
119
+
120
+ # ...for each image
121
+
122
+ print('Found {} failed images and {} candidate human images'.format(
123
+ len(failed_images),len(candidate_human_images)))
124
+
125
+
126
+ #%% Copy failed images and human images to a temporary folder for review
127
+
128
+ review_folder_base = r'g:/temp/review_images'
129
+ os.makedirs(review_folder_base,exist_ok=True)
130
+
131
+ images_to_review = failed_images.union(candidate_human_images)
132
+ images_to_review = sorted(list(images_to_review))
133
+
134
+ source_file_to_target_file = {}
135
+
136
+ # fn_relative = images_to_review[0]
137
+ for fn_relative in images_to_review:
138
+ assert '\\' not in fn_relative
139
+ fn_abs_source = input_base_folder + '/' + fn_relative
140
+ assert os.path.isfile(fn_abs_source)
141
+ fn_abs_dest = review_folder_base + '/' + fn_relative.replace('/','_')
142
+ source_file_to_target_file[fn_abs_source] = fn_abs_dest
143
+
144
+ from megadetector.utils.path_utils import parallel_copy_files
145
+
146
+ parallel_copy_files(input_file_to_output_file=source_file_to_target_file,
147
+ max_workers=16,
148
+ use_threads=True,
149
+ overwrite=False,verbose=False)
150
+
151
+
152
+ #%% Copy videos to a temporary folder for review
153
+
154
+ review_folder_base = r'g:/temp/review_videos'
155
+ os.makedirs(review_folder_base,exist_ok=True)
156
+
157
+ source_file_to_target_file = {}
158
+
159
+ # fn_relative = video_files[0]
160
+ for fn_relative in video_files:
161
+ assert '\\' not in fn_relative
162
+ fn_abs_source = input_base_folder + '/' + fn_relative
163
+ assert os.path.isfile(fn_abs_source)
164
+ fn_abs_dest = review_folder_base + '/' + fn_relative.replace('/','_')
165
+ source_file_to_target_file[fn_abs_source] = fn_abs_dest
166
+
167
+ from megadetector.utils.path_utils import parallel_copy_files
168
+
169
+ parallel_copy_files(input_file_to_output_file=source_file_to_target_file,
170
+ max_workers=16,
171
+ use_threads=True,
172
+ overwrite=False,verbose=False)
173
+
174
+
175
+ #%% Track removed images
176
+
177
+ removed_images = [
178
+ "annotated-imgs\panthera leo\Camera Trap\Events\X73Okngwe\2013\02\PvL_seq_41468415-4518-44d6-acac-2113b442f723\PICT0190.JPG",
179
+ "annotated-imgs\panthera leo\Camera Trap\Hoanib\FldPln_Arch\211011\PvL_seq_5a9c6379-6980-4ab8-903a-b3bcba2ad21b\PICT0039.JPG",
180
+ "annotated-imgs\panthera leo\Camera Trap\Hoanib\FldPln_Arch\211011\PvL_seq_5a9c6379-6980-4ab8-903a-b3bcba2ad21b\PICT0037.JPG",
181
+ "annotated-imgs\panthera leo\Camera Trap\Hoanib\FldPln_Arch\211011\PvL_seq_5a9c6379-6980-4ab8-903a-b3bcba2ad21b\PICT0038.JPG",
182
+ "annotated-imgs\panthera leo\Camera Trap\2015\09\PvL_seq_da9c9ab1-74a2-485e-b6e7-3827b0c2a2f0\20150924-RCX_0835.JPG",
183
+ "annotated-imgs\panthera leo\Camera Trap\2015\09\PvL_seq_b0c1c6c5-474e-4844-a66c-e2bf5513d47a\20150924-RCX_0841.JPG",
184
+ "annotated-imgs\oryx gazella\Camera Trap\Video_Clips\Leylands\CDY_0003.AVI"
185
+ ]
186
+
187
+ removed_images = [fn.replace('\\','/') for fn in removed_images]
188
+
189
+
190
+ #%% Map filenames to datetimes
191
+
192
+ filename_to_datetime = {}
193
+ n_valid_datetimes = 0
194
+
195
+ # im = exif_data[0]
196
+ for im in tqdm(exif_data):
197
+ if im['exif_tags'] is None or len(im['exif_tags']) == 0:
198
+ filename_to_datetime[im['file_name']] = None
199
+ continue
200
+ dt = im['exif_tags']['DateTime']
201
+ assert len(dt) == 19
202
+ filename_to_datetime[im['file_name']] = dt
203
+ n_valid_datetimes += 1
204
+
205
+ print('\nFound datetime information for {} of {} images'.format(
206
+ n_valid_datetimes,len(exif_data)))
207
+
208
+
209
+ #%% Convert "annotated_imgs" folder to COCO Camera Traps
210
+
211
+ from megadetector.utils.path_utils import recursive_file_list
212
+
213
+ species_name_to_category_id = {}
214
+
215
+ filenames_relative = \
216
+ recursive_file_list(export_base,return_relative_paths=True,recursive=True,convert_slashes=True)
217
+
218
+ short_species_names = ['aves','cn-owls','cn-francolins','cn-raptors',
219
+ 'columbidae','equus zebra hartmannae','numididae',
220
+ 'pteroclidae']
221
+
222
+ images = []
223
+ annotations = []
224
+ n_datetimes = 0
225
+
226
+ for fn in filenames_relative:
227
+
228
+ assert fn.lower().endswith('.jpg') or fn.lower().endswith('.avi') or fn.lower().endswith('.json')
229
+
230
+ if fn.lower().endswith('.json'):
231
+ continue
232
+
233
+ tokens = fn.split('/')
234
+ species_name = tokens[0]
235
+ assert species_name in short_species_names or len(species_name.split(' ')) == 2
236
+
237
+ if species_name not in species_name_to_category_id:
238
+ category_id = len(species_name_to_category_id)
239
+ species_name_to_category_id[species_name] = category_id
240
+ else:
241
+ category_id = species_name_to_category_id[species_name]
242
+
243
+ im = {}
244
+ im['id'] = fn
245
+ im['file_name'] = fn
246
+ im['location'] = 'unknown'
247
+
248
+ fn_for_datetime_lookup = 'annotated-imgs/' + fn
249
+ if (fn_for_datetime_lookup in filename_to_datetime) and \
250
+ (filename_to_datetime[fn_for_datetime_lookup] is not None):
251
+ im['datetime'] = filename_to_datetime[fn_for_datetime_lookup]
252
+ n_datetimes += 1
253
+
254
+ ann = {}
255
+ ann['image_id'] = im['id']
256
+ ann['id'] = im['id'] + ':ann_00'
257
+ ann['sequence_level_annotation'] = False
258
+ ann['category_id'] = category_id
259
+
260
+ images.append(im)
261
+ annotations.append(ann)
262
+
263
+ # ...for each filename
264
+
265
+ categories = []
266
+ for species_name in species_name_to_category_id:
267
+ category = {}
268
+ category['name'] = species_name
269
+ category['id'] = species_name_to_category_id[species_name]
270
+ categories.append(category)
271
+
272
+ info = {}
273
+ info['version'] = '2024.07.15_00'
274
+ info['description'] = 'Desert Lion Camera Traps'
275
+
276
+ d = {}
277
+ d['info'] = info
278
+ d['images'] = images
279
+ d['annotations'] = annotations
280
+ d['categories'] = categories
281
+
282
+ with open(output_file,'w') as f:
283
+ json.dump(d,f,indent=1)
284
+
285
+
286
+ #%% Integrity check
287
+
288
+ from megadetector.data_management.databases.integrity_check_json_db import \
289
+ IntegrityCheckOptions, integrity_check_json_db
290
+
291
+ integrity_check_options = IntegrityCheckOptions()
292
+
293
+ integrity_check_options.baseDir = export_base
294
+ integrity_check_options.bCheckImageExistence = True
295
+ integrity_check_options.bRequireLocation = True
296
+ integrity_check_options.nThreads = 10
297
+ integrity_check_options.verbose = True
298
+ integrity_check_options.allowIntIDs = False
299
+
300
+ integrity_check_results = integrity_check_json_db(output_file,integrity_check_options)
301
+
302
+
303
+ #%% Preview
304
+
305
+ from megadetector.visualization.visualize_db \
306
+ import DbVizOptions, visualize_db
307
+
308
+ viz_options = DbVizOptions()
309
+ viz_options.num_to_visualize = 2500
310
+
311
+ html_output_file,_ = visualize_db(output_file, preview_dir, export_base, options=viz_options)
312
+
313
+ from megadetector.utils.path_utils import open_file
314
+ open_file(html_output_file)
315
+
316
+
317
+ #%% Make MD results paths line up with the output
318
+
319
+ md_results_remapped_file = md_results_file.replace('-all','')
320
+ assert md_results_remapped_file != md_results_file
321
+
322
+ with open(output_file,'r') as f:
323
+ d = json.load(f)
324
+
325
+ image_filenames = [im['file_name'] for im in d['images']]
326
+ image_filenames_set = set(image_filenames)
327
+
328
+ with open(md_results_file,'r') as f:
329
+ md_results = json.load(f)
330
+
331
+ md_results_images_remapped = []
332
+
333
+ # im = md_results['images'][0]
334
+ for im in md_results['images']:
335
+ assert im['file'].startswith('annotated-imgs/') or im['file'].startswith('bboxes/')
336
+ if im['file'].startswith('bboxes/'):
337
+ continue
338
+ im['file'] = im['file'].replace('annotated-imgs/','')
339
+ md_results_images_remapped.append(im)
340
+
341
+ print('Keeping {} of {} images in MD results'.format(
342
+ len(md_results_images_remapped),len(md_results['images'])))
343
+
344
+ d['images'] = md_results_images_remapped
345
+
346
+ with open(md_results_remapped_file,'w') as f:
347
+ json.dump(d,f,indent=1)
348
+
349
+
350
+ #%% Zip MD results and COCO file
351
+
352
+ from megadetector.utils.path_utils import zip_file
353
+
354
+ zip_file(input_fn=md_results_remapped_file, output_fn=None, overwrite=True, verbose=True, compresslevel=9)
355
+ zip_file(input_fn=output_file, output_fn=None, overwrite=True, verbose=True, compresslevel=9)
356
+
357
+
358
+ #%% Zip images
359
+
360
+ from megadetector.utils.path_utils import zip_folder
361
+
362
+ zip_folder(input_folder=export_base, output_fn=output_zipfile, overwrite=True, verbose=True, compresslevel=0)
363
+
364
+
365
+ #%% Copy lion images to a folder for thumbnail selection
366
+
367
+ review_folder_base = r'g:/temp/thumbnail-candidates'
368
+ os.makedirs(review_folder_base,exist_ok=True)
369
+
370
+ source_file_to_target_file = {}
371
+
372
+ # fn_relative = image_files[0]
373
+ for fn_relative in image_files:
374
+ assert '\\' not in fn_relative
375
+ if '/lion/' not in fn_relative and '/panthera leo/' not in fn_relative:
376
+ continue
377
+ fn_abs_source = input_base_folder + '/' + fn_relative
378
+ assert os.path.isfile(fn_abs_source)
379
+ fn_abs_dest = review_folder_base + '/' + fn_relative.replace('/','_')
380
+ source_file_to_target_file[fn_abs_source] = fn_abs_dest
381
+
382
+ from megadetector.utils.path_utils import parallel_copy_files
383
+
384
+ parallel_copy_files(input_file_to_output_file=source_file_to_target_file,
385
+ max_workers=16,
386
+ use_threads=True,
387
+ overwrite=False,verbose=False)
@@ -24,7 +24,7 @@ from collections import defaultdict
24
24
 
25
25
  from megadetector.utils import path_utils
26
26
 
27
- input_base = '/media/user/Elements'
27
+ input_base = 'e:/'
28
28
  output_base = os.path.expanduser('~/data/snapshot-safari-metadata')
29
29
  file_list_cache_file = os.path.join(output_base,'file_list.json')
30
30
 
@@ -76,23 +76,16 @@ print('Found a total of {} files, {} of which are images'.format(
76
76
  len(all_files_relative),len(all_image_files)))
77
77
 
78
78
 
79
- #%% Copy all csv files to the annotation cache folder
79
+ #%% Copy all .csv files to the annotation cache folder
80
80
 
81
81
  # fn = csv_files[0]
82
- for fn in csv_files:
82
+ for fn in tqdm(csv_files):
83
+
83
84
  target_file = os.path.join(annotation_cache_dir,os.path.basename(fn))
84
85
  source_file = os.path.join(input_base,fn)
85
86
  shutil.copyfile(source_file,target_file)
86
87
 
87
- def read_cached_csv_file(fn):
88
- """
89
- Later cells will ask to read a .csv file from the original hard drive;
90
- read from the annotation cache instead.
91
- """
92
-
93
- cached_csv_file = os.path.join(annotation_cache_dir,os.path.basename(fn))
94
- df = pd.read_csv(cached_csv_file)
95
- return df
88
+ print('Copied {} .csv files to cache folder'.format(len(csv_files)))
96
89
 
97
90
 
98
91
  #%% List project folders
@@ -123,6 +116,21 @@ project_folder_to_project_code = {v: k for k, v in project_code_to_project_folde
123
116
  project_codes = sorted(list(project_code_to_project_folder.keys()))
124
117
  project_folders = sorted(list(project_code_to_project_folder.values()))
125
118
 
119
+ print('Eumerated {} project folders'.format(len(project_folders)))
120
+
121
+
122
+ #%% Support functions
123
+
124
+ def read_cached_csv_file(fn):
125
+ """
126
+ Later cells will ask to read a .csv file from the original hard drive;
127
+ read from the annotation cache instead.
128
+ """
129
+
130
+ cached_csv_file = os.path.join(annotation_cache_dir,os.path.basename(fn))
131
+ df = pd.read_csv(cached_csv_file)
132
+ return df
133
+
126
134
  def file_to_project_folder(fn):
127
135
  """
128
136
  For a given filename relative to the drive root, return the corresponding
@@ -138,7 +146,6 @@ def file_to_project_folder(fn):
138
146
  assert project_folder in project_folders
139
147
  return project_folder
140
148
 
141
-
142
149
  def file_to_project_code(fn):
143
150
  """
144
151
  For a given filename relative to the drive root, return the corresponding
@@ -147,6 +154,9 @@ def file_to_project_code(fn):
147
154
 
148
155
  return project_folder_to_project_code[file_to_project_folder(fn)]
149
156
 
157
+
158
+ #%% Consistency checking
159
+
150
160
  assert file_to_project_folder(
151
161
  'APN/APN_S2/DW/DW_R5/APN_S2_DW_R5_IMAG0003.JPG') == 'APN'
152
162
  assert file_to_project_folder(
@@ -163,9 +173,11 @@ assert file_to_project_code(
163
173
  #
164
174
  # E.g.:
165
175
  #
166
- # 'DHP': ['Snapshot South Africa/DHP/LILA_Reports/DHP_S1_report_lila.csv',
167
- # 'Snapshot South Africa/DHP/LILA_Reports/DHP_S2_report_lila.csv',
168
- # 'Snapshot South Africa/DHP/LILA_Reports/DHP_S3_report_lila.csv']
176
+ # 'DHP': [
177
+ # 'Snapshot South Africa/DHP/LILA_Reports/DHP_S1_report_lila.csv',
178
+ # 'Snapshot South Africa/DHP/LILA_Reports/DHP_S2_report_lila.csv',
179
+ # 'Snapshot South Africa/DHP/LILA_Reports/DHP_S3_report_lila.csv'
180
+ # ]
169
181
  #
170
182
  project_code_to_report_files = defaultdict(list)
171
183
 
@@ -354,7 +354,7 @@ print('Processed {} datasets'.format(len(metadata_table)))
354
354
  #%% Read the .csv back
355
355
 
356
356
  df = pd.read_csv(output_file)
357
- print('Read {} lines from {}'.format(len(df),output_file))
357
+ print('Read {} rows from {}'.format(len(df),output_file))
358
358
 
359
359
 
360
360
  #%% Do some post-hoc integrity checking
@@ -403,9 +403,9 @@ else:
403
403
  check_row(row)
404
404
 
405
405
 
406
- #%% Check for datasets that have only one location string
406
+ #%% Check for datasets that have only one location string (typically "unknown")
407
407
 
408
- # Expected: ENA24, Missouri Camera Traps
408
+ # Expected: ENA24, Missouri Camera Traps, Desert Lion Conservation Camera Traps
409
409
 
410
410
  for ds_name in dataset_name_to_locations.keys():
411
411
  if len(dataset_name_to_locations[ds_name]) == 1:
@@ -72,8 +72,8 @@ for ds_name in metadata_table.keys():
72
72
  url_to_source = {}
73
73
 
74
74
  # The first image in a dataset is disproportionately likely to be human (and thus 404),
75
- # so we pick a semi-arbitrary image that isn't the first. How about the 1000th?
76
- image_index = 1000
75
+ # so we pick a semi-arbitrary image that isn't the first. How about the 2000th?
76
+ image_index = 2000
77
77
 
78
78
  # ds_name = list(metadata_table.keys())[0]
79
79
  for ds_name in metadata_table.keys():
@@ -3,64 +3,89 @@
3
3
  remove_exif.py
4
4
 
5
5
  Removes all EXIF/IPTC/XMP metadata from a folder of images, without making
6
- backup copies, using pyexiv2.
6
+ backup copies, using pyexiv2. Ignores non-jpeg images.
7
7
 
8
- TODO: This is a one-off script waiting to be cleaned up for more general use.
8
+ This module is rarely used, and pyexiv2 is not thread-safe, so pyexiv2 is not
9
+ included in package-level dependency lists. YMMV.
9
10
 
10
11
  """
11
12
 
12
- input_base = r'f:\images'
13
-
14
-
15
13
  #%% Imports and constants
16
14
 
17
15
  import os
18
16
  import glob
19
17
 
20
- def main():
21
-
22
- assert os.path.isdir(input_base)
18
+ from multiprocessing.pool import Pool as Pool
19
+ from tqdm import tqdm
23
20
 
24
- ##%% List files
25
21
 
26
- all_files = [f for f in glob.glob(input_base + "*/**", recursive=True)]
27
- image_files = [s for s in all_files if (s.lower().endswith('.jpg'))]
28
-
22
+ #%% Support functions
29
23
 
30
- ##%% Remove EXIF data (support)
24
+ # Pyexif2 is not thread safe, do not call this function in parallel within a process
25
+ #
26
+ # Parallelizing across processes is fine.
27
+ def remove_exif_from_image(fn):
31
28
 
32
29
  import pyexiv2
30
+
31
+ try:
32
+ img = pyexiv2.Image(fn)
33
+ img.clear_exif()
34
+ img.clear_iptc()
35
+ img.clear_xmp()
36
+ img.close()
37
+ except Exception as e:
38
+ print('EXIF error on {}: {}'.format(fn,str(e)))
39
+
40
+ return True
41
+
42
+
43
+ #%% Remove EXIF data
44
+
45
+ def remove_exif(image_base_folder,recursive=True,n_processes=1):
46
+ """
47
+ Removes all EXIF/IPTC/XMP metadata from a folder of images, without making
48
+ backup copies, using pyexiv2. Ignores non-jpeg images.
49
+
50
+ Args:
51
+ image_base_folder (str): the folder from which we should remove EXIF data
52
+ recursive (bool, optional): whether to process [image_base_folder] recursively
53
+ n_processes (int, optional): number of concurrent workers. Because pyexiv2 is not
54
+ thread-safe, only process-based parallelism is supported.
55
+ """
56
+ try:
57
+ import pyexiv2 #noqa
58
+ except:
59
+ print('pyexiv2 not available; try "pip install pyexiv2"')
60
+ raise
33
61
 
34
- # PYEXIV2 IS NOT THREAD SAFE; DO NOT CALL THIS IN PARALLEL FROM A SINGLE PROCESS
35
- def remove_exif(fn):
36
62
 
37
- try:
38
- img = pyexiv2.Image(fn)
39
- # data = img.read_exif(); print(data)
40
- img.clear_exif()
41
- img.clear_iptc()
42
- img.clear_xmp()
43
- img.close()
44
- except Exception as e:
45
- print('EXIF error on {}: {}'.format(fn,str(e)))
63
+ ##%% List files
64
+
65
+ assert os.path.isdir(image_base_folder), \
66
+ 'Could not find folder {}'.format(image_base_folder)
67
+ all_files = [f for f in glob.glob(image_base_folder+ "*/**", recursive=recursive)]
68
+ image_files = [s for s in all_files if \
69
+ (s.lower().endswith('.jpg') or s.lower().endswith('.jpeg'))]
46
70
 
47
71
 
48
72
  ##%% Remove EXIF data (execution)
49
73
 
50
- from joblib import Parallel, delayed
51
-
52
- n_exif_threads = 50
53
-
54
- if n_exif_threads == 1:
74
+ if n_processes == 1:
55
75
 
56
76
  # fn = image_files[0]
57
- for fn in image_files:
58
- remove_exif(fn)
77
+ for fn in tqdm(image_files):
78
+ remove_exif_from_image(fn)
59
79
 
60
80
  else:
61
- # joblib.Parallel defaults to a process-based backend, but let's be sure
62
- # results = Parallel(n_jobs=n_exif_threads,verbose=2,prefer='processes')(delayed(remove_exif)(fn) for fn in image_files[0:10])
63
- _ = Parallel(n_jobs=n_exif_threads,verbose=2,prefer='processes')(delayed(remove_exif)(fn) for fn in image_files)
81
+ # pyexiv2 is not thread-safe, so we need to use processes
82
+ print('Starting parallel process pool with {} workers'.format(n_processes))
83
+ pool = Pool(n_processes)
84
+ _ = list(tqdm(pool.imap(remove_exif_from_image,image_files),total=len(image_files)))
64
85
 
65
- if __name__ == '__main__':
66
- main()
86
+ # ...remove_exif(...)
87
+
88
+
89
+ #%% Command-line driver
90
+
91
+ ## TODO