megadetector 5.0.15__py3-none-any.whl → 5.0.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +387 -0
- megadetector/data_management/importers/snapshot_safari_importer_reprise.py +28 -16
- megadetector/data_management/lila/generate_lila_per_image_labels.py +3 -3
- megadetector/data_management/lila/test_lila_metadata_urls.py +2 -2
- megadetector/data_management/remove_exif.py +61 -36
- megadetector/data_management/yolo_to_coco.py +25 -6
- megadetector/detection/process_video.py +270 -127
- megadetector/detection/pytorch_detector.py +13 -11
- megadetector/detection/run_detector.py +9 -2
- megadetector/detection/run_detector_batch.py +8 -1
- megadetector/detection/run_inference_with_yolov5_val.py +58 -10
- megadetector/detection/tf_detector.py +8 -2
- megadetector/detection/video_utils.py +214 -18
- megadetector/postprocessing/md_to_coco.py +31 -9
- megadetector/postprocessing/postprocess_batch_results.py +23 -7
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +5 -2
- megadetector/postprocessing/subset_json_detector_output.py +22 -12
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +3 -3
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +2 -1
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +1 -1
- megadetector/taxonomy_mapping/simple_image_download.py +5 -0
- megadetector/taxonomy_mapping/species_lookup.py +1 -1
- megadetector/utils/ct_utils.py +48 -0
- megadetector/utils/md_tests.py +231 -56
- megadetector/utils/path_utils.py +2 -2
- megadetector/utils/torch_test.py +32 -0
- megadetector/utils/url_utils.py +101 -4
- megadetector/visualization/visualization_utils.py +21 -6
- megadetector/visualization/visualize_db.py +16 -0
- {megadetector-5.0.15.dist-info → megadetector-5.0.17.dist-info}/LICENSE +0 -0
- {megadetector-5.0.15.dist-info → megadetector-5.0.17.dist-info}/METADATA +5 -7
- {megadetector-5.0.15.dist-info → megadetector-5.0.17.dist-info}/RECORD +34 -32
- {megadetector-5.0.15.dist-info → megadetector-5.0.17.dist-info}/WHEEL +1 -1
- {megadetector-5.0.15.dist-info → megadetector-5.0.17.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
import_desert_lion_conservation_camera_traps.py
|
|
4
|
+
|
|
5
|
+
Prepare the Desert Lion Conservation Camera Traps dataset for release on LILA.
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
#%% Imports and constants
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
import json
|
|
13
|
+
|
|
14
|
+
input_base_folder = r'i:/data/desert-lion'
|
|
15
|
+
assert os.path.isdir(input_base_folder)
|
|
16
|
+
|
|
17
|
+
# md_results_file = r'i:/data/desert-lion/desert-lion-camera-traps-2024-07-14-v5a.0.0_detections-all.json'
|
|
18
|
+
md_results_file = r'i:/data/desert-lion/desert-lion-camera-traps-2024-07-14-v5a.0.0_detections.json'
|
|
19
|
+
assert os.path.isfile(md_results_file)
|
|
20
|
+
|
|
21
|
+
export_base = os.path.join(input_base_folder,'annotated-imgs')
|
|
22
|
+
assert os.path.isdir(export_base)
|
|
23
|
+
|
|
24
|
+
preview_dir = r'g:\temp\desert-lion-viz'
|
|
25
|
+
output_file = os.path.join(input_base_folder,'desert_lion_camera_traps.json')
|
|
26
|
+
output_zipfile = os.path.join(input_base_folder,'desert-lion-camera-traps-images.zip')
|
|
27
|
+
|
|
28
|
+
exif_cache_file_post_exif_removal = os.path.join(input_base_folder,'exif_data_post_exif_removal.json')
|
|
29
|
+
exif_cache_file = os.path.join(input_base_folder,'exif_data.json')
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
#%% Find images and videos
|
|
33
|
+
|
|
34
|
+
from megadetector.detection.video_utils import find_videos
|
|
35
|
+
from megadetector.utils.path_utils import find_images
|
|
36
|
+
|
|
37
|
+
video_files = find_videos(input_base_folder,recursive=True,return_relative_paths=True,convert_slashes=True)
|
|
38
|
+
image_files = find_images(input_base_folder,recursive=True,return_relative_paths=True,convert_slashes=True)
|
|
39
|
+
|
|
40
|
+
n_annotated_imgs = len([fn for fn in image_files if 'annotated-imgs' in fn])
|
|
41
|
+
print('Found {} images ({} in the annotated-imgs folder), {} videos'.format(
|
|
42
|
+
len(image_files),n_annotated_imgs,len(video_files)))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
#%% Read EXIF data
|
|
46
|
+
|
|
47
|
+
from megadetector.data_management.read_exif import read_exif_from_folder, ReadExifOptions
|
|
48
|
+
|
|
49
|
+
exif_options = ReadExifOptions()
|
|
50
|
+
exif_options.n_workers = 10
|
|
51
|
+
|
|
52
|
+
if os.path.isfile(exif_cache_file):
|
|
53
|
+
print('EXIF cache {} exists, skipping EXIF read'.format(exif_cache_file))
|
|
54
|
+
with open(exif_cache_file,'r') as f:
|
|
55
|
+
exif_data = json.load(f)
|
|
56
|
+
else:
|
|
57
|
+
exif_data = read_exif_from_folder(input_folder=input_base_folder,
|
|
58
|
+
output_file=exif_cache_file,
|
|
59
|
+
options=exif_options,
|
|
60
|
+
filenames=None,
|
|
61
|
+
recursive=True)
|
|
62
|
+
|
|
63
|
+
assert len(exif_data) == len(image_files)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
#%% Remove EXIF data
|
|
67
|
+
|
|
68
|
+
from megadetector.data_management.remove_exif import remove_exif
|
|
69
|
+
remove_exif(input_base_folder,recursive=True,n_processes=1)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
#%% Read EXIF data again
|
|
73
|
+
|
|
74
|
+
exif_data_post_exif_removal = read_exif_from_folder(input_folder=input_base_folder,
|
|
75
|
+
output_file=exif_cache_file_post_exif_removal,
|
|
76
|
+
options=exif_options,
|
|
77
|
+
filenames=None,
|
|
78
|
+
recursive=True)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
#%% Make sure no lat/lon data is present
|
|
82
|
+
|
|
83
|
+
from tqdm import tqdm
|
|
84
|
+
|
|
85
|
+
for i_image,im in enumerate(tqdm(exif_data_post_exif_removal)):
|
|
86
|
+
tags = im['exif_tags']
|
|
87
|
+
if tags is None:
|
|
88
|
+
continue
|
|
89
|
+
for k in tags:
|
|
90
|
+
assert 'gps' not in str(k).lower()
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
#%% Look for images that contain humans
|
|
94
|
+
|
|
95
|
+
with open(md_results_file,'r') as f:
|
|
96
|
+
md_results = json.load(f)
|
|
97
|
+
|
|
98
|
+
assert len(md_results['images']) == len(image_files)
|
|
99
|
+
|
|
100
|
+
human_threshold = 0.1
|
|
101
|
+
human_categories = ['2','3']
|
|
102
|
+
|
|
103
|
+
candidate_human_images = set()
|
|
104
|
+
failed_images = set()
|
|
105
|
+
|
|
106
|
+
# i_image = 0; im = md_results['images'][0]
|
|
107
|
+
for i_image,im in tqdm(enumerate(md_results['images']),total=len(md_results['images'])):
|
|
108
|
+
|
|
109
|
+
if 'failure' in im:
|
|
110
|
+
failed_images.add(im['file'])
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
for det in im['detections']:
|
|
114
|
+
if det['category'] in human_categories and det['conf'] >= human_threshold:
|
|
115
|
+
candidate_human_images.add(im['file'])
|
|
116
|
+
break
|
|
117
|
+
|
|
118
|
+
# ...for each detection
|
|
119
|
+
|
|
120
|
+
# ...for each image
|
|
121
|
+
|
|
122
|
+
print('Found {} failed images and {} candidate human images'.format(
|
|
123
|
+
len(failed_images),len(candidate_human_images)))
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
#%% Copy failed images and human images to a temporary folder for review
|
|
127
|
+
|
|
128
|
+
review_folder_base = r'g:/temp/review_images'
|
|
129
|
+
os.makedirs(review_folder_base,exist_ok=True)
|
|
130
|
+
|
|
131
|
+
images_to_review = failed_images.union(candidate_human_images)
|
|
132
|
+
images_to_review = sorted(list(images_to_review))
|
|
133
|
+
|
|
134
|
+
source_file_to_target_file = {}
|
|
135
|
+
|
|
136
|
+
# fn_relative = images_to_review[0]
|
|
137
|
+
for fn_relative in images_to_review:
|
|
138
|
+
assert '\\' not in fn_relative
|
|
139
|
+
fn_abs_source = input_base_folder + '/' + fn_relative
|
|
140
|
+
assert os.path.isfile(fn_abs_source)
|
|
141
|
+
fn_abs_dest = review_folder_base + '/' + fn_relative.replace('/','_')
|
|
142
|
+
source_file_to_target_file[fn_abs_source] = fn_abs_dest
|
|
143
|
+
|
|
144
|
+
from megadetector.utils.path_utils import parallel_copy_files
|
|
145
|
+
|
|
146
|
+
parallel_copy_files(input_file_to_output_file=source_file_to_target_file,
|
|
147
|
+
max_workers=16,
|
|
148
|
+
use_threads=True,
|
|
149
|
+
overwrite=False,verbose=False)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
#%% Copy videos to a temporary folder for review
|
|
153
|
+
|
|
154
|
+
review_folder_base = r'g:/temp/review_videos'
|
|
155
|
+
os.makedirs(review_folder_base,exist_ok=True)
|
|
156
|
+
|
|
157
|
+
source_file_to_target_file = {}
|
|
158
|
+
|
|
159
|
+
# fn_relative = video_files[0]
|
|
160
|
+
for fn_relative in video_files:
|
|
161
|
+
assert '\\' not in fn_relative
|
|
162
|
+
fn_abs_source = input_base_folder + '/' + fn_relative
|
|
163
|
+
assert os.path.isfile(fn_abs_source)
|
|
164
|
+
fn_abs_dest = review_folder_base + '/' + fn_relative.replace('/','_')
|
|
165
|
+
source_file_to_target_file[fn_abs_source] = fn_abs_dest
|
|
166
|
+
|
|
167
|
+
from megadetector.utils.path_utils import parallel_copy_files
|
|
168
|
+
|
|
169
|
+
parallel_copy_files(input_file_to_output_file=source_file_to_target_file,
|
|
170
|
+
max_workers=16,
|
|
171
|
+
use_threads=True,
|
|
172
|
+
overwrite=False,verbose=False)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
#%% Track removed images
|
|
176
|
+
|
|
177
|
+
removed_images = [
|
|
178
|
+
"annotated-imgs\panthera leo\Camera Trap\Events\X73Okngwe\2013\02\PvL_seq_41468415-4518-44d6-acac-2113b442f723\PICT0190.JPG",
|
|
179
|
+
"annotated-imgs\panthera leo\Camera Trap\Hoanib\FldPln_Arch\211011\PvL_seq_5a9c6379-6980-4ab8-903a-b3bcba2ad21b\PICT0039.JPG",
|
|
180
|
+
"annotated-imgs\panthera leo\Camera Trap\Hoanib\FldPln_Arch\211011\PvL_seq_5a9c6379-6980-4ab8-903a-b3bcba2ad21b\PICT0037.JPG",
|
|
181
|
+
"annotated-imgs\panthera leo\Camera Trap\Hoanib\FldPln_Arch\211011\PvL_seq_5a9c6379-6980-4ab8-903a-b3bcba2ad21b\PICT0038.JPG",
|
|
182
|
+
"annotated-imgs\panthera leo\Camera Trap\2015\09\PvL_seq_da9c9ab1-74a2-485e-b6e7-3827b0c2a2f0\20150924-RCX_0835.JPG",
|
|
183
|
+
"annotated-imgs\panthera leo\Camera Trap\2015\09\PvL_seq_b0c1c6c5-474e-4844-a66c-e2bf5513d47a\20150924-RCX_0841.JPG",
|
|
184
|
+
"annotated-imgs\oryx gazella\Camera Trap\Video_Clips\Leylands\CDY_0003.AVI"
|
|
185
|
+
]
|
|
186
|
+
|
|
187
|
+
removed_images = [fn.replace('\\','/') for fn in removed_images]
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
#%% Map filenames to datetimes
|
|
191
|
+
|
|
192
|
+
filename_to_datetime = {}
|
|
193
|
+
n_valid_datetimes = 0
|
|
194
|
+
|
|
195
|
+
# im = exif_data[0]
|
|
196
|
+
for im in tqdm(exif_data):
|
|
197
|
+
if im['exif_tags'] is None or len(im['exif_tags']) == 0:
|
|
198
|
+
filename_to_datetime[im['file_name']] = None
|
|
199
|
+
continue
|
|
200
|
+
dt = im['exif_tags']['DateTime']
|
|
201
|
+
assert len(dt) == 19
|
|
202
|
+
filename_to_datetime[im['file_name']] = dt
|
|
203
|
+
n_valid_datetimes += 1
|
|
204
|
+
|
|
205
|
+
print('\nFound datetime information for {} of {} images'.format(
|
|
206
|
+
n_valid_datetimes,len(exif_data)))
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
#%% Convert "annotated_imgs" folder to COCO Camera Traps
|
|
210
|
+
|
|
211
|
+
from megadetector.utils.path_utils import recursive_file_list
|
|
212
|
+
|
|
213
|
+
species_name_to_category_id = {}
|
|
214
|
+
|
|
215
|
+
filenames_relative = \
|
|
216
|
+
recursive_file_list(export_base,return_relative_paths=True,recursive=True,convert_slashes=True)
|
|
217
|
+
|
|
218
|
+
short_species_names = ['aves','cn-owls','cn-francolins','cn-raptors',
|
|
219
|
+
'columbidae','equus zebra hartmannae','numididae',
|
|
220
|
+
'pteroclidae']
|
|
221
|
+
|
|
222
|
+
images = []
|
|
223
|
+
annotations = []
|
|
224
|
+
n_datetimes = 0
|
|
225
|
+
|
|
226
|
+
for fn in filenames_relative:
|
|
227
|
+
|
|
228
|
+
assert fn.lower().endswith('.jpg') or fn.lower().endswith('.avi') or fn.lower().endswith('.json')
|
|
229
|
+
|
|
230
|
+
if fn.lower().endswith('.json'):
|
|
231
|
+
continue
|
|
232
|
+
|
|
233
|
+
tokens = fn.split('/')
|
|
234
|
+
species_name = tokens[0]
|
|
235
|
+
assert species_name in short_species_names or len(species_name.split(' ')) == 2
|
|
236
|
+
|
|
237
|
+
if species_name not in species_name_to_category_id:
|
|
238
|
+
category_id = len(species_name_to_category_id)
|
|
239
|
+
species_name_to_category_id[species_name] = category_id
|
|
240
|
+
else:
|
|
241
|
+
category_id = species_name_to_category_id[species_name]
|
|
242
|
+
|
|
243
|
+
im = {}
|
|
244
|
+
im['id'] = fn
|
|
245
|
+
im['file_name'] = fn
|
|
246
|
+
im['location'] = 'unknown'
|
|
247
|
+
|
|
248
|
+
fn_for_datetime_lookup = 'annotated-imgs/' + fn
|
|
249
|
+
if (fn_for_datetime_lookup in filename_to_datetime) and \
|
|
250
|
+
(filename_to_datetime[fn_for_datetime_lookup] is not None):
|
|
251
|
+
im['datetime'] = filename_to_datetime[fn_for_datetime_lookup]
|
|
252
|
+
n_datetimes += 1
|
|
253
|
+
|
|
254
|
+
ann = {}
|
|
255
|
+
ann['image_id'] = im['id']
|
|
256
|
+
ann['id'] = im['id'] + ':ann_00'
|
|
257
|
+
ann['sequence_level_annotation'] = False
|
|
258
|
+
ann['category_id'] = category_id
|
|
259
|
+
|
|
260
|
+
images.append(im)
|
|
261
|
+
annotations.append(ann)
|
|
262
|
+
|
|
263
|
+
# ...for each filename
|
|
264
|
+
|
|
265
|
+
categories = []
|
|
266
|
+
for species_name in species_name_to_category_id:
|
|
267
|
+
category = {}
|
|
268
|
+
category['name'] = species_name
|
|
269
|
+
category['id'] = species_name_to_category_id[species_name]
|
|
270
|
+
categories.append(category)
|
|
271
|
+
|
|
272
|
+
info = {}
|
|
273
|
+
info['version'] = '2024.07.15_00'
|
|
274
|
+
info['description'] = 'Desert Lion Camera Traps'
|
|
275
|
+
|
|
276
|
+
d = {}
|
|
277
|
+
d['info'] = info
|
|
278
|
+
d['images'] = images
|
|
279
|
+
d['annotations'] = annotations
|
|
280
|
+
d['categories'] = categories
|
|
281
|
+
|
|
282
|
+
with open(output_file,'w') as f:
|
|
283
|
+
json.dump(d,f,indent=1)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
#%% Integrity check
|
|
287
|
+
|
|
288
|
+
from megadetector.data_management.databases.integrity_check_json_db import \
|
|
289
|
+
IntegrityCheckOptions, integrity_check_json_db
|
|
290
|
+
|
|
291
|
+
integrity_check_options = IntegrityCheckOptions()
|
|
292
|
+
|
|
293
|
+
integrity_check_options.baseDir = export_base
|
|
294
|
+
integrity_check_options.bCheckImageExistence = True
|
|
295
|
+
integrity_check_options.bRequireLocation = True
|
|
296
|
+
integrity_check_options.nThreads = 10
|
|
297
|
+
integrity_check_options.verbose = True
|
|
298
|
+
integrity_check_options.allowIntIDs = False
|
|
299
|
+
|
|
300
|
+
integrity_check_results = integrity_check_json_db(output_file,integrity_check_options)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
#%% Preview
|
|
304
|
+
|
|
305
|
+
from megadetector.visualization.visualize_db \
|
|
306
|
+
import DbVizOptions, visualize_db
|
|
307
|
+
|
|
308
|
+
viz_options = DbVizOptions()
|
|
309
|
+
viz_options.num_to_visualize = 2500
|
|
310
|
+
|
|
311
|
+
html_output_file,_ = visualize_db(output_file, preview_dir, export_base, options=viz_options)
|
|
312
|
+
|
|
313
|
+
from megadetector.utils.path_utils import open_file
|
|
314
|
+
open_file(html_output_file)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
#%% Make MD results paths line up with the output
|
|
318
|
+
|
|
319
|
+
md_results_remapped_file = md_results_file.replace('-all','')
|
|
320
|
+
assert md_results_remapped_file != md_results_file
|
|
321
|
+
|
|
322
|
+
with open(output_file,'r') as f:
|
|
323
|
+
d = json.load(f)
|
|
324
|
+
|
|
325
|
+
image_filenames = [im['file_name'] for im in d['images']]
|
|
326
|
+
image_filenames_set = set(image_filenames)
|
|
327
|
+
|
|
328
|
+
with open(md_results_file,'r') as f:
|
|
329
|
+
md_results = json.load(f)
|
|
330
|
+
|
|
331
|
+
md_results_images_remapped = []
|
|
332
|
+
|
|
333
|
+
# im = md_results['images'][0]
|
|
334
|
+
for im in md_results['images']:
|
|
335
|
+
assert im['file'].startswith('annotated-imgs/') or im['file'].startswith('bboxes/')
|
|
336
|
+
if im['file'].startswith('bboxes/'):
|
|
337
|
+
continue
|
|
338
|
+
im['file'] = im['file'].replace('annotated-imgs/','')
|
|
339
|
+
md_results_images_remapped.append(im)
|
|
340
|
+
|
|
341
|
+
print('Keeping {} of {} images in MD results'.format(
|
|
342
|
+
len(md_results_images_remapped),len(md_results['images'])))
|
|
343
|
+
|
|
344
|
+
d['images'] = md_results_images_remapped
|
|
345
|
+
|
|
346
|
+
with open(md_results_remapped_file,'w') as f:
|
|
347
|
+
json.dump(d,f,indent=1)
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
#%% Zip MD results and COCO file
|
|
351
|
+
|
|
352
|
+
from megadetector.utils.path_utils import zip_file
|
|
353
|
+
|
|
354
|
+
zip_file(input_fn=md_results_remapped_file, output_fn=None, overwrite=True, verbose=True, compresslevel=9)
|
|
355
|
+
zip_file(input_fn=output_file, output_fn=None, overwrite=True, verbose=True, compresslevel=9)
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
#%% Zip images
|
|
359
|
+
|
|
360
|
+
from megadetector.utils.path_utils import zip_folder
|
|
361
|
+
|
|
362
|
+
zip_folder(input_folder=export_base, output_fn=output_zipfile, overwrite=True, verbose=True, compresslevel=0)
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
#%% Copy lion images to a folder for thumbnail selection
|
|
366
|
+
|
|
367
|
+
review_folder_base = r'g:/temp/thumbnail-candidates'
|
|
368
|
+
os.makedirs(review_folder_base,exist_ok=True)
|
|
369
|
+
|
|
370
|
+
source_file_to_target_file = {}
|
|
371
|
+
|
|
372
|
+
# fn_relative = image_files[0]
|
|
373
|
+
for fn_relative in image_files:
|
|
374
|
+
assert '\\' not in fn_relative
|
|
375
|
+
if '/lion/' not in fn_relative and '/panthera leo/' not in fn_relative:
|
|
376
|
+
continue
|
|
377
|
+
fn_abs_source = input_base_folder + '/' + fn_relative
|
|
378
|
+
assert os.path.isfile(fn_abs_source)
|
|
379
|
+
fn_abs_dest = review_folder_base + '/' + fn_relative.replace('/','_')
|
|
380
|
+
source_file_to_target_file[fn_abs_source] = fn_abs_dest
|
|
381
|
+
|
|
382
|
+
from megadetector.utils.path_utils import parallel_copy_files
|
|
383
|
+
|
|
384
|
+
parallel_copy_files(input_file_to_output_file=source_file_to_target_file,
|
|
385
|
+
max_workers=16,
|
|
386
|
+
use_threads=True,
|
|
387
|
+
overwrite=False,verbose=False)
|
|
@@ -24,7 +24,7 @@ from collections import defaultdict
|
|
|
24
24
|
|
|
25
25
|
from megadetector.utils import path_utils
|
|
26
26
|
|
|
27
|
-
input_base = '
|
|
27
|
+
input_base = 'e:/'
|
|
28
28
|
output_base = os.path.expanduser('~/data/snapshot-safari-metadata')
|
|
29
29
|
file_list_cache_file = os.path.join(output_base,'file_list.json')
|
|
30
30
|
|
|
@@ -76,23 +76,16 @@ print('Found a total of {} files, {} of which are images'.format(
|
|
|
76
76
|
len(all_files_relative),len(all_image_files)))
|
|
77
77
|
|
|
78
78
|
|
|
79
|
-
#%% Copy all csv files to the annotation cache folder
|
|
79
|
+
#%% Copy all .csv files to the annotation cache folder
|
|
80
80
|
|
|
81
81
|
# fn = csv_files[0]
|
|
82
|
-
for fn in csv_files:
|
|
82
|
+
for fn in tqdm(csv_files):
|
|
83
|
+
|
|
83
84
|
target_file = os.path.join(annotation_cache_dir,os.path.basename(fn))
|
|
84
85
|
source_file = os.path.join(input_base,fn)
|
|
85
86
|
shutil.copyfile(source_file,target_file)
|
|
86
87
|
|
|
87
|
-
|
|
88
|
-
"""
|
|
89
|
-
Later cells will ask to read a .csv file from the original hard drive;
|
|
90
|
-
read from the annotation cache instead.
|
|
91
|
-
"""
|
|
92
|
-
|
|
93
|
-
cached_csv_file = os.path.join(annotation_cache_dir,os.path.basename(fn))
|
|
94
|
-
df = pd.read_csv(cached_csv_file)
|
|
95
|
-
return df
|
|
88
|
+
print('Copied {} .csv files to cache folder'.format(len(csv_files)))
|
|
96
89
|
|
|
97
90
|
|
|
98
91
|
#%% List project folders
|
|
@@ -123,6 +116,21 @@ project_folder_to_project_code = {v: k for k, v in project_code_to_project_folde
|
|
|
123
116
|
project_codes = sorted(list(project_code_to_project_folder.keys()))
|
|
124
117
|
project_folders = sorted(list(project_code_to_project_folder.values()))
|
|
125
118
|
|
|
119
|
+
print('Eumerated {} project folders'.format(len(project_folders)))
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
#%% Support functions
|
|
123
|
+
|
|
124
|
+
def read_cached_csv_file(fn):
|
|
125
|
+
"""
|
|
126
|
+
Later cells will ask to read a .csv file from the original hard drive;
|
|
127
|
+
read from the annotation cache instead.
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
cached_csv_file = os.path.join(annotation_cache_dir,os.path.basename(fn))
|
|
131
|
+
df = pd.read_csv(cached_csv_file)
|
|
132
|
+
return df
|
|
133
|
+
|
|
126
134
|
def file_to_project_folder(fn):
|
|
127
135
|
"""
|
|
128
136
|
For a given filename relative to the drive root, return the corresponding
|
|
@@ -138,7 +146,6 @@ def file_to_project_folder(fn):
|
|
|
138
146
|
assert project_folder in project_folders
|
|
139
147
|
return project_folder
|
|
140
148
|
|
|
141
|
-
|
|
142
149
|
def file_to_project_code(fn):
|
|
143
150
|
"""
|
|
144
151
|
For a given filename relative to the drive root, return the corresponding
|
|
@@ -147,6 +154,9 @@ def file_to_project_code(fn):
|
|
|
147
154
|
|
|
148
155
|
return project_folder_to_project_code[file_to_project_folder(fn)]
|
|
149
156
|
|
|
157
|
+
|
|
158
|
+
#%% Consistency checking
|
|
159
|
+
|
|
150
160
|
assert file_to_project_folder(
|
|
151
161
|
'APN/APN_S2/DW/DW_R5/APN_S2_DW_R5_IMAG0003.JPG') == 'APN'
|
|
152
162
|
assert file_to_project_folder(
|
|
@@ -163,9 +173,11 @@ assert file_to_project_code(
|
|
|
163
173
|
#
|
|
164
174
|
# E.g.:
|
|
165
175
|
#
|
|
166
|
-
# 'DHP': [
|
|
167
|
-
#
|
|
168
|
-
#
|
|
176
|
+
# 'DHP': [
|
|
177
|
+
# 'Snapshot South Africa/DHP/LILA_Reports/DHP_S1_report_lila.csv',
|
|
178
|
+
# 'Snapshot South Africa/DHP/LILA_Reports/DHP_S2_report_lila.csv',
|
|
179
|
+
# 'Snapshot South Africa/DHP/LILA_Reports/DHP_S3_report_lila.csv'
|
|
180
|
+
# ]
|
|
169
181
|
#
|
|
170
182
|
project_code_to_report_files = defaultdict(list)
|
|
171
183
|
|
|
@@ -354,7 +354,7 @@ print('Processed {} datasets'.format(len(metadata_table)))
|
|
|
354
354
|
#%% Read the .csv back
|
|
355
355
|
|
|
356
356
|
df = pd.read_csv(output_file)
|
|
357
|
-
print('Read {}
|
|
357
|
+
print('Read {} rows from {}'.format(len(df),output_file))
|
|
358
358
|
|
|
359
359
|
|
|
360
360
|
#%% Do some post-hoc integrity checking
|
|
@@ -403,9 +403,9 @@ else:
|
|
|
403
403
|
check_row(row)
|
|
404
404
|
|
|
405
405
|
|
|
406
|
-
#%% Check for datasets that have only one location string
|
|
406
|
+
#%% Check for datasets that have only one location string (typically "unknown")
|
|
407
407
|
|
|
408
|
-
# Expected: ENA24, Missouri Camera Traps
|
|
408
|
+
# Expected: ENA24, Missouri Camera Traps, Desert Lion Conservation Camera Traps
|
|
409
409
|
|
|
410
410
|
for ds_name in dataset_name_to_locations.keys():
|
|
411
411
|
if len(dataset_name_to_locations[ds_name]) == 1:
|
|
@@ -72,8 +72,8 @@ for ds_name in metadata_table.keys():
|
|
|
72
72
|
url_to_source = {}
|
|
73
73
|
|
|
74
74
|
# The first image in a dataset is disproportionately likely to be human (and thus 404),
|
|
75
|
-
# so we pick a semi-arbitrary image that isn't the first. How about the
|
|
76
|
-
image_index =
|
|
75
|
+
# so we pick a semi-arbitrary image that isn't the first. How about the 2000th?
|
|
76
|
+
image_index = 2000
|
|
77
77
|
|
|
78
78
|
# ds_name = list(metadata_table.keys())[0]
|
|
79
79
|
for ds_name in metadata_table.keys():
|
|
@@ -3,64 +3,89 @@
|
|
|
3
3
|
remove_exif.py
|
|
4
4
|
|
|
5
5
|
Removes all EXIF/IPTC/XMP metadata from a folder of images, without making
|
|
6
|
-
backup copies, using pyexiv2.
|
|
6
|
+
backup copies, using pyexiv2. Ignores non-jpeg images.
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
This module is rarely used, and pyexiv2 is not thread-safe, so pyexiv2 is not
|
|
9
|
+
included in package-level dependency lists. YMMV.
|
|
9
10
|
|
|
10
11
|
"""
|
|
11
12
|
|
|
12
|
-
input_base = r'f:\images'
|
|
13
|
-
|
|
14
|
-
|
|
15
13
|
#%% Imports and constants
|
|
16
14
|
|
|
17
15
|
import os
|
|
18
16
|
import glob
|
|
19
17
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
assert os.path.isdir(input_base)
|
|
18
|
+
from multiprocessing.pool import Pool as Pool
|
|
19
|
+
from tqdm import tqdm
|
|
23
20
|
|
|
24
|
-
##%% List files
|
|
25
21
|
|
|
26
|
-
|
|
27
|
-
image_files = [s for s in all_files if (s.lower().endswith('.jpg'))]
|
|
28
|
-
|
|
22
|
+
#%% Support functions
|
|
29
23
|
|
|
30
|
-
|
|
24
|
+
# Pyexif2 is not thread safe, do not call this function in parallel within a process
|
|
25
|
+
#
|
|
26
|
+
# Parallelizing across processes is fine.
|
|
27
|
+
def remove_exif_from_image(fn):
|
|
31
28
|
|
|
32
29
|
import pyexiv2
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
img = pyexiv2.Image(fn)
|
|
33
|
+
img.clear_exif()
|
|
34
|
+
img.clear_iptc()
|
|
35
|
+
img.clear_xmp()
|
|
36
|
+
img.close()
|
|
37
|
+
except Exception as e:
|
|
38
|
+
print('EXIF error on {}: {}'.format(fn,str(e)))
|
|
39
|
+
|
|
40
|
+
return True
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
#%% Remove EXIF data
|
|
44
|
+
|
|
45
|
+
def remove_exif(image_base_folder,recursive=True,n_processes=1):
|
|
46
|
+
"""
|
|
47
|
+
Removes all EXIF/IPTC/XMP metadata from a folder of images, without making
|
|
48
|
+
backup copies, using pyexiv2. Ignores non-jpeg images.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
image_base_folder (str): the folder from which we should remove EXIF data
|
|
52
|
+
recursive (bool, optional): whether to process [image_base_folder] recursively
|
|
53
|
+
n_processes (int, optional): number of concurrent workers. Because pyexiv2 is not
|
|
54
|
+
thread-safe, only process-based parallelism is supported.
|
|
55
|
+
"""
|
|
56
|
+
try:
|
|
57
|
+
import pyexiv2 #noqa
|
|
58
|
+
except:
|
|
59
|
+
print('pyexiv2 not available; try "pip install pyexiv2"')
|
|
60
|
+
raise
|
|
33
61
|
|
|
34
|
-
# PYEXIV2 IS NOT THREAD SAFE; DO NOT CALL THIS IN PARALLEL FROM A SINGLE PROCESS
|
|
35
|
-
def remove_exif(fn):
|
|
36
62
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
except Exception as e:
|
|
45
|
-
print('EXIF error on {}: {}'.format(fn,str(e)))
|
|
63
|
+
##%% List files
|
|
64
|
+
|
|
65
|
+
assert os.path.isdir(image_base_folder), \
|
|
66
|
+
'Could not find folder {}'.format(image_base_folder)
|
|
67
|
+
all_files = [f for f in glob.glob(image_base_folder+ "*/**", recursive=recursive)]
|
|
68
|
+
image_files = [s for s in all_files if \
|
|
69
|
+
(s.lower().endswith('.jpg') or s.lower().endswith('.jpeg'))]
|
|
46
70
|
|
|
47
71
|
|
|
48
72
|
##%% Remove EXIF data (execution)
|
|
49
73
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
n_exif_threads = 50
|
|
53
|
-
|
|
54
|
-
if n_exif_threads == 1:
|
|
74
|
+
if n_processes == 1:
|
|
55
75
|
|
|
56
76
|
# fn = image_files[0]
|
|
57
|
-
for fn in image_files:
|
|
58
|
-
|
|
77
|
+
for fn in tqdm(image_files):
|
|
78
|
+
remove_exif_from_image(fn)
|
|
59
79
|
|
|
60
80
|
else:
|
|
61
|
-
#
|
|
62
|
-
|
|
63
|
-
|
|
81
|
+
# pyexiv2 is not thread-safe, so we need to use processes
|
|
82
|
+
print('Starting parallel process pool with {} workers'.format(n_processes))
|
|
83
|
+
pool = Pool(n_processes)
|
|
84
|
+
_ = list(tqdm(pool.imap(remove_exif_from_image,image_files),total=len(image_files)))
|
|
64
85
|
|
|
65
|
-
|
|
66
|
-
|
|
86
|
+
# ...remove_exif(...)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
#%% Command-line driver
|
|
90
|
+
|
|
91
|
+
## TODO
|