megadetector 5.0.6__py3-none-any.whl → 5.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (62) hide show
  1. api/batch_processing/data_preparation/manage_local_batch.py +278 -197
  2. api/batch_processing/data_preparation/manage_video_batch.py +7 -2
  3. api/batch_processing/postprocessing/add_max_conf.py +1 -0
  4. api/batch_processing/postprocessing/compare_batch_results.py +110 -60
  5. api/batch_processing/postprocessing/load_api_results.py +55 -69
  6. api/batch_processing/postprocessing/md_to_labelme.py +1 -0
  7. api/batch_processing/postprocessing/postprocess_batch_results.py +158 -50
  8. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +625 -0
  9. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
  10. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
  11. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +222 -74
  12. api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
  13. api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
  14. classification/prepare_classification_script.py +191 -191
  15. data_management/coco_to_yolo.py +65 -44
  16. data_management/databases/integrity_check_json_db.py +7 -5
  17. data_management/generate_crops_from_cct.py +1 -1
  18. data_management/importers/animl_results_to_md_results.py +2 -2
  19. data_management/importers/noaa_seals_2019.py +1 -1
  20. data_management/importers/zamba_results_to_md_results.py +2 -2
  21. data_management/labelme_to_coco.py +34 -6
  22. data_management/labelme_to_yolo.py +1 -1
  23. data_management/lila/create_lila_blank_set.py +474 -0
  24. data_management/lila/create_lila_test_set.py +2 -1
  25. data_management/lila/create_links_to_md_results_files.py +1 -1
  26. data_management/lila/download_lila_subset.py +46 -21
  27. data_management/lila/generate_lila_per_image_labels.py +23 -14
  28. data_management/lila/get_lila_annotation_counts.py +16 -10
  29. data_management/lila/lila_common.py +14 -11
  30. data_management/lila/test_lila_metadata_urls.py +116 -0
  31. data_management/resize_coco_dataset.py +12 -10
  32. data_management/yolo_output_to_md_output.py +40 -13
  33. data_management/yolo_to_coco.py +34 -21
  34. detection/process_video.py +36 -14
  35. detection/pytorch_detector.py +1 -1
  36. detection/run_detector.py +73 -18
  37. detection/run_detector_batch.py +104 -24
  38. detection/run_inference_with_yolov5_val.py +127 -26
  39. detection/run_tiled_inference.py +153 -43
  40. detection/video_utils.py +3 -1
  41. md_utils/ct_utils.py +79 -3
  42. md_utils/md_tests.py +253 -15
  43. md_utils/path_utils.py +129 -24
  44. md_utils/process_utils.py +26 -7
  45. md_utils/split_locations_into_train_val.py +215 -0
  46. md_utils/string_utils.py +10 -0
  47. md_utils/url_utils.py +0 -2
  48. md_utils/write_html_image_list.py +1 -0
  49. md_visualization/visualization_utils.py +17 -2
  50. md_visualization/visualize_db.py +8 -0
  51. md_visualization/visualize_detector_output.py +185 -104
  52. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/METADATA +2 -2
  53. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/RECORD +62 -58
  54. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/WHEEL +1 -1
  55. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
  56. taxonomy_mapping/map_new_lila_datasets.py +43 -39
  57. taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
  58. taxonomy_mapping/preview_lila_taxonomy.py +27 -27
  59. taxonomy_mapping/species_lookup.py +33 -13
  60. taxonomy_mapping/taxonomy_csv_checker.py +7 -5
  61. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/LICENSE +0 -0
  62. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,474 @@
1
+ ########
2
+ #
3
+ # create_lila_blank_set.py
4
+ #
5
+ # Create a folder of blank images sampled from LILA. We'll aim for diversity, so less-common
6
+ # locations will be oversampled relative to more common locations. We'll also run MegaDetector
7
+ # to minimize the chance that incorrectly-labeled non-empty images sneak into our blank set.
8
+ #
9
+ ########
10
+
11
+ #%% Constants and imports
12
+
13
+ import os
14
+ import random
15
+ import math
16
+ import json
17
+ import shutil
18
+
19
+ import numpy as np
20
+ from tqdm import tqdm
21
+ from multiprocessing.pool import ThreadPool
22
+ from urllib.parse import urlparse
23
+ from collections import defaultdict
24
+
25
+ from data_management.lila.lila_common import \
26
+ read_lila_all_images_file, azure_url_to_gcp_http_url
27
+ from md_utils.url_utils import download_url
28
+ from md_visualization import visualization_utils as vis_utils
29
+ from md_utils.path_utils import recursive_file_list
30
+
31
+ # We'll write images, metadata downloads, and temporary files here
32
+ lila_local_base = os.path.expanduser('~/lila')
33
+
34
+ metadata_dir = os.path.join(lila_local_base,'metadata')
35
+ os.makedirs(metadata_dir,exist_ok=True)
36
+
37
+ project_base = os.path.join(lila_local_base,'lila_blanks')
38
+
39
+ candidate_blanks_base = os.path.join(project_base,'candidate_blanks')
40
+ os.makedirs(candidate_blanks_base,exist_ok=True)
41
+
42
+ confirmed_blanks_base = os.path.join(project_base,'confirmed_blanks')
43
+ os.makedirs(confirmed_blanks_base,exist_ok=True)
44
+
45
+ md_possible_non_blanks_folder = os.path.join(project_base,'candidate_non_blanks')
46
+ os.makedirs(md_possible_non_blanks_folder,exist_ok=True)
47
+
48
+ preferred_image_download_source = 'gcp'
49
+
50
+ # Number of concurrent download threads
51
+ n_download_threads = 20
52
+
53
+ n_blanks = 100000
54
+
55
+ random.seed(0)
56
+
57
+
58
+ #%% Download and open the giant table of image URLs and labels
59
+
60
+ # ~60 seconds to download, unzip, and open
61
+ df = read_lila_all_images_file(metadata_dir)
62
+
63
+
64
+ #%% Explore blank labels
65
+
66
+ # Original labels we're treating as blank:
67
+ blank_original_labels = (
68
+ 'empty','misfire'
69
+ )
70
+
71
+ # Some notable original labels we're *not* treating as blank:
72
+ nonblank_original_labels = (
73
+ 'unclassifiable', 'unidentifiable', 'unidentified', 'unknown', 'fire',
74
+ 'foggy lens', 'foggy weather', 'blurred', 'end', 'eye_shine', 'ignore',
75
+ 'lens obscured', 'misdirected', 'other', 'start', 'sun', 'problem',
76
+ 'tilted', 'vegetation obstruction', 'snow on lens', 'malfunction'
77
+ )
78
+
79
+ other_labels_without_common_names = (
80
+ 'car', 'motorcycle', 'vehicle'
81
+ )
82
+
83
+ common_names = sorted(list(df['common_name'].unique()),
84
+ key=lambda x:str(x) if isinstance(x,float) else x)
85
+ original_labels = sorted(list(df['original_label'].unique()),
86
+ key=lambda x:str(x) if isinstance(x,float) else x)
87
+
88
+ # Blanks are represented as NaN in the "common_name" column (though not all NaN's are blanks)
89
+ assert '' not in common_names
90
+ assert all([s not in common_names for s in blank_original_labels])
91
+ assert all([s not in common_names for s in nonblank_original_labels])
92
+ assert np.nan in common_names
93
+
94
+ # Blanks are represented as "empty" or "misfire" in the "original_label" column
95
+ assert all([s in original_labels for s in blank_original_labels])
96
+ assert all([s in original_labels for s in nonblank_original_labels])
97
+ assert all([s in original_labels for s in other_labels_without_common_names])
98
+ assert all([s not in original_labels for s in ('','blank','none',np.nan)])
99
+
100
+
101
+ #%% Count empty labels and common names
102
+
103
+ common_names_with_empty_original_labels = set()
104
+ original_labels_with_nan_common_names = set()
105
+
106
+ common_name_to_count = defaultdict(int)
107
+ original_label_to_count = defaultdict(int)
108
+
109
+ # This loop takes ~10 mins
110
+ for i_row,row in tqdm(df.iterrows(),total=len(df)):
111
+
112
+ common_name = row['common_name']
113
+ original_label = row['original_label']
114
+
115
+ if isinstance(common_name,float):
116
+ assert np.isnan(common_name)
117
+ original_labels_with_nan_common_names.add(original_label)
118
+
119
+ common_name = str(common_name)
120
+
121
+ assert isinstance(original_label,str)
122
+ if original_label in blank_original_labels:
123
+ common_names_with_empty_original_labels.add(common_name)
124
+ common_name_to_count[common_name] += 1
125
+ original_label_to_count[original_label] += 1
126
+
127
+
128
+ #%% Look at the most common labels and common names
129
+
130
+ from md_utils.ct_utils import sort_dictionary_by_value
131
+ common_name_to_count = sort_dictionary_by_value(common_name_to_count,reverse=True)
132
+ original_label_to_count = sort_dictionary_by_value(original_label_to_count,reverse=True)
133
+
134
+ k = 10
135
+
136
+ print('\nMost frequent common names:\n')
137
+
138
+ i_label = 0
139
+ for i_label,s in enumerate(common_name_to_count):
140
+ if i_label >= k:
141
+ break
142
+ print('{}: {}'.format(s,common_name_to_count[s]))
143
+
144
+ print('\nMost frequent original labels:\n')
145
+
146
+ i_label = 0
147
+ for i_label,s in enumerate(original_label_to_count):
148
+ if i_label >= k:
149
+ break
150
+ print('{}: {}'.format(s,original_label_to_count[s]))
151
+
152
+
153
+ #%% Do some consistency checks over the empty labels and stats
154
+
155
+ # All images called 'empty' should have NaN as their common name
156
+ assert (len(common_names_with_empty_original_labels) == 1)
157
+ assert next(iter(common_names_with_empty_original_labels)) == 'nan'
158
+
159
+ # 'empty' should be the most frequent original label overall
160
+ assert next(iter(original_label_to_count)) == 'empty'
161
+
162
+ # NaN should be the most frequent common name overall
163
+ assert next(iter(common_name_to_count)) == 'nan'
164
+
165
+ for s in original_labels_with_nan_common_names:
166
+ assert \
167
+ (s in blank_original_labels) or \
168
+ (s in nonblank_original_labels) or \
169
+ (s in other_labels_without_common_names)
170
+
171
+
172
+ #%% Map locations to blank images
173
+
174
+ location_to_blank_image_urls_cache_file = os.path.join(project_base,
175
+ 'location_to_blank_image_urls.json')
176
+
177
+ force_map_locations = False
178
+
179
+ # Load from .json if available
180
+ if (not force_map_locations) and (os.path.isfile(location_to_blank_image_urls_cache_file)):
181
+
182
+ with open(location_to_blank_image_urls_cache_file,'r') as f:
183
+ location_to_blank_image_urls = json.load(f)
184
+
185
+ else:
186
+
187
+ location_to_blank_image_urls = defaultdict(list)
188
+
189
+ # i_row = 0; row = df.iloc[i_row]
190
+ for i_row,row in tqdm(df.iterrows(),total=len(df)):
191
+
192
+ location_id = row['location_id']
193
+ url = row['url']
194
+
195
+ original_label = row['original_label']
196
+ if original_label in blank_original_labels:
197
+ assert np.isnan(row['common_name'])
198
+ location_to_blank_image_urls[location_id].append(url)
199
+
200
+ with open(location_to_blank_image_urls_cache_file,'w') as f:
201
+ json.dump(location_to_blank_image_urls,f,indent=1)
202
+
203
+ n_locations_with_blanks = len(location_to_blank_image_urls)
204
+ print('Found {} locations with blank images'.format(n_locations_with_blanks))
205
+
206
+
207
+ #%% Sample blanks
208
+
209
+ random.seed(0)
210
+
211
+ # Make a fresh copy of the lists
212
+ location_to_unsampled_blank_image_urls = {}
213
+
214
+ # location = next(iter(location_to_blank_image_urls.keys()))
215
+ for location in location_to_blank_image_urls:
216
+ blank_image_urls_this_location = location_to_blank_image_urls[location]
217
+ unsampled_blank_image_urls_this_location = blank_image_urls_this_location.copy()
218
+ location_to_unsampled_blank_image_urls[location] = unsampled_blank_image_urls_this_location
219
+
220
+ # Put locations in a random order
221
+ location_ids = list(location_to_unsampled_blank_image_urls.keys())
222
+ random.shuffle(location_ids)
223
+
224
+ blank_urls = []
225
+ location_to_sampled_blanks = defaultdict(list)
226
+ fully_sampled_locations = set()
227
+
228
+ # Pick from each location until we hit our limit or have no blanks left
229
+ while(True):
230
+
231
+ found_sample = False
232
+
233
+ # location = location_ids[0]
234
+ for location in location_ids:
235
+
236
+ unsampled_images_this_location = location_to_unsampled_blank_image_urls[location]
237
+ if len(unsampled_images_this_location) == 0:
238
+ fully_sampled_locations.add(location)
239
+ continue
240
+
241
+ url = random.choice(unsampled_images_this_location)
242
+ blank_urls.append(url)
243
+ location_to_unsampled_blank_image_urls[location].remove(url)
244
+ location_to_sampled_blanks[location].append(url)
245
+ found_sample = True
246
+
247
+ if len(blank_urls) == n_blanks:
248
+ break
249
+
250
+ # ...for each location
251
+
252
+ if not found_sample:
253
+ print('Terminating after {} blanks, we ran out before hitting {}'.format(
254
+ len(blank_urls),n_blanks))
255
+
256
+ if len(blank_urls) == n_blanks:
257
+ break
258
+
259
+ # ...while(True)
260
+
261
+ assert len(blank_urls) <= n_blanks
262
+ min_blanks_per_location = math.floor(n_blanks/n_locations_with_blanks)
263
+ max_blanks_per_location = -1
264
+ for location in location_to_sampled_blanks:
265
+ n_blanks_this_location = len(location_to_sampled_blanks[location])
266
+ if n_blanks_this_location >= max_blanks_per_location:
267
+ max_blanks_per_location = n_blanks_this_location
268
+ assert (location in fully_sampled_locations) or \
269
+ n_blanks_this_location >= min_blanks_per_location
270
+
271
+ print('Choose {} blanks from {} locations'.format(n_blanks,len(location_ids)))
272
+ print('Fully sampled {} locations'.format(len(fully_sampled_locations)))
273
+ print('Max samples per location: {}'.format(max_blanks_per_location))
274
+
275
+
276
+ #%% Download those image files (prep)
277
+
278
+ container_to_url_base = {
279
+ 'lilablobssc.blob.core.windows.net':'/',
280
+ 'storage.googleapis.com':'/public-datasets-lila/'
281
+ }
282
+
283
+ def download_relative_filename(url, output_base, verbose=False, url_base=None, overwrite=False):
284
+ """
285
+ Download a URL to output_base, preserving relative path
286
+ """
287
+
288
+ result = {'status':'unknown','url':url,'destination_filename':None}
289
+
290
+ if url_base is None:
291
+ assert url.startswith('https://')
292
+ container = url.split('/')[2]
293
+ assert container in container_to_url_base
294
+ url_base = container_to_url_base[container]
295
+
296
+ assert url_base.startswith('/') and url_base.endswith('/')
297
+
298
+ p = urlparse(url)
299
+ relative_filename = str(p.path)
300
+ # remove the leading '/'
301
+ assert relative_filename.startswith(url_base)
302
+ relative_filename = relative_filename.replace(url_base,'',1)
303
+
304
+ destination_filename = os.path.join(output_base,relative_filename)
305
+ result['destination_filename'] = destination_filename
306
+
307
+ if ((os.path.isfile(destination_filename)) and (not overwrite)):
308
+ result['status'] = 'skipped'
309
+ return result
310
+ try:
311
+ download_url(url, destination_filename, verbose=verbose)
312
+ except Exception as e:
313
+ print('Warning: error downloading URL {}: {}'.format(
314
+ url,str(e)))
315
+ result['status'] = 'error: {}'.format(str(e))
316
+ return result
317
+
318
+ result['status'] = 'success'
319
+ return result
320
+
321
+ # Convert Azure URLs to GCP URLs if necessary
322
+ if preferred_image_download_source != 'azure':
323
+ assert preferred_image_download_source == 'gcp'
324
+ blank_urls = [azure_url_to_gcp_http_url(url) for url in blank_urls]
325
+
326
+
327
+ #%% Download those image files (execution)
328
+
329
+ print('Downloading {} images on {} workers'.format(len(blank_urls),n_download_threads))
330
+
331
+ if n_download_threads <= 1:
332
+
333
+ results = []
334
+
335
+ # url = all_urls[0]
336
+ for url in tqdm(blank_urls):
337
+ results.append(download_relative_filename(url,candidate_blanks_base,url_base=None))
338
+
339
+ else:
340
+
341
+ pool = ThreadPool(n_download_threads)
342
+ results = list(tqdm(pool.imap(lambda s: download_relative_filename(
343
+ s,candidate_blanks_base,url_base=None),
344
+ blank_urls), total=len(blank_urls)))
345
+
346
+ # pool.terminate()
347
+
348
+
349
+ #%% Review results
350
+
351
+ error_urls = []
352
+ for r in results:
353
+ if r['status'] != 'success':
354
+ error_urls.append(r['url'])
355
+
356
+ print('Errors on {} of {} downloads'.format(len(error_urls),len(results)))
357
+
358
+
359
+ #%% Run MegaDetector on the folder
360
+
361
+ md_results_file = os.path.join(project_base,'lila_blanks_md_results.json')
362
+
363
+ cmd = 'python run_detector_batch.py MDV5A "{}" "{}"'.format(
364
+ candidate_blanks_base,md_results_file)
365
+ cmd += ' --recursive --output_relative_filenames'
366
+
367
+ import clipboard; clipboard.copy(cmd); print(cmd)
368
+
369
+
370
+ #%% Review MD results that suggests images are non-empty
371
+
372
+ assert os.path.isfile(md_results_file)
373
+
374
+ category_name_to_threshold = {'animal':0.25,'person':0.25,'vehicle':0.25}
375
+ min_threshold = min(category_name_to_threshold.values())
376
+ with open(md_results_file,'r') as f:
377
+ md_results = json.load(f)
378
+
379
+ images_to_review_to_detections = {}
380
+
381
+ category_id_to_threshold = {}
382
+ for category_id in md_results['detection_categories']:
383
+ category_name = md_results['detection_categories'][category_id]
384
+ category_id_to_threshold[category_id] = category_name_to_threshold[category_name]
385
+
386
+ # im = md_results['images'][0]
387
+ for im in md_results['images']:
388
+
389
+ if 'detections' not in im:
390
+ continue
391
+
392
+ found_object = False
393
+ for det in im['detections']:
394
+ threshold = category_id_to_threshold[det['category']]
395
+ if det['conf'] >= threshold:
396
+ found_object = True
397
+ break
398
+ if found_object:
399
+ images_to_review_to_detections[im['file']] = im['detections']
400
+
401
+ print('Flagging {} of {} images for review'.format(len(images_to_review_to_detections),len(md_results['images'])))
402
+
403
+ output_file_to_source_file = {}
404
+
405
+ # i_fn = 0; source_file_relative = images_to_review[i_fn]
406
+ for i_fn,source_file_relative in tqdm(enumerate(images_to_review_to_detections),
407
+ total=len(images_to_review_to_detections)):
408
+
409
+ source_file_abs = os.path.join(candidate_blanks_base,source_file_relative)
410
+ assert os.path.isfile(source_file_abs)
411
+ ext = os.path.splitext(source_file_abs)[1]
412
+ target_file_relative = str(i_fn).zfill(8) + ext
413
+ target_file_abs = os.path.join(md_possible_non_blanks_folder,target_file_relative)
414
+ output_file_to_source_file[target_file_relative] = source_file_relative
415
+ # shutil.copyfile(source_file_abs,target_file_abs)
416
+ vis_utils.draw_bounding_boxes_on_file(input_file=source_file_abs,
417
+ output_file=target_file_abs,
418
+ detections=images_to_review_to_detections[source_file_relative],
419
+ confidence_threshold=min_threshold,
420
+ target_size=(1280,-1))
421
+
422
+ with open(os.path.join(project_base,'output_file_to_source_file.json'),'w') as f:
423
+ json.dump(output_file_to_source_file,f,indent=1)
424
+
425
+
426
+ #%% Manual review
427
+
428
+ # Delete images that are *not* empty
429
+
430
+
431
+ #%% Figure out which images are still there; these are the actually-blank ones
432
+
433
+ remaining_images = set(os.listdir(md_possible_non_blanks_folder))
434
+ print('Kept {} of {} candidate blank images'.format(len(remaining_images),
435
+ len(images_to_review_to_detections)))
436
+
437
+ removed_blank_images_relative = []
438
+
439
+ # output_file = next(iter(output_file_to_source_file.keys()))
440
+ for output_file in tqdm(output_file_to_source_file.keys()):
441
+ if output_file not in remaining_images:
442
+ source_file_relative = output_file_to_source_file[output_file]
443
+ removed_blank_images_relative.append(source_file_relative)
444
+
445
+ assert len(removed_blank_images_relative) + len(remaining_images) == len(output_file_to_source_file)
446
+
447
+
448
+ #%% Copy all the confirmed blanks to the confirmed folder
449
+
450
+ all_candidate_blanks = recursive_file_list(candidate_blanks_base,return_relative_paths=True)
451
+ print('Found {} candidate blanks'.format(len(all_candidate_blanks)))
452
+
453
+ for source_fn_relative in tqdm(all_candidate_blanks):
454
+ source_fn_abs = os.path.join(candidate_blanks_base,source_fn_relative)
455
+ assert os.path.isfile(source_fn_abs)
456
+ target_fn_abs = os.path.join(confirmed_blanks_base,source_fn_relative)
457
+ os.makedirs(os.path.dirname(target_fn_abs),exist_ok=True)
458
+ shutil.copyfile(source_fn_abs,target_fn_abs)
459
+
460
+
461
+ #%% Record location information for each file
462
+
463
+ fn_relative_to_location = {}
464
+ for location in location_to_blank_image_urls:
465
+ urls_this_location = location_to_blank_image_urls[location]
466
+ for url in urls_this_location:
467
+ fn_relative = url.split('//')[1]
468
+ fn_relative_to_location[fn_relative] = location
469
+
470
+ all_confirmed_blanks = recursive_file_list(confirmed_blanks_base,return_relative_paths=True)
471
+ print('Found {} confirmed blanks'.format(len(all_confirmed_blanks)))
472
+
473
+ for fn_relative in all_confirmed_blanks:
474
+ assert fn_relative in fn_relative_to_location
@@ -124,6 +124,8 @@ for ds_name in metadata_table.keys():
124
124
 
125
125
  #%% Download those image files
126
126
 
127
+ # TODO: trivially parallelizable
128
+ #
127
129
  # ds_name = (list(metadata_table.keys()))[0]
128
130
  for ds_name in metadata_table.keys():
129
131
 
@@ -147,4 +149,3 @@ for ds_name in metadata_table.keys():
147
149
  # ...for each url
148
150
 
149
151
  # ...for each dataset
150
-
@@ -57,7 +57,7 @@ for i_row,row in df.iterrows():
57
57
  df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
58
58
  df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
59
59
  else:
60
- # Exclude single-season files for snpashot-serengeti
60
+ # Exclude single-season files for snapshot-serengeti
61
61
  if dataset_shortname == 'snapshot-serengeti':
62
62
  matching_files = [fn for fn in matching_files if '_S' not in fn]
63
63
  assert len(matching_files) == 2
@@ -24,11 +24,11 @@ from urllib.parse import urlparse
24
24
  from collections import defaultdict
25
25
 
26
26
  from data_management.lila.lila_common import \
27
- read_lila_all_images_file, read_lila_metadata, is_empty, azure_url_to_gcp_http_url
27
+ read_lila_all_images_file, is_empty, azure_url_to_gcp_http_url
28
28
  from md_utils.url_utils import download_url
29
29
 
30
30
  # If any of these strings appear in the common name of a species, we'll download that image
31
- species_of_interest = ['grey fox','red fox','leopard cat']
31
+ species_of_interest = ['grey fox','red fox','leopard cat','kiwi']
32
32
 
33
33
  # We'll write images, metadata downloads, and temporary files here
34
34
  lila_local_base = os.path.expanduser('~/lila')
@@ -40,30 +40,28 @@ output_dir = os.path.join(lila_local_base,'lila_downloads_by_dataset')
40
40
  os.makedirs(output_dir,exist_ok=True)
41
41
 
42
42
  # Number of concurrent download threads
43
- n_download_threads = 50
43
+ n_download_threads = 20
44
44
 
45
45
  max_images_per_dataset = 10 # None
46
46
 
47
47
  # This impacts the data download, but not the metadata download
48
+ #
49
+ # "Azure" really means "Azure if available"; recent datasets are only available
50
+ # on GCP.
48
51
  image_download_source = 'azure' # 'azure' or 'gcp'
49
52
 
50
53
  random.seed(0)
51
54
 
52
55
 
53
- #%% Download and open the giant table of image metadata
56
+ #%% Download and open the giant table of image URLs and labels
54
57
 
55
- # Opening this huge .csv file make take ~30 seconds
58
+ # ~60 seconds to download, unzip, and open
56
59
  df = read_lila_all_images_file(metadata_dir)
57
60
 
58
61
 
59
- #%% Download and parse the metadata file
60
-
61
- metadata_table = read_lila_metadata(metadata_dir)
62
-
63
-
64
62
  #%% Find all the images we want to download
65
63
 
66
- # Searching over the giant table can take a couple of minutes
64
+ # ~2 minutes
67
65
 
68
66
  ds_name_to_urls = defaultdict(list)
69
67
 
@@ -106,13 +104,24 @@ else:
106
104
 
107
105
  #%% Download those image files
108
106
 
109
- def download_relative_filename(url, output_base, verbose=False, url_base=None):
107
+ container_to_url_base = {
108
+ 'lilablobssc.blob.core.windows.net':'/',
109
+ 'storage.googleapis.com':'/public-datasets-lila/'
110
+ }
111
+
112
+ def download_relative_filename(url, output_base, verbose=False, url_base=None, overwrite=False):
110
113
  """
111
114
  Download a URL to output_base, preserving relative path
112
115
  """
113
116
 
117
+ result = {'status':'unknown','url':url,'destination_filename':None}
118
+
114
119
  if url_base is None:
115
- url_base = '/'
120
+ assert url.startswith('https://')
121
+ container = url.split('/')[2]
122
+ assert container in container_to_url_base
123
+ url_base = container_to_url_base[container]
124
+
116
125
  assert url_base.startswith('/') and url_base.endswith('/')
117
126
 
118
127
  p = urlparse(url)
@@ -122,29 +131,45 @@ def download_relative_filename(url, output_base, verbose=False, url_base=None):
122
131
  relative_filename = relative_filename.replace(url_base,'',1)
123
132
 
124
133
  destination_filename = os.path.join(output_base,relative_filename)
125
- download_url(url, destination_filename, verbose=verbose)
134
+ result['destination_filename'] = destination_filename
135
+
136
+ if ((os.path.isfile(destination_filename)) and (not overwrite)):
137
+ result['status'] = 'skipped'
138
+ return result
139
+ try:
140
+ download_url(url, destination_filename, verbose=verbose)
141
+ except Exception as e:
142
+ print('Warning: error downloading URL {}: {}'.format(
143
+ url,str(e)))
144
+ result['status'] = 'error: {}'.format(str(e))
145
+ return result
126
146
 
147
+ result['status'] = 'success'
148
+ return result
149
+
150
+
151
+ # ds_name_to_urls maps dataset names to lists of URLs; flatten to a single list of URLs
127
152
  all_urls = list(ds_name_to_urls.values())
128
153
  all_urls = [item for sublist in all_urls for item in sublist]
129
154
 
130
- url_base = '/'
131
-
132
155
  # Convert Azure URLs to GCP URLs if necessary
133
156
  if image_download_source != 'azure':
134
157
  assert image_download_source == 'gcp'
135
- url_base = '/public-datasets-lila/'
136
158
  all_urls = [azure_url_to_gcp_http_url(url) for url in all_urls]
137
159
 
138
- print('Downloading {} images with Python requests'.format(len(all_urls)))
160
+ print('Downloading {} images on {} workers'.format(len(all_urls),n_download_threads))
139
161
 
140
162
  if n_download_threads <= 1:
141
163
 
164
+ results = []
165
+
142
166
  # url = all_urls[0]
143
167
  for url in tqdm(all_urls):
144
- download_relative_filename(url,output_dir,verbose=True,url_base=url_base)
168
+ results.append(download_relative_filename(url,output_dir,url_base=None))
145
169
 
146
170
  else:
147
171
 
148
172
  pool = ThreadPool(n_download_threads)
149
- tqdm(pool.imap(lambda s: download_relative_filename(s,output_dir,verbose=False,url_base=url_base),
150
- all_urls), total=len(all_urls))
173
+ results = list(tqdm(pool.imap(lambda s: download_relative_filename(
174
+ s,output_dir,url_base=None),
175
+ all_urls), total=len(all_urls)))