megadetector 5.0.6__py3-none-any.whl → 5.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (75) hide show
  1. api/batch_processing/data_preparation/manage_local_batch.py +297 -202
  2. api/batch_processing/data_preparation/manage_video_batch.py +7 -2
  3. api/batch_processing/postprocessing/add_max_conf.py +1 -0
  4. api/batch_processing/postprocessing/combine_api_outputs.py +2 -2
  5. api/batch_processing/postprocessing/compare_batch_results.py +111 -61
  6. api/batch_processing/postprocessing/convert_output_format.py +24 -6
  7. api/batch_processing/postprocessing/load_api_results.py +56 -72
  8. api/batch_processing/postprocessing/md_to_labelme.py +119 -51
  9. api/batch_processing/postprocessing/merge_detections.py +30 -5
  10. api/batch_processing/postprocessing/postprocess_batch_results.py +175 -55
  11. api/batch_processing/postprocessing/remap_detection_categories.py +163 -0
  12. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +628 -0
  13. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
  14. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
  15. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +224 -76
  16. api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
  17. api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
  18. classification/prepare_classification_script.py +191 -191
  19. data_management/cct_json_utils.py +7 -2
  20. data_management/coco_to_labelme.py +263 -0
  21. data_management/coco_to_yolo.py +72 -48
  22. data_management/databases/integrity_check_json_db.py +75 -64
  23. data_management/databases/subset_json_db.py +1 -1
  24. data_management/generate_crops_from_cct.py +1 -1
  25. data_management/get_image_sizes.py +44 -26
  26. data_management/importers/animl_results_to_md_results.py +3 -5
  27. data_management/importers/noaa_seals_2019.py +2 -2
  28. data_management/importers/zamba_results_to_md_results.py +2 -2
  29. data_management/labelme_to_coco.py +264 -127
  30. data_management/labelme_to_yolo.py +96 -53
  31. data_management/lila/create_lila_blank_set.py +557 -0
  32. data_management/lila/create_lila_test_set.py +2 -1
  33. data_management/lila/create_links_to_md_results_files.py +1 -1
  34. data_management/lila/download_lila_subset.py +138 -45
  35. data_management/lila/generate_lila_per_image_labels.py +23 -14
  36. data_management/lila/get_lila_annotation_counts.py +16 -10
  37. data_management/lila/lila_common.py +15 -42
  38. data_management/lila/test_lila_metadata_urls.py +116 -0
  39. data_management/read_exif.py +65 -16
  40. data_management/remap_coco_categories.py +84 -0
  41. data_management/resize_coco_dataset.py +14 -31
  42. data_management/wi_download_csv_to_coco.py +239 -0
  43. data_management/yolo_output_to_md_output.py +40 -13
  44. data_management/yolo_to_coco.py +313 -100
  45. detection/process_video.py +36 -14
  46. detection/pytorch_detector.py +1 -1
  47. detection/run_detector.py +73 -18
  48. detection/run_detector_batch.py +116 -27
  49. detection/run_inference_with_yolov5_val.py +135 -27
  50. detection/run_tiled_inference.py +153 -43
  51. detection/tf_detector.py +2 -1
  52. detection/video_utils.py +4 -2
  53. md_utils/ct_utils.py +101 -6
  54. md_utils/md_tests.py +264 -17
  55. md_utils/path_utils.py +326 -47
  56. md_utils/process_utils.py +26 -7
  57. md_utils/split_locations_into_train_val.py +215 -0
  58. md_utils/string_utils.py +10 -0
  59. md_utils/url_utils.py +66 -3
  60. md_utils/write_html_image_list.py +12 -2
  61. md_visualization/visualization_utils.py +380 -74
  62. md_visualization/visualize_db.py +41 -10
  63. md_visualization/visualize_detector_output.py +185 -104
  64. {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/METADATA +11 -13
  65. {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/RECORD +74 -67
  66. {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/WHEEL +1 -1
  67. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
  68. taxonomy_mapping/map_new_lila_datasets.py +43 -39
  69. taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
  70. taxonomy_mapping/preview_lila_taxonomy.py +27 -27
  71. taxonomy_mapping/species_lookup.py +33 -13
  72. taxonomy_mapping/taxonomy_csv_checker.py +7 -5
  73. md_visualization/visualize_megadb.py +0 -183
  74. {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/LICENSE +0 -0
  75. {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/top_level.txt +0 -0
@@ -9,7 +9,7 @@
9
9
  # what you want to query for, etc., is very application-specific; this is just meant as a
10
10
  # demo.
11
11
  #
12
- # Can download from either Azure or GCP.
12
+ # Can download from GCP (all datasets), AWS (all datasets), or Azure (most datasets).
13
13
  #
14
14
  ########
15
15
 
@@ -20,15 +20,16 @@ import random
20
20
 
21
21
  from tqdm import tqdm
22
22
  from multiprocessing.pool import ThreadPool
23
- from urllib.parse import urlparse
24
23
  from collections import defaultdict
25
24
 
26
- from data_management.lila.lila_common import \
27
- read_lila_all_images_file, read_lila_metadata, is_empty, azure_url_to_gcp_http_url
25
+ from data_management.lila.lila_common import read_lila_all_images_file, is_empty, lila_base_urls
28
26
  from md_utils.url_utils import download_url
29
27
 
28
+ for s in lila_base_urls.values():
29
+ assert s.endswith('/')
30
+
30
31
  # If any of these strings appear in the common name of a species, we'll download that image
31
- species_of_interest = ['grey fox','red fox','leopard cat']
32
+ species_of_interest = ['grey fox','gray fox','cape fox','red fox','kit fox']
32
33
 
33
34
  # We'll write images, metadata downloads, and temporary files here
34
35
  lila_local_base = os.path.expanduser('~/lila')
@@ -40,30 +41,74 @@ output_dir = os.path.join(lila_local_base,'lila_downloads_by_dataset')
40
41
  os.makedirs(output_dir,exist_ok=True)
41
42
 
42
43
  # Number of concurrent download threads
43
- n_download_threads = 50
44
+ n_download_threads = 20
45
+
46
+ verbose = False
44
47
 
45
48
  max_images_per_dataset = 10 # None
46
49
 
47
50
  # This impacts the data download, but not the metadata download
48
- image_download_source = 'azure' # 'azure' or 'gcp'
51
+ #
52
+ # Setting this to "Azure" really means "Azure if available"; some datasets are
53
+ # not available on Azure.
54
+ preferred_provider = 'gcp' # 'azure', 'gcp', 'aws'
49
55
 
50
56
  random.seed(0)
51
57
 
52
58
 
53
- #%% Download and open the giant table of image metadata
59
+ #%% Support functions
54
60
 
55
- # Opening this huge .csv file make take ~30 seconds
56
- df = read_lila_all_images_file(metadata_dir)
61
+ def download_relative_url(relative_url, output_base, provider='gcp',
62
+ verbose=False, overwrite=False):
63
+ """
64
+ Download a URL to output_base, preserving the path relative to the common LILA root.
65
+ """
66
+
67
+ assert not relative_url.startswith('/')
68
+
69
+ # Not all datasets are available on Azure, fall back in these cases. The decision
70
+ # to fall back to GCP rather than AWS is arbitrary.
71
+ if provider == 'azure':
72
+ nominal_provider = relative_url_to_nominal_provider[relative_url]
73
+ if nominal_provider != 'azure':
74
+ if verbose:
75
+ print('URL {} not available on Azure, falling back to GCP'.format(
76
+ relative_url))
77
+ provider = 'gcp'
78
+
79
+ url = lila_base_urls[provider] + relative_url
80
+
81
+ result = {'status':'unknown','url':url,'destination_filename':None}
82
+
83
+ destination_filename = os.path.join(output_base,relative_url)
84
+ result['destination_filename'] = destination_filename
85
+
86
+ if ((os.path.isfile(destination_filename)) and (not overwrite)):
87
+ result['status'] = 'skipped'
88
+ return result
89
+ try:
90
+ download_url(url, destination_filename, verbose=verbose, force_download=overwrite)
91
+ except Exception as e:
92
+ print('Warning: error downloading URL {}: {}'.format(
93
+ url,str(e)))
94
+ result['status'] = 'error: {}'.format(str(e))
95
+ return result
96
+
97
+ result['status'] = 'success'
98
+ return result
57
99
 
58
100
 
59
- #%% Download and parse the metadata file
101
+ #%% Download and open the giant table of image URLs and labels
60
102
 
61
- metadata_table = read_lila_metadata(metadata_dir)
103
+ # ~60 seconds to download, unzip, and open
104
+ df = read_lila_all_images_file(metadata_dir)
62
105
 
63
106
 
64
107
  #%% Find all the images we want to download
65
108
 
66
- # Searching over the giant table can take a couple of minutes
109
+ # ~2 minutes
110
+
111
+ common_name_to_count = defaultdict(int)
67
112
 
68
113
  ds_name_to_urls = defaultdict(list)
69
114
 
@@ -77,6 +122,7 @@ def find_items(row):
77
122
  for species_name in species_of_interest:
78
123
  if species_name in row['common_name']:
79
124
  match = True
125
+ common_name_to_count[species_name] += 1
80
126
  break
81
127
 
82
128
  if match:
@@ -85,15 +131,19 @@ def find_items(row):
85
131
  tqdm.pandas()
86
132
  _ = df.progress_apply(find_items,axis=1)
87
133
 
134
+ # We have a list of URLs for each dataset, flatten them all into a list of URLs
88
135
  all_urls = list(ds_name_to_urls.values())
89
136
  all_urls = [item for sublist in all_urls for item in sublist]
90
137
  print('Found {} matching URLs across {} datasets'.format(len(all_urls),len(ds_name_to_urls)))
91
138
 
139
+ for common_name in common_name_to_count:
140
+ print('{}: {}'.format(common_name,common_name_to_count[common_name]))
141
+
92
142
  from copy import deepcopy
93
143
  ds_name_to_urls_raw = deepcopy(ds_name_to_urls)
94
144
 
95
145
 
96
- #%% Trim to a fixed number of URLs per dataset
146
+ #%% Optionally trim to a fixed number of URLs per dataset
97
147
 
98
148
  if max_images_per_dataset is None:
99
149
  pass
@@ -104,47 +154,90 @@ else:
104
154
  ds_name_to_urls[ds_name] = random.sample(ds_name_to_urls[ds_name],max_images_per_dataset)
105
155
 
106
156
 
107
- #%% Download those image files
157
+ #%% Convert URLs to be relative to the common LILA base
108
158
 
109
- def download_relative_filename(url, output_base, verbose=False, url_base=None):
110
- """
111
- Download a URL to output_base, preserving relative path
112
- """
113
-
114
- if url_base is None:
115
- url_base = '/'
116
- assert url_base.startswith('/') and url_base.endswith('/')
117
-
118
- p = urlparse(url)
119
- relative_filename = str(p.path)
120
- # remove the leading '/'
121
- assert relative_filename.startswith(url_base)
122
- relative_filename = relative_filename.replace(url_base,'',1)
123
-
124
- destination_filename = os.path.join(output_base,relative_filename)
125
- download_url(url, destination_filename, verbose=verbose)
126
-
127
159
  all_urls = list(ds_name_to_urls.values())
128
160
  all_urls = [item for sublist in all_urls for item in sublist]
129
161
 
130
- url_base = '/'
162
+ all_urls_relative = []
131
163
 
132
- # Convert Azure URLs to GCP URLs if necessary
133
- if image_download_source != 'azure':
134
- assert image_download_source == 'gcp'
135
- url_base = '/public-datasets-lila/'
136
- all_urls = [azure_url_to_gcp_http_url(url) for url in all_urls]
164
+ # Each file has a nominal URL in the .csv file. For now, the only thing this tells is
165
+ # is that if the nominal URL isn't an Azure URL, the file isn't on Azure. All files are on
166
+ # GCP and AWS.
167
+ #
168
+ # Keep track of the nominal provider for each URL.
169
+ relative_url_to_nominal_provider = {}
170
+
171
+ for url in all_urls:
172
+ found_base = False
173
+ for provider in lila_base_urls.keys():
174
+ base = lila_base_urls[provider]
175
+ if url.startswith(base):
176
+ relative_url = url.replace(base,'')
177
+ all_urls_relative.append(relative_url)
178
+ relative_url_to_nominal_provider[relative_url] = provider
179
+ found_base = True
180
+ break
181
+ assert found_base
182
+
183
+ assert len(all_urls) == len(all_urls_relative)
184
+
185
+
186
+ #%% Download image files
137
187
 
138
- print('Downloading {} images with Python requests'.format(len(all_urls)))
188
+ print('Downloading {} images on {} workers, preferred provider is {}'.format(
189
+ len(all_urls),n_download_threads,preferred_provider))
139
190
 
140
191
  if n_download_threads <= 1:
141
192
 
142
- # url = all_urls[0]
143
- for url in tqdm(all_urls):
144
- download_relative_filename(url,output_dir,verbose=True,url_base=url_base)
193
+ results = []
194
+
195
+ # url_relative = all_urls_relative[0]
196
+ for url_relative in tqdm(all_urls_relative):
197
+ result = download_relative_url(url_relative,
198
+ output_base=output_dir,
199
+ provider=preferred_provider,
200
+ verbose=verbose)
201
+ results.append(result)
145
202
 
146
203
  else:
147
204
 
148
205
  pool = ThreadPool(n_download_threads)
149
- tqdm(pool.imap(lambda s: download_relative_filename(s,output_dir,verbose=False,url_base=url_base),
150
- all_urls), total=len(all_urls))
206
+ results = list(tqdm(pool.imap(lambda s: download_relative_url(
207
+ s,output_base=output_dir,provider=preferred_provider,verbose=verbose),
208
+ all_urls_relative), total=len(all_urls_relative)))
209
+
210
+
211
+ #%% Scrap
212
+
213
+ if False:
214
+
215
+ pass
216
+
217
+ #%% Find all the reptiles on LILA
218
+
219
+ reptile_rows = df.loc[df['class'] == 'reptilia']
220
+
221
+ # i_row = 0; row = reptile_rows.iloc[i_row]
222
+
223
+ common_name_to_count = defaultdict(int)
224
+ dataset_to_count = defaultdict(int)
225
+ for i_row,row in reptile_rows.iterrows():
226
+ common_name_to_count[row['common_name']] += 1
227
+ dataset_to_count[row['dataset_name']] += 1
228
+
229
+ from md_utils.ct_utils import sort_dictionary_by_value
230
+
231
+ print('Found {} reptiles\n'.format(len(reptile_rows)))
232
+
233
+ common_name_to_count = sort_dictionary_by_value(common_name_to_count,reverse=True)
234
+ dataset_to_count = sort_dictionary_by_value(dataset_to_count,reverse=True)
235
+
236
+ print('Common names by count:\n')
237
+ for k in common_name_to_count:
238
+ print('{} ({})'.format(k,common_name_to_count[k]))
239
+
240
+ print('\nDatasets by count:\n')
241
+ for k in dataset_to_count:
242
+ print('{} ({})'.format(k,dataset_to_count[k]))
243
+
@@ -22,6 +22,9 @@ import json
22
22
  import pandas as pd
23
23
  import numpy as np
24
24
  import dateparser
25
+ import csv
26
+ import urllib
27
+ import urllib.request
25
28
 
26
29
  from collections import defaultdict
27
30
  from tqdm import tqdm
@@ -30,6 +33,9 @@ from data_management.lila.lila_common import read_lila_metadata, \
30
33
  read_metadata_file_for_dataset, \
31
34
  read_lila_taxonomy_mapping
32
35
 
36
+ from md_utils import write_html_image_list
37
+ from md_utils.path_utils import zip_file
38
+ from md_utils.path_utils import open_file
33
39
  from md_utils.url_utils import download_url
34
40
 
35
41
  # We'll write images, metadata downloads, and temporary files here
@@ -56,7 +62,7 @@ ds_name_to_annotation_level['NACTI'] = 'unknown'
56
62
 
57
63
  known_unmapped_labels = set(['WCS Camera Traps:#ref!'])
58
64
 
59
- debug_max_images_per_dataset = 0
65
+ debug_max_images_per_dataset = -1
60
66
  if debug_max_images_per_dataset > 0:
61
67
  print('Running in debug mode')
62
68
  output_file = output_file.replace('.csv','_debug.csv')
@@ -72,7 +78,7 @@ if False:
72
78
  metadata_table = {k:metadata_table[k]}
73
79
 
74
80
 
75
- #%% Download and extract metadata for the datasets we're interested in
81
+ #%% Download and extract metadata for each dataset
76
82
 
77
83
  for ds_name in metadata_table.keys():
78
84
  metadata_table[ds_name]['metadata_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
@@ -101,8 +107,6 @@ for i_row,row in taxonomy_df.iterrows():
101
107
 
102
108
  # Takes several hours
103
109
 
104
- import csv
105
-
106
110
  header = ['dataset_name','url','image_id','sequence_id','location_id','frame_num','original_label',\
107
111
  'scientific_name','common_name','datetime','annotation_level']
108
112
 
@@ -122,7 +126,7 @@ def clearnan(v):
122
126
  assert isinstance(v,str)
123
127
  return v
124
128
 
125
- with open(output_file,'w') as f:
129
+ with open(output_file,'w',encoding='utf-8',newline='') as f:
126
130
 
127
131
  csv_writer = csv.writer(f)
128
132
  csv_writer.writerow(header)
@@ -334,6 +338,8 @@ with open(output_file,'w') as f:
334
338
 
335
339
  # ...with open()
336
340
 
341
+ print('Processed {} datasets'.format(len(metadata_table)))
342
+
337
343
 
338
344
  #%% Read the .csv back
339
345
 
@@ -352,6 +358,8 @@ def isint(v):
352
358
 
353
359
  valid_annotation_levels = set(['sequence','image','unknown'])
354
360
 
361
+ # Collect a list of locations within each dataset; we'll use this
362
+ # in the next cell to look for datasets that only have a single location
355
363
  dataset_name_to_locations = defaultdict(set)
356
364
 
357
365
  def check_row(row):
@@ -386,6 +394,8 @@ else:
386
394
 
387
395
  #%% Check for datasets that have only one location string
388
396
 
397
+ # Expected: ENA24, Missouri Camera Traps
398
+
389
399
  for ds_name in dataset_name_to_locations.keys():
390
400
  if len(dataset_name_to_locations[ds_name]) == 1:
391
401
  print('No location information for {}'.format(ds_name))
@@ -440,8 +450,8 @@ print('Selected {} total images'.format(len(images_to_download)))
440
450
 
441
451
  # Expect a few errors for images with human or vehicle labels (or things like "ignore" that *could* be humans)
442
452
 
443
- import urllib.request
444
-
453
+ # TODO: trivially parallelizable
454
+ #
445
455
  # i_image = 10; image = images_to_download[i_image]
446
456
  for i_image,image in tqdm(enumerate(images_to_download),total=len(images_to_download)):
447
457
 
@@ -450,17 +460,17 @@ for i_image,image in tqdm(enumerate(images_to_download),total=len(images_to_down
450
460
  image_file = os.path.join(preview_folder,'image_{}'.format(str(i_image).zfill(4)) + ext)
451
461
  relative_file = os.path.relpath(image_file,preview_folder)
452
462
  try:
453
- download_url(url,output_file,verbose=False)
463
+ download_url(url,image_file,verbose=False)
454
464
  image['relative_file'] = relative_file
455
465
  except urllib.error.HTTPError:
456
466
  print('Image {} does not exist ({}:{})'.format(
457
467
  i_image,image['dataset_name'],image['original_label']))
458
468
  image['relative_file'] = None
459
469
 
470
+ # ...for each image we need to download
460
471
 
461
- #%% Write preview HTML
462
472
 
463
- from md_utils import write_html_image_list
473
+ #%% Write preview HTML
464
474
 
465
475
  html_filename = os.path.join(preview_folder,'index.html')
466
476
 
@@ -475,19 +485,18 @@ for im in images_to_download:
475
485
  output_im = {}
476
486
  output_im['filename'] = im['relative_file']
477
487
  output_im['linkTarget'] = im['url']
478
- output_im['title'] = str(im)
488
+ output_im['title'] = '<b>{}: {}</b><br/><br/>'.format(im['dataset_name'],im['original_label']) + str(im)
479
489
  output_im['imageStyle'] = 'width:600px;'
480
490
  output_im['textStyle'] = 'font-weight:normal;font-size:100%;'
481
491
  html_images.append(output_im)
482
492
 
483
493
  write_html_image_list.write_html_image_list(html_filename,html_images)
484
494
 
485
- from md_utils.path_utils import open_file
486
495
  open_file(html_filename)
487
496
 
488
497
 
489
498
  #%% Zip output file
490
499
 
491
- from md_utils.path_utils import zip_file
500
+ zipped_output_file = zip_file(output_file,verbose=True)
492
501
 
493
- zip_file(output_file,verbose=True)
502
+ print('Zipped {} to {}'.format(output_file,zipped_output_file))
@@ -34,18 +34,9 @@ os.makedirs(metadata_dir,exist_ok=True)
34
34
 
35
35
  output_file = os.path.join(output_dir,'lila_dataset_to_categories.json')
36
36
 
37
- # Created by get_lila_category_list.py... contains counts for each category
38
- category_list_dir = os.path.join(lila_local_base,'lila_categories_list')
39
- lila_dataset_to_categories_file = os.path.join(category_list_dir,'lila_dataset_to_categories.json')
40
-
41
- assert os.path.isfile(lila_dataset_to_categories_file)
42
-
43
37
 
44
38
  #%% Load category and taxonomy files
45
39
 
46
- with open(lila_dataset_to_categories_file,'r') as f:
47
- lila_dataset_to_categories = json.load(f)
48
-
49
40
  taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
50
41
 
51
42
 
@@ -55,9 +46,13 @@ ds_query_to_scientific_name = {}
55
46
 
56
47
  unmapped_queries = set()
57
48
 
49
+ datasets_with_taxonomy_mapping = set()
50
+
58
51
  # i_row = 1; row = taxonomy_df.iloc[i_row]; row
59
52
  for i_row,row in taxonomy_df.iterrows():
60
53
 
54
+ datasets_with_taxonomy_mapping.add(row['dataset_name'])
55
+
61
56
  ds_query = row['dataset_name'] + ':' + row['query']
62
57
  ds_query = ds_query.lower()
63
58
 
@@ -68,13 +63,17 @@ for i_row,row in taxonomy_df.iterrows():
68
63
 
69
64
  ds_query_to_scientific_name[ds_query] = row['scientific_name']
70
65
 
66
+ print('Loaded taxonomy mappings for {} datasets'.format(len(datasets_with_taxonomy_mapping)))
71
67
 
68
+
72
69
  #%% Download and parse the metadata file
73
70
 
74
71
  metadata_table = read_lila_metadata(metadata_dir)
75
72
 
73
+ print('Loaded metadata URLs for {} datasets'.format(len(metadata_table)))
74
+
76
75
 
77
- #%% Download and extract metadata for the datasets we're interested in
76
+ #%% Download and extract metadata for each dataset
78
77
 
79
78
  for ds_name in metadata_table.keys():
80
79
  metadata_table[ds_name]['json_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
@@ -91,6 +90,11 @@ dataset_to_categories = {}
91
90
  # ds_name = 'NACTI'
92
91
  for ds_name in metadata_table.keys():
93
92
 
93
+ taxonomy_mapping_available = (ds_name in datasets_with_taxonomy_mapping)
94
+
95
+ if not taxonomy_mapping_available:
96
+ print('Warning: taxonomy mapping not available for {}'.format(ds_name))
97
+
94
98
  print('Finding categories in {}'.format(ds_name))
95
99
 
96
100
  json_filename = metadata_table[ds_name]['json_filename']
@@ -122,6 +126,8 @@ for ds_name in metadata_table.keys():
122
126
  # always redundant with the class-level data sets.
123
127
  if 'bbox' in ds_name:
124
128
  c['scientific_name_from_taxonomy_mapping'] = None
129
+ elif not taxonomy_mapping_available:
130
+ c['scientific_name_from_taxonomy_mapping'] = None
125
131
  else:
126
132
  taxonomy_query_string = ds_name.lower().strip() + ':' + c['name'].lower()
127
133
  if taxonomy_query_string not in ds_query_to_scientific_name:
@@ -21,7 +21,7 @@ from md_utils.path_utils import unzip_file
21
21
 
22
22
  # LILA camera trap primary metadata file
23
23
  lila_metadata_url = 'http://lila.science/wp-content/uploads/2023/06/lila_camera_trap_datasets.csv'
24
- lila_taxonomy_mapping_url = 'https://lila.science/wp-content/uploads/2022/07/lila-taxonomy-mapping_release.csv'
24
+ lila_taxonomy_mapping_url = 'https://lila.science/public/lila-taxonomy-mapping_release.csv'
25
25
  lila_all_images_url = 'https://lila.science/public/lila_image_urls_and_labels.csv.zip'
26
26
 
27
27
  wildlife_insights_page_size = 30000
@@ -31,9 +31,13 @@ wildlife_insights_taxonomy_local_json_filename = 'wi_taxonomy.json'
31
31
  wildlife_insights_taxonomy_local_csv_filename = \
32
32
  wildlife_insights_taxonomy_local_json_filename.replace('.json','.csv')
33
33
 
34
- lila_azure_storage_account = 'https://lilablobssc.blob.core.windows.net'
35
- gcp_bucket_api_url = 'https://storage.googleapis.com/public-datasets-lila'
36
- gcp_bucket_gs_url = 'gs://public-datasets-lila'
34
+ # Filenames are consistent across clouds relative to these URLs
35
+ lila_base_urls = {
36
+ 'azure':'https://lilablobssc.blob.core.windows.net/',
37
+ 'gcp':'https://storage.googleapis.com/public-datasets-lila/',
38
+ 'aws':'http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/'
39
+ }
40
+
37
41
 
38
42
 
39
43
  #%% Common functions
@@ -165,16 +169,18 @@ def read_lila_all_images_file(metadata_dir):
165
169
  return df
166
170
 
167
171
 
168
- def read_metadata_file_for_dataset(ds_name,metadata_dir,metadata_table=None):
172
+ def read_metadata_file_for_dataset(ds_name,metadata_dir,metadata_table=None,json_url=None):
169
173
  """
170
174
  Downloads if necessary - then unzips if necessary - the .json file for a specific dataset.
171
175
  Returns the .json filename on the local disk.
172
176
  """
173
177
 
174
- if metadata_table is None:
175
- metadata_table = read_lila_metadata(metadata_dir)
178
+ if json_url is None:
176
179
 
177
- json_url = metadata_table[ds_name]['metadata_url']
180
+ if metadata_table is None:
181
+ metadata_table = read_lila_metadata(metadata_dir)
182
+
183
+ json_url = metadata_table[ds_name]['metadata_url']
178
184
 
179
185
  p = urlparse(json_url)
180
186
  json_filename = os.path.join(metadata_dir,os.path.basename(p.path))
@@ -196,27 +202,6 @@ def read_metadata_file_for_dataset(ds_name,metadata_dir,metadata_table=None):
196
202
  return json_filename
197
203
 
198
204
 
199
- def azure_url_to_gcp_http_url(url):
200
- """
201
- Most URLs point to Azure by default, but most files are available on both Azure and GCP.
202
- This function converts an Azure URL to the corresponding GCP http:// url.
203
- """
204
-
205
- assert url.startswith(lila_azure_storage_account)
206
- gcp_url = url.replace(lila_azure_storage_account,gcp_bucket_api_url,1)
207
- return gcp_url
208
-
209
-
210
- def azure_url_to_gcp_gs_url(url):
211
- """
212
- Most URLs point to Azure by default, but most files are available on both Azure and GCP.
213
- This function converts an Azure URL to the corresponding GCP gs:// url.
214
- """
215
-
216
- return azure_url_to_gcp_http_url(url).replace(gcp_bucket_api_url,
217
- gcp_bucket_gs_url,1)
218
-
219
-
220
205
  #%% Interactive test driver
221
206
 
222
207
  if False:
@@ -249,16 +234,4 @@ if False:
249
234
  urls_to_test.append(ds_info['bbox_url'])
250
235
 
251
236
  status_codes = url_utils.test_urls(urls_to_test)
252
-
253
-
254
- #%% Verify that the GCP versions of all metadata files exist
255
-
256
- gcp_urls = []
257
-
258
- # url = urls_to_test[0]
259
- for url in urls_to_test:
260
- assert url.startswith(lila_azure_storage_account)
261
- gcp_url = url.replace(lila_azure_storage_account,gcp_bucket_api_url,1)
262
- gcp_urls.append(gcp_url)
263
-
264
- status_codes = url_utils.test_urls(gcp_urls)
237
+
@@ -0,0 +1,116 @@
1
+ ########
2
+ #
3
+ # test_lila_metadata_urls.py
4
+ #
5
+ # Test that all the metadata URLs for LILA camera trap datasets are valid, and
6
+ # test that at least one image within each URL is valid, including MegaDetector results
7
+ # files.
8
+ #
9
+ ########
10
+
11
+ #%% Constants and imports
12
+
13
+ import json
14
+ import os
15
+
16
+ from data_management.lila.lila_common import read_lila_metadata,\
17
+ read_metadata_file_for_dataset, read_lila_taxonomy_mapping
18
+
19
+ # We'll write images, metadata downloads, and temporary files here
20
+ lila_local_base = os.path.expanduser('~/lila')
21
+
22
+ output_dir = os.path.join(lila_local_base,'lila_metadata_tests')
23
+ os.makedirs(output_dir,exist_ok=True)
24
+
25
+ metadata_dir = os.path.join(lila_local_base,'metadata')
26
+ os.makedirs(metadata_dir,exist_ok=True)
27
+
28
+ md_results_dir = os.path.join(lila_local_base,'md_results')
29
+ os.makedirs(md_results_dir,exist_ok=True)
30
+
31
+ md_results_keys = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
32
+
33
+
34
+ #%% Load category and taxonomy files
35
+
36
+ taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
37
+
38
+
39
+ #%% Download and parse the metadata file
40
+
41
+ metadata_table = read_lila_metadata(metadata_dir)
42
+
43
+ print('Loaded metadata URLs for {} datasets'.format(len(metadata_table)))
44
+
45
+
46
+ #%% Download and extract metadata and MD results for each dataset
47
+
48
+ for ds_name in metadata_table.keys():
49
+
50
+ metadata_table[ds_name]['json_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
51
+ metadata_dir=metadata_dir,
52
+ metadata_table=metadata_table)
53
+ for k in md_results_keys:
54
+ md_results_url = metadata_table[ds_name][k]
55
+ if md_results_url is None:
56
+ metadata_table[ds_name][k + '_filename'] = None
57
+ else:
58
+ metadata_table[ds_name][k + '_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
59
+ metadata_dir=md_results_dir,
60
+ json_url=md_results_url)
61
+
62
+
63
+ #%% Build up a list of URLs to test
64
+
65
+ url_to_source = {}
66
+
67
+ # The first image in a dataset is disproportionately likely to be human (and thus 404)
68
+ image_index = 1000
69
+
70
+ # ds_name = list(metadata_table.keys())[0]
71
+ for ds_name in metadata_table.keys():
72
+
73
+ if 'bbox' in ds_name:
74
+ print('Skipping bbox dataset {}'.format(ds_name))
75
+ continue
76
+
77
+ print('Processing dataset {}'.format(ds_name))
78
+
79
+ json_filename = metadata_table[ds_name]['json_filename']
80
+ with open(json_filename, 'r') as f:
81
+ data = json.load(f)
82
+
83
+ image_base_url = metadata_table[ds_name]['image_base_url']
84
+ assert not image_base_url.endswith('/')
85
+ # Download a test image
86
+ test_image_relative_path = data['images'][image_index]['file_name']
87
+ test_image_url = image_base_url + '/' + test_image_relative_path
88
+
89
+ url_to_source[test_image_url] = ds_name + ' metadata'
90
+
91
+ # k = md_results_keys[2]
92
+ for k in md_results_keys:
93
+ k_fn = k + '_filename'
94
+ if metadata_table[ds_name][k_fn] is not None:
95
+ with open(metadata_table[ds_name][k_fn],'r') as f:
96
+ md_results = json.load(f)
97
+ im = md_results['images'][image_index]
98
+ md_image_url = image_base_url + '/' + im['file']
99
+ url_to_source[md_image_url] = ds_name + ' ' + k
100
+
101
+ # ...for each dataset
102
+
103
+
104
+ #%% Test URLs
105
+
106
+ from md_utils.url_utils import test_urls
107
+
108
+ urls_to_test = sorted(url_to_source.keys())
109
+ urls_to_test = [fn.replace('\\','/') for fn in urls_to_test]
110
+
111
+ status_codes = test_urls(urls_to_test,error_on_failure=False)
112
+
113
+ for i_url,url in enumerate(urls_to_test):
114
+ if status_codes[i_url] != 200:
115
+ print('Status {} for {} ({})'.format(
116
+ status_codes[i_url],url,url_to_source[url]))