megadetector 5.0.5__py3-none-any.whl → 5.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (132) hide show
  1. api/batch_processing/data_preparation/manage_local_batch.py +302 -263
  2. api/batch_processing/data_preparation/manage_video_batch.py +81 -2
  3. api/batch_processing/postprocessing/add_max_conf.py +1 -0
  4. api/batch_processing/postprocessing/categorize_detections_by_size.py +50 -19
  5. api/batch_processing/postprocessing/compare_batch_results.py +110 -60
  6. api/batch_processing/postprocessing/load_api_results.py +56 -70
  7. api/batch_processing/postprocessing/md_to_coco.py +1 -1
  8. api/batch_processing/postprocessing/md_to_labelme.py +2 -1
  9. api/batch_processing/postprocessing/postprocess_batch_results.py +240 -81
  10. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +625 -0
  11. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
  12. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
  13. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +227 -75
  14. api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
  15. api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
  16. api/synchronous/api_core/animal_detection_api/detection/run_detector_batch.py +2 -2
  17. classification/prepare_classification_script.py +191 -191
  18. data_management/coco_to_yolo.py +68 -45
  19. data_management/databases/integrity_check_json_db.py +7 -5
  20. data_management/generate_crops_from_cct.py +3 -3
  21. data_management/get_image_sizes.py +8 -6
  22. data_management/importers/add_timestamps_to_icct.py +79 -0
  23. data_management/importers/animl_results_to_md_results.py +160 -0
  24. data_management/importers/auckland_doc_test_to_json.py +4 -4
  25. data_management/importers/auckland_doc_to_json.py +1 -1
  26. data_management/importers/awc_to_json.py +5 -5
  27. data_management/importers/bellevue_to_json.py +5 -5
  28. data_management/importers/carrizo_shrubfree_2018.py +5 -5
  29. data_management/importers/carrizo_trail_cam_2017.py +5 -5
  30. data_management/importers/cct_field_adjustments.py +2 -3
  31. data_management/importers/channel_islands_to_cct.py +4 -4
  32. data_management/importers/ena24_to_json.py +5 -5
  33. data_management/importers/helena_to_cct.py +10 -10
  34. data_management/importers/idaho-camera-traps.py +12 -12
  35. data_management/importers/idfg_iwildcam_lila_prep.py +8 -8
  36. data_management/importers/jb_csv_to_json.py +4 -4
  37. data_management/importers/missouri_to_json.py +1 -1
  38. data_management/importers/noaa_seals_2019.py +1 -1
  39. data_management/importers/pc_to_json.py +5 -5
  40. data_management/importers/prepare-noaa-fish-data-for-lila.py +4 -4
  41. data_management/importers/prepare_zsl_imerit.py +5 -5
  42. data_management/importers/rspb_to_json.py +4 -4
  43. data_management/importers/save_the_elephants_survey_A.py +5 -5
  44. data_management/importers/save_the_elephants_survey_B.py +6 -6
  45. data_management/importers/snapshot_safari_importer.py +9 -9
  46. data_management/importers/snapshot_serengeti_lila.py +9 -9
  47. data_management/importers/timelapse_csv_set_to_json.py +5 -7
  48. data_management/importers/ubc_to_json.py +4 -4
  49. data_management/importers/umn_to_json.py +4 -4
  50. data_management/importers/wellington_to_json.py +1 -1
  51. data_management/importers/wi_to_json.py +2 -2
  52. data_management/importers/zamba_results_to_md_results.py +181 -0
  53. data_management/labelme_to_coco.py +35 -7
  54. data_management/labelme_to_yolo.py +229 -0
  55. data_management/lila/add_locations_to_island_camera_traps.py +1 -1
  56. data_management/lila/add_locations_to_nacti.py +147 -0
  57. data_management/lila/create_lila_blank_set.py +474 -0
  58. data_management/lila/create_lila_test_set.py +2 -1
  59. data_management/lila/create_links_to_md_results_files.py +106 -0
  60. data_management/lila/download_lila_subset.py +46 -21
  61. data_management/lila/generate_lila_per_image_labels.py +23 -14
  62. data_management/lila/get_lila_annotation_counts.py +17 -11
  63. data_management/lila/lila_common.py +14 -11
  64. data_management/lila/test_lila_metadata_urls.py +116 -0
  65. data_management/ocr_tools.py +829 -0
  66. data_management/resize_coco_dataset.py +13 -11
  67. data_management/yolo_output_to_md_output.py +84 -12
  68. data_management/yolo_to_coco.py +38 -20
  69. detection/process_video.py +36 -14
  70. detection/pytorch_detector.py +23 -8
  71. detection/run_detector.py +76 -19
  72. detection/run_detector_batch.py +178 -63
  73. detection/run_inference_with_yolov5_val.py +326 -57
  74. detection/run_tiled_inference.py +153 -43
  75. detection/video_utils.py +34 -8
  76. md_utils/ct_utils.py +172 -1
  77. md_utils/md_tests.py +372 -51
  78. md_utils/path_utils.py +167 -39
  79. md_utils/process_utils.py +26 -7
  80. md_utils/split_locations_into_train_val.py +215 -0
  81. md_utils/string_utils.py +10 -0
  82. md_utils/url_utils.py +0 -2
  83. md_utils/write_html_image_list.py +9 -26
  84. md_visualization/plot_utils.py +12 -8
  85. md_visualization/visualization_utils.py +106 -7
  86. md_visualization/visualize_db.py +16 -8
  87. md_visualization/visualize_detector_output.py +208 -97
  88. {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/METADATA +3 -6
  89. {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/RECORD +98 -121
  90. {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/WHEEL +1 -1
  91. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
  92. taxonomy_mapping/map_new_lila_datasets.py +43 -39
  93. taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
  94. taxonomy_mapping/preview_lila_taxonomy.py +27 -27
  95. taxonomy_mapping/species_lookup.py +33 -13
  96. taxonomy_mapping/taxonomy_csv_checker.py +7 -5
  97. api/synchronous/api_core/yolov5/detect.py +0 -252
  98. api/synchronous/api_core/yolov5/export.py +0 -607
  99. api/synchronous/api_core/yolov5/hubconf.py +0 -146
  100. api/synchronous/api_core/yolov5/models/__init__.py +0 -0
  101. api/synchronous/api_core/yolov5/models/common.py +0 -738
  102. api/synchronous/api_core/yolov5/models/experimental.py +0 -104
  103. api/synchronous/api_core/yolov5/models/tf.py +0 -574
  104. api/synchronous/api_core/yolov5/models/yolo.py +0 -338
  105. api/synchronous/api_core/yolov5/train.py +0 -670
  106. api/synchronous/api_core/yolov5/utils/__init__.py +0 -36
  107. api/synchronous/api_core/yolov5/utils/activations.py +0 -103
  108. api/synchronous/api_core/yolov5/utils/augmentations.py +0 -284
  109. api/synchronous/api_core/yolov5/utils/autoanchor.py +0 -170
  110. api/synchronous/api_core/yolov5/utils/autobatch.py +0 -66
  111. api/synchronous/api_core/yolov5/utils/aws/__init__.py +0 -0
  112. api/synchronous/api_core/yolov5/utils/aws/resume.py +0 -40
  113. api/synchronous/api_core/yolov5/utils/benchmarks.py +0 -148
  114. api/synchronous/api_core/yolov5/utils/callbacks.py +0 -71
  115. api/synchronous/api_core/yolov5/utils/dataloaders.py +0 -1087
  116. api/synchronous/api_core/yolov5/utils/downloads.py +0 -178
  117. api/synchronous/api_core/yolov5/utils/flask_rest_api/example_request.py +0 -19
  118. api/synchronous/api_core/yolov5/utils/flask_rest_api/restapi.py +0 -46
  119. api/synchronous/api_core/yolov5/utils/general.py +0 -1018
  120. api/synchronous/api_core/yolov5/utils/loggers/__init__.py +0 -187
  121. api/synchronous/api_core/yolov5/utils/loggers/wandb/__init__.py +0 -0
  122. api/synchronous/api_core/yolov5/utils/loggers/wandb/log_dataset.py +0 -27
  123. api/synchronous/api_core/yolov5/utils/loggers/wandb/sweep.py +0 -41
  124. api/synchronous/api_core/yolov5/utils/loggers/wandb/wandb_utils.py +0 -577
  125. api/synchronous/api_core/yolov5/utils/loss.py +0 -234
  126. api/synchronous/api_core/yolov5/utils/metrics.py +0 -355
  127. api/synchronous/api_core/yolov5/utils/plots.py +0 -489
  128. api/synchronous/api_core/yolov5/utils/torch_utils.py +0 -314
  129. api/synchronous/api_core/yolov5/val.py +0 -394
  130. md_utils/matlab_porting_tools.py +0 -97
  131. {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/LICENSE +0 -0
  132. {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,474 @@
1
+ ########
2
+ #
3
+ # create_lila_blank_set.py
4
+ #
5
+ # Create a folder of blank images sampled from LILA. We'll aim for diversity, so less-common
6
+ # locations will be oversampled relative to more common locations. We'll also run MegaDetector
7
+ # to minimize the chance that incorrectly-labeled non-empty images sneak into our blank set.
8
+ #
9
+ ########
10
+
11
+ #%% Constants and imports
12
+
13
+ import os
14
+ import random
15
+ import math
16
+ import json
17
+ import shutil
18
+
19
+ import numpy as np
20
+ from tqdm import tqdm
21
+ from multiprocessing.pool import ThreadPool
22
+ from urllib.parse import urlparse
23
+ from collections import defaultdict
24
+
25
+ from data_management.lila.lila_common import \
26
+ read_lila_all_images_file, azure_url_to_gcp_http_url
27
+ from md_utils.url_utils import download_url
28
+ from md_visualization import visualization_utils as vis_utils
29
+ from md_utils.path_utils import recursive_file_list
30
+
31
+ # We'll write images, metadata downloads, and temporary files here
32
+ lila_local_base = os.path.expanduser('~/lila')
33
+
34
+ metadata_dir = os.path.join(lila_local_base,'metadata')
35
+ os.makedirs(metadata_dir,exist_ok=True)
36
+
37
+ project_base = os.path.join(lila_local_base,'lila_blanks')
38
+
39
+ candidate_blanks_base = os.path.join(project_base,'candidate_blanks')
40
+ os.makedirs(candidate_blanks_base,exist_ok=True)
41
+
42
+ confirmed_blanks_base = os.path.join(project_base,'confirmed_blanks')
43
+ os.makedirs(confirmed_blanks_base,exist_ok=True)
44
+
45
+ md_possible_non_blanks_folder = os.path.join(project_base,'candidate_non_blanks')
46
+ os.makedirs(md_possible_non_blanks_folder,exist_ok=True)
47
+
48
+ preferred_image_download_source = 'gcp'
49
+
50
+ # Number of concurrent download threads
51
+ n_download_threads = 20
52
+
53
+ n_blanks = 100000
54
+
55
+ random.seed(0)
56
+
57
+
58
+ #%% Download and open the giant table of image URLs and labels
59
+
60
+ # ~60 seconds to download, unzip, and open
61
+ df = read_lila_all_images_file(metadata_dir)
62
+
63
+
64
+ #%% Explore blank labels
65
+
66
+ # Original labels we're treating as blank:
67
+ blank_original_labels = (
68
+ 'empty','misfire'
69
+ )
70
+
71
+ # Some notable original labels we're *not* treating as blank:
72
+ nonblank_original_labels = (
73
+ 'unclassifiable', 'unidentifiable', 'unidentified', 'unknown', 'fire',
74
+ 'foggy lens', 'foggy weather', 'blurred', 'end', 'eye_shine', 'ignore',
75
+ 'lens obscured', 'misdirected', 'other', 'start', 'sun', 'problem',
76
+ 'tilted', 'vegetation obstruction', 'snow on lens', 'malfunction'
77
+ )
78
+
79
+ other_labels_without_common_names = (
80
+ 'car', 'motorcycle', 'vehicle'
81
+ )
82
+
83
+ common_names = sorted(list(df['common_name'].unique()),
84
+ key=lambda x:str(x) if isinstance(x,float) else x)
85
+ original_labels = sorted(list(df['original_label'].unique()),
86
+ key=lambda x:str(x) if isinstance(x,float) else x)
87
+
88
+ # Blanks are represented as NaN in the "common_name" column (though not all NaN's are blanks)
89
+ assert '' not in common_names
90
+ assert all([s not in common_names for s in blank_original_labels])
91
+ assert all([s not in common_names for s in nonblank_original_labels])
92
+ assert np.nan in common_names
93
+
94
+ # Blanks are represented as "empty" or "misfire" in the "original_label" column
95
+ assert all([s in original_labels for s in blank_original_labels])
96
+ assert all([s in original_labels for s in nonblank_original_labels])
97
+ assert all([s in original_labels for s in other_labels_without_common_names])
98
+ assert all([s not in original_labels for s in ('','blank','none',np.nan)])
99
+
100
+
101
+ #%% Count empty labels and common names
102
+
103
+ common_names_with_empty_original_labels = set()
104
+ original_labels_with_nan_common_names = set()
105
+
106
+ common_name_to_count = defaultdict(int)
107
+ original_label_to_count = defaultdict(int)
108
+
109
+ # This loop takes ~10 mins
110
+ for i_row,row in tqdm(df.iterrows(),total=len(df)):
111
+
112
+ common_name = row['common_name']
113
+ original_label = row['original_label']
114
+
115
+ if isinstance(common_name,float):
116
+ assert np.isnan(common_name)
117
+ original_labels_with_nan_common_names.add(original_label)
118
+
119
+ common_name = str(common_name)
120
+
121
+ assert isinstance(original_label,str)
122
+ if original_label in blank_original_labels:
123
+ common_names_with_empty_original_labels.add(common_name)
124
+ common_name_to_count[common_name] += 1
125
+ original_label_to_count[original_label] += 1
126
+
127
+
128
+ #%% Look at the most common labels and common names
129
+
130
+ from md_utils.ct_utils import sort_dictionary_by_value
131
+ common_name_to_count = sort_dictionary_by_value(common_name_to_count,reverse=True)
132
+ original_label_to_count = sort_dictionary_by_value(original_label_to_count,reverse=True)
133
+
134
+ k = 10
135
+
136
+ print('\nMost frequent common names:\n')
137
+
138
+ i_label = 0
139
+ for i_label,s in enumerate(common_name_to_count):
140
+ if i_label >= k:
141
+ break
142
+ print('{}: {}'.format(s,common_name_to_count[s]))
143
+
144
+ print('\nMost frequent original labels:\n')
145
+
146
+ i_label = 0
147
+ for i_label,s in enumerate(original_label_to_count):
148
+ if i_label >= k:
149
+ break
150
+ print('{}: {}'.format(s,original_label_to_count[s]))
151
+
152
+
153
+ #%% Do some consistency checks over the empty labels and stats
154
+
155
+ # All images called 'empty' should have NaN as their common name
156
+ assert (len(common_names_with_empty_original_labels) == 1)
157
+ assert next(iter(common_names_with_empty_original_labels)) == 'nan'
158
+
159
+ # 'empty' should be the most frequent original label overall
160
+ assert next(iter(original_label_to_count)) == 'empty'
161
+
162
+ # NaN should be the most frequent common name overall
163
+ assert next(iter(common_name_to_count)) == 'nan'
164
+
165
+ for s in original_labels_with_nan_common_names:
166
+ assert \
167
+ (s in blank_original_labels) or \
168
+ (s in nonblank_original_labels) or \
169
+ (s in other_labels_without_common_names)
170
+
171
+
172
+ #%% Map locations to blank images
173
+
174
+ location_to_blank_image_urls_cache_file = os.path.join(project_base,
175
+ 'location_to_blank_image_urls.json')
176
+
177
+ force_map_locations = False
178
+
179
+ # Load from .json if available
180
+ if (not force_map_locations) and (os.path.isfile(location_to_blank_image_urls_cache_file)):
181
+
182
+ with open(location_to_blank_image_urls_cache_file,'r') as f:
183
+ location_to_blank_image_urls = json.load(f)
184
+
185
+ else:
186
+
187
+ location_to_blank_image_urls = defaultdict(list)
188
+
189
+ # i_row = 0; row = df.iloc[i_row]
190
+ for i_row,row in tqdm(df.iterrows(),total=len(df)):
191
+
192
+ location_id = row['location_id']
193
+ url = row['url']
194
+
195
+ original_label = row['original_label']
196
+ if original_label in blank_original_labels:
197
+ assert np.isnan(row['common_name'])
198
+ location_to_blank_image_urls[location_id].append(url)
199
+
200
+ with open(location_to_blank_image_urls_cache_file,'w') as f:
201
+ json.dump(location_to_blank_image_urls,f,indent=1)
202
+
203
+ n_locations_with_blanks = len(location_to_blank_image_urls)
204
+ print('Found {} locations with blank images'.format(n_locations_with_blanks))
205
+
206
+
207
+ #%% Sample blanks
208
+
209
+ random.seed(0)
210
+
211
+ # Make a fresh copy of the lists
212
+ location_to_unsampled_blank_image_urls = {}
213
+
214
+ # location = next(iter(location_to_blank_image_urls.keys()))
215
+ for location in location_to_blank_image_urls:
216
+ blank_image_urls_this_location = location_to_blank_image_urls[location]
217
+ unsampled_blank_image_urls_this_location = blank_image_urls_this_location.copy()
218
+ location_to_unsampled_blank_image_urls[location] = unsampled_blank_image_urls_this_location
219
+
220
+ # Put locations in a random order
221
+ location_ids = list(location_to_unsampled_blank_image_urls.keys())
222
+ random.shuffle(location_ids)
223
+
224
+ blank_urls = []
225
+ location_to_sampled_blanks = defaultdict(list)
226
+ fully_sampled_locations = set()
227
+
228
+ # Pick from each location until we hit our limit or have no blanks left
229
+ while(True):
230
+
231
+ found_sample = False
232
+
233
+ # location = location_ids[0]
234
+ for location in location_ids:
235
+
236
+ unsampled_images_this_location = location_to_unsampled_blank_image_urls[location]
237
+ if len(unsampled_images_this_location) == 0:
238
+ fully_sampled_locations.add(location)
239
+ continue
240
+
241
+ url = random.choice(unsampled_images_this_location)
242
+ blank_urls.append(url)
243
+ location_to_unsampled_blank_image_urls[location].remove(url)
244
+ location_to_sampled_blanks[location].append(url)
245
+ found_sample = True
246
+
247
+ if len(blank_urls) == n_blanks:
248
+ break
249
+
250
+ # ...for each location
251
+
252
+ if not found_sample:
253
+ print('Terminating after {} blanks, we ran out before hitting {}'.format(
254
+ len(blank_urls),n_blanks))
255
+
256
+ if len(blank_urls) == n_blanks:
257
+ break
258
+
259
+ # ...while(True)
260
+
261
+ assert len(blank_urls) <= n_blanks
262
+ min_blanks_per_location = math.floor(n_blanks/n_locations_with_blanks)
263
+ max_blanks_per_location = -1
264
+ for location in location_to_sampled_blanks:
265
+ n_blanks_this_location = len(location_to_sampled_blanks[location])
266
+ if n_blanks_this_location >= max_blanks_per_location:
267
+ max_blanks_per_location = n_blanks_this_location
268
+ assert (location in fully_sampled_locations) or \
269
+ n_blanks_this_location >= min_blanks_per_location
270
+
271
+ print('Choose {} blanks from {} locations'.format(n_blanks,len(location_ids)))
272
+ print('Fully sampled {} locations'.format(len(fully_sampled_locations)))
273
+ print('Max samples per location: {}'.format(max_blanks_per_location))
274
+
275
+
276
+ #%% Download those image files (prep)
277
+
278
+ container_to_url_base = {
279
+ 'lilablobssc.blob.core.windows.net':'/',
280
+ 'storage.googleapis.com':'/public-datasets-lila/'
281
+ }
282
+
283
+ def download_relative_filename(url, output_base, verbose=False, url_base=None, overwrite=False):
284
+ """
285
+ Download a URL to output_base, preserving relative path
286
+ """
287
+
288
+ result = {'status':'unknown','url':url,'destination_filename':None}
289
+
290
+ if url_base is None:
291
+ assert url.startswith('https://')
292
+ container = url.split('/')[2]
293
+ assert container in container_to_url_base
294
+ url_base = container_to_url_base[container]
295
+
296
+ assert url_base.startswith('/') and url_base.endswith('/')
297
+
298
+ p = urlparse(url)
299
+ relative_filename = str(p.path)
300
+ # remove the leading '/'
301
+ assert relative_filename.startswith(url_base)
302
+ relative_filename = relative_filename.replace(url_base,'',1)
303
+
304
+ destination_filename = os.path.join(output_base,relative_filename)
305
+ result['destination_filename'] = destination_filename
306
+
307
+ if ((os.path.isfile(destination_filename)) and (not overwrite)):
308
+ result['status'] = 'skipped'
309
+ return result
310
+ try:
311
+ download_url(url, destination_filename, verbose=verbose)
312
+ except Exception as e:
313
+ print('Warning: error downloading URL {}: {}'.format(
314
+ url,str(e)))
315
+ result['status'] = 'error: {}'.format(str(e))
316
+ return result
317
+
318
+ result['status'] = 'success'
319
+ return result
320
+
321
+ # Convert Azure URLs to GCP URLs if necessary
322
+ if preferred_image_download_source != 'azure':
323
+ assert preferred_image_download_source == 'gcp'
324
+ blank_urls = [azure_url_to_gcp_http_url(url) for url in blank_urls]
325
+
326
+
327
+ #%% Download those image files (execution)
328
+
329
+ print('Downloading {} images on {} workers'.format(len(blank_urls),n_download_threads))
330
+
331
+ if n_download_threads <= 1:
332
+
333
+ results = []
334
+
335
+ # url = all_urls[0]
336
+ for url in tqdm(blank_urls):
337
+ results.append(download_relative_filename(url,candidate_blanks_base,url_base=None))
338
+
339
+ else:
340
+
341
+ pool = ThreadPool(n_download_threads)
342
+ results = list(tqdm(pool.imap(lambda s: download_relative_filename(
343
+ s,candidate_blanks_base,url_base=None),
344
+ blank_urls), total=len(blank_urls)))
345
+
346
+ # pool.terminate()
347
+
348
+
349
+ #%% Review results
350
+
351
+ error_urls = []
352
+ for r in results:
353
+ if r['status'] != 'success':
354
+ error_urls.append(r['url'])
355
+
356
+ print('Errors on {} of {} downloads'.format(len(error_urls),len(results)))
357
+
358
+
359
+ #%% Run MegaDetector on the folder
360
+
361
+ md_results_file = os.path.join(project_base,'lila_blanks_md_results.json')
362
+
363
+ cmd = 'python run_detector_batch.py MDV5A "{}" "{}"'.format(
364
+ candidate_blanks_base,md_results_file)
365
+ cmd += ' --recursive --output_relative_filenames'
366
+
367
+ import clipboard; clipboard.copy(cmd); print(cmd)
368
+
369
+
370
+ #%% Review MD results that suggests images are non-empty
371
+
372
+ assert os.path.isfile(md_results_file)
373
+
374
+ category_name_to_threshold = {'animal':0.25,'person':0.25,'vehicle':0.25}
375
+ min_threshold = min(category_name_to_threshold.values())
376
+ with open(md_results_file,'r') as f:
377
+ md_results = json.load(f)
378
+
379
+ images_to_review_to_detections = {}
380
+
381
+ category_id_to_threshold = {}
382
+ for category_id in md_results['detection_categories']:
383
+ category_name = md_results['detection_categories'][category_id]
384
+ category_id_to_threshold[category_id] = category_name_to_threshold[category_name]
385
+
386
+ # im = md_results['images'][0]
387
+ for im in md_results['images']:
388
+
389
+ if 'detections' not in im:
390
+ continue
391
+
392
+ found_object = False
393
+ for det in im['detections']:
394
+ threshold = category_id_to_threshold[det['category']]
395
+ if det['conf'] >= threshold:
396
+ found_object = True
397
+ break
398
+ if found_object:
399
+ images_to_review_to_detections[im['file']] = im['detections']
400
+
401
+ print('Flagging {} of {} images for review'.format(len(images_to_review_to_detections),len(md_results['images'])))
402
+
403
+ output_file_to_source_file = {}
404
+
405
+ # i_fn = 0; source_file_relative = images_to_review[i_fn]
406
+ for i_fn,source_file_relative in tqdm(enumerate(images_to_review_to_detections),
407
+ total=len(images_to_review_to_detections)):
408
+
409
+ source_file_abs = os.path.join(candidate_blanks_base,source_file_relative)
410
+ assert os.path.isfile(source_file_abs)
411
+ ext = os.path.splitext(source_file_abs)[1]
412
+ target_file_relative = str(i_fn).zfill(8) + ext
413
+ target_file_abs = os.path.join(md_possible_non_blanks_folder,target_file_relative)
414
+ output_file_to_source_file[target_file_relative] = source_file_relative
415
+ # shutil.copyfile(source_file_abs,target_file_abs)
416
+ vis_utils.draw_bounding_boxes_on_file(input_file=source_file_abs,
417
+ output_file=target_file_abs,
418
+ detections=images_to_review_to_detections[source_file_relative],
419
+ confidence_threshold=min_threshold,
420
+ target_size=(1280,-1))
421
+
422
+ with open(os.path.join(project_base,'output_file_to_source_file.json'),'w') as f:
423
+ json.dump(output_file_to_source_file,f,indent=1)
424
+
425
+
426
+ #%% Manual review
427
+
428
+ # Delete images that are *not* empty
429
+
430
+
431
+ #%% Figure out which images are still there; these are the actually-blank ones
432
+
433
+ remaining_images = set(os.listdir(md_possible_non_blanks_folder))
434
+ print('Kept {} of {} candidate blank images'.format(len(remaining_images),
435
+ len(images_to_review_to_detections)))
436
+
437
+ removed_blank_images_relative = []
438
+
439
+ # output_file = next(iter(output_file_to_source_file.keys()))
440
+ for output_file in tqdm(output_file_to_source_file.keys()):
441
+ if output_file not in remaining_images:
442
+ source_file_relative = output_file_to_source_file[output_file]
443
+ removed_blank_images_relative.append(source_file_relative)
444
+
445
+ assert len(removed_blank_images_relative) + len(remaining_images) == len(output_file_to_source_file)
446
+
447
+
448
+ #%% Copy all the confirmed blanks to the confirmed folder
449
+
450
+ all_candidate_blanks = recursive_file_list(candidate_blanks_base,return_relative_paths=True)
451
+ print('Found {} candidate blanks'.format(len(all_candidate_blanks)))
452
+
453
+ for source_fn_relative in tqdm(all_candidate_blanks):
454
+ source_fn_abs = os.path.join(candidate_blanks_base,source_fn_relative)
455
+ assert os.path.isfile(source_fn_abs)
456
+ target_fn_abs = os.path.join(confirmed_blanks_base,source_fn_relative)
457
+ os.makedirs(os.path.dirname(target_fn_abs),exist_ok=True)
458
+ shutil.copyfile(source_fn_abs,target_fn_abs)
459
+
460
+
461
+ #%% Record location information for each file
462
+
463
+ fn_relative_to_location = {}
464
+ for location in location_to_blank_image_urls:
465
+ urls_this_location = location_to_blank_image_urls[location]
466
+ for url in urls_this_location:
467
+ fn_relative = url.split('//')[1]
468
+ fn_relative_to_location[fn_relative] = location
469
+
470
+ all_confirmed_blanks = recursive_file_list(confirmed_blanks_base,return_relative_paths=True)
471
+ print('Found {} confirmed blanks'.format(len(all_confirmed_blanks)))
472
+
473
+ for fn_relative in all_confirmed_blanks:
474
+ assert fn_relative in fn_relative_to_location
@@ -124,6 +124,8 @@ for ds_name in metadata_table.keys():
124
124
 
125
125
  #%% Download those image files
126
126
 
127
+ # TODO: trivially parallelizable
128
+ #
127
129
  # ds_name = (list(metadata_table.keys()))[0]
128
130
  for ds_name in metadata_table.keys():
129
131
 
@@ -147,4 +149,3 @@ for ds_name in metadata_table.keys():
147
149
  # ...for each url
148
150
 
149
151
  # ...for each dataset
150
-
@@ -0,0 +1,106 @@
1
+ ########
2
+ #
3
+ # create_links_to_md_results_files.py
4
+ #
5
+ # One-off script to populate the columns in the camera trap data .csv file that point to MD results.
6
+ #
7
+ ########
8
+
9
+ #%% Imports and constants
10
+
11
+ import os
12
+
13
+ import pandas as pd
14
+
15
+ input_csv_file = r'g:\temp\lila_camera_trap_datasets_no_md_results.csv'
16
+ output_csv_file = r'g:\temp\lila_camera_trap_datasets.csv'
17
+
18
+ md_results_local_folder = r'g:\temp\lila-md-results'
19
+ md_base_url = 'https://lila.science/public/lila-md-results/'
20
+ assert md_base_url.endswith('/')
21
+
22
+ # No RDE files for datasets with no location information
23
+ datasets_without_location_info = ('ena24','missouri-camera-traps')
24
+
25
+ md_results_column_names = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
26
+
27
+ validate_urls = False
28
+
29
+
30
+ #%% Read input data
31
+
32
+ df = pd.read_csv(input_csv_file)
33
+ for s in md_results_column_names:
34
+ df[s] = ''
35
+
36
+
37
+ #%% Find matching files locally, and create URLs
38
+
39
+ local_files = os.listdir(md_results_local_folder)
40
+ local_files = [fn for fn in local_files if fn.endswith('.zip')]
41
+
42
+ # i_row = 0; row = df.iloc[i_row]
43
+ for i_row,row in df.iterrows():
44
+
45
+ if not isinstance(row['name'],str):
46
+ continue
47
+
48
+ dataset_shortname = row['short_name']
49
+ matching_files = [fn for fn in local_files if dataset_shortname in fn]
50
+
51
+ # No RDE files for datasets with no location information
52
+ if dataset_shortname in datasets_without_location_info:
53
+ assert len(matching_files) == 2
54
+ mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn]
55
+ mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn]
56
+ assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1
57
+ df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
58
+ df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
59
+ else:
60
+ # Exclude single-season files for snapshot-serengeti
61
+ if dataset_shortname == 'snapshot-serengeti':
62
+ matching_files = [fn for fn in matching_files if '_S' not in fn]
63
+ assert len(matching_files) == 2
64
+ assert all(['mdv4' in fn for fn in matching_files])
65
+ rde_files = [fn for fn in matching_files if 'rde' in fn]
66
+ raw_files = [fn for fn in matching_files if 'rde' not in fn]
67
+ assert len(rde_files) == 1 and len(raw_files) == 1
68
+ df.loc[i_row,'mdv4_results_raw'] = md_base_url + raw_files[0]
69
+ df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
70
+ else:
71
+ assert len(matching_files) == 3
72
+ mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn and 'rde' not in fn]
73
+ mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn and 'rde' not in fn]
74
+ rde_files = [fn for fn in matching_files if 'rde' in fn]
75
+ assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1 and len(rde_files) == 1
76
+ df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
77
+ df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
78
+ df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
79
+
80
+ print('Found {} matching files for {}'.format(len(matching_files),dataset_shortname))
81
+
82
+ # ...for each row
83
+
84
+
85
+ #%% Validate URLs
86
+
87
+ if validate_urls:
88
+
89
+ from md_utils.url_utils import test_urls
90
+
91
+ urls = set()
92
+
93
+ for i_row,row in df.iterrows():
94
+ for column_name in md_results_column_names:
95
+ if len(row[column_name]) > 0:
96
+ assert row[column_name] not in urls
97
+ urls.add(row[column_name])
98
+
99
+ test_urls(urls,error_on_failure=True)
100
+
101
+ print('Validated {} URLs'.format(len(urls)))
102
+
103
+
104
+ #%% Write new .csv file
105
+
106
+ df.to_csv(output_csv_file,header=True,index=False)