megadetector 5.0.6__py3-none-any.whl → 5.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- api/batch_processing/data_preparation/manage_local_batch.py +278 -197
- api/batch_processing/data_preparation/manage_video_batch.py +7 -2
- api/batch_processing/postprocessing/add_max_conf.py +1 -0
- api/batch_processing/postprocessing/compare_batch_results.py +110 -60
- api/batch_processing/postprocessing/load_api_results.py +55 -69
- api/batch_processing/postprocessing/md_to_labelme.py +1 -0
- api/batch_processing/postprocessing/postprocess_batch_results.py +158 -50
- api/batch_processing/postprocessing/render_detection_confusion_matrix.py +625 -0
- api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
- api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
- api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +222 -74
- api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
- api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
- classification/prepare_classification_script.py +191 -191
- data_management/coco_to_yolo.py +65 -44
- data_management/databases/integrity_check_json_db.py +7 -5
- data_management/generate_crops_from_cct.py +1 -1
- data_management/importers/animl_results_to_md_results.py +2 -2
- data_management/importers/noaa_seals_2019.py +1 -1
- data_management/importers/zamba_results_to_md_results.py +2 -2
- data_management/labelme_to_coco.py +34 -6
- data_management/labelme_to_yolo.py +1 -1
- data_management/lila/create_lila_blank_set.py +474 -0
- data_management/lila/create_lila_test_set.py +2 -1
- data_management/lila/create_links_to_md_results_files.py +1 -1
- data_management/lila/download_lila_subset.py +46 -21
- data_management/lila/generate_lila_per_image_labels.py +23 -14
- data_management/lila/get_lila_annotation_counts.py +16 -10
- data_management/lila/lila_common.py +14 -11
- data_management/lila/test_lila_metadata_urls.py +116 -0
- data_management/resize_coco_dataset.py +12 -10
- data_management/yolo_output_to_md_output.py +40 -13
- data_management/yolo_to_coco.py +34 -21
- detection/process_video.py +36 -14
- detection/pytorch_detector.py +1 -1
- detection/run_detector.py +73 -18
- detection/run_detector_batch.py +104 -24
- detection/run_inference_with_yolov5_val.py +127 -26
- detection/run_tiled_inference.py +153 -43
- detection/video_utils.py +3 -1
- md_utils/ct_utils.py +79 -3
- md_utils/md_tests.py +253 -15
- md_utils/path_utils.py +129 -24
- md_utils/process_utils.py +26 -7
- md_utils/split_locations_into_train_val.py +215 -0
- md_utils/string_utils.py +10 -0
- md_utils/url_utils.py +0 -2
- md_utils/write_html_image_list.py +1 -0
- md_visualization/visualization_utils.py +17 -2
- md_visualization/visualize_db.py +8 -0
- md_visualization/visualize_detector_output.py +185 -104
- {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/METADATA +2 -2
- {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/RECORD +62 -58
- {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/WHEEL +1 -1
- taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
- taxonomy_mapping/map_new_lila_datasets.py +43 -39
- taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
- taxonomy_mapping/preview_lila_taxonomy.py +27 -27
- taxonomy_mapping/species_lookup.py +33 -13
- taxonomy_mapping/taxonomy_csv_checker.py +7 -5
- {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/LICENSE +0 -0
- {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,474 @@
|
|
|
1
|
+
########
|
|
2
|
+
#
|
|
3
|
+
# create_lila_blank_set.py
|
|
4
|
+
#
|
|
5
|
+
# Create a folder of blank images sampled from LILA. We'll aim for diversity, so less-common
|
|
6
|
+
# locations will be oversampled relative to more common locations. We'll also run MegaDetector
|
|
7
|
+
# to minimize the chance that incorrectly-labeled non-empty images sneak into our blank set.
|
|
8
|
+
#
|
|
9
|
+
########
|
|
10
|
+
|
|
11
|
+
#%% Constants and imports
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
import random
|
|
15
|
+
import math
|
|
16
|
+
import json
|
|
17
|
+
import shutil
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
from tqdm import tqdm
|
|
21
|
+
from multiprocessing.pool import ThreadPool
|
|
22
|
+
from urllib.parse import urlparse
|
|
23
|
+
from collections import defaultdict
|
|
24
|
+
|
|
25
|
+
from data_management.lila.lila_common import \
|
|
26
|
+
read_lila_all_images_file, azure_url_to_gcp_http_url
|
|
27
|
+
from md_utils.url_utils import download_url
|
|
28
|
+
from md_visualization import visualization_utils as vis_utils
|
|
29
|
+
from md_utils.path_utils import recursive_file_list
|
|
30
|
+
|
|
31
|
+
# We'll write images, metadata downloads, and temporary files here
|
|
32
|
+
lila_local_base = os.path.expanduser('~/lila')
|
|
33
|
+
|
|
34
|
+
metadata_dir = os.path.join(lila_local_base,'metadata')
|
|
35
|
+
os.makedirs(metadata_dir,exist_ok=True)
|
|
36
|
+
|
|
37
|
+
project_base = os.path.join(lila_local_base,'lila_blanks')
|
|
38
|
+
|
|
39
|
+
candidate_blanks_base = os.path.join(project_base,'candidate_blanks')
|
|
40
|
+
os.makedirs(candidate_blanks_base,exist_ok=True)
|
|
41
|
+
|
|
42
|
+
confirmed_blanks_base = os.path.join(project_base,'confirmed_blanks')
|
|
43
|
+
os.makedirs(confirmed_blanks_base,exist_ok=True)
|
|
44
|
+
|
|
45
|
+
md_possible_non_blanks_folder = os.path.join(project_base,'candidate_non_blanks')
|
|
46
|
+
os.makedirs(md_possible_non_blanks_folder,exist_ok=True)
|
|
47
|
+
|
|
48
|
+
preferred_image_download_source = 'gcp'
|
|
49
|
+
|
|
50
|
+
# Number of concurrent download threads
|
|
51
|
+
n_download_threads = 20
|
|
52
|
+
|
|
53
|
+
n_blanks = 100000
|
|
54
|
+
|
|
55
|
+
random.seed(0)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
#%% Download and open the giant table of image URLs and labels
|
|
59
|
+
|
|
60
|
+
# ~60 seconds to download, unzip, and open
|
|
61
|
+
df = read_lila_all_images_file(metadata_dir)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
#%% Explore blank labels
|
|
65
|
+
|
|
66
|
+
# Original labels we're treating as blank:
|
|
67
|
+
blank_original_labels = (
|
|
68
|
+
'empty','misfire'
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# Some notable original labels we're *not* treating as blank:
|
|
72
|
+
nonblank_original_labels = (
|
|
73
|
+
'unclassifiable', 'unidentifiable', 'unidentified', 'unknown', 'fire',
|
|
74
|
+
'foggy lens', 'foggy weather', 'blurred', 'end', 'eye_shine', 'ignore',
|
|
75
|
+
'lens obscured', 'misdirected', 'other', 'start', 'sun', 'problem',
|
|
76
|
+
'tilted', 'vegetation obstruction', 'snow on lens', 'malfunction'
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
other_labels_without_common_names = (
|
|
80
|
+
'car', 'motorcycle', 'vehicle'
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
common_names = sorted(list(df['common_name'].unique()),
|
|
84
|
+
key=lambda x:str(x) if isinstance(x,float) else x)
|
|
85
|
+
original_labels = sorted(list(df['original_label'].unique()),
|
|
86
|
+
key=lambda x:str(x) if isinstance(x,float) else x)
|
|
87
|
+
|
|
88
|
+
# Blanks are represented as NaN in the "common_name" column (though not all NaN's are blanks)
|
|
89
|
+
assert '' not in common_names
|
|
90
|
+
assert all([s not in common_names for s in blank_original_labels])
|
|
91
|
+
assert all([s not in common_names for s in nonblank_original_labels])
|
|
92
|
+
assert np.nan in common_names
|
|
93
|
+
|
|
94
|
+
# Blanks are represented as "empty" or "misfire" in the "original_label" column
|
|
95
|
+
assert all([s in original_labels for s in blank_original_labels])
|
|
96
|
+
assert all([s in original_labels for s in nonblank_original_labels])
|
|
97
|
+
assert all([s in original_labels for s in other_labels_without_common_names])
|
|
98
|
+
assert all([s not in original_labels for s in ('','blank','none',np.nan)])
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
#%% Count empty labels and common names
|
|
102
|
+
|
|
103
|
+
common_names_with_empty_original_labels = set()
|
|
104
|
+
original_labels_with_nan_common_names = set()
|
|
105
|
+
|
|
106
|
+
common_name_to_count = defaultdict(int)
|
|
107
|
+
original_label_to_count = defaultdict(int)
|
|
108
|
+
|
|
109
|
+
# This loop takes ~10 mins
|
|
110
|
+
for i_row,row in tqdm(df.iterrows(),total=len(df)):
|
|
111
|
+
|
|
112
|
+
common_name = row['common_name']
|
|
113
|
+
original_label = row['original_label']
|
|
114
|
+
|
|
115
|
+
if isinstance(common_name,float):
|
|
116
|
+
assert np.isnan(common_name)
|
|
117
|
+
original_labels_with_nan_common_names.add(original_label)
|
|
118
|
+
|
|
119
|
+
common_name = str(common_name)
|
|
120
|
+
|
|
121
|
+
assert isinstance(original_label,str)
|
|
122
|
+
if original_label in blank_original_labels:
|
|
123
|
+
common_names_with_empty_original_labels.add(common_name)
|
|
124
|
+
common_name_to_count[common_name] += 1
|
|
125
|
+
original_label_to_count[original_label] += 1
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
#%% Look at the most common labels and common names
|
|
129
|
+
|
|
130
|
+
from md_utils.ct_utils import sort_dictionary_by_value
|
|
131
|
+
common_name_to_count = sort_dictionary_by_value(common_name_to_count,reverse=True)
|
|
132
|
+
original_label_to_count = sort_dictionary_by_value(original_label_to_count,reverse=True)
|
|
133
|
+
|
|
134
|
+
k = 10
|
|
135
|
+
|
|
136
|
+
print('\nMost frequent common names:\n')
|
|
137
|
+
|
|
138
|
+
i_label = 0
|
|
139
|
+
for i_label,s in enumerate(common_name_to_count):
|
|
140
|
+
if i_label >= k:
|
|
141
|
+
break
|
|
142
|
+
print('{}: {}'.format(s,common_name_to_count[s]))
|
|
143
|
+
|
|
144
|
+
print('\nMost frequent original labels:\n')
|
|
145
|
+
|
|
146
|
+
i_label = 0
|
|
147
|
+
for i_label,s in enumerate(original_label_to_count):
|
|
148
|
+
if i_label >= k:
|
|
149
|
+
break
|
|
150
|
+
print('{}: {}'.format(s,original_label_to_count[s]))
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
#%% Do some consistency checks over the empty labels and stats
|
|
154
|
+
|
|
155
|
+
# All images called 'empty' should have NaN as their common name
|
|
156
|
+
assert (len(common_names_with_empty_original_labels) == 1)
|
|
157
|
+
assert next(iter(common_names_with_empty_original_labels)) == 'nan'
|
|
158
|
+
|
|
159
|
+
# 'empty' should be the most frequent original label overall
|
|
160
|
+
assert next(iter(original_label_to_count)) == 'empty'
|
|
161
|
+
|
|
162
|
+
# NaN should be the most frequent common name overall
|
|
163
|
+
assert next(iter(common_name_to_count)) == 'nan'
|
|
164
|
+
|
|
165
|
+
for s in original_labels_with_nan_common_names:
|
|
166
|
+
assert \
|
|
167
|
+
(s in blank_original_labels) or \
|
|
168
|
+
(s in nonblank_original_labels) or \
|
|
169
|
+
(s in other_labels_without_common_names)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
#%% Map locations to blank images
|
|
173
|
+
|
|
174
|
+
location_to_blank_image_urls_cache_file = os.path.join(project_base,
|
|
175
|
+
'location_to_blank_image_urls.json')
|
|
176
|
+
|
|
177
|
+
force_map_locations = False
|
|
178
|
+
|
|
179
|
+
# Load from .json if available
|
|
180
|
+
if (not force_map_locations) and (os.path.isfile(location_to_blank_image_urls_cache_file)):
|
|
181
|
+
|
|
182
|
+
with open(location_to_blank_image_urls_cache_file,'r') as f:
|
|
183
|
+
location_to_blank_image_urls = json.load(f)
|
|
184
|
+
|
|
185
|
+
else:
|
|
186
|
+
|
|
187
|
+
location_to_blank_image_urls = defaultdict(list)
|
|
188
|
+
|
|
189
|
+
# i_row = 0; row = df.iloc[i_row]
|
|
190
|
+
for i_row,row in tqdm(df.iterrows(),total=len(df)):
|
|
191
|
+
|
|
192
|
+
location_id = row['location_id']
|
|
193
|
+
url = row['url']
|
|
194
|
+
|
|
195
|
+
original_label = row['original_label']
|
|
196
|
+
if original_label in blank_original_labels:
|
|
197
|
+
assert np.isnan(row['common_name'])
|
|
198
|
+
location_to_blank_image_urls[location_id].append(url)
|
|
199
|
+
|
|
200
|
+
with open(location_to_blank_image_urls_cache_file,'w') as f:
|
|
201
|
+
json.dump(location_to_blank_image_urls,f,indent=1)
|
|
202
|
+
|
|
203
|
+
n_locations_with_blanks = len(location_to_blank_image_urls)
|
|
204
|
+
print('Found {} locations with blank images'.format(n_locations_with_blanks))
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
#%% Sample blanks
|
|
208
|
+
|
|
209
|
+
random.seed(0)
|
|
210
|
+
|
|
211
|
+
# Make a fresh copy of the lists
|
|
212
|
+
location_to_unsampled_blank_image_urls = {}
|
|
213
|
+
|
|
214
|
+
# location = next(iter(location_to_blank_image_urls.keys()))
|
|
215
|
+
for location in location_to_blank_image_urls:
|
|
216
|
+
blank_image_urls_this_location = location_to_blank_image_urls[location]
|
|
217
|
+
unsampled_blank_image_urls_this_location = blank_image_urls_this_location.copy()
|
|
218
|
+
location_to_unsampled_blank_image_urls[location] = unsampled_blank_image_urls_this_location
|
|
219
|
+
|
|
220
|
+
# Put locations in a random order
|
|
221
|
+
location_ids = list(location_to_unsampled_blank_image_urls.keys())
|
|
222
|
+
random.shuffle(location_ids)
|
|
223
|
+
|
|
224
|
+
blank_urls = []
|
|
225
|
+
location_to_sampled_blanks = defaultdict(list)
|
|
226
|
+
fully_sampled_locations = set()
|
|
227
|
+
|
|
228
|
+
# Pick from each location until we hit our limit or have no blanks left
|
|
229
|
+
while(True):
|
|
230
|
+
|
|
231
|
+
found_sample = False
|
|
232
|
+
|
|
233
|
+
# location = location_ids[0]
|
|
234
|
+
for location in location_ids:
|
|
235
|
+
|
|
236
|
+
unsampled_images_this_location = location_to_unsampled_blank_image_urls[location]
|
|
237
|
+
if len(unsampled_images_this_location) == 0:
|
|
238
|
+
fully_sampled_locations.add(location)
|
|
239
|
+
continue
|
|
240
|
+
|
|
241
|
+
url = random.choice(unsampled_images_this_location)
|
|
242
|
+
blank_urls.append(url)
|
|
243
|
+
location_to_unsampled_blank_image_urls[location].remove(url)
|
|
244
|
+
location_to_sampled_blanks[location].append(url)
|
|
245
|
+
found_sample = True
|
|
246
|
+
|
|
247
|
+
if len(blank_urls) == n_blanks:
|
|
248
|
+
break
|
|
249
|
+
|
|
250
|
+
# ...for each location
|
|
251
|
+
|
|
252
|
+
if not found_sample:
|
|
253
|
+
print('Terminating after {} blanks, we ran out before hitting {}'.format(
|
|
254
|
+
len(blank_urls),n_blanks))
|
|
255
|
+
|
|
256
|
+
if len(blank_urls) == n_blanks:
|
|
257
|
+
break
|
|
258
|
+
|
|
259
|
+
# ...while(True)
|
|
260
|
+
|
|
261
|
+
assert len(blank_urls) <= n_blanks
|
|
262
|
+
min_blanks_per_location = math.floor(n_blanks/n_locations_with_blanks)
|
|
263
|
+
max_blanks_per_location = -1
|
|
264
|
+
for location in location_to_sampled_blanks:
|
|
265
|
+
n_blanks_this_location = len(location_to_sampled_blanks[location])
|
|
266
|
+
if n_blanks_this_location >= max_blanks_per_location:
|
|
267
|
+
max_blanks_per_location = n_blanks_this_location
|
|
268
|
+
assert (location in fully_sampled_locations) or \
|
|
269
|
+
n_blanks_this_location >= min_blanks_per_location
|
|
270
|
+
|
|
271
|
+
print('Choose {} blanks from {} locations'.format(n_blanks,len(location_ids)))
|
|
272
|
+
print('Fully sampled {} locations'.format(len(fully_sampled_locations)))
|
|
273
|
+
print('Max samples per location: {}'.format(max_blanks_per_location))
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
#%% Download those image files (prep)
|
|
277
|
+
|
|
278
|
+
container_to_url_base = {
|
|
279
|
+
'lilablobssc.blob.core.windows.net':'/',
|
|
280
|
+
'storage.googleapis.com':'/public-datasets-lila/'
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
def download_relative_filename(url, output_base, verbose=False, url_base=None, overwrite=False):
|
|
284
|
+
"""
|
|
285
|
+
Download a URL to output_base, preserving relative path
|
|
286
|
+
"""
|
|
287
|
+
|
|
288
|
+
result = {'status':'unknown','url':url,'destination_filename':None}
|
|
289
|
+
|
|
290
|
+
if url_base is None:
|
|
291
|
+
assert url.startswith('https://')
|
|
292
|
+
container = url.split('/')[2]
|
|
293
|
+
assert container in container_to_url_base
|
|
294
|
+
url_base = container_to_url_base[container]
|
|
295
|
+
|
|
296
|
+
assert url_base.startswith('/') and url_base.endswith('/')
|
|
297
|
+
|
|
298
|
+
p = urlparse(url)
|
|
299
|
+
relative_filename = str(p.path)
|
|
300
|
+
# remove the leading '/'
|
|
301
|
+
assert relative_filename.startswith(url_base)
|
|
302
|
+
relative_filename = relative_filename.replace(url_base,'',1)
|
|
303
|
+
|
|
304
|
+
destination_filename = os.path.join(output_base,relative_filename)
|
|
305
|
+
result['destination_filename'] = destination_filename
|
|
306
|
+
|
|
307
|
+
if ((os.path.isfile(destination_filename)) and (not overwrite)):
|
|
308
|
+
result['status'] = 'skipped'
|
|
309
|
+
return result
|
|
310
|
+
try:
|
|
311
|
+
download_url(url, destination_filename, verbose=verbose)
|
|
312
|
+
except Exception as e:
|
|
313
|
+
print('Warning: error downloading URL {}: {}'.format(
|
|
314
|
+
url,str(e)))
|
|
315
|
+
result['status'] = 'error: {}'.format(str(e))
|
|
316
|
+
return result
|
|
317
|
+
|
|
318
|
+
result['status'] = 'success'
|
|
319
|
+
return result
|
|
320
|
+
|
|
321
|
+
# Convert Azure URLs to GCP URLs if necessary
|
|
322
|
+
if preferred_image_download_source != 'azure':
|
|
323
|
+
assert preferred_image_download_source == 'gcp'
|
|
324
|
+
blank_urls = [azure_url_to_gcp_http_url(url) for url in blank_urls]
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
#%% Download those image files (execution)
|
|
328
|
+
|
|
329
|
+
print('Downloading {} images on {} workers'.format(len(blank_urls),n_download_threads))
|
|
330
|
+
|
|
331
|
+
if n_download_threads <= 1:
|
|
332
|
+
|
|
333
|
+
results = []
|
|
334
|
+
|
|
335
|
+
# url = all_urls[0]
|
|
336
|
+
for url in tqdm(blank_urls):
|
|
337
|
+
results.append(download_relative_filename(url,candidate_blanks_base,url_base=None))
|
|
338
|
+
|
|
339
|
+
else:
|
|
340
|
+
|
|
341
|
+
pool = ThreadPool(n_download_threads)
|
|
342
|
+
results = list(tqdm(pool.imap(lambda s: download_relative_filename(
|
|
343
|
+
s,candidate_blanks_base,url_base=None),
|
|
344
|
+
blank_urls), total=len(blank_urls)))
|
|
345
|
+
|
|
346
|
+
# pool.terminate()
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
#%% Review results
|
|
350
|
+
|
|
351
|
+
error_urls = []
|
|
352
|
+
for r in results:
|
|
353
|
+
if r['status'] != 'success':
|
|
354
|
+
error_urls.append(r['url'])
|
|
355
|
+
|
|
356
|
+
print('Errors on {} of {} downloads'.format(len(error_urls),len(results)))
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
#%% Run MegaDetector on the folder
|
|
360
|
+
|
|
361
|
+
md_results_file = os.path.join(project_base,'lila_blanks_md_results.json')
|
|
362
|
+
|
|
363
|
+
cmd = 'python run_detector_batch.py MDV5A "{}" "{}"'.format(
|
|
364
|
+
candidate_blanks_base,md_results_file)
|
|
365
|
+
cmd += ' --recursive --output_relative_filenames'
|
|
366
|
+
|
|
367
|
+
import clipboard; clipboard.copy(cmd); print(cmd)
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
#%% Review MD results that suggests images are non-empty
|
|
371
|
+
|
|
372
|
+
assert os.path.isfile(md_results_file)
|
|
373
|
+
|
|
374
|
+
category_name_to_threshold = {'animal':0.25,'person':0.25,'vehicle':0.25}
|
|
375
|
+
min_threshold = min(category_name_to_threshold.values())
|
|
376
|
+
with open(md_results_file,'r') as f:
|
|
377
|
+
md_results = json.load(f)
|
|
378
|
+
|
|
379
|
+
images_to_review_to_detections = {}
|
|
380
|
+
|
|
381
|
+
category_id_to_threshold = {}
|
|
382
|
+
for category_id in md_results['detection_categories']:
|
|
383
|
+
category_name = md_results['detection_categories'][category_id]
|
|
384
|
+
category_id_to_threshold[category_id] = category_name_to_threshold[category_name]
|
|
385
|
+
|
|
386
|
+
# im = md_results['images'][0]
|
|
387
|
+
for im in md_results['images']:
|
|
388
|
+
|
|
389
|
+
if 'detections' not in im:
|
|
390
|
+
continue
|
|
391
|
+
|
|
392
|
+
found_object = False
|
|
393
|
+
for det in im['detections']:
|
|
394
|
+
threshold = category_id_to_threshold[det['category']]
|
|
395
|
+
if det['conf'] >= threshold:
|
|
396
|
+
found_object = True
|
|
397
|
+
break
|
|
398
|
+
if found_object:
|
|
399
|
+
images_to_review_to_detections[im['file']] = im['detections']
|
|
400
|
+
|
|
401
|
+
print('Flagging {} of {} images for review'.format(len(images_to_review_to_detections),len(md_results['images'])))
|
|
402
|
+
|
|
403
|
+
output_file_to_source_file = {}
|
|
404
|
+
|
|
405
|
+
# i_fn = 0; source_file_relative = images_to_review[i_fn]
|
|
406
|
+
for i_fn,source_file_relative in tqdm(enumerate(images_to_review_to_detections),
|
|
407
|
+
total=len(images_to_review_to_detections)):
|
|
408
|
+
|
|
409
|
+
source_file_abs = os.path.join(candidate_blanks_base,source_file_relative)
|
|
410
|
+
assert os.path.isfile(source_file_abs)
|
|
411
|
+
ext = os.path.splitext(source_file_abs)[1]
|
|
412
|
+
target_file_relative = str(i_fn).zfill(8) + ext
|
|
413
|
+
target_file_abs = os.path.join(md_possible_non_blanks_folder,target_file_relative)
|
|
414
|
+
output_file_to_source_file[target_file_relative] = source_file_relative
|
|
415
|
+
# shutil.copyfile(source_file_abs,target_file_abs)
|
|
416
|
+
vis_utils.draw_bounding_boxes_on_file(input_file=source_file_abs,
|
|
417
|
+
output_file=target_file_abs,
|
|
418
|
+
detections=images_to_review_to_detections[source_file_relative],
|
|
419
|
+
confidence_threshold=min_threshold,
|
|
420
|
+
target_size=(1280,-1))
|
|
421
|
+
|
|
422
|
+
with open(os.path.join(project_base,'output_file_to_source_file.json'),'w') as f:
|
|
423
|
+
json.dump(output_file_to_source_file,f,indent=1)
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
#%% Manual review
|
|
427
|
+
|
|
428
|
+
# Delete images that are *not* empty
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
#%% Figure out which images are still there; these are the actually-blank ones
|
|
432
|
+
|
|
433
|
+
remaining_images = set(os.listdir(md_possible_non_blanks_folder))
|
|
434
|
+
print('Kept {} of {} candidate blank images'.format(len(remaining_images),
|
|
435
|
+
len(images_to_review_to_detections)))
|
|
436
|
+
|
|
437
|
+
removed_blank_images_relative = []
|
|
438
|
+
|
|
439
|
+
# output_file = next(iter(output_file_to_source_file.keys()))
|
|
440
|
+
for output_file in tqdm(output_file_to_source_file.keys()):
|
|
441
|
+
if output_file not in remaining_images:
|
|
442
|
+
source_file_relative = output_file_to_source_file[output_file]
|
|
443
|
+
removed_blank_images_relative.append(source_file_relative)
|
|
444
|
+
|
|
445
|
+
assert len(removed_blank_images_relative) + len(remaining_images) == len(output_file_to_source_file)
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
#%% Copy all the confirmed blanks to the confirmed folder
|
|
449
|
+
|
|
450
|
+
all_candidate_blanks = recursive_file_list(candidate_blanks_base,return_relative_paths=True)
|
|
451
|
+
print('Found {} candidate blanks'.format(len(all_candidate_blanks)))
|
|
452
|
+
|
|
453
|
+
for source_fn_relative in tqdm(all_candidate_blanks):
|
|
454
|
+
source_fn_abs = os.path.join(candidate_blanks_base,source_fn_relative)
|
|
455
|
+
assert os.path.isfile(source_fn_abs)
|
|
456
|
+
target_fn_abs = os.path.join(confirmed_blanks_base,source_fn_relative)
|
|
457
|
+
os.makedirs(os.path.dirname(target_fn_abs),exist_ok=True)
|
|
458
|
+
shutil.copyfile(source_fn_abs,target_fn_abs)
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
#%% Record location information for each file
|
|
462
|
+
|
|
463
|
+
fn_relative_to_location = {}
|
|
464
|
+
for location in location_to_blank_image_urls:
|
|
465
|
+
urls_this_location = location_to_blank_image_urls[location]
|
|
466
|
+
for url in urls_this_location:
|
|
467
|
+
fn_relative = url.split('//')[1]
|
|
468
|
+
fn_relative_to_location[fn_relative] = location
|
|
469
|
+
|
|
470
|
+
all_confirmed_blanks = recursive_file_list(confirmed_blanks_base,return_relative_paths=True)
|
|
471
|
+
print('Found {} confirmed blanks'.format(len(all_confirmed_blanks)))
|
|
472
|
+
|
|
473
|
+
for fn_relative in all_confirmed_blanks:
|
|
474
|
+
assert fn_relative in fn_relative_to_location
|
|
@@ -124,6 +124,8 @@ for ds_name in metadata_table.keys():
|
|
|
124
124
|
|
|
125
125
|
#%% Download those image files
|
|
126
126
|
|
|
127
|
+
# TODO: trivially parallelizable
|
|
128
|
+
#
|
|
127
129
|
# ds_name = (list(metadata_table.keys()))[0]
|
|
128
130
|
for ds_name in metadata_table.keys():
|
|
129
131
|
|
|
@@ -147,4 +149,3 @@ for ds_name in metadata_table.keys():
|
|
|
147
149
|
# ...for each url
|
|
148
150
|
|
|
149
151
|
# ...for each dataset
|
|
150
|
-
|
|
@@ -57,7 +57,7 @@ for i_row,row in df.iterrows():
|
|
|
57
57
|
df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
|
|
58
58
|
df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
|
|
59
59
|
else:
|
|
60
|
-
# Exclude single-season files for
|
|
60
|
+
# Exclude single-season files for snapshot-serengeti
|
|
61
61
|
if dataset_shortname == 'snapshot-serengeti':
|
|
62
62
|
matching_files = [fn for fn in matching_files if '_S' not in fn]
|
|
63
63
|
assert len(matching_files) == 2
|
|
@@ -24,11 +24,11 @@ from urllib.parse import urlparse
|
|
|
24
24
|
from collections import defaultdict
|
|
25
25
|
|
|
26
26
|
from data_management.lila.lila_common import \
|
|
27
|
-
read_lila_all_images_file,
|
|
27
|
+
read_lila_all_images_file, is_empty, azure_url_to_gcp_http_url
|
|
28
28
|
from md_utils.url_utils import download_url
|
|
29
29
|
|
|
30
30
|
# If any of these strings appear in the common name of a species, we'll download that image
|
|
31
|
-
species_of_interest = ['grey fox','red fox','leopard cat']
|
|
31
|
+
species_of_interest = ['grey fox','red fox','leopard cat','kiwi']
|
|
32
32
|
|
|
33
33
|
# We'll write images, metadata downloads, and temporary files here
|
|
34
34
|
lila_local_base = os.path.expanduser('~/lila')
|
|
@@ -40,30 +40,28 @@ output_dir = os.path.join(lila_local_base,'lila_downloads_by_dataset')
|
|
|
40
40
|
os.makedirs(output_dir,exist_ok=True)
|
|
41
41
|
|
|
42
42
|
# Number of concurrent download threads
|
|
43
|
-
n_download_threads =
|
|
43
|
+
n_download_threads = 20
|
|
44
44
|
|
|
45
45
|
max_images_per_dataset = 10 # None
|
|
46
46
|
|
|
47
47
|
# This impacts the data download, but not the metadata download
|
|
48
|
+
#
|
|
49
|
+
# "Azure" really means "Azure if available"; recent datasets are only available
|
|
50
|
+
# on GCP.
|
|
48
51
|
image_download_source = 'azure' # 'azure' or 'gcp'
|
|
49
52
|
|
|
50
53
|
random.seed(0)
|
|
51
54
|
|
|
52
55
|
|
|
53
|
-
#%% Download and open the giant table of image
|
|
56
|
+
#%% Download and open the giant table of image URLs and labels
|
|
54
57
|
|
|
55
|
-
#
|
|
58
|
+
# ~60 seconds to download, unzip, and open
|
|
56
59
|
df = read_lila_all_images_file(metadata_dir)
|
|
57
60
|
|
|
58
61
|
|
|
59
|
-
#%% Download and parse the metadata file
|
|
60
|
-
|
|
61
|
-
metadata_table = read_lila_metadata(metadata_dir)
|
|
62
|
-
|
|
63
|
-
|
|
64
62
|
#%% Find all the images we want to download
|
|
65
63
|
|
|
66
|
-
#
|
|
64
|
+
# ~2 minutes
|
|
67
65
|
|
|
68
66
|
ds_name_to_urls = defaultdict(list)
|
|
69
67
|
|
|
@@ -106,13 +104,24 @@ else:
|
|
|
106
104
|
|
|
107
105
|
#%% Download those image files
|
|
108
106
|
|
|
109
|
-
|
|
107
|
+
container_to_url_base = {
|
|
108
|
+
'lilablobssc.blob.core.windows.net':'/',
|
|
109
|
+
'storage.googleapis.com':'/public-datasets-lila/'
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
def download_relative_filename(url, output_base, verbose=False, url_base=None, overwrite=False):
|
|
110
113
|
"""
|
|
111
114
|
Download a URL to output_base, preserving relative path
|
|
112
115
|
"""
|
|
113
116
|
|
|
117
|
+
result = {'status':'unknown','url':url,'destination_filename':None}
|
|
118
|
+
|
|
114
119
|
if url_base is None:
|
|
115
|
-
|
|
120
|
+
assert url.startswith('https://')
|
|
121
|
+
container = url.split('/')[2]
|
|
122
|
+
assert container in container_to_url_base
|
|
123
|
+
url_base = container_to_url_base[container]
|
|
124
|
+
|
|
116
125
|
assert url_base.startswith('/') and url_base.endswith('/')
|
|
117
126
|
|
|
118
127
|
p = urlparse(url)
|
|
@@ -122,29 +131,45 @@ def download_relative_filename(url, output_base, verbose=False, url_base=None):
|
|
|
122
131
|
relative_filename = relative_filename.replace(url_base,'',1)
|
|
123
132
|
|
|
124
133
|
destination_filename = os.path.join(output_base,relative_filename)
|
|
125
|
-
|
|
134
|
+
result['destination_filename'] = destination_filename
|
|
135
|
+
|
|
136
|
+
if ((os.path.isfile(destination_filename)) and (not overwrite)):
|
|
137
|
+
result['status'] = 'skipped'
|
|
138
|
+
return result
|
|
139
|
+
try:
|
|
140
|
+
download_url(url, destination_filename, verbose=verbose)
|
|
141
|
+
except Exception as e:
|
|
142
|
+
print('Warning: error downloading URL {}: {}'.format(
|
|
143
|
+
url,str(e)))
|
|
144
|
+
result['status'] = 'error: {}'.format(str(e))
|
|
145
|
+
return result
|
|
126
146
|
|
|
147
|
+
result['status'] = 'success'
|
|
148
|
+
return result
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
# ds_name_to_urls maps dataset names to lists of URLs; flatten to a single list of URLs
|
|
127
152
|
all_urls = list(ds_name_to_urls.values())
|
|
128
153
|
all_urls = [item for sublist in all_urls for item in sublist]
|
|
129
154
|
|
|
130
|
-
url_base = '/'
|
|
131
|
-
|
|
132
155
|
# Convert Azure URLs to GCP URLs if necessary
|
|
133
156
|
if image_download_source != 'azure':
|
|
134
157
|
assert image_download_source == 'gcp'
|
|
135
|
-
url_base = '/public-datasets-lila/'
|
|
136
158
|
all_urls = [azure_url_to_gcp_http_url(url) for url in all_urls]
|
|
137
159
|
|
|
138
|
-
print('Downloading {} images
|
|
160
|
+
print('Downloading {} images on {} workers'.format(len(all_urls),n_download_threads))
|
|
139
161
|
|
|
140
162
|
if n_download_threads <= 1:
|
|
141
163
|
|
|
164
|
+
results = []
|
|
165
|
+
|
|
142
166
|
# url = all_urls[0]
|
|
143
167
|
for url in tqdm(all_urls):
|
|
144
|
-
download_relative_filename(url,output_dir,
|
|
168
|
+
results.append(download_relative_filename(url,output_dir,url_base=None))
|
|
145
169
|
|
|
146
170
|
else:
|
|
147
171
|
|
|
148
172
|
pool = ThreadPool(n_download_threads)
|
|
149
|
-
tqdm(pool.imap(lambda s: download_relative_filename(
|
|
150
|
-
|
|
173
|
+
results = list(tqdm(pool.imap(lambda s: download_relative_filename(
|
|
174
|
+
s,output_dir,url_base=None),
|
|
175
|
+
all_urls), total=len(all_urls)))
|