megadetector 5.0.5__py3-none-any.whl → 5.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- api/batch_processing/data_preparation/manage_local_batch.py +302 -263
- api/batch_processing/data_preparation/manage_video_batch.py +81 -2
- api/batch_processing/postprocessing/add_max_conf.py +1 -0
- api/batch_processing/postprocessing/categorize_detections_by_size.py +50 -19
- api/batch_processing/postprocessing/compare_batch_results.py +110 -60
- api/batch_processing/postprocessing/load_api_results.py +56 -70
- api/batch_processing/postprocessing/md_to_coco.py +1 -1
- api/batch_processing/postprocessing/md_to_labelme.py +2 -1
- api/batch_processing/postprocessing/postprocess_batch_results.py +240 -81
- api/batch_processing/postprocessing/render_detection_confusion_matrix.py +625 -0
- api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
- api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
- api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +227 -75
- api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
- api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
- api/synchronous/api_core/animal_detection_api/detection/run_detector_batch.py +2 -2
- classification/prepare_classification_script.py +191 -191
- data_management/coco_to_yolo.py +68 -45
- data_management/databases/integrity_check_json_db.py +7 -5
- data_management/generate_crops_from_cct.py +3 -3
- data_management/get_image_sizes.py +8 -6
- data_management/importers/add_timestamps_to_icct.py +79 -0
- data_management/importers/animl_results_to_md_results.py +160 -0
- data_management/importers/auckland_doc_test_to_json.py +4 -4
- data_management/importers/auckland_doc_to_json.py +1 -1
- data_management/importers/awc_to_json.py +5 -5
- data_management/importers/bellevue_to_json.py +5 -5
- data_management/importers/carrizo_shrubfree_2018.py +5 -5
- data_management/importers/carrizo_trail_cam_2017.py +5 -5
- data_management/importers/cct_field_adjustments.py +2 -3
- data_management/importers/channel_islands_to_cct.py +4 -4
- data_management/importers/ena24_to_json.py +5 -5
- data_management/importers/helena_to_cct.py +10 -10
- data_management/importers/idaho-camera-traps.py +12 -12
- data_management/importers/idfg_iwildcam_lila_prep.py +8 -8
- data_management/importers/jb_csv_to_json.py +4 -4
- data_management/importers/missouri_to_json.py +1 -1
- data_management/importers/noaa_seals_2019.py +1 -1
- data_management/importers/pc_to_json.py +5 -5
- data_management/importers/prepare-noaa-fish-data-for-lila.py +4 -4
- data_management/importers/prepare_zsl_imerit.py +5 -5
- data_management/importers/rspb_to_json.py +4 -4
- data_management/importers/save_the_elephants_survey_A.py +5 -5
- data_management/importers/save_the_elephants_survey_B.py +6 -6
- data_management/importers/snapshot_safari_importer.py +9 -9
- data_management/importers/snapshot_serengeti_lila.py +9 -9
- data_management/importers/timelapse_csv_set_to_json.py +5 -7
- data_management/importers/ubc_to_json.py +4 -4
- data_management/importers/umn_to_json.py +4 -4
- data_management/importers/wellington_to_json.py +1 -1
- data_management/importers/wi_to_json.py +2 -2
- data_management/importers/zamba_results_to_md_results.py +181 -0
- data_management/labelme_to_coco.py +35 -7
- data_management/labelme_to_yolo.py +229 -0
- data_management/lila/add_locations_to_island_camera_traps.py +1 -1
- data_management/lila/add_locations_to_nacti.py +147 -0
- data_management/lila/create_lila_blank_set.py +474 -0
- data_management/lila/create_lila_test_set.py +2 -1
- data_management/lila/create_links_to_md_results_files.py +106 -0
- data_management/lila/download_lila_subset.py +46 -21
- data_management/lila/generate_lila_per_image_labels.py +23 -14
- data_management/lila/get_lila_annotation_counts.py +17 -11
- data_management/lila/lila_common.py +14 -11
- data_management/lila/test_lila_metadata_urls.py +116 -0
- data_management/ocr_tools.py +829 -0
- data_management/resize_coco_dataset.py +13 -11
- data_management/yolo_output_to_md_output.py +84 -12
- data_management/yolo_to_coco.py +38 -20
- detection/process_video.py +36 -14
- detection/pytorch_detector.py +23 -8
- detection/run_detector.py +76 -19
- detection/run_detector_batch.py +178 -63
- detection/run_inference_with_yolov5_val.py +326 -57
- detection/run_tiled_inference.py +153 -43
- detection/video_utils.py +34 -8
- md_utils/ct_utils.py +172 -1
- md_utils/md_tests.py +372 -51
- md_utils/path_utils.py +167 -39
- md_utils/process_utils.py +26 -7
- md_utils/split_locations_into_train_val.py +215 -0
- md_utils/string_utils.py +10 -0
- md_utils/url_utils.py +0 -2
- md_utils/write_html_image_list.py +9 -26
- md_visualization/plot_utils.py +12 -8
- md_visualization/visualization_utils.py +106 -7
- md_visualization/visualize_db.py +16 -8
- md_visualization/visualize_detector_output.py +208 -97
- {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/METADATA +3 -6
- {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/RECORD +98 -121
- {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/WHEEL +1 -1
- taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
- taxonomy_mapping/map_new_lila_datasets.py +43 -39
- taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
- taxonomy_mapping/preview_lila_taxonomy.py +27 -27
- taxonomy_mapping/species_lookup.py +33 -13
- taxonomy_mapping/taxonomy_csv_checker.py +7 -5
- api/synchronous/api_core/yolov5/detect.py +0 -252
- api/synchronous/api_core/yolov5/export.py +0 -607
- api/synchronous/api_core/yolov5/hubconf.py +0 -146
- api/synchronous/api_core/yolov5/models/__init__.py +0 -0
- api/synchronous/api_core/yolov5/models/common.py +0 -738
- api/synchronous/api_core/yolov5/models/experimental.py +0 -104
- api/synchronous/api_core/yolov5/models/tf.py +0 -574
- api/synchronous/api_core/yolov5/models/yolo.py +0 -338
- api/synchronous/api_core/yolov5/train.py +0 -670
- api/synchronous/api_core/yolov5/utils/__init__.py +0 -36
- api/synchronous/api_core/yolov5/utils/activations.py +0 -103
- api/synchronous/api_core/yolov5/utils/augmentations.py +0 -284
- api/synchronous/api_core/yolov5/utils/autoanchor.py +0 -170
- api/synchronous/api_core/yolov5/utils/autobatch.py +0 -66
- api/synchronous/api_core/yolov5/utils/aws/__init__.py +0 -0
- api/synchronous/api_core/yolov5/utils/aws/resume.py +0 -40
- api/synchronous/api_core/yolov5/utils/benchmarks.py +0 -148
- api/synchronous/api_core/yolov5/utils/callbacks.py +0 -71
- api/synchronous/api_core/yolov5/utils/dataloaders.py +0 -1087
- api/synchronous/api_core/yolov5/utils/downloads.py +0 -178
- api/synchronous/api_core/yolov5/utils/flask_rest_api/example_request.py +0 -19
- api/synchronous/api_core/yolov5/utils/flask_rest_api/restapi.py +0 -46
- api/synchronous/api_core/yolov5/utils/general.py +0 -1018
- api/synchronous/api_core/yolov5/utils/loggers/__init__.py +0 -187
- api/synchronous/api_core/yolov5/utils/loggers/wandb/__init__.py +0 -0
- api/synchronous/api_core/yolov5/utils/loggers/wandb/log_dataset.py +0 -27
- api/synchronous/api_core/yolov5/utils/loggers/wandb/sweep.py +0 -41
- api/synchronous/api_core/yolov5/utils/loggers/wandb/wandb_utils.py +0 -577
- api/synchronous/api_core/yolov5/utils/loss.py +0 -234
- api/synchronous/api_core/yolov5/utils/metrics.py +0 -355
- api/synchronous/api_core/yolov5/utils/plots.py +0 -489
- api/synchronous/api_core/yolov5/utils/torch_utils.py +0 -314
- api/synchronous/api_core/yolov5/val.py +0 -394
- md_utils/matlab_porting_tools.py +0 -97
- {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/LICENSE +0 -0
- {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,474 @@
|
|
|
1
|
+
########
|
|
2
|
+
#
|
|
3
|
+
# create_lila_blank_set.py
|
|
4
|
+
#
|
|
5
|
+
# Create a folder of blank images sampled from LILA. We'll aim for diversity, so less-common
|
|
6
|
+
# locations will be oversampled relative to more common locations. We'll also run MegaDetector
|
|
7
|
+
# to minimize the chance that incorrectly-labeled non-empty images sneak into our blank set.
|
|
8
|
+
#
|
|
9
|
+
########
|
|
10
|
+
|
|
11
|
+
#%% Constants and imports
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
import random
|
|
15
|
+
import math
|
|
16
|
+
import json
|
|
17
|
+
import shutil
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
from tqdm import tqdm
|
|
21
|
+
from multiprocessing.pool import ThreadPool
|
|
22
|
+
from urllib.parse import urlparse
|
|
23
|
+
from collections import defaultdict
|
|
24
|
+
|
|
25
|
+
from data_management.lila.lila_common import \
|
|
26
|
+
read_lila_all_images_file, azure_url_to_gcp_http_url
|
|
27
|
+
from md_utils.url_utils import download_url
|
|
28
|
+
from md_visualization import visualization_utils as vis_utils
|
|
29
|
+
from md_utils.path_utils import recursive_file_list
|
|
30
|
+
|
|
31
|
+
# We'll write images, metadata downloads, and temporary files here
|
|
32
|
+
lila_local_base = os.path.expanduser('~/lila')
|
|
33
|
+
|
|
34
|
+
metadata_dir = os.path.join(lila_local_base,'metadata')
|
|
35
|
+
os.makedirs(metadata_dir,exist_ok=True)
|
|
36
|
+
|
|
37
|
+
project_base = os.path.join(lila_local_base,'lila_blanks')
|
|
38
|
+
|
|
39
|
+
candidate_blanks_base = os.path.join(project_base,'candidate_blanks')
|
|
40
|
+
os.makedirs(candidate_blanks_base,exist_ok=True)
|
|
41
|
+
|
|
42
|
+
confirmed_blanks_base = os.path.join(project_base,'confirmed_blanks')
|
|
43
|
+
os.makedirs(confirmed_blanks_base,exist_ok=True)
|
|
44
|
+
|
|
45
|
+
md_possible_non_blanks_folder = os.path.join(project_base,'candidate_non_blanks')
|
|
46
|
+
os.makedirs(md_possible_non_blanks_folder,exist_ok=True)
|
|
47
|
+
|
|
48
|
+
preferred_image_download_source = 'gcp'
|
|
49
|
+
|
|
50
|
+
# Number of concurrent download threads
|
|
51
|
+
n_download_threads = 20
|
|
52
|
+
|
|
53
|
+
n_blanks = 100000
|
|
54
|
+
|
|
55
|
+
random.seed(0)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
#%% Download and open the giant table of image URLs and labels
|
|
59
|
+
|
|
60
|
+
# ~60 seconds to download, unzip, and open
|
|
61
|
+
df = read_lila_all_images_file(metadata_dir)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
#%% Explore blank labels
|
|
65
|
+
|
|
66
|
+
# Original labels we're treating as blank:
|
|
67
|
+
blank_original_labels = (
|
|
68
|
+
'empty','misfire'
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# Some notable original labels we're *not* treating as blank:
|
|
72
|
+
nonblank_original_labels = (
|
|
73
|
+
'unclassifiable', 'unidentifiable', 'unidentified', 'unknown', 'fire',
|
|
74
|
+
'foggy lens', 'foggy weather', 'blurred', 'end', 'eye_shine', 'ignore',
|
|
75
|
+
'lens obscured', 'misdirected', 'other', 'start', 'sun', 'problem',
|
|
76
|
+
'tilted', 'vegetation obstruction', 'snow on lens', 'malfunction'
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
other_labels_without_common_names = (
|
|
80
|
+
'car', 'motorcycle', 'vehicle'
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
common_names = sorted(list(df['common_name'].unique()),
|
|
84
|
+
key=lambda x:str(x) if isinstance(x,float) else x)
|
|
85
|
+
original_labels = sorted(list(df['original_label'].unique()),
|
|
86
|
+
key=lambda x:str(x) if isinstance(x,float) else x)
|
|
87
|
+
|
|
88
|
+
# Blanks are represented as NaN in the "common_name" column (though not all NaN's are blanks)
|
|
89
|
+
assert '' not in common_names
|
|
90
|
+
assert all([s not in common_names for s in blank_original_labels])
|
|
91
|
+
assert all([s not in common_names for s in nonblank_original_labels])
|
|
92
|
+
assert np.nan in common_names
|
|
93
|
+
|
|
94
|
+
# Blanks are represented as "empty" or "misfire" in the "original_label" column
|
|
95
|
+
assert all([s in original_labels for s in blank_original_labels])
|
|
96
|
+
assert all([s in original_labels for s in nonblank_original_labels])
|
|
97
|
+
assert all([s in original_labels for s in other_labels_without_common_names])
|
|
98
|
+
assert all([s not in original_labels for s in ('','blank','none',np.nan)])
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
#%% Count empty labels and common names
|
|
102
|
+
|
|
103
|
+
common_names_with_empty_original_labels = set()
|
|
104
|
+
original_labels_with_nan_common_names = set()
|
|
105
|
+
|
|
106
|
+
common_name_to_count = defaultdict(int)
|
|
107
|
+
original_label_to_count = defaultdict(int)
|
|
108
|
+
|
|
109
|
+
# This loop takes ~10 mins
|
|
110
|
+
for i_row,row in tqdm(df.iterrows(),total=len(df)):
|
|
111
|
+
|
|
112
|
+
common_name = row['common_name']
|
|
113
|
+
original_label = row['original_label']
|
|
114
|
+
|
|
115
|
+
if isinstance(common_name,float):
|
|
116
|
+
assert np.isnan(common_name)
|
|
117
|
+
original_labels_with_nan_common_names.add(original_label)
|
|
118
|
+
|
|
119
|
+
common_name = str(common_name)
|
|
120
|
+
|
|
121
|
+
assert isinstance(original_label,str)
|
|
122
|
+
if original_label in blank_original_labels:
|
|
123
|
+
common_names_with_empty_original_labels.add(common_name)
|
|
124
|
+
common_name_to_count[common_name] += 1
|
|
125
|
+
original_label_to_count[original_label] += 1
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
#%% Look at the most common labels and common names
|
|
129
|
+
|
|
130
|
+
from md_utils.ct_utils import sort_dictionary_by_value
|
|
131
|
+
common_name_to_count = sort_dictionary_by_value(common_name_to_count,reverse=True)
|
|
132
|
+
original_label_to_count = sort_dictionary_by_value(original_label_to_count,reverse=True)
|
|
133
|
+
|
|
134
|
+
k = 10
|
|
135
|
+
|
|
136
|
+
print('\nMost frequent common names:\n')
|
|
137
|
+
|
|
138
|
+
i_label = 0
|
|
139
|
+
for i_label,s in enumerate(common_name_to_count):
|
|
140
|
+
if i_label >= k:
|
|
141
|
+
break
|
|
142
|
+
print('{}: {}'.format(s,common_name_to_count[s]))
|
|
143
|
+
|
|
144
|
+
print('\nMost frequent original labels:\n')
|
|
145
|
+
|
|
146
|
+
i_label = 0
|
|
147
|
+
for i_label,s in enumerate(original_label_to_count):
|
|
148
|
+
if i_label >= k:
|
|
149
|
+
break
|
|
150
|
+
print('{}: {}'.format(s,original_label_to_count[s]))
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
#%% Do some consistency checks over the empty labels and stats
|
|
154
|
+
|
|
155
|
+
# All images called 'empty' should have NaN as their common name
|
|
156
|
+
assert (len(common_names_with_empty_original_labels) == 1)
|
|
157
|
+
assert next(iter(common_names_with_empty_original_labels)) == 'nan'
|
|
158
|
+
|
|
159
|
+
# 'empty' should be the most frequent original label overall
|
|
160
|
+
assert next(iter(original_label_to_count)) == 'empty'
|
|
161
|
+
|
|
162
|
+
# NaN should be the most frequent common name overall
|
|
163
|
+
assert next(iter(common_name_to_count)) == 'nan'
|
|
164
|
+
|
|
165
|
+
for s in original_labels_with_nan_common_names:
|
|
166
|
+
assert \
|
|
167
|
+
(s in blank_original_labels) or \
|
|
168
|
+
(s in nonblank_original_labels) or \
|
|
169
|
+
(s in other_labels_without_common_names)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
#%% Map locations to blank images
|
|
173
|
+
|
|
174
|
+
location_to_blank_image_urls_cache_file = os.path.join(project_base,
|
|
175
|
+
'location_to_blank_image_urls.json')
|
|
176
|
+
|
|
177
|
+
force_map_locations = False
|
|
178
|
+
|
|
179
|
+
# Load from .json if available
|
|
180
|
+
if (not force_map_locations) and (os.path.isfile(location_to_blank_image_urls_cache_file)):
|
|
181
|
+
|
|
182
|
+
with open(location_to_blank_image_urls_cache_file,'r') as f:
|
|
183
|
+
location_to_blank_image_urls = json.load(f)
|
|
184
|
+
|
|
185
|
+
else:
|
|
186
|
+
|
|
187
|
+
location_to_blank_image_urls = defaultdict(list)
|
|
188
|
+
|
|
189
|
+
# i_row = 0; row = df.iloc[i_row]
|
|
190
|
+
for i_row,row in tqdm(df.iterrows(),total=len(df)):
|
|
191
|
+
|
|
192
|
+
location_id = row['location_id']
|
|
193
|
+
url = row['url']
|
|
194
|
+
|
|
195
|
+
original_label = row['original_label']
|
|
196
|
+
if original_label in blank_original_labels:
|
|
197
|
+
assert np.isnan(row['common_name'])
|
|
198
|
+
location_to_blank_image_urls[location_id].append(url)
|
|
199
|
+
|
|
200
|
+
with open(location_to_blank_image_urls_cache_file,'w') as f:
|
|
201
|
+
json.dump(location_to_blank_image_urls,f,indent=1)
|
|
202
|
+
|
|
203
|
+
n_locations_with_blanks = len(location_to_blank_image_urls)
|
|
204
|
+
print('Found {} locations with blank images'.format(n_locations_with_blanks))
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
#%% Sample blanks
|
|
208
|
+
|
|
209
|
+
random.seed(0)
|
|
210
|
+
|
|
211
|
+
# Make a fresh copy of the lists
|
|
212
|
+
location_to_unsampled_blank_image_urls = {}
|
|
213
|
+
|
|
214
|
+
# location = next(iter(location_to_blank_image_urls.keys()))
|
|
215
|
+
for location in location_to_blank_image_urls:
|
|
216
|
+
blank_image_urls_this_location = location_to_blank_image_urls[location]
|
|
217
|
+
unsampled_blank_image_urls_this_location = blank_image_urls_this_location.copy()
|
|
218
|
+
location_to_unsampled_blank_image_urls[location] = unsampled_blank_image_urls_this_location
|
|
219
|
+
|
|
220
|
+
# Put locations in a random order
|
|
221
|
+
location_ids = list(location_to_unsampled_blank_image_urls.keys())
|
|
222
|
+
random.shuffle(location_ids)
|
|
223
|
+
|
|
224
|
+
blank_urls = []
|
|
225
|
+
location_to_sampled_blanks = defaultdict(list)
|
|
226
|
+
fully_sampled_locations = set()
|
|
227
|
+
|
|
228
|
+
# Pick from each location until we hit our limit or have no blanks left
|
|
229
|
+
while(True):
|
|
230
|
+
|
|
231
|
+
found_sample = False
|
|
232
|
+
|
|
233
|
+
# location = location_ids[0]
|
|
234
|
+
for location in location_ids:
|
|
235
|
+
|
|
236
|
+
unsampled_images_this_location = location_to_unsampled_blank_image_urls[location]
|
|
237
|
+
if len(unsampled_images_this_location) == 0:
|
|
238
|
+
fully_sampled_locations.add(location)
|
|
239
|
+
continue
|
|
240
|
+
|
|
241
|
+
url = random.choice(unsampled_images_this_location)
|
|
242
|
+
blank_urls.append(url)
|
|
243
|
+
location_to_unsampled_blank_image_urls[location].remove(url)
|
|
244
|
+
location_to_sampled_blanks[location].append(url)
|
|
245
|
+
found_sample = True
|
|
246
|
+
|
|
247
|
+
if len(blank_urls) == n_blanks:
|
|
248
|
+
break
|
|
249
|
+
|
|
250
|
+
# ...for each location
|
|
251
|
+
|
|
252
|
+
if not found_sample:
|
|
253
|
+
print('Terminating after {} blanks, we ran out before hitting {}'.format(
|
|
254
|
+
len(blank_urls),n_blanks))
|
|
255
|
+
|
|
256
|
+
if len(blank_urls) == n_blanks:
|
|
257
|
+
break
|
|
258
|
+
|
|
259
|
+
# ...while(True)
|
|
260
|
+
|
|
261
|
+
assert len(blank_urls) <= n_blanks
|
|
262
|
+
min_blanks_per_location = math.floor(n_blanks/n_locations_with_blanks)
|
|
263
|
+
max_blanks_per_location = -1
|
|
264
|
+
for location in location_to_sampled_blanks:
|
|
265
|
+
n_blanks_this_location = len(location_to_sampled_blanks[location])
|
|
266
|
+
if n_blanks_this_location >= max_blanks_per_location:
|
|
267
|
+
max_blanks_per_location = n_blanks_this_location
|
|
268
|
+
assert (location in fully_sampled_locations) or \
|
|
269
|
+
n_blanks_this_location >= min_blanks_per_location
|
|
270
|
+
|
|
271
|
+
print('Choose {} blanks from {} locations'.format(n_blanks,len(location_ids)))
|
|
272
|
+
print('Fully sampled {} locations'.format(len(fully_sampled_locations)))
|
|
273
|
+
print('Max samples per location: {}'.format(max_blanks_per_location))
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
#%% Download those image files (prep)
|
|
277
|
+
|
|
278
|
+
container_to_url_base = {
|
|
279
|
+
'lilablobssc.blob.core.windows.net':'/',
|
|
280
|
+
'storage.googleapis.com':'/public-datasets-lila/'
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
def download_relative_filename(url, output_base, verbose=False, url_base=None, overwrite=False):
|
|
284
|
+
"""
|
|
285
|
+
Download a URL to output_base, preserving relative path
|
|
286
|
+
"""
|
|
287
|
+
|
|
288
|
+
result = {'status':'unknown','url':url,'destination_filename':None}
|
|
289
|
+
|
|
290
|
+
if url_base is None:
|
|
291
|
+
assert url.startswith('https://')
|
|
292
|
+
container = url.split('/')[2]
|
|
293
|
+
assert container in container_to_url_base
|
|
294
|
+
url_base = container_to_url_base[container]
|
|
295
|
+
|
|
296
|
+
assert url_base.startswith('/') and url_base.endswith('/')
|
|
297
|
+
|
|
298
|
+
p = urlparse(url)
|
|
299
|
+
relative_filename = str(p.path)
|
|
300
|
+
# remove the leading '/'
|
|
301
|
+
assert relative_filename.startswith(url_base)
|
|
302
|
+
relative_filename = relative_filename.replace(url_base,'',1)
|
|
303
|
+
|
|
304
|
+
destination_filename = os.path.join(output_base,relative_filename)
|
|
305
|
+
result['destination_filename'] = destination_filename
|
|
306
|
+
|
|
307
|
+
if ((os.path.isfile(destination_filename)) and (not overwrite)):
|
|
308
|
+
result['status'] = 'skipped'
|
|
309
|
+
return result
|
|
310
|
+
try:
|
|
311
|
+
download_url(url, destination_filename, verbose=verbose)
|
|
312
|
+
except Exception as e:
|
|
313
|
+
print('Warning: error downloading URL {}: {}'.format(
|
|
314
|
+
url,str(e)))
|
|
315
|
+
result['status'] = 'error: {}'.format(str(e))
|
|
316
|
+
return result
|
|
317
|
+
|
|
318
|
+
result['status'] = 'success'
|
|
319
|
+
return result
|
|
320
|
+
|
|
321
|
+
# Convert Azure URLs to GCP URLs if necessary
|
|
322
|
+
if preferred_image_download_source != 'azure':
|
|
323
|
+
assert preferred_image_download_source == 'gcp'
|
|
324
|
+
blank_urls = [azure_url_to_gcp_http_url(url) for url in blank_urls]
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
#%% Download those image files (execution)
|
|
328
|
+
|
|
329
|
+
print('Downloading {} images on {} workers'.format(len(blank_urls),n_download_threads))
|
|
330
|
+
|
|
331
|
+
if n_download_threads <= 1:
|
|
332
|
+
|
|
333
|
+
results = []
|
|
334
|
+
|
|
335
|
+
# url = all_urls[0]
|
|
336
|
+
for url in tqdm(blank_urls):
|
|
337
|
+
results.append(download_relative_filename(url,candidate_blanks_base,url_base=None))
|
|
338
|
+
|
|
339
|
+
else:
|
|
340
|
+
|
|
341
|
+
pool = ThreadPool(n_download_threads)
|
|
342
|
+
results = list(tqdm(pool.imap(lambda s: download_relative_filename(
|
|
343
|
+
s,candidate_blanks_base,url_base=None),
|
|
344
|
+
blank_urls), total=len(blank_urls)))
|
|
345
|
+
|
|
346
|
+
# pool.terminate()
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
#%% Review results
|
|
350
|
+
|
|
351
|
+
error_urls = []
|
|
352
|
+
for r in results:
|
|
353
|
+
if r['status'] != 'success':
|
|
354
|
+
error_urls.append(r['url'])
|
|
355
|
+
|
|
356
|
+
print('Errors on {} of {} downloads'.format(len(error_urls),len(results)))
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
#%% Run MegaDetector on the folder
|
|
360
|
+
|
|
361
|
+
md_results_file = os.path.join(project_base,'lila_blanks_md_results.json')
|
|
362
|
+
|
|
363
|
+
cmd = 'python run_detector_batch.py MDV5A "{}" "{}"'.format(
|
|
364
|
+
candidate_blanks_base,md_results_file)
|
|
365
|
+
cmd += ' --recursive --output_relative_filenames'
|
|
366
|
+
|
|
367
|
+
import clipboard; clipboard.copy(cmd); print(cmd)
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
#%% Review MD results that suggests images are non-empty
|
|
371
|
+
|
|
372
|
+
assert os.path.isfile(md_results_file)
|
|
373
|
+
|
|
374
|
+
category_name_to_threshold = {'animal':0.25,'person':0.25,'vehicle':0.25}
|
|
375
|
+
min_threshold = min(category_name_to_threshold.values())
|
|
376
|
+
with open(md_results_file,'r') as f:
|
|
377
|
+
md_results = json.load(f)
|
|
378
|
+
|
|
379
|
+
images_to_review_to_detections = {}
|
|
380
|
+
|
|
381
|
+
category_id_to_threshold = {}
|
|
382
|
+
for category_id in md_results['detection_categories']:
|
|
383
|
+
category_name = md_results['detection_categories'][category_id]
|
|
384
|
+
category_id_to_threshold[category_id] = category_name_to_threshold[category_name]
|
|
385
|
+
|
|
386
|
+
# im = md_results['images'][0]
|
|
387
|
+
for im in md_results['images']:
|
|
388
|
+
|
|
389
|
+
if 'detections' not in im:
|
|
390
|
+
continue
|
|
391
|
+
|
|
392
|
+
found_object = False
|
|
393
|
+
for det in im['detections']:
|
|
394
|
+
threshold = category_id_to_threshold[det['category']]
|
|
395
|
+
if det['conf'] >= threshold:
|
|
396
|
+
found_object = True
|
|
397
|
+
break
|
|
398
|
+
if found_object:
|
|
399
|
+
images_to_review_to_detections[im['file']] = im['detections']
|
|
400
|
+
|
|
401
|
+
print('Flagging {} of {} images for review'.format(len(images_to_review_to_detections),len(md_results['images'])))
|
|
402
|
+
|
|
403
|
+
output_file_to_source_file = {}
|
|
404
|
+
|
|
405
|
+
# i_fn = 0; source_file_relative = images_to_review[i_fn]
|
|
406
|
+
for i_fn,source_file_relative in tqdm(enumerate(images_to_review_to_detections),
|
|
407
|
+
total=len(images_to_review_to_detections)):
|
|
408
|
+
|
|
409
|
+
source_file_abs = os.path.join(candidate_blanks_base,source_file_relative)
|
|
410
|
+
assert os.path.isfile(source_file_abs)
|
|
411
|
+
ext = os.path.splitext(source_file_abs)[1]
|
|
412
|
+
target_file_relative = str(i_fn).zfill(8) + ext
|
|
413
|
+
target_file_abs = os.path.join(md_possible_non_blanks_folder,target_file_relative)
|
|
414
|
+
output_file_to_source_file[target_file_relative] = source_file_relative
|
|
415
|
+
# shutil.copyfile(source_file_abs,target_file_abs)
|
|
416
|
+
vis_utils.draw_bounding_boxes_on_file(input_file=source_file_abs,
|
|
417
|
+
output_file=target_file_abs,
|
|
418
|
+
detections=images_to_review_to_detections[source_file_relative],
|
|
419
|
+
confidence_threshold=min_threshold,
|
|
420
|
+
target_size=(1280,-1))
|
|
421
|
+
|
|
422
|
+
with open(os.path.join(project_base,'output_file_to_source_file.json'),'w') as f:
|
|
423
|
+
json.dump(output_file_to_source_file,f,indent=1)
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
#%% Manual review
|
|
427
|
+
|
|
428
|
+
# Delete images that are *not* empty
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
#%% Figure out which images are still there; these are the actually-blank ones
|
|
432
|
+
|
|
433
|
+
remaining_images = set(os.listdir(md_possible_non_blanks_folder))
|
|
434
|
+
print('Kept {} of {} candidate blank images'.format(len(remaining_images),
|
|
435
|
+
len(images_to_review_to_detections)))
|
|
436
|
+
|
|
437
|
+
removed_blank_images_relative = []
|
|
438
|
+
|
|
439
|
+
# output_file = next(iter(output_file_to_source_file.keys()))
|
|
440
|
+
for output_file in tqdm(output_file_to_source_file.keys()):
|
|
441
|
+
if output_file not in remaining_images:
|
|
442
|
+
source_file_relative = output_file_to_source_file[output_file]
|
|
443
|
+
removed_blank_images_relative.append(source_file_relative)
|
|
444
|
+
|
|
445
|
+
assert len(removed_blank_images_relative) + len(remaining_images) == len(output_file_to_source_file)
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
#%% Copy all the confirmed blanks to the confirmed folder
|
|
449
|
+
|
|
450
|
+
all_candidate_blanks = recursive_file_list(candidate_blanks_base,return_relative_paths=True)
|
|
451
|
+
print('Found {} candidate blanks'.format(len(all_candidate_blanks)))
|
|
452
|
+
|
|
453
|
+
for source_fn_relative in tqdm(all_candidate_blanks):
|
|
454
|
+
source_fn_abs = os.path.join(candidate_blanks_base,source_fn_relative)
|
|
455
|
+
assert os.path.isfile(source_fn_abs)
|
|
456
|
+
target_fn_abs = os.path.join(confirmed_blanks_base,source_fn_relative)
|
|
457
|
+
os.makedirs(os.path.dirname(target_fn_abs),exist_ok=True)
|
|
458
|
+
shutil.copyfile(source_fn_abs,target_fn_abs)
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
#%% Record location information for each file
|
|
462
|
+
|
|
463
|
+
fn_relative_to_location = {}
|
|
464
|
+
for location in location_to_blank_image_urls:
|
|
465
|
+
urls_this_location = location_to_blank_image_urls[location]
|
|
466
|
+
for url in urls_this_location:
|
|
467
|
+
fn_relative = url.split('//')[1]
|
|
468
|
+
fn_relative_to_location[fn_relative] = location
|
|
469
|
+
|
|
470
|
+
all_confirmed_blanks = recursive_file_list(confirmed_blanks_base,return_relative_paths=True)
|
|
471
|
+
print('Found {} confirmed blanks'.format(len(all_confirmed_blanks)))
|
|
472
|
+
|
|
473
|
+
for fn_relative in all_confirmed_blanks:
|
|
474
|
+
assert fn_relative in fn_relative_to_location
|
|
@@ -124,6 +124,8 @@ for ds_name in metadata_table.keys():
|
|
|
124
124
|
|
|
125
125
|
#%% Download those image files
|
|
126
126
|
|
|
127
|
+
# TODO: trivially parallelizable
|
|
128
|
+
#
|
|
127
129
|
# ds_name = (list(metadata_table.keys()))[0]
|
|
128
130
|
for ds_name in metadata_table.keys():
|
|
129
131
|
|
|
@@ -147,4 +149,3 @@ for ds_name in metadata_table.keys():
|
|
|
147
149
|
# ...for each url
|
|
148
150
|
|
|
149
151
|
# ...for each dataset
|
|
150
|
-
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
########
|
|
2
|
+
#
|
|
3
|
+
# create_links_to_md_results_files.py
|
|
4
|
+
#
|
|
5
|
+
# One-off script to populate the columns in the camera trap data .csv file that point to MD results.
|
|
6
|
+
#
|
|
7
|
+
########
|
|
8
|
+
|
|
9
|
+
#%% Imports and constants
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
input_csv_file = r'g:\temp\lila_camera_trap_datasets_no_md_results.csv'
|
|
16
|
+
output_csv_file = r'g:\temp\lila_camera_trap_datasets.csv'
|
|
17
|
+
|
|
18
|
+
md_results_local_folder = r'g:\temp\lila-md-results'
|
|
19
|
+
md_base_url = 'https://lila.science/public/lila-md-results/'
|
|
20
|
+
assert md_base_url.endswith('/')
|
|
21
|
+
|
|
22
|
+
# No RDE files for datasets with no location information
|
|
23
|
+
datasets_without_location_info = ('ena24','missouri-camera-traps')
|
|
24
|
+
|
|
25
|
+
md_results_column_names = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
|
|
26
|
+
|
|
27
|
+
validate_urls = False
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
#%% Read input data
|
|
31
|
+
|
|
32
|
+
df = pd.read_csv(input_csv_file)
|
|
33
|
+
for s in md_results_column_names:
|
|
34
|
+
df[s] = ''
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
#%% Find matching files locally, and create URLs
|
|
38
|
+
|
|
39
|
+
local_files = os.listdir(md_results_local_folder)
|
|
40
|
+
local_files = [fn for fn in local_files if fn.endswith('.zip')]
|
|
41
|
+
|
|
42
|
+
# i_row = 0; row = df.iloc[i_row]
|
|
43
|
+
for i_row,row in df.iterrows():
|
|
44
|
+
|
|
45
|
+
if not isinstance(row['name'],str):
|
|
46
|
+
continue
|
|
47
|
+
|
|
48
|
+
dataset_shortname = row['short_name']
|
|
49
|
+
matching_files = [fn for fn in local_files if dataset_shortname in fn]
|
|
50
|
+
|
|
51
|
+
# No RDE files for datasets with no location information
|
|
52
|
+
if dataset_shortname in datasets_without_location_info:
|
|
53
|
+
assert len(matching_files) == 2
|
|
54
|
+
mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn]
|
|
55
|
+
mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn]
|
|
56
|
+
assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1
|
|
57
|
+
df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
|
|
58
|
+
df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
|
|
59
|
+
else:
|
|
60
|
+
# Exclude single-season files for snapshot-serengeti
|
|
61
|
+
if dataset_shortname == 'snapshot-serengeti':
|
|
62
|
+
matching_files = [fn for fn in matching_files if '_S' not in fn]
|
|
63
|
+
assert len(matching_files) == 2
|
|
64
|
+
assert all(['mdv4' in fn for fn in matching_files])
|
|
65
|
+
rde_files = [fn for fn in matching_files if 'rde' in fn]
|
|
66
|
+
raw_files = [fn for fn in matching_files if 'rde' not in fn]
|
|
67
|
+
assert len(rde_files) == 1 and len(raw_files) == 1
|
|
68
|
+
df.loc[i_row,'mdv4_results_raw'] = md_base_url + raw_files[0]
|
|
69
|
+
df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
|
|
70
|
+
else:
|
|
71
|
+
assert len(matching_files) == 3
|
|
72
|
+
mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn and 'rde' not in fn]
|
|
73
|
+
mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn and 'rde' not in fn]
|
|
74
|
+
rde_files = [fn for fn in matching_files if 'rde' in fn]
|
|
75
|
+
assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1 and len(rde_files) == 1
|
|
76
|
+
df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
|
|
77
|
+
df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
|
|
78
|
+
df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
|
|
79
|
+
|
|
80
|
+
print('Found {} matching files for {}'.format(len(matching_files),dataset_shortname))
|
|
81
|
+
|
|
82
|
+
# ...for each row
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
#%% Validate URLs
|
|
86
|
+
|
|
87
|
+
if validate_urls:
|
|
88
|
+
|
|
89
|
+
from md_utils.url_utils import test_urls
|
|
90
|
+
|
|
91
|
+
urls = set()
|
|
92
|
+
|
|
93
|
+
for i_row,row in df.iterrows():
|
|
94
|
+
for column_name in md_results_column_names:
|
|
95
|
+
if len(row[column_name]) > 0:
|
|
96
|
+
assert row[column_name] not in urls
|
|
97
|
+
urls.add(row[column_name])
|
|
98
|
+
|
|
99
|
+
test_urls(urls,error_on_failure=True)
|
|
100
|
+
|
|
101
|
+
print('Validated {} URLs'.format(len(urls)))
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
#%% Write new .csv file
|
|
105
|
+
|
|
106
|
+
df.to_csv(output_csv_file,header=True,index=False)
|