megadetector 10.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/__init__.py +0 -0
- megadetector/api/__init__.py +0 -0
- megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
- megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
- megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
- megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
- megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
- megadetector/classification/__init__.py +0 -0
- megadetector/classification/aggregate_classifier_probs.py +108 -0
- megadetector/classification/analyze_failed_images.py +227 -0
- megadetector/classification/cache_batchapi_outputs.py +198 -0
- megadetector/classification/create_classification_dataset.py +626 -0
- megadetector/classification/crop_detections.py +516 -0
- megadetector/classification/csv_to_json.py +226 -0
- megadetector/classification/detect_and_crop.py +853 -0
- megadetector/classification/efficientnet/__init__.py +9 -0
- megadetector/classification/efficientnet/model.py +415 -0
- megadetector/classification/efficientnet/utils.py +608 -0
- megadetector/classification/evaluate_model.py +520 -0
- megadetector/classification/identify_mislabeled_candidates.py +152 -0
- megadetector/classification/json_to_azcopy_list.py +63 -0
- megadetector/classification/json_validator.py +696 -0
- megadetector/classification/map_classification_categories.py +276 -0
- megadetector/classification/merge_classification_detection_output.py +509 -0
- megadetector/classification/prepare_classification_script.py +194 -0
- megadetector/classification/prepare_classification_script_mc.py +228 -0
- megadetector/classification/run_classifier.py +287 -0
- megadetector/classification/save_mislabeled.py +110 -0
- megadetector/classification/train_classifier.py +827 -0
- megadetector/classification/train_classifier_tf.py +725 -0
- megadetector/classification/train_utils.py +323 -0
- megadetector/data_management/__init__.py +0 -0
- megadetector/data_management/animl_to_md.py +161 -0
- megadetector/data_management/annotations/__init__.py +0 -0
- megadetector/data_management/annotations/annotation_constants.py +33 -0
- megadetector/data_management/camtrap_dp_to_coco.py +270 -0
- megadetector/data_management/cct_json_utils.py +566 -0
- megadetector/data_management/cct_to_md.py +184 -0
- megadetector/data_management/cct_to_wi.py +293 -0
- megadetector/data_management/coco_to_labelme.py +284 -0
- megadetector/data_management/coco_to_yolo.py +702 -0
- megadetector/data_management/databases/__init__.py +0 -0
- megadetector/data_management/databases/add_width_and_height_to_db.py +107 -0
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +210 -0
- megadetector/data_management/databases/integrity_check_json_db.py +528 -0
- megadetector/data_management/databases/subset_json_db.py +195 -0
- megadetector/data_management/generate_crops_from_cct.py +200 -0
- megadetector/data_management/get_image_sizes.py +164 -0
- megadetector/data_management/labelme_to_coco.py +559 -0
- megadetector/data_management/labelme_to_yolo.py +349 -0
- megadetector/data_management/lila/__init__.py +0 -0
- megadetector/data_management/lila/create_lila_blank_set.py +556 -0
- megadetector/data_management/lila/create_lila_test_set.py +187 -0
- megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
- megadetector/data_management/lila/download_lila_subset.py +182 -0
- megadetector/data_management/lila/generate_lila_per_image_labels.py +777 -0
- megadetector/data_management/lila/get_lila_annotation_counts.py +174 -0
- megadetector/data_management/lila/get_lila_image_counts.py +112 -0
- megadetector/data_management/lila/lila_common.py +319 -0
- megadetector/data_management/lila/test_lila_metadata_urls.py +164 -0
- megadetector/data_management/mewc_to_md.py +344 -0
- megadetector/data_management/ocr_tools.py +873 -0
- megadetector/data_management/read_exif.py +964 -0
- megadetector/data_management/remap_coco_categories.py +195 -0
- megadetector/data_management/remove_exif.py +156 -0
- megadetector/data_management/rename_images.py +194 -0
- megadetector/data_management/resize_coco_dataset.py +663 -0
- megadetector/data_management/speciesnet_to_md.py +41 -0
- megadetector/data_management/wi_download_csv_to_coco.py +247 -0
- megadetector/data_management/yolo_output_to_md_output.py +594 -0
- megadetector/data_management/yolo_to_coco.py +876 -0
- megadetector/data_management/zamba_to_md.py +188 -0
- megadetector/detection/__init__.py +0 -0
- megadetector/detection/change_detection.py +840 -0
- megadetector/detection/process_video.py +479 -0
- megadetector/detection/pytorch_detector.py +1451 -0
- megadetector/detection/run_detector.py +1267 -0
- megadetector/detection/run_detector_batch.py +2159 -0
- megadetector/detection/run_inference_with_yolov5_val.py +1314 -0
- megadetector/detection/run_md_and_speciesnet.py +1494 -0
- megadetector/detection/run_tiled_inference.py +1038 -0
- megadetector/detection/tf_detector.py +209 -0
- megadetector/detection/video_utils.py +1379 -0
- megadetector/postprocessing/__init__.py +0 -0
- megadetector/postprocessing/add_max_conf.py +72 -0
- megadetector/postprocessing/categorize_detections_by_size.py +166 -0
- megadetector/postprocessing/classification_postprocessing.py +1752 -0
- megadetector/postprocessing/combine_batch_outputs.py +249 -0
- megadetector/postprocessing/compare_batch_results.py +2110 -0
- megadetector/postprocessing/convert_output_format.py +403 -0
- megadetector/postprocessing/create_crop_folder.py +629 -0
- megadetector/postprocessing/detector_calibration.py +570 -0
- megadetector/postprocessing/generate_csv_report.py +522 -0
- megadetector/postprocessing/load_api_results.py +223 -0
- megadetector/postprocessing/md_to_coco.py +428 -0
- megadetector/postprocessing/md_to_labelme.py +351 -0
- megadetector/postprocessing/md_to_wi.py +41 -0
- megadetector/postprocessing/merge_detections.py +392 -0
- megadetector/postprocessing/postprocess_batch_results.py +2077 -0
- megadetector/postprocessing/remap_detection_categories.py +226 -0
- megadetector/postprocessing/render_detection_confusion_matrix.py +677 -0
- megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +206 -0
- megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +82 -0
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1665 -0
- megadetector/postprocessing/separate_detections_into_folders.py +795 -0
- megadetector/postprocessing/subset_json_detector_output.py +964 -0
- megadetector/postprocessing/top_folders_to_bottom.py +238 -0
- megadetector/postprocessing/validate_batch_results.py +332 -0
- megadetector/taxonomy_mapping/__init__.py +0 -0
- megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +213 -0
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +165 -0
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +543 -0
- megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
- megadetector/taxonomy_mapping/simple_image_download.py +224 -0
- megadetector/taxonomy_mapping/species_lookup.py +1008 -0
- megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
- megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
- megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
- megadetector/tests/__init__.py +0 -0
- megadetector/tests/test_nms_synthetic.py +335 -0
- megadetector/utils/__init__.py +0 -0
- megadetector/utils/ct_utils.py +1857 -0
- megadetector/utils/directory_listing.py +199 -0
- megadetector/utils/extract_frames_from_video.py +307 -0
- megadetector/utils/gpu_test.py +125 -0
- megadetector/utils/md_tests.py +2072 -0
- megadetector/utils/path_utils.py +2832 -0
- megadetector/utils/process_utils.py +172 -0
- megadetector/utils/split_locations_into_train_val.py +237 -0
- megadetector/utils/string_utils.py +234 -0
- megadetector/utils/url_utils.py +825 -0
- megadetector/utils/wi_platform_utils.py +968 -0
- megadetector/utils/wi_taxonomy_utils.py +1759 -0
- megadetector/utils/write_html_image_list.py +239 -0
- megadetector/visualization/__init__.py +0 -0
- megadetector/visualization/plot_utils.py +309 -0
- megadetector/visualization/render_images_with_thumbnails.py +243 -0
- megadetector/visualization/visualization_utils.py +1940 -0
- megadetector/visualization/visualize_db.py +630 -0
- megadetector/visualization/visualize_detector_output.py +479 -0
- megadetector/visualization/visualize_video_output.py +705 -0
- megadetector-10.0.13.dist-info/METADATA +134 -0
- megadetector-10.0.13.dist-info/RECORD +147 -0
- megadetector-10.0.13.dist-info/WHEEL +5 -0
- megadetector-10.0.13.dist-info/licenses/LICENSE +19 -0
- megadetector-10.0.13.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
create_lila_test_set.py
|
|
4
|
+
|
|
5
|
+
Create a test set of camera trap images, containing N empty and N non-empty
|
|
6
|
+
images from each LILA data set.
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
#%% Constants and imports
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
import random
|
|
15
|
+
|
|
16
|
+
from megadetector.data_management.lila.lila_common import \
|
|
17
|
+
read_lila_metadata, read_metadata_file_for_dataset
|
|
18
|
+
from megadetector.utils.url_utils import parallel_download_urls
|
|
19
|
+
from megadetector.utils.path_utils import open_file
|
|
20
|
+
|
|
21
|
+
n_empty_images_per_dataset = 1
|
|
22
|
+
n_non_empty_images_per_dataset = 1
|
|
23
|
+
|
|
24
|
+
# We'll write images, metadata downloads, and temporary files here
|
|
25
|
+
lila_local_base = os.path.expanduser('~/lila')
|
|
26
|
+
|
|
27
|
+
output_dir = os.path.join(lila_local_base,'lila_test_set')
|
|
28
|
+
os.makedirs(output_dir,exist_ok=True)
|
|
29
|
+
|
|
30
|
+
metadata_dir = os.path.join(lila_local_base,'metadata')
|
|
31
|
+
os.makedirs(metadata_dir,exist_ok=True)
|
|
32
|
+
|
|
33
|
+
random.seed(0)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
#%% Download and parse the metadata file
|
|
37
|
+
|
|
38
|
+
metadata_table = read_lila_metadata(metadata_dir)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
#%% Download and extract metadata for every dataset
|
|
42
|
+
|
|
43
|
+
for ds_name in metadata_table.keys():
|
|
44
|
+
metadata_table[ds_name]['metadata_filename'] = \
|
|
45
|
+
read_metadata_file_for_dataset(ds_name=ds_name,
|
|
46
|
+
metadata_dir=metadata_dir,
|
|
47
|
+
metadata_table=metadata_table)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
#%% Choose images from each dataset
|
|
51
|
+
|
|
52
|
+
# Takes ~60 seconds
|
|
53
|
+
|
|
54
|
+
empty_category_names = ['empty','blank']
|
|
55
|
+
|
|
56
|
+
# ds_name = (list(metadata_table.keys()))[0]
|
|
57
|
+
for ds_name in metadata_table.keys():
|
|
58
|
+
|
|
59
|
+
print('Choosing images for {}'.format(ds_name))
|
|
60
|
+
|
|
61
|
+
json_filename = metadata_table[ds_name]['metadata_filename']
|
|
62
|
+
|
|
63
|
+
with open(json_filename,'r') as f:
|
|
64
|
+
d = json.load(f)
|
|
65
|
+
|
|
66
|
+
category_id_to_name = {c['id']:c['name'] for c in d['categories']}
|
|
67
|
+
category_name_to_id = {c['name']:c['id'] for c in d['categories']}
|
|
68
|
+
|
|
69
|
+
## Find empty images
|
|
70
|
+
|
|
71
|
+
empty_category_present = False
|
|
72
|
+
for category_name in category_name_to_id:
|
|
73
|
+
if category_name in empty_category_names:
|
|
74
|
+
empty_category_present = True
|
|
75
|
+
break
|
|
76
|
+
if not empty_category_present:
|
|
77
|
+
empty_annotations_to_download = []
|
|
78
|
+
else:
|
|
79
|
+
empty_category_id = None
|
|
80
|
+
for category_name in empty_category_names:
|
|
81
|
+
if category_name in category_name_to_id:
|
|
82
|
+
if empty_category_id is not None:
|
|
83
|
+
print('Warning: multiple empty categories in dataset {}'.format(ds_name))
|
|
84
|
+
else:
|
|
85
|
+
empty_category_id = category_name_to_id[category_name]
|
|
86
|
+
assert empty_category_id is not None
|
|
87
|
+
empty_annotations = [ann for ann in d['annotations'] if ann['category_id'] == empty_category_id]
|
|
88
|
+
try:
|
|
89
|
+
empty_annotations_to_download = random.sample(empty_annotations,n_empty_images_per_dataset)
|
|
90
|
+
except ValueError:
|
|
91
|
+
print('No empty images available for dataset {}'.format(ds_name))
|
|
92
|
+
empty_annotations_to_download = []
|
|
93
|
+
|
|
94
|
+
## Find non-empty images
|
|
95
|
+
|
|
96
|
+
non_empty_annotations = [ann for ann in d['annotations'] if ann['category_id'] != empty_category_id]
|
|
97
|
+
try:
|
|
98
|
+
non_empty_annotations_to_download = random.sample(non_empty_annotations,n_non_empty_images_per_dataset)
|
|
99
|
+
except ValueError:
|
|
100
|
+
print('No non-empty images available for dataset {}'.format(ds_name))
|
|
101
|
+
non_empty_annotations_to_download = []
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
annotations_to_download = empty_annotations_to_download + non_empty_annotations_to_download
|
|
105
|
+
|
|
106
|
+
image_ids_to_download = set([ann['image_id'] for ann in annotations_to_download])
|
|
107
|
+
assert len(image_ids_to_download) == len(set(image_ids_to_download))
|
|
108
|
+
|
|
109
|
+
images_to_download = []
|
|
110
|
+
for im in d['images']:
|
|
111
|
+
if im['id'] in image_ids_to_download:
|
|
112
|
+
images_to_download.append(im)
|
|
113
|
+
assert len(images_to_download) == len(image_ids_to_download)
|
|
114
|
+
|
|
115
|
+
metadata_table[ds_name]['images_to_download'] = images_to_download
|
|
116
|
+
|
|
117
|
+
# ...for each dataset
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
#%% Convert to URLs
|
|
121
|
+
|
|
122
|
+
preferred_cloud = 'gcp'
|
|
123
|
+
|
|
124
|
+
# ds_name = (list(metadata_table.keys()))[0]
|
|
125
|
+
for ds_name in metadata_table.keys():
|
|
126
|
+
|
|
127
|
+
base_url = metadata_table[ds_name]['image_base_url_' + preferred_cloud]
|
|
128
|
+
assert not base_url.endswith('/')
|
|
129
|
+
|
|
130
|
+
# Retrieve image file names
|
|
131
|
+
filenames = [im['file_name'] for im in metadata_table[ds_name]['images_to_download']]
|
|
132
|
+
|
|
133
|
+
urls_to_download = []
|
|
134
|
+
|
|
135
|
+
# Convert to URLs
|
|
136
|
+
for fn in filenames:
|
|
137
|
+
url = base_url + '/' + fn
|
|
138
|
+
urls_to_download.append(url)
|
|
139
|
+
|
|
140
|
+
metadata_table[ds_name]['urls_to_download'] = urls_to_download
|
|
141
|
+
|
|
142
|
+
# ...for each dataset
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
#%% Download image files (prep)
|
|
146
|
+
|
|
147
|
+
url_to_target_file = {}
|
|
148
|
+
|
|
149
|
+
# ds_name = (list(metadata_table.keys()))[0]
|
|
150
|
+
for ds_name in metadata_table.keys():
|
|
151
|
+
|
|
152
|
+
base_url = metadata_table[ds_name]['image_base_url_' + preferred_cloud]
|
|
153
|
+
assert not base_url.endswith('/')
|
|
154
|
+
base_url += '/'
|
|
155
|
+
|
|
156
|
+
urls_to_download = metadata_table[ds_name]['urls_to_download']
|
|
157
|
+
|
|
158
|
+
# url = urls_to_download[0]
|
|
159
|
+
for url in urls_to_download:
|
|
160
|
+
|
|
161
|
+
assert base_url in url
|
|
162
|
+
output_file_relative = ds_name.lower().replace(' ','_') + \
|
|
163
|
+
'_' + url.replace(base_url,'').replace('/','_').replace('\\','_')
|
|
164
|
+
output_file_absolute = os.path.join(output_dir,output_file_relative)
|
|
165
|
+
url_to_target_file[url] = output_file_absolute
|
|
166
|
+
|
|
167
|
+
# ...for each url
|
|
168
|
+
|
|
169
|
+
# ...for each dataset
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
#%% Download image files (execution)
|
|
173
|
+
|
|
174
|
+
download_results = parallel_download_urls(url_to_target_file,
|
|
175
|
+
verbose=False,
|
|
176
|
+
overwrite=False,
|
|
177
|
+
n_workers=20,
|
|
178
|
+
pool_type='thread')
|
|
179
|
+
|
|
180
|
+
# r = download_results[0]
|
|
181
|
+
for r in download_results:
|
|
182
|
+
assert r['status'] in ('skipped','success')
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
#%% Open the test test
|
|
186
|
+
|
|
187
|
+
open_file(output_dir)
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
create_links_to_md_results_files.py
|
|
4
|
+
|
|
5
|
+
One-off script to populate the columns in the camera trap data .csv file that point to MD results.
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
#%% Imports and constants
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
input_csv_file = r'g:\temp\lila_camera_trap_datasets_no_md_results.csv'
|
|
16
|
+
output_csv_file = r'g:\temp\lila_camera_trap_datasets.csv'
|
|
17
|
+
|
|
18
|
+
md_results_local_folder = r'g:\temp\lila-md-results'
|
|
19
|
+
md_base_url = 'https://lila.science/public/lila-md-results/'
|
|
20
|
+
assert md_base_url.endswith('/')
|
|
21
|
+
|
|
22
|
+
# No RDE files for datasets with no location information
|
|
23
|
+
datasets_without_location_info = ('ena24','missouri-camera-traps')
|
|
24
|
+
|
|
25
|
+
md_results_column_names = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
|
|
26
|
+
|
|
27
|
+
validate_urls = False
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
#%% Read input data
|
|
31
|
+
|
|
32
|
+
df = pd.read_csv(input_csv_file)
|
|
33
|
+
for s in md_results_column_names:
|
|
34
|
+
df[s] = ''
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
#%% Find matching files locally, and create URLs
|
|
38
|
+
|
|
39
|
+
local_files = os.listdir(md_results_local_folder)
|
|
40
|
+
local_files = [fn for fn in local_files if fn.endswith('.zip')]
|
|
41
|
+
|
|
42
|
+
# i_row = 0; row = df.iloc[i_row]
|
|
43
|
+
for i_row,row in df.iterrows():
|
|
44
|
+
|
|
45
|
+
if not isinstance(row['name'],str):
|
|
46
|
+
continue
|
|
47
|
+
|
|
48
|
+
dataset_shortname = row['short_name']
|
|
49
|
+
matching_files = [fn for fn in local_files if dataset_shortname in fn]
|
|
50
|
+
|
|
51
|
+
# No RDE files for datasets with no location information
|
|
52
|
+
if dataset_shortname in datasets_without_location_info:
|
|
53
|
+
assert len(matching_files) == 2
|
|
54
|
+
mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn]
|
|
55
|
+
mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn]
|
|
56
|
+
assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1
|
|
57
|
+
df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
|
|
58
|
+
df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
|
|
59
|
+
else:
|
|
60
|
+
# Exclude single-season files for snapshot-serengeti
|
|
61
|
+
if dataset_shortname == 'snapshot-serengeti':
|
|
62
|
+
matching_files = [fn for fn in matching_files if '_S' not in fn]
|
|
63
|
+
assert len(matching_files) == 2
|
|
64
|
+
assert all(['mdv4' in fn for fn in matching_files])
|
|
65
|
+
rde_files = [fn for fn in matching_files if 'rde' in fn]
|
|
66
|
+
raw_files = [fn for fn in matching_files if 'rde' not in fn]
|
|
67
|
+
assert len(rde_files) == 1 and len(raw_files) == 1
|
|
68
|
+
df.loc[i_row,'mdv4_results_raw'] = md_base_url + raw_files[0]
|
|
69
|
+
df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
|
|
70
|
+
else:
|
|
71
|
+
assert len(matching_files) == 3
|
|
72
|
+
mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn and 'rde' not in fn]
|
|
73
|
+
mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn and 'rde' not in fn]
|
|
74
|
+
rde_files = [fn for fn in matching_files if 'rde' in fn]
|
|
75
|
+
assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1 and len(rde_files) == 1
|
|
76
|
+
df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
|
|
77
|
+
df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
|
|
78
|
+
df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
|
|
79
|
+
|
|
80
|
+
print('Found {} matching files for {}'.format(len(matching_files),dataset_shortname))
|
|
81
|
+
|
|
82
|
+
# ...for each row
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
#%% Validate URLs
|
|
86
|
+
|
|
87
|
+
if validate_urls:
|
|
88
|
+
|
|
89
|
+
from megadetector.utils.url_utils import test_urls
|
|
90
|
+
|
|
91
|
+
urls = set()
|
|
92
|
+
|
|
93
|
+
for i_row,row in df.iterrows():
|
|
94
|
+
for column_name in md_results_column_names:
|
|
95
|
+
if len(row[column_name]) > 0:
|
|
96
|
+
assert row[column_name] not in urls
|
|
97
|
+
urls.add(row[column_name])
|
|
98
|
+
|
|
99
|
+
test_urls(urls,error_on_failure=True)
|
|
100
|
+
|
|
101
|
+
print('Validated {} URLs'.format(len(urls)))
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
#%% Write new .csv file
|
|
105
|
+
|
|
106
|
+
df.to_csv(output_csv_file,header=True,index=False)
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
download_lila_subset.py
|
|
4
|
+
|
|
5
|
+
Example of how to download a list of files from LILA, e.g. all the files
|
|
6
|
+
in a data set corresponding to a particular species.
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
#%% Constants and imports
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import random
|
|
14
|
+
|
|
15
|
+
from tqdm import tqdm
|
|
16
|
+
from collections import defaultdict
|
|
17
|
+
from copy import deepcopy
|
|
18
|
+
|
|
19
|
+
from megadetector.data_management.lila.lila_common import \
|
|
20
|
+
read_lila_all_images_file, is_empty, lila_base_urls
|
|
21
|
+
from megadetector.utils.url_utils import parallel_download_urls
|
|
22
|
+
from megadetector.utils.path_utils import open_file
|
|
23
|
+
|
|
24
|
+
for s in lila_base_urls.values():
|
|
25
|
+
assert s.endswith('/')
|
|
26
|
+
|
|
27
|
+
# If any of these strings appear in the common name of a species, we'll download that image
|
|
28
|
+
# species_of_interest = ['grey fox','gray fox','cape fox','red fox','kit fox']
|
|
29
|
+
species_of_interest = ['bear']
|
|
30
|
+
|
|
31
|
+
# We'll write images, metadata downloads, and temporary files here
|
|
32
|
+
lila_local_base = os.path.expanduser('~/lila')
|
|
33
|
+
|
|
34
|
+
metadata_dir = os.path.join(lila_local_base,'metadata')
|
|
35
|
+
os.makedirs(metadata_dir,exist_ok=True)
|
|
36
|
+
|
|
37
|
+
output_dir = os.path.join(lila_local_base,'lila_downloads_by_dataset')
|
|
38
|
+
os.makedirs(output_dir,exist_ok=True)
|
|
39
|
+
|
|
40
|
+
# Number of concurrent download threads
|
|
41
|
+
n_download_threads = 20
|
|
42
|
+
|
|
43
|
+
max_images_per_dataset = 10 # None
|
|
44
|
+
|
|
45
|
+
preferred_provider = 'gcp' # 'azure', 'gcp', 'aws'
|
|
46
|
+
|
|
47
|
+
random.seed(0)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
#%% Download and open the giant table of image URLs and labels
|
|
51
|
+
|
|
52
|
+
# Takes ~2 minutes to download, unzip, and open
|
|
53
|
+
df = read_lila_all_images_file(metadata_dir)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
#%% Find all the images we want to download
|
|
57
|
+
|
|
58
|
+
# Takes ~2 minutes
|
|
59
|
+
|
|
60
|
+
common_name_to_count = defaultdict(int)
|
|
61
|
+
|
|
62
|
+
ds_name_to_urls = defaultdict(list)
|
|
63
|
+
|
|
64
|
+
def find_items(row): # noqa
|
|
65
|
+
|
|
66
|
+
if is_empty(row['common_name']):
|
|
67
|
+
return
|
|
68
|
+
|
|
69
|
+
match = False
|
|
70
|
+
|
|
71
|
+
# This is the only bit of this file that's specific to a particular query. In this case
|
|
72
|
+
# we're checking whether each row is on a list of species of interest, but you do you.
|
|
73
|
+
for species_name in species_of_interest:
|
|
74
|
+
if species_name in row['common_name']:
|
|
75
|
+
match = True
|
|
76
|
+
common_name_to_count[species_name] += 1
|
|
77
|
+
break
|
|
78
|
+
|
|
79
|
+
if match:
|
|
80
|
+
ds_name_to_urls[row['dataset_name']].append(row['url_' + preferred_provider])
|
|
81
|
+
|
|
82
|
+
tqdm.pandas()
|
|
83
|
+
_ = df.progress_apply(find_items,axis=1)
|
|
84
|
+
|
|
85
|
+
# We have a list of URLs for each dataset, flatten them all into a list of URLs
|
|
86
|
+
all_urls = list(ds_name_to_urls.values())
|
|
87
|
+
all_urls = [item for sublist in all_urls for item in sublist]
|
|
88
|
+
print('Found {} matching URLs across {} datasets'.format(len(all_urls),len(ds_name_to_urls)))
|
|
89
|
+
|
|
90
|
+
for common_name in common_name_to_count:
|
|
91
|
+
print('{}: {}'.format(common_name,common_name_to_count[common_name]))
|
|
92
|
+
|
|
93
|
+
ds_name_to_urls_raw = deepcopy(ds_name_to_urls)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
#%% Optionally trim to a fixed number of URLs per dataset
|
|
97
|
+
|
|
98
|
+
if max_images_per_dataset is None:
|
|
99
|
+
pass
|
|
100
|
+
else:
|
|
101
|
+
# ds_name = next(iter(ds_name_to_urls.keys()))
|
|
102
|
+
for ds_name in ds_name_to_urls:
|
|
103
|
+
if len(ds_name_to_urls[ds_name]) > max_images_per_dataset:
|
|
104
|
+
ds_name_to_urls[ds_name] = random.sample(ds_name_to_urls[ds_name],max_images_per_dataset)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
#%% Choose target files for each URL
|
|
108
|
+
|
|
109
|
+
# We have a list of URLs per dataset, flatten that into a single list of URLs
|
|
110
|
+
urls_to_download = set()
|
|
111
|
+
for ds_name in ds_name_to_urls:
|
|
112
|
+
for url in ds_name_to_urls[ds_name]:
|
|
113
|
+
urls_to_download.add(url)
|
|
114
|
+
urls_to_download = sorted(list(urls_to_download))
|
|
115
|
+
|
|
116
|
+
# A URL might look like this:
|
|
117
|
+
#
|
|
118
|
+
# https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0667/0302.jpg
|
|
119
|
+
#
|
|
120
|
+
# We'll write that to an output file that looks like this (relative to output_dir):
|
|
121
|
+
#
|
|
122
|
+
# wcs-unzipped/animals/0667/0302.jpg
|
|
123
|
+
#
|
|
124
|
+
# ...so we need to remove the base URL to get the target file.
|
|
125
|
+
base_url = lila_base_urls[preferred_provider]
|
|
126
|
+
assert base_url.endswith('/')
|
|
127
|
+
|
|
128
|
+
url_to_target_file = {}
|
|
129
|
+
|
|
130
|
+
for url in urls_to_download:
|
|
131
|
+
assert url.startswith(base_url)
|
|
132
|
+
target_fn_relative = url.replace(base_url,'')
|
|
133
|
+
target_fn_abs = os.path.join(output_dir,target_fn_relative)
|
|
134
|
+
url_to_target_file[url] = target_fn_abs
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
#%% Download image files
|
|
138
|
+
|
|
139
|
+
download_results = parallel_download_urls(url_to_target_file=url_to_target_file,
|
|
140
|
+
verbose=False,
|
|
141
|
+
overwrite=False,
|
|
142
|
+
n_workers=n_download_threads,
|
|
143
|
+
pool_type='thread')
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
#%% Open output folder
|
|
147
|
+
|
|
148
|
+
open_file(output_dir)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
#%% Scrap
|
|
152
|
+
|
|
153
|
+
if False:
|
|
154
|
+
|
|
155
|
+
pass
|
|
156
|
+
|
|
157
|
+
#%% Find all the reptiles on LILA
|
|
158
|
+
|
|
159
|
+
reptile_rows = df.loc[df['class'] == 'reptilia']
|
|
160
|
+
|
|
161
|
+
# i_row = 0; row = reptile_rows.iloc[i_row]
|
|
162
|
+
|
|
163
|
+
common_name_to_count = defaultdict(int)
|
|
164
|
+
dataset_to_count = defaultdict(int)
|
|
165
|
+
for i_row,row in reptile_rows.iterrows():
|
|
166
|
+
common_name_to_count[row['common_name']] += 1
|
|
167
|
+
dataset_to_count[row['dataset_name']] += 1
|
|
168
|
+
|
|
169
|
+
from megadetector.utils.ct_utils import sort_dictionary_by_value
|
|
170
|
+
|
|
171
|
+
print('Found {} reptiles\n'.format(len(reptile_rows)))
|
|
172
|
+
|
|
173
|
+
common_name_to_count = sort_dictionary_by_value(common_name_to_count,reverse=True)
|
|
174
|
+
dataset_to_count = sort_dictionary_by_value(dataset_to_count,reverse=True)
|
|
175
|
+
|
|
176
|
+
print('Common names by count:\n')
|
|
177
|
+
for k in common_name_to_count:
|
|
178
|
+
print('{} ({})'.format(k,common_name_to_count[k]))
|
|
179
|
+
|
|
180
|
+
print('\nDatasets by count:\n')
|
|
181
|
+
for k in dataset_to_count:
|
|
182
|
+
print('{} ({})'.format(k,dataset_to_count[k]))
|