megadetector 5.0.7__py3-none-any.whl → 5.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- api/__init__.py +0 -0
- api/batch_processing/__init__.py +0 -0
- api/batch_processing/api_core/__init__.py +0 -0
- api/batch_processing/api_core/batch_service/__init__.py +0 -0
- api/batch_processing/api_core/batch_service/score.py +0 -1
- api/batch_processing/api_core/server_job_status_table.py +0 -1
- api/batch_processing/api_core_support/__init__.py +0 -0
- api/batch_processing/api_core_support/aggregate_results_manually.py +0 -1
- api/batch_processing/api_support/__init__.py +0 -0
- api/batch_processing/api_support/summarize_daily_activity.py +0 -1
- api/batch_processing/data_preparation/__init__.py +0 -0
- api/batch_processing/data_preparation/manage_local_batch.py +93 -79
- api/batch_processing/data_preparation/manage_video_batch.py +8 -8
- api/batch_processing/integration/digiKam/xmp_integration.py +0 -1
- api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -1
- api/batch_processing/postprocessing/__init__.py +0 -0
- api/batch_processing/postprocessing/add_max_conf.py +12 -12
- api/batch_processing/postprocessing/categorize_detections_by_size.py +32 -14
- api/batch_processing/postprocessing/combine_api_outputs.py +69 -55
- api/batch_processing/postprocessing/compare_batch_results.py +114 -44
- api/batch_processing/postprocessing/convert_output_format.py +62 -19
- api/batch_processing/postprocessing/load_api_results.py +17 -20
- api/batch_processing/postprocessing/md_to_coco.py +31 -21
- api/batch_processing/postprocessing/md_to_labelme.py +165 -68
- api/batch_processing/postprocessing/merge_detections.py +40 -15
- api/batch_processing/postprocessing/postprocess_batch_results.py +270 -186
- api/batch_processing/postprocessing/remap_detection_categories.py +170 -0
- api/batch_processing/postprocessing/render_detection_confusion_matrix.py +75 -39
- api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +53 -44
- api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +25 -14
- api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +244 -160
- api/batch_processing/postprocessing/separate_detections_into_folders.py +159 -114
- api/batch_processing/postprocessing/subset_json_detector_output.py +146 -169
- api/batch_processing/postprocessing/top_folders_to_bottom.py +77 -43
- api/synchronous/__init__.py +0 -0
- api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
- api/synchronous/api_core/animal_detection_api/api_backend.py +0 -2
- api/synchronous/api_core/animal_detection_api/api_frontend.py +266 -268
- api/synchronous/api_core/animal_detection_api/config.py +35 -35
- api/synchronous/api_core/tests/__init__.py +0 -0
- api/synchronous/api_core/tests/load_test.py +109 -109
- classification/__init__.py +0 -0
- classification/aggregate_classifier_probs.py +21 -24
- classification/analyze_failed_images.py +11 -13
- classification/cache_batchapi_outputs.py +51 -51
- classification/create_classification_dataset.py +69 -68
- classification/crop_detections.py +54 -53
- classification/csv_to_json.py +97 -100
- classification/detect_and_crop.py +105 -105
- classification/evaluate_model.py +43 -42
- classification/identify_mislabeled_candidates.py +47 -46
- classification/json_to_azcopy_list.py +10 -10
- classification/json_validator.py +72 -71
- classification/map_classification_categories.py +44 -43
- classification/merge_classification_detection_output.py +68 -68
- classification/prepare_classification_script.py +157 -154
- classification/prepare_classification_script_mc.py +228 -228
- classification/run_classifier.py +27 -26
- classification/save_mislabeled.py +30 -30
- classification/train_classifier.py +20 -20
- classification/train_classifier_tf.py +21 -22
- classification/train_utils.py +10 -10
- data_management/__init__.py +0 -0
- data_management/annotations/__init__.py +0 -0
- data_management/annotations/annotation_constants.py +18 -31
- data_management/camtrap_dp_to_coco.py +238 -0
- data_management/cct_json_utils.py +107 -59
- data_management/cct_to_md.py +176 -158
- data_management/cct_to_wi.py +247 -219
- data_management/coco_to_labelme.py +272 -0
- data_management/coco_to_yolo.py +86 -62
- data_management/databases/__init__.py +0 -0
- data_management/databases/add_width_and_height_to_db.py +20 -16
- data_management/databases/combine_coco_camera_traps_files.py +35 -31
- data_management/databases/integrity_check_json_db.py +130 -83
- data_management/databases/subset_json_db.py +25 -16
- data_management/generate_crops_from_cct.py +27 -45
- data_management/get_image_sizes.py +188 -144
- data_management/importers/add_nacti_sizes.py +8 -8
- data_management/importers/add_timestamps_to_icct.py +78 -78
- data_management/importers/animl_results_to_md_results.py +158 -160
- data_management/importers/auckland_doc_test_to_json.py +9 -9
- data_management/importers/auckland_doc_to_json.py +8 -8
- data_management/importers/awc_to_json.py +7 -7
- data_management/importers/bellevue_to_json.py +15 -15
- data_management/importers/cacophony-thermal-importer.py +13 -13
- data_management/importers/carrizo_shrubfree_2018.py +8 -8
- data_management/importers/carrizo_trail_cam_2017.py +8 -8
- data_management/importers/cct_field_adjustments.py +9 -9
- data_management/importers/channel_islands_to_cct.py +10 -10
- data_management/importers/eMammal/copy_and_unzip_emammal.py +1 -0
- data_management/importers/ena24_to_json.py +7 -7
- data_management/importers/filenames_to_json.py +8 -8
- data_management/importers/helena_to_cct.py +7 -7
- data_management/importers/idaho-camera-traps.py +7 -7
- data_management/importers/idfg_iwildcam_lila_prep.py +10 -10
- data_management/importers/jb_csv_to_json.py +9 -9
- data_management/importers/mcgill_to_json.py +8 -8
- data_management/importers/missouri_to_json.py +18 -18
- data_management/importers/nacti_fieldname_adjustments.py +10 -10
- data_management/importers/noaa_seals_2019.py +8 -8
- data_management/importers/pc_to_json.py +7 -7
- data_management/importers/plot_wni_giraffes.py +7 -7
- data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -359
- data_management/importers/prepare_zsl_imerit.py +7 -7
- data_management/importers/rspb_to_json.py +8 -8
- data_management/importers/save_the_elephants_survey_A.py +8 -8
- data_management/importers/save_the_elephants_survey_B.py +9 -9
- data_management/importers/snapshot_safari_importer.py +26 -26
- data_management/importers/snapshot_safari_importer_reprise.py +665 -665
- data_management/importers/snapshot_serengeti_lila.py +14 -14
- data_management/importers/sulross_get_exif.py +8 -9
- data_management/importers/timelapse_csv_set_to_json.py +11 -11
- data_management/importers/ubc_to_json.py +13 -13
- data_management/importers/umn_to_json.py +7 -7
- data_management/importers/wellington_to_json.py +8 -8
- data_management/importers/wi_to_json.py +9 -9
- data_management/importers/zamba_results_to_md_results.py +181 -181
- data_management/labelme_to_coco.py +309 -159
- data_management/labelme_to_yolo.py +103 -60
- data_management/lila/__init__.py +0 -0
- data_management/lila/add_locations_to_island_camera_traps.py +9 -9
- data_management/lila/add_locations_to_nacti.py +147 -147
- data_management/lila/create_lila_blank_set.py +114 -31
- data_management/lila/create_lila_test_set.py +8 -8
- data_management/lila/create_links_to_md_results_files.py +106 -106
- data_management/lila/download_lila_subset.py +92 -90
- data_management/lila/generate_lila_per_image_labels.py +56 -43
- data_management/lila/get_lila_annotation_counts.py +18 -15
- data_management/lila/get_lila_image_counts.py +11 -11
- data_management/lila/lila_common.py +103 -70
- data_management/lila/test_lila_metadata_urls.py +132 -116
- data_management/ocr_tools.py +173 -128
- data_management/read_exif.py +161 -99
- data_management/remap_coco_categories.py +84 -0
- data_management/remove_exif.py +58 -62
- data_management/resize_coco_dataset.py +32 -44
- data_management/wi_download_csv_to_coco.py +246 -0
- data_management/yolo_output_to_md_output.py +86 -73
- data_management/yolo_to_coco.py +535 -95
- detection/__init__.py +0 -0
- detection/detector_training/__init__.py +0 -0
- detection/process_video.py +85 -33
- detection/pytorch_detector.py +43 -25
- detection/run_detector.py +157 -72
- detection/run_detector_batch.py +189 -114
- detection/run_inference_with_yolov5_val.py +118 -51
- detection/run_tiled_inference.py +113 -42
- detection/tf_detector.py +51 -28
- detection/video_utils.py +606 -521
- docs/source/conf.py +43 -0
- md_utils/__init__.py +0 -0
- md_utils/azure_utils.py +9 -9
- md_utils/ct_utils.py +249 -70
- md_utils/directory_listing.py +59 -64
- md_utils/md_tests.py +968 -862
- md_utils/path_utils.py +655 -155
- md_utils/process_utils.py +157 -133
- md_utils/sas_blob_utils.py +20 -20
- md_utils/split_locations_into_train_val.py +45 -32
- md_utils/string_utils.py +33 -10
- md_utils/url_utils.py +208 -27
- md_utils/write_html_image_list.py +51 -35
- md_visualization/__init__.py +0 -0
- md_visualization/plot_utils.py +102 -109
- md_visualization/render_images_with_thumbnails.py +34 -34
- md_visualization/visualization_utils.py +908 -311
- md_visualization/visualize_db.py +109 -58
- md_visualization/visualize_detector_output.py +61 -42
- {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/METADATA +21 -17
- megadetector-5.0.9.dist-info/RECORD +224 -0
- {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/WHEEL +1 -1
- {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/top_level.txt +1 -0
- taxonomy_mapping/__init__.py +0 -0
- taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +342 -335
- taxonomy_mapping/map_new_lila_datasets.py +154 -154
- taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -134
- taxonomy_mapping/preview_lila_taxonomy.py +591 -591
- taxonomy_mapping/retrieve_sample_image.py +12 -12
- taxonomy_mapping/simple_image_download.py +11 -11
- taxonomy_mapping/species_lookup.py +10 -10
- taxonomy_mapping/taxonomy_csv_checker.py +18 -18
- taxonomy_mapping/taxonomy_graph.py +47 -47
- taxonomy_mapping/validate_lila_category_mappings.py +83 -76
- data_management/cct_json_to_filename_json.py +0 -89
- data_management/cct_to_csv.py +0 -140
- data_management/databases/remove_corrupted_images_from_db.py +0 -191
- detection/detector_training/copy_checkpoints.py +0 -43
- md_visualization/visualize_megadb.py +0 -183
- megadetector-5.0.7.dist-info/RECORD +0 -202
- {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/LICENSE +0 -0
|
@@ -1,17 +1,11 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
# what you want to query for, etc., is very application-specific; this is just meant as a
|
|
10
|
-
# demo.
|
|
11
|
-
#
|
|
12
|
-
# Can download from either Azure or GCP.
|
|
13
|
-
#
|
|
14
|
-
########
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
download_lila_subset.py
|
|
4
|
+
|
|
5
|
+
Example of how to download a list of files from LILA, e.g. all the files
|
|
6
|
+
in a data set corresponding to a particular species.
|
|
7
|
+
|
|
8
|
+
"""
|
|
15
9
|
|
|
16
10
|
#%% Constants and imports
|
|
17
11
|
|
|
@@ -19,16 +13,15 @@ import os
|
|
|
19
13
|
import random
|
|
20
14
|
|
|
21
15
|
from tqdm import tqdm
|
|
22
|
-
from multiprocessing.pool import ThreadPool
|
|
23
|
-
from urllib.parse import urlparse
|
|
24
16
|
from collections import defaultdict
|
|
25
17
|
|
|
26
|
-
from data_management.lila.lila_common import
|
|
27
|
-
|
|
28
|
-
|
|
18
|
+
from data_management.lila.lila_common import read_lila_all_images_file, is_empty, lila_base_urls
|
|
19
|
+
|
|
20
|
+
for s in lila_base_urls.values():
|
|
21
|
+
assert s.endswith('/')
|
|
29
22
|
|
|
30
23
|
# If any of these strings appear in the common name of a species, we'll download that image
|
|
31
|
-
species_of_interest = ['grey fox','
|
|
24
|
+
species_of_interest = ['grey fox','gray fox','cape fox','red fox','kit fox']
|
|
32
25
|
|
|
33
26
|
# We'll write images, metadata downloads, and temporary files here
|
|
34
27
|
lila_local_base = os.path.expanduser('~/lila')
|
|
@@ -44,24 +37,22 @@ n_download_threads = 20
|
|
|
44
37
|
|
|
45
38
|
max_images_per_dataset = 10 # None
|
|
46
39
|
|
|
47
|
-
|
|
48
|
-
#
|
|
49
|
-
# "Azure" really means "Azure if available"; recent datasets are only available
|
|
50
|
-
# on GCP.
|
|
51
|
-
image_download_source = 'azure' # 'azure' or 'gcp'
|
|
40
|
+
preferred_provider = 'gcp' # 'azure', 'gcp', 'aws'
|
|
52
41
|
|
|
53
42
|
random.seed(0)
|
|
54
43
|
|
|
55
44
|
|
|
56
45
|
#%% Download and open the giant table of image URLs and labels
|
|
57
46
|
|
|
58
|
-
# ~60 seconds to download, unzip, and open
|
|
47
|
+
# Takes ~60 seconds to download, unzip, and open
|
|
59
48
|
df = read_lila_all_images_file(metadata_dir)
|
|
60
49
|
|
|
61
50
|
|
|
62
51
|
#%% Find all the images we want to download
|
|
63
52
|
|
|
64
|
-
# ~2 minutes
|
|
53
|
+
# Takes ~2 minutes
|
|
54
|
+
|
|
55
|
+
common_name_to_count = defaultdict(int)
|
|
65
56
|
|
|
66
57
|
ds_name_to_urls = defaultdict(list)
|
|
67
58
|
|
|
@@ -72,26 +63,33 @@ def find_items(row):
|
|
|
72
63
|
|
|
73
64
|
match = False
|
|
74
65
|
|
|
66
|
+
# This is the only bit of this file that's specific to a particular query. In this case
|
|
67
|
+
# we're checking whether each row is on a list of species of interest, but you do you.
|
|
75
68
|
for species_name in species_of_interest:
|
|
76
69
|
if species_name in row['common_name']:
|
|
77
70
|
match = True
|
|
71
|
+
common_name_to_count[species_name] += 1
|
|
78
72
|
break
|
|
79
73
|
|
|
80
74
|
if match:
|
|
81
|
-
ds_name_to_urls[row['dataset_name']].append(row['
|
|
75
|
+
ds_name_to_urls[row['dataset_name']].append(row['url_' + preferred_provider])
|
|
82
76
|
|
|
83
77
|
tqdm.pandas()
|
|
84
78
|
_ = df.progress_apply(find_items,axis=1)
|
|
85
79
|
|
|
80
|
+
# We have a list of URLs for each dataset, flatten them all into a list of URLs
|
|
86
81
|
all_urls = list(ds_name_to_urls.values())
|
|
87
82
|
all_urls = [item for sublist in all_urls for item in sublist]
|
|
88
83
|
print('Found {} matching URLs across {} datasets'.format(len(all_urls),len(ds_name_to_urls)))
|
|
89
84
|
|
|
85
|
+
for common_name in common_name_to_count:
|
|
86
|
+
print('{}: {}'.format(common_name,common_name_to_count[common_name]))
|
|
87
|
+
|
|
90
88
|
from copy import deepcopy
|
|
91
89
|
ds_name_to_urls_raw = deepcopy(ds_name_to_urls)
|
|
92
90
|
|
|
93
91
|
|
|
94
|
-
#%%
|
|
92
|
+
#%% Optionally trim to a fixed number of URLs per dataset
|
|
95
93
|
|
|
96
94
|
if max_images_per_dataset is None:
|
|
97
95
|
pass
|
|
@@ -102,74 +100,78 @@ else:
|
|
|
102
100
|
ds_name_to_urls[ds_name] = random.sample(ds_name_to_urls[ds_name],max_images_per_dataset)
|
|
103
101
|
|
|
104
102
|
|
|
105
|
-
#%%
|
|
103
|
+
#%% Choose target files for each URL
|
|
106
104
|
|
|
107
|
-
|
|
108
|
-
'lilablobssc.blob.core.windows.net':'/',
|
|
109
|
-
'storage.googleapis.com':'/public-datasets-lila/'
|
|
110
|
-
}
|
|
105
|
+
from data_management.lila.lila_common import lila_base_urls
|
|
111
106
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
if url_base is None:
|
|
120
|
-
assert url.startswith('https://')
|
|
121
|
-
container = url.split('/')[2]
|
|
122
|
-
assert container in container_to_url_base
|
|
123
|
-
url_base = container_to_url_base[container]
|
|
124
|
-
|
|
125
|
-
assert url_base.startswith('/') and url_base.endswith('/')
|
|
126
|
-
|
|
127
|
-
p = urlparse(url)
|
|
128
|
-
relative_filename = str(p.path)
|
|
129
|
-
# remove the leading '/'
|
|
130
|
-
assert relative_filename.startswith(url_base)
|
|
131
|
-
relative_filename = relative_filename.replace(url_base,'',1)
|
|
132
|
-
|
|
133
|
-
destination_filename = os.path.join(output_base,relative_filename)
|
|
134
|
-
result['destination_filename'] = destination_filename
|
|
135
|
-
|
|
136
|
-
if ((os.path.isfile(destination_filename)) and (not overwrite)):
|
|
137
|
-
result['status'] = 'skipped'
|
|
138
|
-
return result
|
|
139
|
-
try:
|
|
140
|
-
download_url(url, destination_filename, verbose=verbose)
|
|
141
|
-
except Exception as e:
|
|
142
|
-
print('Warning: error downloading URL {}: {}'.format(
|
|
143
|
-
url,str(e)))
|
|
144
|
-
result['status'] = 'error: {}'.format(str(e))
|
|
145
|
-
return result
|
|
146
|
-
|
|
147
|
-
result['status'] = 'success'
|
|
148
|
-
return result
|
|
107
|
+
# We have a list of URLs per dataset, flatten that into a single list of URLs
|
|
108
|
+
urls_to_download = set()
|
|
109
|
+
for ds_name in ds_name_to_urls:
|
|
110
|
+
for url in ds_name_to_urls[ds_name]:
|
|
111
|
+
urls_to_download.add(url)
|
|
112
|
+
urls_to_download = sorted(list(urls_to_download))
|
|
149
113
|
|
|
114
|
+
# A URL might look like this:
|
|
115
|
+
#
|
|
116
|
+
# https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0667/0302.jpg
|
|
117
|
+
#
|
|
118
|
+
# We'll write that to an output file that looks like this (relative to output_dir):
|
|
119
|
+
#
|
|
120
|
+
# wcs-unzipped/animals/0667/0302.jpg
|
|
121
|
+
#
|
|
122
|
+
# ...so we need to remove the base URL to get the target file.
|
|
123
|
+
base_url = lila_base_urls[preferred_provider]
|
|
124
|
+
assert base_url.endswith('/')
|
|
150
125
|
|
|
151
|
-
|
|
152
|
-
all_urls = list(ds_name_to_urls.values())
|
|
153
|
-
all_urls = [item for sublist in all_urls for item in sublist]
|
|
126
|
+
url_to_target_file = {}
|
|
154
127
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
128
|
+
for url in urls_to_download:
|
|
129
|
+
assert url.startswith(base_url)
|
|
130
|
+
target_fn_relative = url.replace(base_url,'')
|
|
131
|
+
target_fn_abs = os.path.join(output_dir,target_fn_relative)
|
|
132
|
+
url_to_target_file[url] = target_fn_abs
|
|
159
133
|
|
|
160
|
-
print('Downloading {} images on {} workers'.format(len(all_urls),n_download_threads))
|
|
161
134
|
|
|
162
|
-
|
|
135
|
+
#%% Download image files
|
|
163
136
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
137
|
+
from md_utils.url_utils import parallel_download_urls
|
|
138
|
+
|
|
139
|
+
download_results = parallel_download_urls(url_to_target_file=url_to_target_file,
|
|
140
|
+
verbose=False,
|
|
141
|
+
overwrite=False,
|
|
142
|
+
n_workers=n_download_threads,
|
|
143
|
+
pool_type='thread')
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
#%% Scrap
|
|
147
|
+
|
|
148
|
+
if False:
|
|
169
149
|
|
|
170
|
-
|
|
150
|
+
pass
|
|
151
|
+
|
|
152
|
+
#%% Find all the reptiles on LILA
|
|
171
153
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
154
|
+
reptile_rows = df.loc[df['class'] == 'reptilia']
|
|
155
|
+
|
|
156
|
+
# i_row = 0; row = reptile_rows.iloc[i_row]
|
|
157
|
+
|
|
158
|
+
common_name_to_count = defaultdict(int)
|
|
159
|
+
dataset_to_count = defaultdict(int)
|
|
160
|
+
for i_row,row in reptile_rows.iterrows():
|
|
161
|
+
common_name_to_count[row['common_name']] += 1
|
|
162
|
+
dataset_to_count[row['dataset_name']] += 1
|
|
163
|
+
|
|
164
|
+
from md_utils.ct_utils import sort_dictionary_by_value
|
|
165
|
+
|
|
166
|
+
print('Found {} reptiles\n'.format(len(reptile_rows)))
|
|
167
|
+
|
|
168
|
+
common_name_to_count = sort_dictionary_by_value(common_name_to_count,reverse=True)
|
|
169
|
+
dataset_to_count = sort_dictionary_by_value(dataset_to_count,reverse=True)
|
|
170
|
+
|
|
171
|
+
print('Common names by count:\n')
|
|
172
|
+
for k in common_name_to_count:
|
|
173
|
+
print('{} ({})'.format(k,common_name_to_count[k]))
|
|
174
|
+
|
|
175
|
+
print('\nDatasets by count:\n')
|
|
176
|
+
for k in dataset_to_count:
|
|
177
|
+
print('{} ({})'.format(k,dataset_to_count[k]))
|
|
@@ -1,19 +1,19 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
generate_lila_per_image_labels.py
|
|
4
|
+
|
|
5
|
+
Generate a .csv file with one row per annotation, containing full URLs to every
|
|
6
|
+
camera trap image on LILA, with taxonomically expanded labels.
|
|
7
|
+
|
|
8
|
+
Typically there will be one row per image, though images with multiple annotations
|
|
9
|
+
will have multiple rows.
|
|
10
|
+
|
|
11
|
+
Some images may not physically exist, particularly images that are labeled as "human".
|
|
12
|
+
This script does not validate image URLs.
|
|
13
|
+
|
|
14
|
+
Does not include bounding box annotations.
|
|
15
|
+
|
|
16
|
+
"""
|
|
17
17
|
|
|
18
18
|
#%% Constants and imports
|
|
19
19
|
|
|
@@ -23,8 +23,6 @@ import pandas as pd
|
|
|
23
23
|
import numpy as np
|
|
24
24
|
import dateparser
|
|
25
25
|
import csv
|
|
26
|
-
import urllib
|
|
27
|
-
import urllib.request
|
|
28
26
|
|
|
29
27
|
from collections import defaultdict
|
|
30
28
|
from tqdm import tqdm
|
|
@@ -36,7 +34,6 @@ from data_management.lila.lila_common import read_lila_metadata, \
|
|
|
36
34
|
from md_utils import write_html_image_list
|
|
37
35
|
from md_utils.path_utils import zip_file
|
|
38
36
|
from md_utils.path_utils import open_file
|
|
39
|
-
from md_utils.url_utils import download_url
|
|
40
37
|
|
|
41
38
|
# We'll write images, metadata downloads, and temporary files here
|
|
42
39
|
lila_local_base = os.path.expanduser('~/lila')
|
|
@@ -107,12 +104,15 @@ for i_row,row in taxonomy_df.iterrows():
|
|
|
107
104
|
|
|
108
105
|
# Takes several hours
|
|
109
106
|
|
|
110
|
-
|
|
111
|
-
|
|
107
|
+
# The order of these headers needs to match the order in which fields are added later in this cell;
|
|
108
|
+
# don't mess with this order.
|
|
109
|
+
header = ['dataset_name','url_gcp','url_aws','url_azure',
|
|
110
|
+
'image_id','sequence_id','location_id','frame_num',
|
|
111
|
+
'original_label','scientific_name','common_name','datetime','annotation_level']
|
|
112
112
|
|
|
113
113
|
taxonomy_levels_to_include = \
|
|
114
114
|
['kingdom','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order',
|
|
115
|
-
'suborder','infraorder','superfamily','family','subfamily','tribe','genus','species','subspecies'
|
|
115
|
+
'suborder','infraorder','superfamily','family','subfamily','tribe','genus','species','subspecies',
|
|
116
116
|
'variety']
|
|
117
117
|
|
|
118
118
|
header.extend(taxonomy_levels_to_include)
|
|
@@ -179,10 +179,17 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
|
|
|
179
179
|
break
|
|
180
180
|
|
|
181
181
|
file_name = im['file_name'].replace('\\','/')
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
182
|
+
base_url_gcp = metadata_table[ds_name]['image_base_url_gcp']
|
|
183
|
+
base_url_aws = metadata_table[ds_name]['image_base_url_aws']
|
|
184
|
+
base_url_azure = metadata_table[ds_name]['image_base_url_azure']
|
|
185
|
+
assert not base_url_gcp.endswith('/')
|
|
186
|
+
assert not base_url_aws.endswith('/')
|
|
187
|
+
assert not base_url_azure.endswith('/')
|
|
185
188
|
|
|
189
|
+
url_gcp = base_url_gcp + '/' + file_name
|
|
190
|
+
url_aws = base_url_aws + '/' + file_name
|
|
191
|
+
url_azure = base_url_azure + '/' + file_name
|
|
192
|
+
|
|
186
193
|
for k in im.keys():
|
|
187
194
|
if ('date' in k or 'time' in k) and (k not in ['datetime','date_captured']):
|
|
188
195
|
raise ValueError('Unrecognized datetime field')
|
|
@@ -297,7 +304,9 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
|
|
|
297
304
|
|
|
298
305
|
row = []
|
|
299
306
|
row.append(ds_name)
|
|
300
|
-
row.append(
|
|
307
|
+
row.append(url_gcp)
|
|
308
|
+
row.append(url_aws)
|
|
309
|
+
row.append(url_azure)
|
|
301
310
|
row.append(image_id)
|
|
302
311
|
row.append(sequence_id)
|
|
303
312
|
row.append(location_id)
|
|
@@ -338,7 +347,7 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
|
|
|
338
347
|
|
|
339
348
|
# ...with open()
|
|
340
349
|
|
|
341
|
-
print('Processed {}
|
|
350
|
+
print('Processed {} datasets'.format(len(metadata_table)))
|
|
342
351
|
|
|
343
352
|
|
|
344
353
|
#%% Read the .csv back
|
|
@@ -365,7 +374,8 @@ dataset_name_to_locations = defaultdict(set)
|
|
|
365
374
|
def check_row(row):
|
|
366
375
|
|
|
367
376
|
assert row['dataset_name'] in metadata_table.keys()
|
|
368
|
-
|
|
377
|
+
for url_column in ['url_gcp','url_aws','url_azure']:
|
|
378
|
+
assert row[url_column].startswith('https://') or row[url_column].startswith('http://')
|
|
369
379
|
assert ' : ' in row['image_id']
|
|
370
380
|
assert 'seq' not in row['location_id'].lower()
|
|
371
381
|
assert row['annotation_level'] in valid_annotation_levels
|
|
@@ -446,28 +456,31 @@ for ds_name in metadata_table.keys():
|
|
|
446
456
|
print('Selected {} total images'.format(len(images_to_download)))
|
|
447
457
|
|
|
448
458
|
|
|
449
|
-
#%% Download images
|
|
459
|
+
#%% Download images (prep)
|
|
450
460
|
|
|
451
461
|
# Expect a few errors for images with human or vehicle labels (or things like "ignore" that *could* be humans)
|
|
452
462
|
|
|
453
|
-
|
|
454
|
-
|
|
463
|
+
preferred_cloud = 'aws'
|
|
464
|
+
|
|
465
|
+
url_to_target_file = {}
|
|
466
|
+
|
|
455
467
|
# i_image = 10; image = images_to_download[i_image]
|
|
456
468
|
for i_image,image in tqdm(enumerate(images_to_download),total=len(images_to_download)):
|
|
457
469
|
|
|
458
|
-
url = image['
|
|
470
|
+
url = image['url_' + preferred_cloud]
|
|
459
471
|
ext = os.path.splitext(url)[1]
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
image['relative_file'] = None
|
|
472
|
+
fn_relative = 'image_{}'.format(str(i_image).zfill(4)) + ext
|
|
473
|
+
fn_abs = os.path.join(preview_folder,fn_relative)
|
|
474
|
+
image['relative_file'] = fn_relative
|
|
475
|
+
image['url'] = url
|
|
476
|
+
url_to_target_file[url] = fn_abs
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
#%% Download images (execution)
|
|
469
480
|
|
|
470
|
-
|
|
481
|
+
from md_utils.url_utils import parallel_download_urls
|
|
482
|
+
download_results = parallel_download_urls(url_to_target_file,verbose=False,overwrite=True,
|
|
483
|
+
n_workers=20,pool_type='thread')
|
|
471
484
|
|
|
472
485
|
|
|
473
486
|
#%% Write preview HTML
|
|
@@ -499,4 +512,4 @@ open_file(html_filename)
|
|
|
499
512
|
|
|
500
513
|
zipped_output_file = zip_file(output_file,verbose=True)
|
|
501
514
|
|
|
502
|
-
print('Zipped {} to {}'.format(output_file,zipped_output_file))
|
|
515
|
+
print('Zipped {} to {}'.format(output_file,zipped_output_file))
|
|
@@ -1,16 +1,16 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
get_lila_annotation_counts.py
|
|
4
|
+
|
|
5
|
+
Generates a .json-formatted dictionary mapping each LILA dataset to all categories
|
|
6
|
+
that exist for that dataset, with counts for the number of occurrences of each category
|
|
7
|
+
(the number of *annotations* for each category, not the number of *images*).
|
|
8
|
+
|
|
9
|
+
Also loads the taxonomy mapping file, to include scientific names for each category.
|
|
10
|
+
|
|
11
|
+
get_lila_image_counts.py counts the number of *images* for each category in each dataset.
|
|
12
|
+
|
|
13
|
+
"""
|
|
14
14
|
|
|
15
15
|
#%% Constants and imports
|
|
16
16
|
|
|
@@ -20,6 +20,9 @@ import os
|
|
|
20
20
|
from data_management.lila.lila_common import read_lila_metadata,\
|
|
21
21
|
read_metadata_file_for_dataset, read_lila_taxonomy_mapping
|
|
22
22
|
|
|
23
|
+
# cloud provider to use for downloading images; options are 'gcp', 'azure', or 'aws'
|
|
24
|
+
preferred_cloud = 'gcp'
|
|
25
|
+
|
|
23
26
|
# array to fill for output
|
|
24
27
|
category_list = []
|
|
25
28
|
|
|
@@ -96,9 +99,9 @@ for ds_name in metadata_table.keys():
|
|
|
96
99
|
print('Warning: taxonomy mapping not available for {}'.format(ds_name))
|
|
97
100
|
|
|
98
101
|
print('Finding categories in {}'.format(ds_name))
|
|
99
|
-
|
|
102
|
+
|
|
100
103
|
json_filename = metadata_table[ds_name]['json_filename']
|
|
101
|
-
base_url = metadata_table[ds_name]['
|
|
104
|
+
base_url = metadata_table[ds_name]['image_base_url_' + preferred_cloud]
|
|
102
105
|
assert not base_url.endswith('/')
|
|
103
106
|
|
|
104
107
|
# Open the metadata file
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
get_lila_image_counts.py
|
|
4
|
+
|
|
5
|
+
Count the number of images and bounding boxes with each label in one or more LILA datasets.
|
|
6
|
+
|
|
7
|
+
This script doesn't write these counts out anywhere other than the console, it's just intended
|
|
8
|
+
as a template for doing operations like this on LILA data. get_lila_annotation_counts.py writes
|
|
9
|
+
information out to a .json file, but it counts *annotations*, not *images*, for each category.
|
|
10
|
+
|
|
11
|
+
"""
|
|
12
12
|
|
|
13
13
|
#%% Constants and imports
|
|
14
14
|
|