megadetector 5.0.7__py3-none-any.whl → 5.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- api/__init__.py +0 -0
- api/batch_processing/__init__.py +0 -0
- api/batch_processing/api_core/__init__.py +0 -0
- api/batch_processing/api_core/batch_service/__init__.py +0 -0
- api/batch_processing/api_core/batch_service/score.py +0 -1
- api/batch_processing/api_core/server_job_status_table.py +0 -1
- api/batch_processing/api_core_support/__init__.py +0 -0
- api/batch_processing/api_core_support/aggregate_results_manually.py +0 -1
- api/batch_processing/api_support/__init__.py +0 -0
- api/batch_processing/api_support/summarize_daily_activity.py +0 -1
- api/batch_processing/data_preparation/__init__.py +0 -0
- api/batch_processing/data_preparation/manage_local_batch.py +93 -79
- api/batch_processing/data_preparation/manage_video_batch.py +8 -8
- api/batch_processing/integration/digiKam/xmp_integration.py +0 -1
- api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -1
- api/batch_processing/postprocessing/__init__.py +0 -0
- api/batch_processing/postprocessing/add_max_conf.py +12 -12
- api/batch_processing/postprocessing/categorize_detections_by_size.py +32 -14
- api/batch_processing/postprocessing/combine_api_outputs.py +69 -55
- api/batch_processing/postprocessing/compare_batch_results.py +114 -44
- api/batch_processing/postprocessing/convert_output_format.py +62 -19
- api/batch_processing/postprocessing/load_api_results.py +17 -20
- api/batch_processing/postprocessing/md_to_coco.py +31 -21
- api/batch_processing/postprocessing/md_to_labelme.py +165 -68
- api/batch_processing/postprocessing/merge_detections.py +40 -15
- api/batch_processing/postprocessing/postprocess_batch_results.py +270 -186
- api/batch_processing/postprocessing/remap_detection_categories.py +170 -0
- api/batch_processing/postprocessing/render_detection_confusion_matrix.py +75 -39
- api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +53 -44
- api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +25 -14
- api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +244 -160
- api/batch_processing/postprocessing/separate_detections_into_folders.py +159 -114
- api/batch_processing/postprocessing/subset_json_detector_output.py +146 -169
- api/batch_processing/postprocessing/top_folders_to_bottom.py +77 -43
- api/synchronous/__init__.py +0 -0
- api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
- api/synchronous/api_core/animal_detection_api/api_backend.py +0 -2
- api/synchronous/api_core/animal_detection_api/api_frontend.py +266 -268
- api/synchronous/api_core/animal_detection_api/config.py +35 -35
- api/synchronous/api_core/tests/__init__.py +0 -0
- api/synchronous/api_core/tests/load_test.py +109 -109
- classification/__init__.py +0 -0
- classification/aggregate_classifier_probs.py +21 -24
- classification/analyze_failed_images.py +11 -13
- classification/cache_batchapi_outputs.py +51 -51
- classification/create_classification_dataset.py +69 -68
- classification/crop_detections.py +54 -53
- classification/csv_to_json.py +97 -100
- classification/detect_and_crop.py +105 -105
- classification/evaluate_model.py +43 -42
- classification/identify_mislabeled_candidates.py +47 -46
- classification/json_to_azcopy_list.py +10 -10
- classification/json_validator.py +72 -71
- classification/map_classification_categories.py +44 -43
- classification/merge_classification_detection_output.py +68 -68
- classification/prepare_classification_script.py +157 -154
- classification/prepare_classification_script_mc.py +228 -228
- classification/run_classifier.py +27 -26
- classification/save_mislabeled.py +30 -30
- classification/train_classifier.py +20 -20
- classification/train_classifier_tf.py +21 -22
- classification/train_utils.py +10 -10
- data_management/__init__.py +0 -0
- data_management/annotations/__init__.py +0 -0
- data_management/annotations/annotation_constants.py +18 -31
- data_management/camtrap_dp_to_coco.py +238 -0
- data_management/cct_json_utils.py +107 -59
- data_management/cct_to_md.py +176 -158
- data_management/cct_to_wi.py +247 -219
- data_management/coco_to_labelme.py +272 -0
- data_management/coco_to_yolo.py +86 -62
- data_management/databases/__init__.py +0 -0
- data_management/databases/add_width_and_height_to_db.py +20 -16
- data_management/databases/combine_coco_camera_traps_files.py +35 -31
- data_management/databases/integrity_check_json_db.py +130 -83
- data_management/databases/subset_json_db.py +25 -16
- data_management/generate_crops_from_cct.py +27 -45
- data_management/get_image_sizes.py +188 -144
- data_management/importers/add_nacti_sizes.py +8 -8
- data_management/importers/add_timestamps_to_icct.py +78 -78
- data_management/importers/animl_results_to_md_results.py +158 -160
- data_management/importers/auckland_doc_test_to_json.py +9 -9
- data_management/importers/auckland_doc_to_json.py +8 -8
- data_management/importers/awc_to_json.py +7 -7
- data_management/importers/bellevue_to_json.py +15 -15
- data_management/importers/cacophony-thermal-importer.py +13 -13
- data_management/importers/carrizo_shrubfree_2018.py +8 -8
- data_management/importers/carrizo_trail_cam_2017.py +8 -8
- data_management/importers/cct_field_adjustments.py +9 -9
- data_management/importers/channel_islands_to_cct.py +10 -10
- data_management/importers/eMammal/copy_and_unzip_emammal.py +1 -0
- data_management/importers/ena24_to_json.py +7 -7
- data_management/importers/filenames_to_json.py +8 -8
- data_management/importers/helena_to_cct.py +7 -7
- data_management/importers/idaho-camera-traps.py +7 -7
- data_management/importers/idfg_iwildcam_lila_prep.py +10 -10
- data_management/importers/jb_csv_to_json.py +9 -9
- data_management/importers/mcgill_to_json.py +8 -8
- data_management/importers/missouri_to_json.py +18 -18
- data_management/importers/nacti_fieldname_adjustments.py +10 -10
- data_management/importers/noaa_seals_2019.py +8 -8
- data_management/importers/pc_to_json.py +7 -7
- data_management/importers/plot_wni_giraffes.py +7 -7
- data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -359
- data_management/importers/prepare_zsl_imerit.py +7 -7
- data_management/importers/rspb_to_json.py +8 -8
- data_management/importers/save_the_elephants_survey_A.py +8 -8
- data_management/importers/save_the_elephants_survey_B.py +9 -9
- data_management/importers/snapshot_safari_importer.py +26 -26
- data_management/importers/snapshot_safari_importer_reprise.py +665 -665
- data_management/importers/snapshot_serengeti_lila.py +14 -14
- data_management/importers/sulross_get_exif.py +8 -9
- data_management/importers/timelapse_csv_set_to_json.py +11 -11
- data_management/importers/ubc_to_json.py +13 -13
- data_management/importers/umn_to_json.py +7 -7
- data_management/importers/wellington_to_json.py +8 -8
- data_management/importers/wi_to_json.py +9 -9
- data_management/importers/zamba_results_to_md_results.py +181 -181
- data_management/labelme_to_coco.py +309 -159
- data_management/labelme_to_yolo.py +103 -60
- data_management/lila/__init__.py +0 -0
- data_management/lila/add_locations_to_island_camera_traps.py +9 -9
- data_management/lila/add_locations_to_nacti.py +147 -147
- data_management/lila/create_lila_blank_set.py +114 -31
- data_management/lila/create_lila_test_set.py +8 -8
- data_management/lila/create_links_to_md_results_files.py +106 -106
- data_management/lila/download_lila_subset.py +92 -90
- data_management/lila/generate_lila_per_image_labels.py +56 -43
- data_management/lila/get_lila_annotation_counts.py +18 -15
- data_management/lila/get_lila_image_counts.py +11 -11
- data_management/lila/lila_common.py +103 -70
- data_management/lila/test_lila_metadata_urls.py +132 -116
- data_management/ocr_tools.py +173 -128
- data_management/read_exif.py +161 -99
- data_management/remap_coco_categories.py +84 -0
- data_management/remove_exif.py +58 -62
- data_management/resize_coco_dataset.py +32 -44
- data_management/wi_download_csv_to_coco.py +246 -0
- data_management/yolo_output_to_md_output.py +86 -73
- data_management/yolo_to_coco.py +535 -95
- detection/__init__.py +0 -0
- detection/detector_training/__init__.py +0 -0
- detection/process_video.py +85 -33
- detection/pytorch_detector.py +43 -25
- detection/run_detector.py +157 -72
- detection/run_detector_batch.py +189 -114
- detection/run_inference_with_yolov5_val.py +118 -51
- detection/run_tiled_inference.py +113 -42
- detection/tf_detector.py +51 -28
- detection/video_utils.py +606 -521
- docs/source/conf.py +43 -0
- md_utils/__init__.py +0 -0
- md_utils/azure_utils.py +9 -9
- md_utils/ct_utils.py +249 -70
- md_utils/directory_listing.py +59 -64
- md_utils/md_tests.py +968 -862
- md_utils/path_utils.py +655 -155
- md_utils/process_utils.py +157 -133
- md_utils/sas_blob_utils.py +20 -20
- md_utils/split_locations_into_train_val.py +45 -32
- md_utils/string_utils.py +33 -10
- md_utils/url_utils.py +208 -27
- md_utils/write_html_image_list.py +51 -35
- md_visualization/__init__.py +0 -0
- md_visualization/plot_utils.py +102 -109
- md_visualization/render_images_with_thumbnails.py +34 -34
- md_visualization/visualization_utils.py +908 -311
- md_visualization/visualize_db.py +109 -58
- md_visualization/visualize_detector_output.py +61 -42
- {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/METADATA +21 -17
- megadetector-5.0.9.dist-info/RECORD +224 -0
- {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/WHEEL +1 -1
- {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/top_level.txt +1 -0
- taxonomy_mapping/__init__.py +0 -0
- taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +342 -335
- taxonomy_mapping/map_new_lila_datasets.py +154 -154
- taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -134
- taxonomy_mapping/preview_lila_taxonomy.py +591 -591
- taxonomy_mapping/retrieve_sample_image.py +12 -12
- taxonomy_mapping/simple_image_download.py +11 -11
- taxonomy_mapping/species_lookup.py +10 -10
- taxonomy_mapping/taxonomy_csv_checker.py +18 -18
- taxonomy_mapping/taxonomy_graph.py +47 -47
- taxonomy_mapping/validate_lila_category_mappings.py +83 -76
- data_management/cct_json_to_filename_json.py +0 -89
- data_management/cct_to_csv.py +0 -140
- data_management/databases/remove_corrupted_images_from_db.py +0 -191
- detection/detector_training/copy_checkpoints.py +0 -43
- md_visualization/visualize_megadb.py +0 -183
- megadetector-5.0.7.dist-info/RECORD +0 -202
- {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/LICENSE +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
lila_common.py
|
|
4
|
+
|
|
5
|
+
Common constants and functions related to LILA data management/retrieval.
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
8
|
|
|
9
9
|
#%% Imports and constants
|
|
10
10
|
|
|
@@ -12,12 +12,12 @@ import os
|
|
|
12
12
|
import json
|
|
13
13
|
import zipfile
|
|
14
14
|
import pandas as pd
|
|
15
|
-
import numpy as np
|
|
16
15
|
|
|
17
16
|
from urllib.parse import urlparse
|
|
18
17
|
|
|
19
18
|
from md_utils.url_utils import download_url
|
|
20
19
|
from md_utils.path_utils import unzip_file
|
|
20
|
+
from md_utils.ct_utils import is_empty
|
|
21
21
|
|
|
22
22
|
# LILA camera trap primary metadata file
|
|
23
23
|
lila_metadata_url = 'http://lila.science/wp-content/uploads/2023/06/lila_camera_trap_datasets.csv'
|
|
@@ -31,9 +31,21 @@ wildlife_insights_taxonomy_local_json_filename = 'wi_taxonomy.json'
|
|
|
31
31
|
wildlife_insights_taxonomy_local_csv_filename = \
|
|
32
32
|
wildlife_insights_taxonomy_local_json_filename.replace('.json','.csv')
|
|
33
33
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
34
|
+
# Filenames are consistent across clouds relative to these URLs
|
|
35
|
+
lila_base_urls = {
|
|
36
|
+
'azure':'https://lilawildlife.blob.core.windows.net/lila-wildlife/',
|
|
37
|
+
'gcp':'https://storage.googleapis.com/public-datasets-lila/',
|
|
38
|
+
'aws':'http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/'
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
lila_cloud_urls = {
|
|
42
|
+
'azure':'https://lilawildlife.blob.core.windows.net/lila-wildlife/',
|
|
43
|
+
'gcp':'gs://public-datasets-lila/',
|
|
44
|
+
'aws':'s3://us-west-2.opendata.source.coop/agentmorris/lila-wildlife/'
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
for url in lila_base_urls.values():
|
|
48
|
+
assert url.endswith('/')
|
|
37
49
|
|
|
38
50
|
|
|
39
51
|
#%% Common functions
|
|
@@ -42,7 +54,11 @@ def read_wildlife_insights_taxonomy_mapping(metadata_dir):
|
|
|
42
54
|
"""
|
|
43
55
|
Reads the WI taxonomy mapping file, downloading the .json data (and writing to .csv) if necessary.
|
|
44
56
|
|
|
45
|
-
|
|
57
|
+
Args:
|
|
58
|
+
metadata_dir (str): folder to use for temporary LILA metadata files
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
pd.dataframe: A DataFrame with taxonomy information
|
|
46
62
|
"""
|
|
47
63
|
|
|
48
64
|
wi_taxonomy_csv_path = os.path.join(metadata_dir,wildlife_insights_taxonomy_local_csv_filename)
|
|
@@ -81,7 +97,11 @@ def read_lila_taxonomy_mapping(metadata_dir):
|
|
|
81
97
|
"""
|
|
82
98
|
Reads the LILA taxonomy mapping file, downloading the .csv file if necessary.
|
|
83
99
|
|
|
84
|
-
|
|
100
|
+
Args:
|
|
101
|
+
metadata_dir (str): folder to use for temporary LILA metadata files
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
pd.DataFrame: a DataFrame with one row per identification
|
|
85
105
|
"""
|
|
86
106
|
|
|
87
107
|
p = urlparse(lila_taxonomy_mapping_url)
|
|
@@ -93,24 +113,38 @@ def read_lila_taxonomy_mapping(metadata_dir):
|
|
|
93
113
|
return df
|
|
94
114
|
|
|
95
115
|
|
|
96
|
-
def is_empty(v):
|
|
97
|
-
if v is None:
|
|
98
|
-
return True
|
|
99
|
-
if isinstance(v,str) and v == '':
|
|
100
|
-
return True
|
|
101
|
-
if isinstance(v,float) and np.isnan(v):
|
|
102
|
-
return True
|
|
103
|
-
return False
|
|
104
|
-
|
|
105
|
-
|
|
106
116
|
def read_lila_metadata(metadata_dir):
|
|
107
117
|
"""
|
|
108
|
-
Reads LILA metadata (URLs to each dataset), downloading the
|
|
118
|
+
Reads LILA metadata (URLs to each dataset), downloading the .csv file if necessary.
|
|
109
119
|
|
|
110
|
-
|
|
111
|
-
|
|
120
|
+
Args:
|
|
121
|
+
metadata_dir (str): folder to use for temporary LILA metadata files
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
dict: a dict mapping dataset names (e.g. "Caltech Camera Traps") to dicts
|
|
125
|
+
with keys corresponding to the headers in the .csv file, currently:
|
|
112
126
|
|
|
113
|
-
|
|
127
|
+
- name
|
|
128
|
+
- short_name
|
|
129
|
+
- continent
|
|
130
|
+
- country
|
|
131
|
+
- region
|
|
132
|
+
- image_base_url_relative
|
|
133
|
+
- metadata_url_relative
|
|
134
|
+
- bbox_url_relative
|
|
135
|
+
- image_base_url_gcp
|
|
136
|
+
- metadata_url_gcp
|
|
137
|
+
- bbox_url_gcp
|
|
138
|
+
- image_base_url_aws
|
|
139
|
+
- metadata_url_aws
|
|
140
|
+
- bbox_url_aws
|
|
141
|
+
- image_base_url_azure
|
|
142
|
+
- metadata_url_azure
|
|
143
|
+
- box_url_azure
|
|
144
|
+
- mdv4_results_raw
|
|
145
|
+
- mdv5b_results_raw
|
|
146
|
+
- md_results_with_rde
|
|
147
|
+
- json_filename
|
|
114
148
|
"""
|
|
115
149
|
|
|
116
150
|
# Put the master metadata file in the same folder where we're putting images
|
|
@@ -144,6 +178,12 @@ def read_lila_all_images_file(metadata_dir):
|
|
|
144
178
|
"""
|
|
145
179
|
Downloads if necessary - then unzips if necessary - the .csv file with label mappings for
|
|
146
180
|
all LILA files, and opens the resulting .csv file as a Pandas DataFrame.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
metadata_dir (str): folder to use for temporary LILA metadata files
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
pd.DataFrame: a DataFrame containing one row per identification in a LILA camera trap image
|
|
147
187
|
"""
|
|
148
188
|
|
|
149
189
|
p = urlparse(lila_all_images_url)
|
|
@@ -165,18 +205,37 @@ def read_lila_all_images_file(metadata_dir):
|
|
|
165
205
|
return df
|
|
166
206
|
|
|
167
207
|
|
|
168
|
-
def read_metadata_file_for_dataset(ds_name,
|
|
208
|
+
def read_metadata_file_for_dataset(ds_name,
|
|
209
|
+
metadata_dir,
|
|
210
|
+
metadata_table=None,
|
|
211
|
+
json_url=None,
|
|
212
|
+
preferred_cloud='gcp'):
|
|
169
213
|
"""
|
|
170
214
|
Downloads if necessary - then unzips if necessary - the .json file for a specific dataset.
|
|
171
|
-
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
ds_name (str): the name of the dataset for which you want to retrieve metadata (e.g.
|
|
218
|
+
"Caltech Camera Traps")
|
|
219
|
+
metadata_dir (str): folder to use for temporary LILA metadata files
|
|
220
|
+
metadata_table (dict, optional): an optional dictionary already loaded via
|
|
221
|
+
read_lila_metadata()
|
|
222
|
+
json_url (str, optional): the URL of the metadata file, if None will be retrieved
|
|
223
|
+
via read_lila_metadata()
|
|
224
|
+
preferred_cloud (str, optional): 'gcp' (default), 'azure', or 'aws'
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
str: the .json filename on the local disk
|
|
228
|
+
|
|
172
229
|
"""
|
|
173
230
|
|
|
231
|
+
assert preferred_cloud in lila_base_urls.keys()
|
|
232
|
+
|
|
174
233
|
if json_url is None:
|
|
175
234
|
|
|
176
235
|
if metadata_table is None:
|
|
177
236
|
metadata_table = read_lila_metadata(metadata_dir)
|
|
178
237
|
|
|
179
|
-
json_url = metadata_table[ds_name]['
|
|
238
|
+
json_url = metadata_table[ds_name]['metadata_url_' + preferred_cloud]
|
|
180
239
|
|
|
181
240
|
p = urlparse(json_url)
|
|
182
241
|
json_filename = os.path.join(metadata_dir,os.path.basename(p.path))
|
|
@@ -198,28 +257,6 @@ def read_metadata_file_for_dataset(ds_name,metadata_dir,metadata_table=None,json
|
|
|
198
257
|
return json_filename
|
|
199
258
|
|
|
200
259
|
|
|
201
|
-
def azure_url_to_gcp_http_url(url,error_if_not_azure_url=True):
|
|
202
|
-
"""
|
|
203
|
-
Most URLs point to Azure by default, but most files are available on both Azure and GCP.
|
|
204
|
-
This function converts an Azure URL to the corresponding GCP http:// url.
|
|
205
|
-
"""
|
|
206
|
-
|
|
207
|
-
if error_if_not_azure_url:
|
|
208
|
-
assert url.startswith(lila_azure_storage_account)
|
|
209
|
-
gcp_url = url.replace(lila_azure_storage_account,gcp_bucket_api_url,1)
|
|
210
|
-
return gcp_url
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
def azure_url_to_gcp_gs_url(url,error_if_not_azure_url=True):
|
|
214
|
-
"""
|
|
215
|
-
Most URLs point to Azure by default, but most files are available on both Azure and GCP.
|
|
216
|
-
This function converts an Azure URL to the corresponding GCP gs:// url.
|
|
217
|
-
"""
|
|
218
|
-
|
|
219
|
-
return azure_url_to_gcp_http_url(url,error_if_not_azure_url).\
|
|
220
|
-
replace(gcp_bucket_api_url,gcp_bucket_gs_url,1)
|
|
221
|
-
|
|
222
|
-
|
|
223
260
|
#%% Interactive test driver
|
|
224
261
|
|
|
225
262
|
if False:
|
|
@@ -233,7 +270,8 @@ if False:
|
|
|
233
270
|
|
|
234
271
|
from md_utils import url_utils
|
|
235
272
|
|
|
236
|
-
status_codes = url_utils.test_urls(urls)
|
|
273
|
+
status_codes = url_utils.test_urls(urls,timeout=2.0)
|
|
274
|
+
assert all([code == 200 for code in status_codes])
|
|
237
275
|
|
|
238
276
|
|
|
239
277
|
#%% Verify that the metadata URLs exist for individual datasets
|
|
@@ -243,25 +281,20 @@ if False:
|
|
|
243
281
|
dataset_metadata = read_lila_metadata(metadata_dir)
|
|
244
282
|
|
|
245
283
|
urls_to_test = []
|
|
284
|
+
|
|
246
285
|
# ds_name = next(iter(dataset_metadata.keys()))
|
|
247
286
|
for ds_name in dataset_metadata.keys():
|
|
248
287
|
|
|
249
288
|
ds_info = dataset_metadata[ds_name]
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
289
|
+
for cloud_name in lila_base_urls.keys():
|
|
290
|
+
urls_to_test.append(ds_info['metadata_url_' + cloud_name])
|
|
291
|
+
if ds_info['bbox_url_relative'] != None:
|
|
292
|
+
urls_to_test.append(ds_info['bbox_url_' + cloud_name])
|
|
253
293
|
|
|
254
|
-
status_codes = url_utils.test_urls(urls_to_test
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
# url = urls_to_test[0]
|
|
262
|
-
for url in urls_to_test:
|
|
263
|
-
assert url.startswith(lila_azure_storage_account)
|
|
264
|
-
gcp_url = url.replace(lila_azure_storage_account,gcp_bucket_api_url,1)
|
|
265
|
-
gcp_urls.append(gcp_url)
|
|
266
|
-
|
|
267
|
-
status_codes = url_utils.test_urls(gcp_urls)
|
|
294
|
+
status_codes = url_utils.test_urls(urls_to_test,
|
|
295
|
+
error_on_failure=True,
|
|
296
|
+
n_workers=10,
|
|
297
|
+
pool_type='process',
|
|
298
|
+
timeout=2.0)
|
|
299
|
+
assert all([code == 200 for code in status_codes])
|
|
300
|
+
|
|
@@ -1,116 +1,132 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
os.
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
os.makedirs(
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
os.makedirs(
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
#%%
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
for
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
#
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
#
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
for
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
test_lila_metadata_urls.py
|
|
4
|
+
|
|
5
|
+
Test that all the metadata URLs for LILA camera trap datasets are valid, including MegaDetector
|
|
6
|
+
results files.
|
|
7
|
+
|
|
8
|
+
Also pick an arbitrary image from each dataset and make sure that URL is valid.
|
|
9
|
+
|
|
10
|
+
Also picks an arbitrary image from each dataset's MD results and make sure the corresponding URL is valid.
|
|
11
|
+
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
#%% Constants and imports
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
|
|
19
|
+
from data_management.lila.lila_common import read_lila_metadata,\
|
|
20
|
+
read_metadata_file_for_dataset, read_lila_taxonomy_mapping
|
|
21
|
+
|
|
22
|
+
# We'll write images, metadata downloads, and temporary files here
|
|
23
|
+
lila_local_base = os.path.expanduser('~/lila')
|
|
24
|
+
|
|
25
|
+
output_dir = os.path.join(lila_local_base,'lila_metadata_tests')
|
|
26
|
+
os.makedirs(output_dir,exist_ok=True)
|
|
27
|
+
|
|
28
|
+
metadata_dir = os.path.join(lila_local_base,'metadata')
|
|
29
|
+
os.makedirs(metadata_dir,exist_ok=True)
|
|
30
|
+
|
|
31
|
+
md_results_dir = os.path.join(lila_local_base,'md_results')
|
|
32
|
+
os.makedirs(md_results_dir,exist_ok=True)
|
|
33
|
+
|
|
34
|
+
md_results_keys = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
|
|
35
|
+
|
|
36
|
+
preferred_cloud = 'gcp' # 'azure', 'aws'
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
#%% Load category and taxonomy files
|
|
40
|
+
|
|
41
|
+
taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
#%% Download and parse the metadata file
|
|
45
|
+
|
|
46
|
+
metadata_table = read_lila_metadata(metadata_dir)
|
|
47
|
+
|
|
48
|
+
print('Loaded metadata URLs for {} datasets'.format(len(metadata_table)))
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
#%% Download and extract metadata and MD results for each dataset
|
|
52
|
+
|
|
53
|
+
for ds_name in metadata_table.keys():
|
|
54
|
+
|
|
55
|
+
metadata_table[ds_name]['json_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
|
|
56
|
+
metadata_dir=metadata_dir,
|
|
57
|
+
metadata_table=metadata_table)
|
|
58
|
+
for k in md_results_keys:
|
|
59
|
+
md_results_url = metadata_table[ds_name][k]
|
|
60
|
+
if md_results_url is None:
|
|
61
|
+
metadata_table[ds_name][k + '_filename'] = None
|
|
62
|
+
else:
|
|
63
|
+
metadata_table[ds_name][k + '_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
|
|
64
|
+
metadata_dir=md_results_dir,
|
|
65
|
+
json_url=md_results_url)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
#%% Build up a list of URLs to test
|
|
69
|
+
|
|
70
|
+
# Takes ~15 mins, since it has to open all the giant .json files
|
|
71
|
+
|
|
72
|
+
url_to_source = {}
|
|
73
|
+
|
|
74
|
+
# The first image in a dataset is disproportionately likely to be human (and thus 404),
|
|
75
|
+
# so we pick a semi-arbitrary image that isn't the first. How about the 1000th?
|
|
76
|
+
image_index = 1000
|
|
77
|
+
|
|
78
|
+
# ds_name = list(metadata_table.keys())[0]
|
|
79
|
+
for ds_name in metadata_table.keys():
|
|
80
|
+
|
|
81
|
+
if 'bbox' in ds_name:
|
|
82
|
+
print('Skipping bbox dataset {}'.format(ds_name))
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
print('Processing dataset {}'.format(ds_name))
|
|
86
|
+
|
|
87
|
+
json_filename = metadata_table[ds_name]['json_filename']
|
|
88
|
+
with open(json_filename, 'r') as f:
|
|
89
|
+
data = json.load(f)
|
|
90
|
+
|
|
91
|
+
image_base_url = metadata_table[ds_name]['image_base_url_' + preferred_cloud]
|
|
92
|
+
assert not image_base_url.endswith('/')
|
|
93
|
+
# Download a test image
|
|
94
|
+
test_image_relative_path = data['images'][image_index]['file_name']
|
|
95
|
+
test_image_url = image_base_url + '/' + test_image_relative_path
|
|
96
|
+
|
|
97
|
+
url_to_source[test_image_url] = ds_name + ' metadata'
|
|
98
|
+
|
|
99
|
+
# Grab an image from the MegaDetector results
|
|
100
|
+
|
|
101
|
+
# k = md_results_keys[2]
|
|
102
|
+
for k in md_results_keys:
|
|
103
|
+
k_fn = k + '_filename'
|
|
104
|
+
if metadata_table[ds_name][k_fn] is not None:
|
|
105
|
+
with open(metadata_table[ds_name][k_fn],'r') as f:
|
|
106
|
+
md_results = json.load(f)
|
|
107
|
+
im = md_results['images'][image_index]
|
|
108
|
+
md_image_url = image_base_url + '/' + im['file']
|
|
109
|
+
url_to_source[md_image_url] = ds_name + ' ' + k
|
|
110
|
+
del md_results
|
|
111
|
+
del data
|
|
112
|
+
|
|
113
|
+
# ...for each dataset
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
#%% Test URLs
|
|
117
|
+
|
|
118
|
+
from md_utils.url_utils import test_urls
|
|
119
|
+
|
|
120
|
+
urls_to_test = sorted(url_to_source.keys())
|
|
121
|
+
urls_to_test = [fn.replace('\\','/') for fn in urls_to_test]
|
|
122
|
+
|
|
123
|
+
status_codes = test_urls(urls_to_test,
|
|
124
|
+
error_on_failure=False,
|
|
125
|
+
pool_type='thread',
|
|
126
|
+
n_workers=10,
|
|
127
|
+
timeout=2.0)
|
|
128
|
+
|
|
129
|
+
for i_url,url in enumerate(urls_to_test):
|
|
130
|
+
if status_codes[i_url] != 200:
|
|
131
|
+
print('Status {} for {} ({})'.format(
|
|
132
|
+
status_codes[i_url],url,url_to_source[url]))
|