megadetector 5.0.5__py3-none-any.whl → 5.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- api/batch_processing/data_preparation/manage_local_batch.py +302 -263
- api/batch_processing/data_preparation/manage_video_batch.py +81 -2
- api/batch_processing/postprocessing/add_max_conf.py +1 -0
- api/batch_processing/postprocessing/categorize_detections_by_size.py +50 -19
- api/batch_processing/postprocessing/compare_batch_results.py +110 -60
- api/batch_processing/postprocessing/load_api_results.py +56 -70
- api/batch_processing/postprocessing/md_to_coco.py +1 -1
- api/batch_processing/postprocessing/md_to_labelme.py +2 -1
- api/batch_processing/postprocessing/postprocess_batch_results.py +240 -81
- api/batch_processing/postprocessing/render_detection_confusion_matrix.py +625 -0
- api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
- api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
- api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +227 -75
- api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
- api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
- api/synchronous/api_core/animal_detection_api/detection/run_detector_batch.py +2 -2
- classification/prepare_classification_script.py +191 -191
- data_management/coco_to_yolo.py +68 -45
- data_management/databases/integrity_check_json_db.py +7 -5
- data_management/generate_crops_from_cct.py +3 -3
- data_management/get_image_sizes.py +8 -6
- data_management/importers/add_timestamps_to_icct.py +79 -0
- data_management/importers/animl_results_to_md_results.py +160 -0
- data_management/importers/auckland_doc_test_to_json.py +4 -4
- data_management/importers/auckland_doc_to_json.py +1 -1
- data_management/importers/awc_to_json.py +5 -5
- data_management/importers/bellevue_to_json.py +5 -5
- data_management/importers/carrizo_shrubfree_2018.py +5 -5
- data_management/importers/carrizo_trail_cam_2017.py +5 -5
- data_management/importers/cct_field_adjustments.py +2 -3
- data_management/importers/channel_islands_to_cct.py +4 -4
- data_management/importers/ena24_to_json.py +5 -5
- data_management/importers/helena_to_cct.py +10 -10
- data_management/importers/idaho-camera-traps.py +12 -12
- data_management/importers/idfg_iwildcam_lila_prep.py +8 -8
- data_management/importers/jb_csv_to_json.py +4 -4
- data_management/importers/missouri_to_json.py +1 -1
- data_management/importers/noaa_seals_2019.py +1 -1
- data_management/importers/pc_to_json.py +5 -5
- data_management/importers/prepare-noaa-fish-data-for-lila.py +4 -4
- data_management/importers/prepare_zsl_imerit.py +5 -5
- data_management/importers/rspb_to_json.py +4 -4
- data_management/importers/save_the_elephants_survey_A.py +5 -5
- data_management/importers/save_the_elephants_survey_B.py +6 -6
- data_management/importers/snapshot_safari_importer.py +9 -9
- data_management/importers/snapshot_serengeti_lila.py +9 -9
- data_management/importers/timelapse_csv_set_to_json.py +5 -7
- data_management/importers/ubc_to_json.py +4 -4
- data_management/importers/umn_to_json.py +4 -4
- data_management/importers/wellington_to_json.py +1 -1
- data_management/importers/wi_to_json.py +2 -2
- data_management/importers/zamba_results_to_md_results.py +181 -0
- data_management/labelme_to_coco.py +35 -7
- data_management/labelme_to_yolo.py +229 -0
- data_management/lila/add_locations_to_island_camera_traps.py +1 -1
- data_management/lila/add_locations_to_nacti.py +147 -0
- data_management/lila/create_lila_blank_set.py +474 -0
- data_management/lila/create_lila_test_set.py +2 -1
- data_management/lila/create_links_to_md_results_files.py +106 -0
- data_management/lila/download_lila_subset.py +46 -21
- data_management/lila/generate_lila_per_image_labels.py +23 -14
- data_management/lila/get_lila_annotation_counts.py +17 -11
- data_management/lila/lila_common.py +14 -11
- data_management/lila/test_lila_metadata_urls.py +116 -0
- data_management/ocr_tools.py +829 -0
- data_management/resize_coco_dataset.py +13 -11
- data_management/yolo_output_to_md_output.py +84 -12
- data_management/yolo_to_coco.py +38 -20
- detection/process_video.py +36 -14
- detection/pytorch_detector.py +23 -8
- detection/run_detector.py +76 -19
- detection/run_detector_batch.py +178 -63
- detection/run_inference_with_yolov5_val.py +326 -57
- detection/run_tiled_inference.py +153 -43
- detection/video_utils.py +34 -8
- md_utils/ct_utils.py +172 -1
- md_utils/md_tests.py +372 -51
- md_utils/path_utils.py +167 -39
- md_utils/process_utils.py +26 -7
- md_utils/split_locations_into_train_val.py +215 -0
- md_utils/string_utils.py +10 -0
- md_utils/url_utils.py +0 -2
- md_utils/write_html_image_list.py +9 -26
- md_visualization/plot_utils.py +12 -8
- md_visualization/visualization_utils.py +106 -7
- md_visualization/visualize_db.py +16 -8
- md_visualization/visualize_detector_output.py +208 -97
- {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/METADATA +3 -6
- {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/RECORD +98 -121
- {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/WHEEL +1 -1
- taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
- taxonomy_mapping/map_new_lila_datasets.py +43 -39
- taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
- taxonomy_mapping/preview_lila_taxonomy.py +27 -27
- taxonomy_mapping/species_lookup.py +33 -13
- taxonomy_mapping/taxonomy_csv_checker.py +7 -5
- api/synchronous/api_core/yolov5/detect.py +0 -252
- api/synchronous/api_core/yolov5/export.py +0 -607
- api/synchronous/api_core/yolov5/hubconf.py +0 -146
- api/synchronous/api_core/yolov5/models/__init__.py +0 -0
- api/synchronous/api_core/yolov5/models/common.py +0 -738
- api/synchronous/api_core/yolov5/models/experimental.py +0 -104
- api/synchronous/api_core/yolov5/models/tf.py +0 -574
- api/synchronous/api_core/yolov5/models/yolo.py +0 -338
- api/synchronous/api_core/yolov5/train.py +0 -670
- api/synchronous/api_core/yolov5/utils/__init__.py +0 -36
- api/synchronous/api_core/yolov5/utils/activations.py +0 -103
- api/synchronous/api_core/yolov5/utils/augmentations.py +0 -284
- api/synchronous/api_core/yolov5/utils/autoanchor.py +0 -170
- api/synchronous/api_core/yolov5/utils/autobatch.py +0 -66
- api/synchronous/api_core/yolov5/utils/aws/__init__.py +0 -0
- api/synchronous/api_core/yolov5/utils/aws/resume.py +0 -40
- api/synchronous/api_core/yolov5/utils/benchmarks.py +0 -148
- api/synchronous/api_core/yolov5/utils/callbacks.py +0 -71
- api/synchronous/api_core/yolov5/utils/dataloaders.py +0 -1087
- api/synchronous/api_core/yolov5/utils/downloads.py +0 -178
- api/synchronous/api_core/yolov5/utils/flask_rest_api/example_request.py +0 -19
- api/synchronous/api_core/yolov5/utils/flask_rest_api/restapi.py +0 -46
- api/synchronous/api_core/yolov5/utils/general.py +0 -1018
- api/synchronous/api_core/yolov5/utils/loggers/__init__.py +0 -187
- api/synchronous/api_core/yolov5/utils/loggers/wandb/__init__.py +0 -0
- api/synchronous/api_core/yolov5/utils/loggers/wandb/log_dataset.py +0 -27
- api/synchronous/api_core/yolov5/utils/loggers/wandb/sweep.py +0 -41
- api/synchronous/api_core/yolov5/utils/loggers/wandb/wandb_utils.py +0 -577
- api/synchronous/api_core/yolov5/utils/loss.py +0 -234
- api/synchronous/api_core/yolov5/utils/metrics.py +0 -355
- api/synchronous/api_core/yolov5/utils/plots.py +0 -489
- api/synchronous/api_core/yolov5/utils/torch_utils.py +0 -314
- api/synchronous/api_core/yolov5/val.py +0 -394
- md_utils/matlab_porting_tools.py +0 -97
- {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/LICENSE +0 -0
- {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/top_level.txt +0 -0
|
@@ -24,11 +24,11 @@ from urllib.parse import urlparse
|
|
|
24
24
|
from collections import defaultdict
|
|
25
25
|
|
|
26
26
|
from data_management.lila.lila_common import \
|
|
27
|
-
read_lila_all_images_file,
|
|
27
|
+
read_lila_all_images_file, is_empty, azure_url_to_gcp_http_url
|
|
28
28
|
from md_utils.url_utils import download_url
|
|
29
29
|
|
|
30
30
|
# If any of these strings appear in the common name of a species, we'll download that image
|
|
31
|
-
species_of_interest = ['grey fox','red fox','leopard cat']
|
|
31
|
+
species_of_interest = ['grey fox','red fox','leopard cat','kiwi']
|
|
32
32
|
|
|
33
33
|
# We'll write images, metadata downloads, and temporary files here
|
|
34
34
|
lila_local_base = os.path.expanduser('~/lila')
|
|
@@ -40,30 +40,28 @@ output_dir = os.path.join(lila_local_base,'lila_downloads_by_dataset')
|
|
|
40
40
|
os.makedirs(output_dir,exist_ok=True)
|
|
41
41
|
|
|
42
42
|
# Number of concurrent download threads
|
|
43
|
-
n_download_threads =
|
|
43
|
+
n_download_threads = 20
|
|
44
44
|
|
|
45
45
|
max_images_per_dataset = 10 # None
|
|
46
46
|
|
|
47
47
|
# This impacts the data download, but not the metadata download
|
|
48
|
+
#
|
|
49
|
+
# "Azure" really means "Azure if available"; recent datasets are only available
|
|
50
|
+
# on GCP.
|
|
48
51
|
image_download_source = 'azure' # 'azure' or 'gcp'
|
|
49
52
|
|
|
50
53
|
random.seed(0)
|
|
51
54
|
|
|
52
55
|
|
|
53
|
-
#%% Download and open the giant table of image
|
|
56
|
+
#%% Download and open the giant table of image URLs and labels
|
|
54
57
|
|
|
55
|
-
#
|
|
58
|
+
# ~60 seconds to download, unzip, and open
|
|
56
59
|
df = read_lila_all_images_file(metadata_dir)
|
|
57
60
|
|
|
58
61
|
|
|
59
|
-
#%% Download and parse the metadata file
|
|
60
|
-
|
|
61
|
-
metadata_table = read_lila_metadata(metadata_dir)
|
|
62
|
-
|
|
63
|
-
|
|
64
62
|
#%% Find all the images we want to download
|
|
65
63
|
|
|
66
|
-
#
|
|
64
|
+
# ~2 minutes
|
|
67
65
|
|
|
68
66
|
ds_name_to_urls = defaultdict(list)
|
|
69
67
|
|
|
@@ -106,13 +104,24 @@ else:
|
|
|
106
104
|
|
|
107
105
|
#%% Download those image files
|
|
108
106
|
|
|
109
|
-
|
|
107
|
+
container_to_url_base = {
|
|
108
|
+
'lilablobssc.blob.core.windows.net':'/',
|
|
109
|
+
'storage.googleapis.com':'/public-datasets-lila/'
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
def download_relative_filename(url, output_base, verbose=False, url_base=None, overwrite=False):
|
|
110
113
|
"""
|
|
111
114
|
Download a URL to output_base, preserving relative path
|
|
112
115
|
"""
|
|
113
116
|
|
|
117
|
+
result = {'status':'unknown','url':url,'destination_filename':None}
|
|
118
|
+
|
|
114
119
|
if url_base is None:
|
|
115
|
-
|
|
120
|
+
assert url.startswith('https://')
|
|
121
|
+
container = url.split('/')[2]
|
|
122
|
+
assert container in container_to_url_base
|
|
123
|
+
url_base = container_to_url_base[container]
|
|
124
|
+
|
|
116
125
|
assert url_base.startswith('/') and url_base.endswith('/')
|
|
117
126
|
|
|
118
127
|
p = urlparse(url)
|
|
@@ -122,29 +131,45 @@ def download_relative_filename(url, output_base, verbose=False, url_base=None):
|
|
|
122
131
|
relative_filename = relative_filename.replace(url_base,'',1)
|
|
123
132
|
|
|
124
133
|
destination_filename = os.path.join(output_base,relative_filename)
|
|
125
|
-
|
|
134
|
+
result['destination_filename'] = destination_filename
|
|
135
|
+
|
|
136
|
+
if ((os.path.isfile(destination_filename)) and (not overwrite)):
|
|
137
|
+
result['status'] = 'skipped'
|
|
138
|
+
return result
|
|
139
|
+
try:
|
|
140
|
+
download_url(url, destination_filename, verbose=verbose)
|
|
141
|
+
except Exception as e:
|
|
142
|
+
print('Warning: error downloading URL {}: {}'.format(
|
|
143
|
+
url,str(e)))
|
|
144
|
+
result['status'] = 'error: {}'.format(str(e))
|
|
145
|
+
return result
|
|
126
146
|
|
|
147
|
+
result['status'] = 'success'
|
|
148
|
+
return result
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
# ds_name_to_urls maps dataset names to lists of URLs; flatten to a single list of URLs
|
|
127
152
|
all_urls = list(ds_name_to_urls.values())
|
|
128
153
|
all_urls = [item for sublist in all_urls for item in sublist]
|
|
129
154
|
|
|
130
|
-
url_base = '/'
|
|
131
|
-
|
|
132
155
|
# Convert Azure URLs to GCP URLs if necessary
|
|
133
156
|
if image_download_source != 'azure':
|
|
134
157
|
assert image_download_source == 'gcp'
|
|
135
|
-
url_base = '/public-datasets-lila/'
|
|
136
158
|
all_urls = [azure_url_to_gcp_http_url(url) for url in all_urls]
|
|
137
159
|
|
|
138
|
-
print('Downloading {} images
|
|
160
|
+
print('Downloading {} images on {} workers'.format(len(all_urls),n_download_threads))
|
|
139
161
|
|
|
140
162
|
if n_download_threads <= 1:
|
|
141
163
|
|
|
164
|
+
results = []
|
|
165
|
+
|
|
142
166
|
# url = all_urls[0]
|
|
143
167
|
for url in tqdm(all_urls):
|
|
144
|
-
download_relative_filename(url,output_dir,
|
|
168
|
+
results.append(download_relative_filename(url,output_dir,url_base=None))
|
|
145
169
|
|
|
146
170
|
else:
|
|
147
171
|
|
|
148
172
|
pool = ThreadPool(n_download_threads)
|
|
149
|
-
tqdm(pool.imap(lambda s: download_relative_filename(
|
|
150
|
-
|
|
173
|
+
results = list(tqdm(pool.imap(lambda s: download_relative_filename(
|
|
174
|
+
s,output_dir,url_base=None),
|
|
175
|
+
all_urls), total=len(all_urls)))
|
|
@@ -22,6 +22,9 @@ import json
|
|
|
22
22
|
import pandas as pd
|
|
23
23
|
import numpy as np
|
|
24
24
|
import dateparser
|
|
25
|
+
import csv
|
|
26
|
+
import urllib
|
|
27
|
+
import urllib.request
|
|
25
28
|
|
|
26
29
|
from collections import defaultdict
|
|
27
30
|
from tqdm import tqdm
|
|
@@ -30,6 +33,9 @@ from data_management.lila.lila_common import read_lila_metadata, \
|
|
|
30
33
|
read_metadata_file_for_dataset, \
|
|
31
34
|
read_lila_taxonomy_mapping
|
|
32
35
|
|
|
36
|
+
from md_utils import write_html_image_list
|
|
37
|
+
from md_utils.path_utils import zip_file
|
|
38
|
+
from md_utils.path_utils import open_file
|
|
33
39
|
from md_utils.url_utils import download_url
|
|
34
40
|
|
|
35
41
|
# We'll write images, metadata downloads, and temporary files here
|
|
@@ -56,7 +62,7 @@ ds_name_to_annotation_level['NACTI'] = 'unknown'
|
|
|
56
62
|
|
|
57
63
|
known_unmapped_labels = set(['WCS Camera Traps:#ref!'])
|
|
58
64
|
|
|
59
|
-
debug_max_images_per_dataset =
|
|
65
|
+
debug_max_images_per_dataset = -1
|
|
60
66
|
if debug_max_images_per_dataset > 0:
|
|
61
67
|
print('Running in debug mode')
|
|
62
68
|
output_file = output_file.replace('.csv','_debug.csv')
|
|
@@ -72,7 +78,7 @@ if False:
|
|
|
72
78
|
metadata_table = {k:metadata_table[k]}
|
|
73
79
|
|
|
74
80
|
|
|
75
|
-
#%% Download and extract metadata for
|
|
81
|
+
#%% Download and extract metadata for each dataset
|
|
76
82
|
|
|
77
83
|
for ds_name in metadata_table.keys():
|
|
78
84
|
metadata_table[ds_name]['metadata_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
|
|
@@ -101,8 +107,6 @@ for i_row,row in taxonomy_df.iterrows():
|
|
|
101
107
|
|
|
102
108
|
# Takes several hours
|
|
103
109
|
|
|
104
|
-
import csv
|
|
105
|
-
|
|
106
110
|
header = ['dataset_name','url','image_id','sequence_id','location_id','frame_num','original_label',\
|
|
107
111
|
'scientific_name','common_name','datetime','annotation_level']
|
|
108
112
|
|
|
@@ -122,7 +126,7 @@ def clearnan(v):
|
|
|
122
126
|
assert isinstance(v,str)
|
|
123
127
|
return v
|
|
124
128
|
|
|
125
|
-
with open(output_file,'w') as f:
|
|
129
|
+
with open(output_file,'w',encoding='utf-8',newline='') as f:
|
|
126
130
|
|
|
127
131
|
csv_writer = csv.writer(f)
|
|
128
132
|
csv_writer.writerow(header)
|
|
@@ -334,6 +338,8 @@ with open(output_file,'w') as f:
|
|
|
334
338
|
|
|
335
339
|
# ...with open()
|
|
336
340
|
|
|
341
|
+
print('Processed {} datsets'.format(len(metadata_table)))
|
|
342
|
+
|
|
337
343
|
|
|
338
344
|
#%% Read the .csv back
|
|
339
345
|
|
|
@@ -352,6 +358,8 @@ def isint(v):
|
|
|
352
358
|
|
|
353
359
|
valid_annotation_levels = set(['sequence','image','unknown'])
|
|
354
360
|
|
|
361
|
+
# Collect a list of locations within each dataset; we'll use this
|
|
362
|
+
# in the next cell to look for datasets that only have a single location
|
|
355
363
|
dataset_name_to_locations = defaultdict(set)
|
|
356
364
|
|
|
357
365
|
def check_row(row):
|
|
@@ -386,6 +394,8 @@ else:
|
|
|
386
394
|
|
|
387
395
|
#%% Check for datasets that have only one location string
|
|
388
396
|
|
|
397
|
+
# Expected: ENA24, Missouri Camera Traps
|
|
398
|
+
|
|
389
399
|
for ds_name in dataset_name_to_locations.keys():
|
|
390
400
|
if len(dataset_name_to_locations[ds_name]) == 1:
|
|
391
401
|
print('No location information for {}'.format(ds_name))
|
|
@@ -440,8 +450,8 @@ print('Selected {} total images'.format(len(images_to_download)))
|
|
|
440
450
|
|
|
441
451
|
# Expect a few errors for images with human or vehicle labels (or things like "ignore" that *could* be humans)
|
|
442
452
|
|
|
443
|
-
|
|
444
|
-
|
|
453
|
+
# TODO: trivially parallelizable
|
|
454
|
+
#
|
|
445
455
|
# i_image = 10; image = images_to_download[i_image]
|
|
446
456
|
for i_image,image in tqdm(enumerate(images_to_download),total=len(images_to_download)):
|
|
447
457
|
|
|
@@ -450,17 +460,17 @@ for i_image,image in tqdm(enumerate(images_to_download),total=len(images_to_down
|
|
|
450
460
|
image_file = os.path.join(preview_folder,'image_{}'.format(str(i_image).zfill(4)) + ext)
|
|
451
461
|
relative_file = os.path.relpath(image_file,preview_folder)
|
|
452
462
|
try:
|
|
453
|
-
download_url(url,
|
|
463
|
+
download_url(url,image_file,verbose=False)
|
|
454
464
|
image['relative_file'] = relative_file
|
|
455
465
|
except urllib.error.HTTPError:
|
|
456
466
|
print('Image {} does not exist ({}:{})'.format(
|
|
457
467
|
i_image,image['dataset_name'],image['original_label']))
|
|
458
468
|
image['relative_file'] = None
|
|
459
469
|
|
|
470
|
+
# ...for each image we need to download
|
|
460
471
|
|
|
461
|
-
#%% Write preview HTML
|
|
462
472
|
|
|
463
|
-
|
|
473
|
+
#%% Write preview HTML
|
|
464
474
|
|
|
465
475
|
html_filename = os.path.join(preview_folder,'index.html')
|
|
466
476
|
|
|
@@ -475,19 +485,18 @@ for im in images_to_download:
|
|
|
475
485
|
output_im = {}
|
|
476
486
|
output_im['filename'] = im['relative_file']
|
|
477
487
|
output_im['linkTarget'] = im['url']
|
|
478
|
-
output_im['title'] = str(im)
|
|
488
|
+
output_im['title'] = '<b>{}: {}</b><br/><br/>'.format(im['dataset_name'],im['original_label']) + str(im)
|
|
479
489
|
output_im['imageStyle'] = 'width:600px;'
|
|
480
490
|
output_im['textStyle'] = 'font-weight:normal;font-size:100%;'
|
|
481
491
|
html_images.append(output_im)
|
|
482
492
|
|
|
483
493
|
write_html_image_list.write_html_image_list(html_filename,html_images)
|
|
484
494
|
|
|
485
|
-
from md_utils.path_utils import open_file
|
|
486
495
|
open_file(html_filename)
|
|
487
496
|
|
|
488
497
|
|
|
489
498
|
#%% Zip output file
|
|
490
499
|
|
|
491
|
-
|
|
500
|
+
zipped_output_file = zip_file(output_file,verbose=True)
|
|
492
501
|
|
|
493
|
-
|
|
502
|
+
print('Zipped {} to {}'.format(output_file,zipped_output_file))
|
|
@@ -34,18 +34,9 @@ os.makedirs(metadata_dir,exist_ok=True)
|
|
|
34
34
|
|
|
35
35
|
output_file = os.path.join(output_dir,'lila_dataset_to_categories.json')
|
|
36
36
|
|
|
37
|
-
# Created by get_lila_category_list.py... contains counts for each category
|
|
38
|
-
category_list_dir = os.path.join(lila_local_base,'lila_categories_list')
|
|
39
|
-
lila_dataset_to_categories_file = os.path.join(category_list_dir,'lila_dataset_to_categories.json')
|
|
40
|
-
|
|
41
|
-
assert os.path.isfile(lila_dataset_to_categories_file)
|
|
42
|
-
|
|
43
37
|
|
|
44
38
|
#%% Load category and taxonomy files
|
|
45
39
|
|
|
46
|
-
with open(lila_dataset_to_categories_file,'r') as f:
|
|
47
|
-
lila_dataset_to_categories = json.load(f)
|
|
48
|
-
|
|
49
40
|
taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
|
|
50
41
|
|
|
51
42
|
|
|
@@ -55,9 +46,13 @@ ds_query_to_scientific_name = {}
|
|
|
55
46
|
|
|
56
47
|
unmapped_queries = set()
|
|
57
48
|
|
|
49
|
+
datasets_with_taxonomy_mapping = set()
|
|
50
|
+
|
|
58
51
|
# i_row = 1; row = taxonomy_df.iloc[i_row]; row
|
|
59
52
|
for i_row,row in taxonomy_df.iterrows():
|
|
60
53
|
|
|
54
|
+
datasets_with_taxonomy_mapping.add(row['dataset_name'])
|
|
55
|
+
|
|
61
56
|
ds_query = row['dataset_name'] + ':' + row['query']
|
|
62
57
|
ds_query = ds_query.lower()
|
|
63
58
|
|
|
@@ -68,13 +63,17 @@ for i_row,row in taxonomy_df.iterrows():
|
|
|
68
63
|
|
|
69
64
|
ds_query_to_scientific_name[ds_query] = row['scientific_name']
|
|
70
65
|
|
|
66
|
+
print('Loaded taxonomy mappings for {} datasets'.format(len(datasets_with_taxonomy_mapping)))
|
|
71
67
|
|
|
68
|
+
|
|
72
69
|
#%% Download and parse the metadata file
|
|
73
70
|
|
|
74
71
|
metadata_table = read_lila_metadata(metadata_dir)
|
|
75
72
|
|
|
73
|
+
print('Loaded metadata URLs for {} datasets'.format(len(metadata_table)))
|
|
74
|
+
|
|
76
75
|
|
|
77
|
-
#%% Download and extract metadata for
|
|
76
|
+
#%% Download and extract metadata for each dataset
|
|
78
77
|
|
|
79
78
|
for ds_name in metadata_table.keys():
|
|
80
79
|
metadata_table[ds_name]['json_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
|
|
@@ -91,6 +90,11 @@ dataset_to_categories = {}
|
|
|
91
90
|
# ds_name = 'NACTI'
|
|
92
91
|
for ds_name in metadata_table.keys():
|
|
93
92
|
|
|
93
|
+
taxonomy_mapping_available = (ds_name in datasets_with_taxonomy_mapping)
|
|
94
|
+
|
|
95
|
+
if not taxonomy_mapping_available:
|
|
96
|
+
print('Warning: taxonomy mapping not available for {}'.format(ds_name))
|
|
97
|
+
|
|
94
98
|
print('Finding categories in {}'.format(ds_name))
|
|
95
99
|
|
|
96
100
|
json_filename = metadata_table[ds_name]['json_filename']
|
|
@@ -122,6 +126,8 @@ for ds_name in metadata_table.keys():
|
|
|
122
126
|
# always redundant with the class-level data sets.
|
|
123
127
|
if 'bbox' in ds_name:
|
|
124
128
|
c['scientific_name_from_taxonomy_mapping'] = None
|
|
129
|
+
elif not taxonomy_mapping_available:
|
|
130
|
+
c['scientific_name_from_taxonomy_mapping'] = None
|
|
125
131
|
else:
|
|
126
132
|
taxonomy_query_string = ds_name.lower().strip() + ':' + c['name'].lower()
|
|
127
133
|
if taxonomy_query_string not in ds_query_to_scientific_name:
|
|
@@ -158,4 +164,4 @@ for ds_name in dataset_to_categories:
|
|
|
158
164
|
#%% Save the results
|
|
159
165
|
|
|
160
166
|
with open(output_file, 'w') as f:
|
|
161
|
-
json.dump(dataset_to_categories,f,indent=
|
|
167
|
+
json.dump(dataset_to_categories,f,indent=1)
|
|
@@ -21,7 +21,7 @@ from md_utils.path_utils import unzip_file
|
|
|
21
21
|
|
|
22
22
|
# LILA camera trap primary metadata file
|
|
23
23
|
lila_metadata_url = 'http://lila.science/wp-content/uploads/2023/06/lila_camera_trap_datasets.csv'
|
|
24
|
-
lila_taxonomy_mapping_url = 'https://lila.science/
|
|
24
|
+
lila_taxonomy_mapping_url = 'https://lila.science/public/lila-taxonomy-mapping_release.csv'
|
|
25
25
|
lila_all_images_url = 'https://lila.science/public/lila_image_urls_and_labels.csv.zip'
|
|
26
26
|
|
|
27
27
|
wildlife_insights_page_size = 30000
|
|
@@ -165,16 +165,18 @@ def read_lila_all_images_file(metadata_dir):
|
|
|
165
165
|
return df
|
|
166
166
|
|
|
167
167
|
|
|
168
|
-
def read_metadata_file_for_dataset(ds_name,metadata_dir,metadata_table=None):
|
|
168
|
+
def read_metadata_file_for_dataset(ds_name,metadata_dir,metadata_table=None,json_url=None):
|
|
169
169
|
"""
|
|
170
170
|
Downloads if necessary - then unzips if necessary - the .json file for a specific dataset.
|
|
171
171
|
Returns the .json filename on the local disk.
|
|
172
172
|
"""
|
|
173
173
|
|
|
174
|
-
if
|
|
175
|
-
metadata_table = read_lila_metadata(metadata_dir)
|
|
174
|
+
if json_url is None:
|
|
176
175
|
|
|
177
|
-
|
|
176
|
+
if metadata_table is None:
|
|
177
|
+
metadata_table = read_lila_metadata(metadata_dir)
|
|
178
|
+
|
|
179
|
+
json_url = metadata_table[ds_name]['metadata_url']
|
|
178
180
|
|
|
179
181
|
p = urlparse(json_url)
|
|
180
182
|
json_filename = os.path.join(metadata_dir,os.path.basename(p.path))
|
|
@@ -196,25 +198,26 @@ def read_metadata_file_for_dataset(ds_name,metadata_dir,metadata_table=None):
|
|
|
196
198
|
return json_filename
|
|
197
199
|
|
|
198
200
|
|
|
199
|
-
def azure_url_to_gcp_http_url(url):
|
|
201
|
+
def azure_url_to_gcp_http_url(url,error_if_not_azure_url=True):
|
|
200
202
|
"""
|
|
201
203
|
Most URLs point to Azure by default, but most files are available on both Azure and GCP.
|
|
202
204
|
This function converts an Azure URL to the corresponding GCP http:// url.
|
|
203
205
|
"""
|
|
204
206
|
|
|
205
|
-
|
|
207
|
+
if error_if_not_azure_url:
|
|
208
|
+
assert url.startswith(lila_azure_storage_account)
|
|
206
209
|
gcp_url = url.replace(lila_azure_storage_account,gcp_bucket_api_url,1)
|
|
207
210
|
return gcp_url
|
|
208
211
|
|
|
209
212
|
|
|
210
|
-
def azure_url_to_gcp_gs_url(url):
|
|
213
|
+
def azure_url_to_gcp_gs_url(url,error_if_not_azure_url=True):
|
|
211
214
|
"""
|
|
212
215
|
Most URLs point to Azure by default, but most files are available on both Azure and GCP.
|
|
213
216
|
This function converts an Azure URL to the corresponding GCP gs:// url.
|
|
214
217
|
"""
|
|
215
218
|
|
|
216
|
-
return azure_url_to_gcp_http_url(url)
|
|
217
|
-
|
|
219
|
+
return azure_url_to_gcp_http_url(url,error_if_not_azure_url).\
|
|
220
|
+
replace(gcp_bucket_api_url,gcp_bucket_gs_url,1)
|
|
218
221
|
|
|
219
222
|
|
|
220
223
|
#%% Interactive test driver
|
|
@@ -261,4 +264,4 @@ if False:
|
|
|
261
264
|
gcp_url = url.replace(lila_azure_storage_account,gcp_bucket_api_url,1)
|
|
262
265
|
gcp_urls.append(gcp_url)
|
|
263
266
|
|
|
264
|
-
status_codes = url_utils.test_urls(gcp_urls)
|
|
267
|
+
status_codes = url_utils.test_urls(gcp_urls)
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
########
|
|
2
|
+
#
|
|
3
|
+
# test_lila_metadata_urls.py
|
|
4
|
+
#
|
|
5
|
+
# Test that all the metadata URLs for LILA camera trap datasets are valid, and
|
|
6
|
+
# test that at least one image within each URL is valid, including MegaDetector results
|
|
7
|
+
# files.
|
|
8
|
+
#
|
|
9
|
+
########
|
|
10
|
+
|
|
11
|
+
#%% Constants and imports
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import os
|
|
15
|
+
|
|
16
|
+
from data_management.lila.lila_common import read_lila_metadata,\
|
|
17
|
+
read_metadata_file_for_dataset, read_lila_taxonomy_mapping
|
|
18
|
+
|
|
19
|
+
# We'll write images, metadata downloads, and temporary files here
|
|
20
|
+
lila_local_base = os.path.expanduser('~/lila')
|
|
21
|
+
|
|
22
|
+
output_dir = os.path.join(lila_local_base,'lila_metadata_tests')
|
|
23
|
+
os.makedirs(output_dir,exist_ok=True)
|
|
24
|
+
|
|
25
|
+
metadata_dir = os.path.join(lila_local_base,'metadata')
|
|
26
|
+
os.makedirs(metadata_dir,exist_ok=True)
|
|
27
|
+
|
|
28
|
+
md_results_dir = os.path.join(lila_local_base,'md_results')
|
|
29
|
+
os.makedirs(md_results_dir,exist_ok=True)
|
|
30
|
+
|
|
31
|
+
md_results_keys = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
#%% Load category and taxonomy files
|
|
35
|
+
|
|
36
|
+
taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
#%% Download and parse the metadata file
|
|
40
|
+
|
|
41
|
+
metadata_table = read_lila_metadata(metadata_dir)
|
|
42
|
+
|
|
43
|
+
print('Loaded metadata URLs for {} datasets'.format(len(metadata_table)))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
#%% Download and extract metadata and MD results for each dataset
|
|
47
|
+
|
|
48
|
+
for ds_name in metadata_table.keys():
|
|
49
|
+
|
|
50
|
+
metadata_table[ds_name]['json_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
|
|
51
|
+
metadata_dir=metadata_dir,
|
|
52
|
+
metadata_table=metadata_table)
|
|
53
|
+
for k in md_results_keys:
|
|
54
|
+
md_results_url = metadata_table[ds_name][k]
|
|
55
|
+
if md_results_url is None:
|
|
56
|
+
metadata_table[ds_name][k + '_filename'] = None
|
|
57
|
+
else:
|
|
58
|
+
metadata_table[ds_name][k + '_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
|
|
59
|
+
metadata_dir=md_results_dir,
|
|
60
|
+
json_url=md_results_url)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
#%% Build up a list of URLs to test
|
|
64
|
+
|
|
65
|
+
url_to_source = {}
|
|
66
|
+
|
|
67
|
+
# The first image in a dataset is disproportionately likely to be human (and thus 404)
|
|
68
|
+
image_index = 1000
|
|
69
|
+
|
|
70
|
+
# ds_name = list(metadata_table.keys())[0]
|
|
71
|
+
for ds_name in metadata_table.keys():
|
|
72
|
+
|
|
73
|
+
if 'bbox' in ds_name:
|
|
74
|
+
print('Skipping bbox dataset {}'.format(ds_name))
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
print('Processing dataset {}'.format(ds_name))
|
|
78
|
+
|
|
79
|
+
json_filename = metadata_table[ds_name]['json_filename']
|
|
80
|
+
with open(json_filename, 'r') as f:
|
|
81
|
+
data = json.load(f)
|
|
82
|
+
|
|
83
|
+
image_base_url = metadata_table[ds_name]['image_base_url']
|
|
84
|
+
assert not image_base_url.endswith('/')
|
|
85
|
+
# Download a test image
|
|
86
|
+
test_image_relative_path = data['images'][image_index]['file_name']
|
|
87
|
+
test_image_url = image_base_url + '/' + test_image_relative_path
|
|
88
|
+
|
|
89
|
+
url_to_source[test_image_url] = ds_name + ' metadata'
|
|
90
|
+
|
|
91
|
+
# k = md_results_keys[2]
|
|
92
|
+
for k in md_results_keys:
|
|
93
|
+
k_fn = k + '_filename'
|
|
94
|
+
if metadata_table[ds_name][k_fn] is not None:
|
|
95
|
+
with open(metadata_table[ds_name][k_fn],'r') as f:
|
|
96
|
+
md_results = json.load(f)
|
|
97
|
+
im = md_results['images'][image_index]
|
|
98
|
+
md_image_url = image_base_url + '/' + im['file']
|
|
99
|
+
url_to_source[md_image_url] = ds_name + ' ' + k
|
|
100
|
+
|
|
101
|
+
# ...for each dataset
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
#%% Test URLs
|
|
105
|
+
|
|
106
|
+
from md_utils.url_utils import test_urls
|
|
107
|
+
|
|
108
|
+
urls_to_test = sorted(url_to_source.keys())
|
|
109
|
+
urls_to_test = [fn.replace('\\','/') for fn in urls_to_test]
|
|
110
|
+
|
|
111
|
+
status_codes = test_urls(urls_to_test,error_on_failure=False)
|
|
112
|
+
|
|
113
|
+
for i_url,url in enumerate(urls_to_test):
|
|
114
|
+
if status_codes[i_url] != 200:
|
|
115
|
+
print('Status {} for {} ({})'.format(
|
|
116
|
+
status_codes[i_url],url,url_to_source[url]))
|