megadetector 10.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- megadetector/__init__.py +0 -0
- megadetector/api/__init__.py +0 -0
- megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
- megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
- megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
- megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
- megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
- megadetector/classification/__init__.py +0 -0
- megadetector/classification/aggregate_classifier_probs.py +108 -0
- megadetector/classification/analyze_failed_images.py +227 -0
- megadetector/classification/cache_batchapi_outputs.py +198 -0
- megadetector/classification/create_classification_dataset.py +626 -0
- megadetector/classification/crop_detections.py +516 -0
- megadetector/classification/csv_to_json.py +226 -0
- megadetector/classification/detect_and_crop.py +853 -0
- megadetector/classification/efficientnet/__init__.py +9 -0
- megadetector/classification/efficientnet/model.py +415 -0
- megadetector/classification/efficientnet/utils.py +608 -0
- megadetector/classification/evaluate_model.py +520 -0
- megadetector/classification/identify_mislabeled_candidates.py +152 -0
- megadetector/classification/json_to_azcopy_list.py +63 -0
- megadetector/classification/json_validator.py +696 -0
- megadetector/classification/map_classification_categories.py +276 -0
- megadetector/classification/merge_classification_detection_output.py +509 -0
- megadetector/classification/prepare_classification_script.py +194 -0
- megadetector/classification/prepare_classification_script_mc.py +228 -0
- megadetector/classification/run_classifier.py +287 -0
- megadetector/classification/save_mislabeled.py +110 -0
- megadetector/classification/train_classifier.py +827 -0
- megadetector/classification/train_classifier_tf.py +725 -0
- megadetector/classification/train_utils.py +323 -0
- megadetector/data_management/__init__.py +0 -0
- megadetector/data_management/animl_to_md.py +161 -0
- megadetector/data_management/annotations/__init__.py +0 -0
- megadetector/data_management/annotations/annotation_constants.py +33 -0
- megadetector/data_management/camtrap_dp_to_coco.py +270 -0
- megadetector/data_management/cct_json_utils.py +566 -0
- megadetector/data_management/cct_to_md.py +184 -0
- megadetector/data_management/cct_to_wi.py +293 -0
- megadetector/data_management/coco_to_labelme.py +284 -0
- megadetector/data_management/coco_to_yolo.py +701 -0
- megadetector/data_management/databases/__init__.py +0 -0
- megadetector/data_management/databases/add_width_and_height_to_db.py +107 -0
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +210 -0
- megadetector/data_management/databases/integrity_check_json_db.py +563 -0
- megadetector/data_management/databases/subset_json_db.py +195 -0
- megadetector/data_management/generate_crops_from_cct.py +200 -0
- megadetector/data_management/get_image_sizes.py +164 -0
- megadetector/data_management/labelme_to_coco.py +559 -0
- megadetector/data_management/labelme_to_yolo.py +349 -0
- megadetector/data_management/lila/__init__.py +0 -0
- megadetector/data_management/lila/create_lila_blank_set.py +556 -0
- megadetector/data_management/lila/create_lila_test_set.py +192 -0
- megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
- megadetector/data_management/lila/download_lila_subset.py +182 -0
- megadetector/data_management/lila/generate_lila_per_image_labels.py +777 -0
- megadetector/data_management/lila/get_lila_annotation_counts.py +174 -0
- megadetector/data_management/lila/get_lila_image_counts.py +112 -0
- megadetector/data_management/lila/lila_common.py +319 -0
- megadetector/data_management/lila/test_lila_metadata_urls.py +164 -0
- megadetector/data_management/mewc_to_md.py +344 -0
- megadetector/data_management/ocr_tools.py +873 -0
- megadetector/data_management/read_exif.py +964 -0
- megadetector/data_management/remap_coco_categories.py +195 -0
- megadetector/data_management/remove_exif.py +156 -0
- megadetector/data_management/rename_images.py +194 -0
- megadetector/data_management/resize_coco_dataset.py +665 -0
- megadetector/data_management/speciesnet_to_md.py +41 -0
- megadetector/data_management/wi_download_csv_to_coco.py +247 -0
- megadetector/data_management/yolo_output_to_md_output.py +594 -0
- megadetector/data_management/yolo_to_coco.py +984 -0
- megadetector/data_management/zamba_to_md.py +188 -0
- megadetector/detection/__init__.py +0 -0
- megadetector/detection/change_detection.py +840 -0
- megadetector/detection/process_video.py +479 -0
- megadetector/detection/pytorch_detector.py +1451 -0
- megadetector/detection/run_detector.py +1267 -0
- megadetector/detection/run_detector_batch.py +2172 -0
- megadetector/detection/run_inference_with_yolov5_val.py +1314 -0
- megadetector/detection/run_md_and_speciesnet.py +1604 -0
- megadetector/detection/run_tiled_inference.py +1044 -0
- megadetector/detection/tf_detector.py +209 -0
- megadetector/detection/video_utils.py +1379 -0
- megadetector/postprocessing/__init__.py +0 -0
- megadetector/postprocessing/add_max_conf.py +72 -0
- megadetector/postprocessing/categorize_detections_by_size.py +166 -0
- megadetector/postprocessing/classification_postprocessing.py +1943 -0
- megadetector/postprocessing/combine_batch_outputs.py +249 -0
- megadetector/postprocessing/compare_batch_results.py +2110 -0
- megadetector/postprocessing/convert_output_format.py +403 -0
- megadetector/postprocessing/create_crop_folder.py +629 -0
- megadetector/postprocessing/detector_calibration.py +570 -0
- megadetector/postprocessing/generate_csv_report.py +522 -0
- megadetector/postprocessing/load_api_results.py +223 -0
- megadetector/postprocessing/md_to_coco.py +428 -0
- megadetector/postprocessing/md_to_labelme.py +351 -0
- megadetector/postprocessing/md_to_wi.py +41 -0
- megadetector/postprocessing/merge_detections.py +392 -0
- megadetector/postprocessing/postprocess_batch_results.py +2140 -0
- megadetector/postprocessing/remap_detection_categories.py +226 -0
- megadetector/postprocessing/render_detection_confusion_matrix.py +677 -0
- megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +206 -0
- megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +82 -0
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1665 -0
- megadetector/postprocessing/separate_detections_into_folders.py +795 -0
- megadetector/postprocessing/subset_json_detector_output.py +964 -0
- megadetector/postprocessing/top_folders_to_bottom.py +238 -0
- megadetector/postprocessing/validate_batch_results.py +332 -0
- megadetector/taxonomy_mapping/__init__.py +0 -0
- megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +211 -0
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +165 -0
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +543 -0
- megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
- megadetector/taxonomy_mapping/simple_image_download.py +231 -0
- megadetector/taxonomy_mapping/species_lookup.py +1008 -0
- megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
- megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
- megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
- megadetector/tests/__init__.py +0 -0
- megadetector/tests/test_nms_synthetic.py +335 -0
- megadetector/utils/__init__.py +0 -0
- megadetector/utils/ct_utils.py +1857 -0
- megadetector/utils/directory_listing.py +199 -0
- megadetector/utils/extract_frames_from_video.py +307 -0
- megadetector/utils/gpu_test.py +125 -0
- megadetector/utils/md_tests.py +2072 -0
- megadetector/utils/path_utils.py +2872 -0
- megadetector/utils/process_utils.py +172 -0
- megadetector/utils/split_locations_into_train_val.py +237 -0
- megadetector/utils/string_utils.py +234 -0
- megadetector/utils/url_utils.py +825 -0
- megadetector/utils/wi_platform_utils.py +968 -0
- megadetector/utils/wi_taxonomy_utils.py +1766 -0
- megadetector/utils/write_html_image_list.py +239 -0
- megadetector/visualization/__init__.py +0 -0
- megadetector/visualization/plot_utils.py +309 -0
- megadetector/visualization/render_images_with_thumbnails.py +243 -0
- megadetector/visualization/visualization_utils.py +1973 -0
- megadetector/visualization/visualize_db.py +630 -0
- megadetector/visualization/visualize_detector_output.py +498 -0
- megadetector/visualization/visualize_video_output.py +705 -0
- megadetector-10.0.15.dist-info/METADATA +115 -0
- megadetector-10.0.15.dist-info/RECORD +147 -0
- megadetector-10.0.15.dist-info/WHEEL +5 -0
- megadetector-10.0.15.dist-info/licenses/LICENSE +19 -0
- megadetector-10.0.15.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
get_lila_annotation_counts.py
|
|
4
|
+
|
|
5
|
+
Generates a .json-formatted dictionary mapping each LILA dataset to all categories
|
|
6
|
+
that exist for that dataset, with counts for the number of occurrences of each category
|
|
7
|
+
(the number of *annotations* for each category, not the number of *images*).
|
|
8
|
+
|
|
9
|
+
Also loads the taxonomy mapping file, to include scientific names for each category.
|
|
10
|
+
|
|
11
|
+
get_lila_image_counts.py counts the number of *images* for each category in each dataset.
|
|
12
|
+
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
#%% Constants and imports
|
|
16
|
+
|
|
17
|
+
import json
|
|
18
|
+
import os
|
|
19
|
+
|
|
20
|
+
from collections import defaultdict
|
|
21
|
+
|
|
22
|
+
from megadetector.data_management.lila.lila_common import \
|
|
23
|
+
read_lila_metadata, read_metadata_file_for_dataset, read_lila_taxonomy_mapping
|
|
24
|
+
from megadetector.utils import ct_utils
|
|
25
|
+
|
|
26
|
+
# cloud provider to use for downloading images; options are 'gcp', 'azure', or 'aws'
|
|
27
|
+
preferred_cloud = 'gcp'
|
|
28
|
+
|
|
29
|
+
# array to fill for output
|
|
30
|
+
category_list = []
|
|
31
|
+
|
|
32
|
+
# We'll write images, metadata downloads, and temporary files here
|
|
33
|
+
lila_local_base = os.path.expanduser('~/lila')
|
|
34
|
+
|
|
35
|
+
output_dir = os.path.join(lila_local_base,'lila_categories_list')
|
|
36
|
+
os.makedirs(output_dir,exist_ok=True)
|
|
37
|
+
|
|
38
|
+
metadata_dir = os.path.join(lila_local_base,'metadata')
|
|
39
|
+
os.makedirs(metadata_dir,exist_ok=True)
|
|
40
|
+
|
|
41
|
+
output_file = os.path.join(output_dir,'lila_dataset_to_categories.json')
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
#%% Load category and taxonomy files
|
|
45
|
+
|
|
46
|
+
taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
#%% Map dataset names and category names to scientific names
|
|
50
|
+
|
|
51
|
+
ds_query_to_scientific_name = {}
|
|
52
|
+
|
|
53
|
+
unmapped_queries = set()
|
|
54
|
+
|
|
55
|
+
datasets_with_taxonomy_mapping = set()
|
|
56
|
+
|
|
57
|
+
# i_row = 1; row = taxonomy_df.iloc[i_row]; row
|
|
58
|
+
for i_row,row in taxonomy_df.iterrows():
|
|
59
|
+
|
|
60
|
+
datasets_with_taxonomy_mapping.add(row['dataset_name'])
|
|
61
|
+
|
|
62
|
+
ds_query = row['dataset_name'] + ':' + row['query']
|
|
63
|
+
ds_query = ds_query.lower()
|
|
64
|
+
|
|
65
|
+
if not isinstance(row['scientific_name'],str):
|
|
66
|
+
unmapped_queries.add(ds_query)
|
|
67
|
+
ds_query_to_scientific_name[ds_query] = 'unmapped'
|
|
68
|
+
continue
|
|
69
|
+
|
|
70
|
+
ds_query_to_scientific_name[ds_query] = row['scientific_name']
|
|
71
|
+
|
|
72
|
+
print('Loaded taxonomy mappings for {} datasets'.format(len(datasets_with_taxonomy_mapping)))
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
#%% Download and parse the metadata file
|
|
76
|
+
|
|
77
|
+
metadata_table = read_lila_metadata(metadata_dir)
|
|
78
|
+
|
|
79
|
+
print('Loaded metadata URLs for {} datasets'.format(len(metadata_table)))
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
#%% Download and extract metadata for each dataset
|
|
83
|
+
|
|
84
|
+
for ds_name in metadata_table.keys():
|
|
85
|
+
metadata_table[ds_name]['json_filename'] = \
|
|
86
|
+
read_metadata_file_for_dataset(ds_name=ds_name,
|
|
87
|
+
metadata_dir=metadata_dir,
|
|
88
|
+
metadata_table=metadata_table,
|
|
89
|
+
preferred_cloud=preferred_cloud)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
#%% Get category names and counts for each dataset
|
|
93
|
+
|
|
94
|
+
# Takes ~5 minutes
|
|
95
|
+
|
|
96
|
+
dataset_to_categories = {}
|
|
97
|
+
|
|
98
|
+
# ds_name = 'NACTI'
|
|
99
|
+
for ds_name in metadata_table.keys():
|
|
100
|
+
|
|
101
|
+
taxonomy_mapping_available = (ds_name in datasets_with_taxonomy_mapping)
|
|
102
|
+
|
|
103
|
+
if not taxonomy_mapping_available:
|
|
104
|
+
print('Warning: taxonomy mapping not available for {}'.format(ds_name))
|
|
105
|
+
|
|
106
|
+
print('Finding categories in {}'.format(ds_name))
|
|
107
|
+
|
|
108
|
+
json_filename = metadata_table[ds_name]['json_filename']
|
|
109
|
+
base_url = metadata_table[ds_name]['image_base_url_' + preferred_cloud]
|
|
110
|
+
assert not base_url.endswith('/')
|
|
111
|
+
|
|
112
|
+
# Open the metadata file
|
|
113
|
+
with open(json_filename, 'r') as f:
|
|
114
|
+
data = json.load(f)
|
|
115
|
+
|
|
116
|
+
# Collect list of categories and mappings to category name
|
|
117
|
+
categories = data['categories']
|
|
118
|
+
|
|
119
|
+
category_id_to_count = defaultdict(int)
|
|
120
|
+
annotations = data['annotations']
|
|
121
|
+
|
|
122
|
+
# ann = annotations[0]
|
|
123
|
+
for ann in annotations:
|
|
124
|
+
category_id_to_count[ann['category_id']] = category_id_to_count[ann['category_id']] + 1
|
|
125
|
+
|
|
126
|
+
# c = categories[0]
|
|
127
|
+
for c in categories:
|
|
128
|
+
count = category_id_to_count[c['id']]
|
|
129
|
+
if 'count' in c:
|
|
130
|
+
assert 'bbox' in ds_name or c['count'] == count
|
|
131
|
+
c['count'] = count
|
|
132
|
+
|
|
133
|
+
# Don't do taxonomy mapping for bbox data sets, which are sometimes just binary and are
|
|
134
|
+
# always redundant with the class-level data sets.
|
|
135
|
+
if 'bbox' in ds_name:
|
|
136
|
+
c['scientific_name_from_taxonomy_mapping'] = None
|
|
137
|
+
elif not taxonomy_mapping_available:
|
|
138
|
+
c['scientific_name_from_taxonomy_mapping'] = None
|
|
139
|
+
else:
|
|
140
|
+
taxonomy_query_string = ds_name.lower().strip() + ':' + c['name'].lower()
|
|
141
|
+
if taxonomy_query_string not in ds_query_to_scientific_name:
|
|
142
|
+
print('No match for query string {}'.format(taxonomy_query_string))
|
|
143
|
+
# As of right now, this is the only quirky case
|
|
144
|
+
assert '#ref!' in taxonomy_query_string and 'wcs' in ds_name.lower()
|
|
145
|
+
c['scientific_name_from_taxonomy_mapping'] = None
|
|
146
|
+
else:
|
|
147
|
+
sn = ds_query_to_scientific_name[taxonomy_query_string]
|
|
148
|
+
assert sn is not None and len(sn) > 0
|
|
149
|
+
c['scientific_name_from_taxonomy_mapping'] = sn
|
|
150
|
+
|
|
151
|
+
dataset_to_categories[ds_name] = categories
|
|
152
|
+
|
|
153
|
+
# ...for each dataset
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
#%% Print the results
|
|
157
|
+
|
|
158
|
+
# ds_name = list(dataset_to_categories.keys())[0]
|
|
159
|
+
for ds_name in dataset_to_categories:
|
|
160
|
+
|
|
161
|
+
print('\n** Category counts for {} **\n'.format(ds_name))
|
|
162
|
+
|
|
163
|
+
categories = dataset_to_categories[ds_name]
|
|
164
|
+
categories = sorted(categories, key=lambda x: x['count'], reverse=True)
|
|
165
|
+
|
|
166
|
+
for c in categories:
|
|
167
|
+
print('{} ({}): {}'.format(c['name'],c['scientific_name_from_taxonomy_mapping'],c['count']))
|
|
168
|
+
|
|
169
|
+
# ...for each dataset
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
#%% Save the results
|
|
173
|
+
|
|
174
|
+
ct_utils.write_json(output_file, dataset_to_categories)
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
get_lila_image_counts.py
|
|
4
|
+
|
|
5
|
+
Count the number of images and bounding boxes with each label in one or more LILA datasets.
|
|
6
|
+
|
|
7
|
+
This script doesn't write these counts out anywhere other than the console, it's just intended
|
|
8
|
+
as a template for doing operations like this on LILA data. get_lila_annotation_counts.py writes
|
|
9
|
+
information out to a .json file, but it counts *annotations*, not *images*, for each category.
|
|
10
|
+
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
#%% Constants and imports
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
|
|
18
|
+
from collections import defaultdict
|
|
19
|
+
|
|
20
|
+
from megadetector.data_management.lila.lila_common import \
|
|
21
|
+
read_lila_metadata, read_metadata_file_for_dataset
|
|
22
|
+
|
|
23
|
+
# If None, will use all datasets
|
|
24
|
+
datasets_of_interest = None
|
|
25
|
+
|
|
26
|
+
# We'll write images, metadata downloads, and temporary files here
|
|
27
|
+
lila_local_base = os.path.expanduser('~/lila')
|
|
28
|
+
|
|
29
|
+
metadata_dir = os.path.join(lila_local_base,'metadata')
|
|
30
|
+
os.makedirs(metadata_dir,exist_ok=True)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
#%% Download and parse the metadata file
|
|
34
|
+
|
|
35
|
+
metadata_table = read_lila_metadata(metadata_dir)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
#%% Download and extract metadata for the datasets we're interested in
|
|
39
|
+
|
|
40
|
+
if datasets_of_interest is None:
|
|
41
|
+
datasets_of_interest = list(metadata_table.keys())
|
|
42
|
+
|
|
43
|
+
for ds_name in datasets_of_interest:
|
|
44
|
+
metadata_table[ds_name]['json_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
|
|
45
|
+
metadata_dir=metadata_dir,
|
|
46
|
+
metadata_table=metadata_table)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
#%% Count categories
|
|
50
|
+
|
|
51
|
+
ds_name_to_category_counts = {}
|
|
52
|
+
|
|
53
|
+
# ds_name = datasets_of_interest[0]
|
|
54
|
+
for ds_name in datasets_of_interest:
|
|
55
|
+
|
|
56
|
+
category_to_image_count = {}
|
|
57
|
+
category_to_bbox_count = {}
|
|
58
|
+
|
|
59
|
+
print('Counting categories in: ' + ds_name)
|
|
60
|
+
|
|
61
|
+
json_filename = metadata_table[ds_name]['json_filename']
|
|
62
|
+
with open(json_filename, 'r') as f:
|
|
63
|
+
data = json.load(f)
|
|
64
|
+
|
|
65
|
+
categories = data['categories']
|
|
66
|
+
category_ids = [c['id'] for c in categories]
|
|
67
|
+
for c in categories:
|
|
68
|
+
category_id_to_name = {c['id']:c['name'] for c in categories}
|
|
69
|
+
annotations = data['annotations']
|
|
70
|
+
images = data['images']
|
|
71
|
+
|
|
72
|
+
for category_id in category_ids:
|
|
73
|
+
category_name = category_id_to_name[category_id]
|
|
74
|
+
category_to_image_count[category_name] = 0
|
|
75
|
+
category_to_bbox_count[category_name] = 0
|
|
76
|
+
|
|
77
|
+
image_id_to_category_names = defaultdict(set)
|
|
78
|
+
|
|
79
|
+
# Go through annotations, marking each image with the categories that are present
|
|
80
|
+
#
|
|
81
|
+
# ann = annotations[0]
|
|
82
|
+
for ann in annotations:
|
|
83
|
+
|
|
84
|
+
category_name = category_id_to_name[ann['category_id']]
|
|
85
|
+
image_id_to_category_names[ann['image_id']].add(category_name)
|
|
86
|
+
|
|
87
|
+
# Now go through images and count categories
|
|
88
|
+
category_to_count = defaultdict(int)
|
|
89
|
+
|
|
90
|
+
# im = images[0]
|
|
91
|
+
for im in images:
|
|
92
|
+
categories_this_image = image_id_to_category_names[im['id']]
|
|
93
|
+
for category_name in categories_this_image:
|
|
94
|
+
category_to_count[category_name] += 1
|
|
95
|
+
|
|
96
|
+
ds_name_to_category_counts[ds_name] = category_to_count
|
|
97
|
+
|
|
98
|
+
# ...for each dataset
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
#%% Print the results
|
|
102
|
+
|
|
103
|
+
for ds_name in ds_name_to_category_counts:
|
|
104
|
+
|
|
105
|
+
print('\n** Category counts for {} **\n'.format(ds_name))
|
|
106
|
+
|
|
107
|
+
category_to_count = ds_name_to_category_counts[ds_name]
|
|
108
|
+
category_to_count = {k: v for k, v in sorted(category_to_count.items(), reverse=True,
|
|
109
|
+
key=lambda item: item[1])}
|
|
110
|
+
|
|
111
|
+
for category_name in category_to_count.keys():
|
|
112
|
+
print('{}: {}'.format(category_name,category_to_count[category_name]))
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
lila_common.py
|
|
4
|
+
|
|
5
|
+
Common constants and functions related to LILA data management/retrieval.
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
#%% Imports and constants
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
import json
|
|
13
|
+
import zipfile
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
from urllib.parse import urlparse
|
|
17
|
+
|
|
18
|
+
from megadetector.utils.url_utils import download_url
|
|
19
|
+
from megadetector.utils.path_utils import unzip_file
|
|
20
|
+
from megadetector.utils.ct_utils import is_empty
|
|
21
|
+
|
|
22
|
+
# LILA camera trap primary metadata file
|
|
23
|
+
lila_metadata_url = 'http://lila.science/wp-content/uploads/2023/06/lila_camera_trap_datasets.csv'
|
|
24
|
+
lila_taxonomy_mapping_url = 'https://lila.science/public/lila-taxonomy-mapping_release.csv'
|
|
25
|
+
lila_all_images_url = 'https://lila.science/public/lila_image_urls_and_labels.csv.zip'
|
|
26
|
+
|
|
27
|
+
wildlife_insights_page_size = 30000
|
|
28
|
+
wildlife_insights_taxonomy_url = 'https://api.wildlifeinsights.org/api/v1/taxonomy/taxonomies-all?fields=class,order,family,genus,species,authority,taxonomyType,uniqueIdentifier,commonNameEnglish&page[size]={}'.format(
|
|
29
|
+
wildlife_insights_page_size)
|
|
30
|
+
wildlife_insights_taxonomy_local_json_filename = 'wi_taxonomy.json'
|
|
31
|
+
wildlife_insights_taxonomy_local_csv_filename = \
|
|
32
|
+
wildlife_insights_taxonomy_local_json_filename.replace('.json','.csv')
|
|
33
|
+
|
|
34
|
+
# Filenames are consistent across clouds relative to these URLs
|
|
35
|
+
lila_base_urls = {
|
|
36
|
+
'azure':'https://lilawildlife.blob.core.windows.net/lila-wildlife/',
|
|
37
|
+
'gcp':'https://storage.googleapis.com/public-datasets-lila/',
|
|
38
|
+
'aws':'http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/'
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
lila_cloud_urls = {
|
|
42
|
+
'azure':'https://lilawildlife.blob.core.windows.net/lila-wildlife/',
|
|
43
|
+
'gcp':'gs://public-datasets-lila/',
|
|
44
|
+
'aws':'s3://us-west-2.opendata.source.coop/agentmorris/lila-wildlife/'
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
for url in lila_base_urls.values():
|
|
48
|
+
assert url.endswith('/')
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
#%% Common functions
|
|
52
|
+
|
|
53
|
+
def read_wildlife_insights_taxonomy_mapping(metadata_dir, force_download=False):
|
|
54
|
+
"""
|
|
55
|
+
Reads the WI taxonomy mapping file, downloading the .json data (and writing to .csv) if necessary.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
metadata_dir (str): folder to use for temporary LILA metadata files
|
|
59
|
+
force_download (bool, optional): download the taxonomy mapping file
|
|
60
|
+
even if the local file exists.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
pd.dataframe: A DataFrame with taxonomy information
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
wi_taxonomy_csv_path = os.path.join(metadata_dir,wildlife_insights_taxonomy_local_csv_filename)
|
|
67
|
+
|
|
68
|
+
if os.path.exists(wi_taxonomy_csv_path) and (not force_download):
|
|
69
|
+
df = pd.read_csv(wi_taxonomy_csv_path)
|
|
70
|
+
else:
|
|
71
|
+
wi_taxonomy_json_path = os.path.join(metadata_dir,wildlife_insights_taxonomy_local_json_filename)
|
|
72
|
+
download_url(wildlife_insights_taxonomy_url, wi_taxonomy_json_path,
|
|
73
|
+
force_download=force_download)
|
|
74
|
+
with open(wi_taxonomy_json_path,'r') as f:
|
|
75
|
+
d = json.load(f)
|
|
76
|
+
|
|
77
|
+
# We haven't implemented paging, make sure that's not an issue
|
|
78
|
+
assert d['meta']['totalItems'] < wildlife_insights_page_size
|
|
79
|
+
|
|
80
|
+
# d['data'] is a list of items that look like:
|
|
81
|
+
"""
|
|
82
|
+
{'id': 2000003,
|
|
83
|
+
'class': 'Mammalia',
|
|
84
|
+
'order': 'Rodentia',
|
|
85
|
+
'family': 'Abrocomidae',
|
|
86
|
+
'genus': 'Abrocoma',
|
|
87
|
+
'species': 'bennettii',
|
|
88
|
+
'authority': 'Waterhouse, 1837',
|
|
89
|
+
'commonNameEnglish': "Bennett's Chinchilla Rat",
|
|
90
|
+
'taxonomyType': 'biological',
|
|
91
|
+
'uniqueIdentifier': '7a6c93a5-bdf7-4182-82f9-7a67d23f7fe1'}
|
|
92
|
+
"""
|
|
93
|
+
df = pd.DataFrame(d['data'])
|
|
94
|
+
df.to_csv(wi_taxonomy_csv_path,index=False)
|
|
95
|
+
|
|
96
|
+
return df
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def read_lila_taxonomy_mapping(metadata_dir, force_download=False):
|
|
100
|
+
"""
|
|
101
|
+
Reads the LILA taxonomy mapping file, downloading the .csv file if necessary.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
metadata_dir (str): folder to use for temporary LILA metadata files
|
|
105
|
+
force_download (bool, optional): download the taxonomy mapping file
|
|
106
|
+
even if the local file exists.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
pd.DataFrame: a DataFrame with one row per identification
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
p = urlparse(lila_taxonomy_mapping_url)
|
|
113
|
+
taxonomy_filename = os.path.join(metadata_dir,os.path.basename(p.path))
|
|
114
|
+
download_url(lila_taxonomy_mapping_url, taxonomy_filename,
|
|
115
|
+
force_download=force_download)
|
|
116
|
+
|
|
117
|
+
df = pd.read_csv(taxonomy_filename)
|
|
118
|
+
|
|
119
|
+
return df
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def read_lila_metadata(metadata_dir, force_download=False):
|
|
123
|
+
"""
|
|
124
|
+
Reads LILA metadata (URLs to each dataset), downloading the .csv file if necessary.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
metadata_dir (str): folder to use for temporary LILA metadata files
|
|
128
|
+
force_download (bool, optional): download the metadata file even if
|
|
129
|
+
the local file exists.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
dict: a dict mapping dataset names (e.g. "Caltech Camera Traps") to dicts
|
|
133
|
+
with keys corresponding to the headers in the .csv file, currently:
|
|
134
|
+
|
|
135
|
+
- name
|
|
136
|
+
- short_name
|
|
137
|
+
- continent
|
|
138
|
+
- country
|
|
139
|
+
- region
|
|
140
|
+
- image_base_url_relative
|
|
141
|
+
- bbox_url_relative
|
|
142
|
+
- image_base_url_gcp
|
|
143
|
+
- metadata_url_gcp
|
|
144
|
+
- bbox_url_gcp
|
|
145
|
+
- image_base_url_aws
|
|
146
|
+
- metadata_url_aws
|
|
147
|
+
- bbox_url_aws
|
|
148
|
+
- image_base_url_azure
|
|
149
|
+
- metadata_url_azure
|
|
150
|
+
- box_url_azure
|
|
151
|
+
- mdv4_results_raw
|
|
152
|
+
- mdv5b_results_raw
|
|
153
|
+
- md_results_with_rde
|
|
154
|
+
- json_filename
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
# Put the master metadata file in the same folder where we're putting images
|
|
158
|
+
p = urlparse(lila_metadata_url)
|
|
159
|
+
metadata_filename = os.path.join(metadata_dir,os.path.basename(p.path))
|
|
160
|
+
download_url(lila_metadata_url, metadata_filename, force_download=force_download)
|
|
161
|
+
|
|
162
|
+
df = pd.read_csv(metadata_filename)
|
|
163
|
+
|
|
164
|
+
records = df.to_dict('records')
|
|
165
|
+
|
|
166
|
+
# Parse into a table keyed by dataset name
|
|
167
|
+
metadata_table = {}
|
|
168
|
+
|
|
169
|
+
# r = records[0]
|
|
170
|
+
for r in records:
|
|
171
|
+
if is_empty(r['name']):
|
|
172
|
+
continue
|
|
173
|
+
|
|
174
|
+
# Convert NaN's to None
|
|
175
|
+
for k in r.keys():
|
|
176
|
+
if is_empty(r[k]):
|
|
177
|
+
r[k] = None
|
|
178
|
+
|
|
179
|
+
metadata_table[r['name']] = r
|
|
180
|
+
|
|
181
|
+
return metadata_table
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def read_lila_all_images_file(metadata_dir, force_download=False):
|
|
185
|
+
"""
|
|
186
|
+
Downloads if necessary - then unzips if necessary - the .csv file with label mappings for
|
|
187
|
+
all LILA files, and opens the resulting .csv file as a Pandas DataFrame.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
metadata_dir (str): folder to use for temporary LILA metadata files
|
|
191
|
+
force_download (bool, optional): download the metadata file even if
|
|
192
|
+
the local file exists.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
pd.DataFrame: a DataFrame containing one row per identification in a LILA camera trap image
|
|
196
|
+
"""
|
|
197
|
+
|
|
198
|
+
p = urlparse(lila_all_images_url)
|
|
199
|
+
lila_all_images_zip_filename = os.path.join(metadata_dir,os.path.basename(p.path))
|
|
200
|
+
download_url(lila_all_images_url, lila_all_images_zip_filename,
|
|
201
|
+
force_download=force_download)
|
|
202
|
+
|
|
203
|
+
with zipfile.ZipFile(lila_all_images_zip_filename,'r') as z:
|
|
204
|
+
files = z.namelist()
|
|
205
|
+
assert len(files) == 1
|
|
206
|
+
|
|
207
|
+
unzipped_csv_filename = os.path.join(metadata_dir,files[0])
|
|
208
|
+
if not os.path.isfile(unzipped_csv_filename):
|
|
209
|
+
unzip_file(lila_all_images_zip_filename,metadata_dir)
|
|
210
|
+
else:
|
|
211
|
+
print('{} already unzipped'.format(unzipped_csv_filename))
|
|
212
|
+
|
|
213
|
+
df = pd.read_csv(unzipped_csv_filename)
|
|
214
|
+
|
|
215
|
+
return df
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def read_metadata_file_for_dataset(ds_name,
|
|
219
|
+
metadata_dir,
|
|
220
|
+
metadata_table=None,
|
|
221
|
+
json_url=None,
|
|
222
|
+
preferred_cloud='gcp',
|
|
223
|
+
force_download=False):
|
|
224
|
+
"""
|
|
225
|
+
Downloads if necessary - then unzips if necessary - the .json file for a specific dataset.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
ds_name (str): the name of the dataset for which you want to retrieve metadata (e.g.
|
|
229
|
+
"Caltech Camera Traps")
|
|
230
|
+
metadata_dir (str): folder to use for temporary LILA metadata files
|
|
231
|
+
metadata_table (dict, optional): an optional dictionary already loaded via
|
|
232
|
+
read_lila_metadata()
|
|
233
|
+
json_url (str, optional): the URL of the metadata file, if None will be retrieved
|
|
234
|
+
via read_lila_metadata()
|
|
235
|
+
preferred_cloud (str, optional): 'gcp' (default), 'azure', or 'aws'
|
|
236
|
+
force_download (bool, optional): download the metadata file even if
|
|
237
|
+
the local file exists.
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
str: the .json filename on the local disk
|
|
241
|
+
|
|
242
|
+
"""
|
|
243
|
+
|
|
244
|
+
if preferred_cloud is None:
|
|
245
|
+
preferred_cloud = 'gcp'
|
|
246
|
+
|
|
247
|
+
assert preferred_cloud in lila_base_urls.keys()
|
|
248
|
+
|
|
249
|
+
if json_url is None:
|
|
250
|
+
|
|
251
|
+
if metadata_table is None:
|
|
252
|
+
metadata_table = read_lila_metadata(metadata_dir)
|
|
253
|
+
|
|
254
|
+
json_url = metadata_table[ds_name]['metadata_url_' + preferred_cloud]
|
|
255
|
+
|
|
256
|
+
p = urlparse(json_url)
|
|
257
|
+
json_filename = os.path.join(metadata_dir,os.path.basename(p.path))
|
|
258
|
+
download_url(json_url, json_filename, force_download=force_download)
|
|
259
|
+
|
|
260
|
+
# Unzip if necessary
|
|
261
|
+
if json_filename.endswith('.zip'):
|
|
262
|
+
|
|
263
|
+
with zipfile.ZipFile(json_filename,'r') as z:
|
|
264
|
+
files = z.namelist()
|
|
265
|
+
assert len(files) == 1
|
|
266
|
+
unzipped_json_filename = os.path.join(metadata_dir,files[0])
|
|
267
|
+
if not os.path.isfile(unzipped_json_filename):
|
|
268
|
+
unzip_file(json_filename,metadata_dir)
|
|
269
|
+
else:
|
|
270
|
+
print('{} already unzipped'.format(unzipped_json_filename))
|
|
271
|
+
json_filename = unzipped_json_filename
|
|
272
|
+
|
|
273
|
+
return json_filename
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
#%% Interactive test driver
|
|
277
|
+
|
|
278
|
+
if False:
|
|
279
|
+
|
|
280
|
+
pass
|
|
281
|
+
|
|
282
|
+
#%% Verify that all base URLs exist
|
|
283
|
+
|
|
284
|
+
# LILA camera trap primary metadata file
|
|
285
|
+
urls = (lila_metadata_url,
|
|
286
|
+
lila_taxonomy_mapping_url,
|
|
287
|
+
lila_all_images_url,
|
|
288
|
+
wildlife_insights_taxonomy_url)
|
|
289
|
+
|
|
290
|
+
from megadetector.utils import url_utils
|
|
291
|
+
|
|
292
|
+
status_codes = url_utils.test_urls(urls,timeout=2.0)
|
|
293
|
+
assert all([code == 200 for code in status_codes])
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
#%% Verify that the metadata URLs exist for individual datasets
|
|
297
|
+
|
|
298
|
+
metadata_dir = os.path.expanduser('~/lila/metadata')
|
|
299
|
+
|
|
300
|
+
dataset_metadata = read_lila_metadata(metadata_dir)
|
|
301
|
+
|
|
302
|
+
urls_to_test = []
|
|
303
|
+
|
|
304
|
+
# ds_name = next(iter(dataset_metadata.keys()))
|
|
305
|
+
for ds_name in dataset_metadata.keys():
|
|
306
|
+
|
|
307
|
+
ds_info = dataset_metadata[ds_name]
|
|
308
|
+
for cloud_name in lila_base_urls.keys():
|
|
309
|
+
urls_to_test.append(ds_info['metadata_url_' + cloud_name])
|
|
310
|
+
if ds_info['bbox_url_relative'] is not None:
|
|
311
|
+
urls_to_test.append(ds_info['bbox_url_' + cloud_name])
|
|
312
|
+
|
|
313
|
+
status_codes = url_utils.test_urls(urls_to_test,
|
|
314
|
+
error_on_failure=True,
|
|
315
|
+
n_workers=10,
|
|
316
|
+
pool_type='process',
|
|
317
|
+
timeout=2.0)
|
|
318
|
+
assert all([code == 200 for code in status_codes])
|
|
319
|
+
|