megadetector 5.0.24__py3-none-any.whl → 5.0.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/data_management/cct_json_utils.py +15 -2
- megadetector/data_management/coco_to_yolo.py +53 -31
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +7 -3
- megadetector/data_management/databases/integrity_check_json_db.py +2 -2
- megadetector/data_management/lila/add_locations_to_island_camera_traps.py +73 -69
- megadetector/data_management/lila/add_locations_to_nacti.py +114 -110
- megadetector/data_management/lila/generate_lila_per_image_labels.py +2 -2
- megadetector/data_management/lila/test_lila_metadata_urls.py +21 -10
- megadetector/data_management/remap_coco_categories.py +60 -11
- megadetector/data_management/{wi_to_md.py → speciesnet_to_md.py} +2 -2
- megadetector/data_management/yolo_to_coco.py +45 -15
- megadetector/detection/run_detector.py +1 -0
- megadetector/detection/run_detector_batch.py +5 -4
- megadetector/postprocessing/classification_postprocessing.py +788 -524
- megadetector/postprocessing/compare_batch_results.py +176 -9
- megadetector/postprocessing/create_crop_folder.py +420 -0
- megadetector/postprocessing/load_api_results.py +4 -1
- megadetector/postprocessing/md_to_coco.py +1 -1
- megadetector/postprocessing/postprocess_batch_results.py +158 -44
- megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +3 -8
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +2 -2
- megadetector/postprocessing/separate_detections_into_folders.py +20 -4
- megadetector/postprocessing/subset_json_detector_output.py +180 -15
- megadetector/postprocessing/validate_batch_results.py +13 -5
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +6 -6
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +3 -58
- megadetector/taxonomy_mapping/species_lookup.py +45 -2
- megadetector/utils/ct_utils.py +76 -3
- megadetector/utils/directory_listing.py +4 -4
- megadetector/utils/gpu_test.py +21 -3
- megadetector/utils/md_tests.py +142 -49
- megadetector/utils/path_utils.py +342 -19
- megadetector/utils/wi_utils.py +1286 -212
- megadetector/visualization/visualization_utils.py +16 -4
- megadetector/visualization/visualize_db.py +1 -1
- megadetector/visualization/visualize_detector_output.py +1 -4
- {megadetector-5.0.24.dist-info → megadetector-5.0.26.dist-info}/METADATA +6 -3
- {megadetector-5.0.24.dist-info → megadetector-5.0.26.dist-info}/RECORD +41 -40
- {megadetector-5.0.24.dist-info → megadetector-5.0.26.dist-info}/WHEEL +1 -1
- {megadetector-5.0.24.dist-info → megadetector-5.0.26.dist-info/licenses}/LICENSE +0 -0
- {megadetector-5.0.24.dist-info → megadetector-5.0.26.dist-info}/top_level.txt +0 -0
|
@@ -61,9 +61,11 @@ import os
|
|
|
61
61
|
import re
|
|
62
62
|
|
|
63
63
|
from tqdm import tqdm
|
|
64
|
+
from collections import defaultdict
|
|
64
65
|
|
|
65
66
|
from megadetector.utils.ct_utils import args_to_object, get_max_conf, invert_dictionary
|
|
66
67
|
from megadetector.utils.path_utils import top_level_folder
|
|
68
|
+
from megadetector.utils.path_utils import recursive_file_list
|
|
67
69
|
|
|
68
70
|
|
|
69
71
|
#%% Helper classes
|
|
@@ -136,7 +138,18 @@ class SubsetJsonDetectorOutputOptions:
|
|
|
136
138
|
|
|
137
139
|
#: Set to >0 during testing to limit the number of images that get processed.
|
|
138
140
|
self.debug_max_images = -1
|
|
141
|
+
|
|
142
|
+
#: Keep only files in this list, which can be a .json results file or a folder.
|
|
143
|
+
#
|
|
144
|
+
#: Assumes that the input .json file contains relative paths when comparing to a folder.
|
|
145
|
+
self.keep_files_in_list = None
|
|
146
|
+
|
|
147
|
+
#: Remove classification with <= N instances. Does not re-map categories
|
|
148
|
+
#: to be contiguous. Set to 1 to remove empty categories only.
|
|
149
|
+
self.remove_classification_categories_below_count = None
|
|
139
150
|
|
|
151
|
+
# ...class SubsetJsonDetectorOutputOptions
|
|
152
|
+
|
|
140
153
|
|
|
141
154
|
#%% Main function
|
|
142
155
|
|
|
@@ -156,11 +169,104 @@ def _write_detection_results(data, output_filename, options):
|
|
|
156
169
|
else:
|
|
157
170
|
os.makedirs(basedir, exist_ok=True)
|
|
158
171
|
|
|
159
|
-
|
|
160
|
-
|
|
172
|
+
n_images = len(data['images'])
|
|
173
|
+
|
|
174
|
+
print('Writing detection output (with {} images) to {}'.format(n_images,output_filename))
|
|
175
|
+
with open(output_filename, 'w', newline='\n') as f:
|
|
161
176
|
json.dump(data,f,indent=1)
|
|
162
177
|
|
|
163
|
-
# ..._write_detection_results()
|
|
178
|
+
# ...def _write_detection_results(...)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def remove_classification_categories_below_count(data, options):
|
|
182
|
+
"""
|
|
183
|
+
Removes all classification categories below a threshold count. Does not re-map
|
|
184
|
+
classification category IDs.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
data (dict): data loaded from a MD results file
|
|
188
|
+
options (SubsetJsonDetectorOutputOptions): parameters for subsetting
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
dict: Possibly-modified version of [data] (also modifies in place)
|
|
192
|
+
"""
|
|
193
|
+
|
|
194
|
+
if options.remove_classification_categories_below_count is None:
|
|
195
|
+
return data
|
|
196
|
+
if 'classification_categories' not in data:
|
|
197
|
+
return data
|
|
198
|
+
|
|
199
|
+
classification_category_id_to_count = {}
|
|
200
|
+
|
|
201
|
+
for classification_category_id in data['classification_categories']:
|
|
202
|
+
classification_category_id_to_count[classification_category_id] = 0
|
|
203
|
+
|
|
204
|
+
# Count the number of occurrences of each classification category
|
|
205
|
+
for im in data['images']:
|
|
206
|
+
if 'detections' not in im or im['detections'] is None:
|
|
207
|
+
continue
|
|
208
|
+
for det in im['detections']:
|
|
209
|
+
if 'classifications' not in det:
|
|
210
|
+
continue
|
|
211
|
+
for classification in det['classifications']:
|
|
212
|
+
classification_category_id_to_count[classification[0]] = \
|
|
213
|
+
classification_category_id_to_count[classification[0]] + 1
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
# Which categories have above-threshold counts?
|
|
217
|
+
classification_category_ids_to_keep = set()
|
|
218
|
+
|
|
219
|
+
for classification_category_id in classification_category_id_to_count:
|
|
220
|
+
if classification_category_id_to_count[classification_category_id] > \
|
|
221
|
+
options.remove_classification_categories_below_count:
|
|
222
|
+
classification_category_ids_to_keep.add(classification_category_id)
|
|
223
|
+
|
|
224
|
+
n_categories_removed = \
|
|
225
|
+
len(classification_category_id_to_count) - \
|
|
226
|
+
len(classification_category_ids_to_keep)
|
|
227
|
+
|
|
228
|
+
print('Removing {} of {} classification categories'.format(
|
|
229
|
+
n_categories_removed,len(classification_category_id_to_count)))
|
|
230
|
+
|
|
231
|
+
if n_categories_removed == 0:
|
|
232
|
+
return data
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
# Filter the category list
|
|
236
|
+
output_classification_categories = {}
|
|
237
|
+
for category_id in data['classification_categories']:
|
|
238
|
+
if category_id in classification_category_ids_to_keep:
|
|
239
|
+
output_classification_categories[category_id] = \
|
|
240
|
+
data['classification_categories'][category_id]
|
|
241
|
+
data['classification_categories'] = output_classification_categories
|
|
242
|
+
assert len(data['classification_categories']) == len(classification_category_ids_to_keep)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
# If necessary, filter the category descriptions
|
|
246
|
+
if 'classification_category_descriptions' in data:
|
|
247
|
+
output_classification_category_descriptions = {}
|
|
248
|
+
for category_id in data['classification_category_descriptions']:
|
|
249
|
+
if category_id in classification_category_ids_to_keep:
|
|
250
|
+
output_classification_category_descriptions[category_id] = \
|
|
251
|
+
data['classification_category_descriptions'][category_id]
|
|
252
|
+
data['classification_category_descriptions'] = output_classification_category_descriptions
|
|
253
|
+
|
|
254
|
+
# Filter images
|
|
255
|
+
for im in data['images']:
|
|
256
|
+
if 'detections' not in im or im['detections'] is None:
|
|
257
|
+
continue
|
|
258
|
+
for det in im['detections']:
|
|
259
|
+
if 'classifications' not in det:
|
|
260
|
+
continue
|
|
261
|
+
classifications_to_keep = []
|
|
262
|
+
for classification in det['classifications']:
|
|
263
|
+
if classification[0] in classification_category_ids_to_keep:
|
|
264
|
+
classifications_to_keep.append(classification)
|
|
265
|
+
det['classifications'] = classifications_to_keep
|
|
266
|
+
|
|
267
|
+
return data
|
|
268
|
+
|
|
269
|
+
# ...def remove_classification_categories_below_count(...)
|
|
164
270
|
|
|
165
271
|
|
|
166
272
|
def subset_json_detector_output_by_confidence(data, options):
|
|
@@ -172,7 +278,7 @@ def subset_json_detector_output_by_confidence(data, options):
|
|
|
172
278
|
options (SubsetJsonDetectorOutputOptions): parameters for subsetting
|
|
173
279
|
|
|
174
280
|
Returns:
|
|
175
|
-
dict: Possibly-modified version of data (also modifies in place)
|
|
281
|
+
dict: Possibly-modified version of [data] (also modifies in place)
|
|
176
282
|
"""
|
|
177
283
|
|
|
178
284
|
if options.confidence_threshold is None:
|
|
@@ -234,9 +340,55 @@ def subset_json_detector_output_by_confidence(data, options):
|
|
|
234
340
|
|
|
235
341
|
return data
|
|
236
342
|
|
|
237
|
-
# ...subset_json_detector_output_by_confidence()
|
|
343
|
+
# ...def subset_json_detector_output_by_confidence(...)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def subset_json_detector_output_by_list(data, options):
|
|
347
|
+
"""
|
|
348
|
+
Keeps only files in options.keep_files_in_list, which can be a .json results file or a folder.
|
|
349
|
+
Assumes that the input .json file contains relative paths when comparing to a folder.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
data (dict): data loaded from a MD results file
|
|
353
|
+
options (SubsetJsonDetectorOutputOptions): parameters for subsetting
|
|
354
|
+
|
|
355
|
+
Returns:
|
|
356
|
+
dict: Possibly-modified version of [data] (also modifies in place)
|
|
357
|
+
"""
|
|
358
|
+
|
|
359
|
+
if options.keep_files_in_list is None:
|
|
360
|
+
return
|
|
361
|
+
|
|
362
|
+
files_to_keep = None
|
|
363
|
+
|
|
364
|
+
if os.path.isfile(options.keep_files_in_list):
|
|
365
|
+
with open(options.keep_files_in_list,'r') as f:
|
|
366
|
+
d = json.load(f)
|
|
367
|
+
files_to_keep = [im['file'] for im in d['images']]
|
|
368
|
+
elif os.path.isdir(options.keep_files_in_list):
|
|
369
|
+
files_to_keep = \
|
|
370
|
+
recursive_file_list(options.keep_files_in_list,return_relative_paths=True)
|
|
371
|
+
else:
|
|
372
|
+
raise ValueError('Subsetting .json file by list: {} is neither a .json results file nor a folder'.format(
|
|
373
|
+
options.keep_files_in_list))
|
|
374
|
+
|
|
375
|
+
files_to_keep = [fn.replace('\\','/') for fn in files_to_keep]
|
|
376
|
+
files_to_keep_set = set(files_to_keep)
|
|
377
|
+
|
|
378
|
+
images_to_keep = []
|
|
379
|
+
|
|
380
|
+
for im in data['images']:
|
|
381
|
+
fn = im['file'].replace('\\','/')
|
|
382
|
+
if fn in files_to_keep_set:
|
|
383
|
+
images_to_keep.append(im)
|
|
384
|
+
|
|
385
|
+
data['images'] = images_to_keep
|
|
386
|
+
|
|
387
|
+
return data
|
|
238
388
|
|
|
389
|
+
# ...def subset_json_detector_output_by_list(...)
|
|
239
390
|
|
|
391
|
+
|
|
240
392
|
def subset_json_detector_output_by_categories(data, options):
|
|
241
393
|
"""
|
|
242
394
|
Removes all detections without detections above a threshold for specific categories.
|
|
@@ -246,7 +398,7 @@ def subset_json_detector_output_by_categories(data, options):
|
|
|
246
398
|
options (SubsetJsonDetectorOutputOptions): parameters for subsetting
|
|
247
399
|
|
|
248
400
|
Returns:
|
|
249
|
-
dict: Possibly-modified version of data (also modifies in place)
|
|
401
|
+
dict: Possibly-modified version of [data] (also modifies in place)
|
|
250
402
|
"""
|
|
251
403
|
|
|
252
404
|
# If categories_to_keep is supplied as a list, convert to a dict
|
|
@@ -342,7 +494,7 @@ def subset_json_detector_output_by_categories(data, options):
|
|
|
342
494
|
|
|
343
495
|
return data
|
|
344
496
|
|
|
345
|
-
# ...subset_json_detector_output_by_categories()
|
|
497
|
+
# ...def subset_json_detector_output_by_categories(...)
|
|
346
498
|
|
|
347
499
|
|
|
348
500
|
def remove_failed_images(data,options):
|
|
@@ -354,7 +506,7 @@ def remove_failed_images(data,options):
|
|
|
354
506
|
options (SubsetJsonDetectorOutputOptions): parameters for subsetting
|
|
355
507
|
|
|
356
508
|
Returns:
|
|
357
|
-
dict: Possibly-modified version of data (also modifies in place)
|
|
509
|
+
dict: Possibly-modified version of [data] (also modifies in place)
|
|
358
510
|
"""
|
|
359
511
|
|
|
360
512
|
images_in = data['images']
|
|
@@ -381,7 +533,7 @@ def remove_failed_images(data,options):
|
|
|
381
533
|
|
|
382
534
|
return data
|
|
383
535
|
|
|
384
|
-
# ...remove_failed_images()
|
|
536
|
+
# ...def remove_failed_images(...)
|
|
385
537
|
|
|
386
538
|
|
|
387
539
|
def subset_json_detector_output_by_query(data, options):
|
|
@@ -394,7 +546,7 @@ def subset_json_detector_output_by_query(data, options):
|
|
|
394
546
|
options (SubsetJsonDetectorOutputOptions): parameters for subsetting
|
|
395
547
|
|
|
396
548
|
Returns:
|
|
397
|
-
dict: Possibly-modified version of data (also modifies in place)
|
|
549
|
+
dict: Possibly-modified version of [data] (also modifies in place)
|
|
398
550
|
"""
|
|
399
551
|
|
|
400
552
|
images_in = data['images']
|
|
@@ -441,7 +593,7 @@ def subset_json_detector_output_by_query(data, options):
|
|
|
441
593
|
|
|
442
594
|
return data
|
|
443
595
|
|
|
444
|
-
# ...subset_json_detector_output_by_query()
|
|
596
|
+
# ...def subset_json_detector_output_by_query(...)
|
|
445
597
|
|
|
446
598
|
|
|
447
599
|
def subset_json_detector_output(input_filename, output_filename, options, data=None):
|
|
@@ -481,10 +633,10 @@ def subset_json_detector_output(input_filename, output_filename, options, data=N
|
|
|
481
633
|
raise ValueError('When splitting by folders, output must be a valid directory name, you specified an existing file')
|
|
482
634
|
|
|
483
635
|
if data is None:
|
|
484
|
-
print('Reading
|
|
636
|
+
print('Reading file {}'.format(input_filename))
|
|
485
637
|
with open(input_filename) as f:
|
|
486
638
|
data = json.load(f)
|
|
487
|
-
print('
|
|
639
|
+
print('Read {} images'.format(len(data['images'])))
|
|
488
640
|
if options.debug_max_images > 0:
|
|
489
641
|
print('Trimming to {} images'.format(options.debug_max_images))
|
|
490
642
|
data['images'] = data['images'][:options.debug_max_images]
|
|
@@ -500,7 +652,7 @@ def subset_json_detector_output(input_filename, output_filename, options, data=N
|
|
|
500
652
|
if options.remove_failed_images:
|
|
501
653
|
|
|
502
654
|
data = remove_failed_images(data, options)
|
|
503
|
-
|
|
655
|
+
|
|
504
656
|
if options.confidence_threshold is not None:
|
|
505
657
|
|
|
506
658
|
data = subset_json_detector_output_by_confidence(data, options)
|
|
@@ -508,6 +660,14 @@ def subset_json_detector_output(input_filename, output_filename, options, data=N
|
|
|
508
660
|
if (options.categories_to_keep is not None) or (options.category_names_to_keep is not None):
|
|
509
661
|
|
|
510
662
|
data = subset_json_detector_output_by_categories(data, options)
|
|
663
|
+
|
|
664
|
+
if options.remove_classification_categories_below_count is not None:
|
|
665
|
+
|
|
666
|
+
data = remove_classification_categories_below_count(data, options)
|
|
667
|
+
|
|
668
|
+
if options.keep_files_in_list is not None:
|
|
669
|
+
|
|
670
|
+
data = subset_json_detector_output_by_list(data, options)
|
|
511
671
|
|
|
512
672
|
if not options.split_folders:
|
|
513
673
|
|
|
@@ -615,7 +775,7 @@ def subset_json_detector_output(input_filename, output_filename, options, data=N
|
|
|
615
775
|
|
|
616
776
|
# ...if we're splitting folders
|
|
617
777
|
|
|
618
|
-
# ...subset_json_detector_output()
|
|
778
|
+
# ...def subset_json_detector_output(...)
|
|
619
779
|
|
|
620
780
|
|
|
621
781
|
#%% Interactive driver
|
|
@@ -676,6 +836,9 @@ def main():
|
|
|
676
836
|
help='Replace [query] with this')
|
|
677
837
|
parser.add_argument('--confidence_threshold', type=float, default=None,
|
|
678
838
|
help='Remove detections below this confidence level')
|
|
839
|
+
parser.add_argument('--keep_files_in_list', type=str, default=None,
|
|
840
|
+
help='Keep only files in this list, which can be a .json results file or a folder.' + \
|
|
841
|
+
' Assumes that the input .json file contains relative paths when comparing to a folder.')
|
|
679
842
|
parser.add_argument('--split_folders', action='store_true',
|
|
680
843
|
help='Split .json files by leaf-node folder')
|
|
681
844
|
parser.add_argument('--split_folder_param', type=int,
|
|
@@ -690,6 +853,8 @@ def main():
|
|
|
690
853
|
help='When using split_folders and make_folder_relative, copy jsons to their corresponding folders (relative to output_file)')
|
|
691
854
|
parser.add_argument('--create_folders', action='store_true',
|
|
692
855
|
help='When using copy_jsons_to_folders, create folders that don''t exist')
|
|
856
|
+
parser.add_argument('--remove_classification_categories_below_count', type=int, default=None,
|
|
857
|
+
help='Remove classification categories with less than this many instances (no removal by default)')
|
|
693
858
|
|
|
694
859
|
if len(sys.argv[1:]) == 0:
|
|
695
860
|
parser.print_help()
|
|
@@ -20,11 +20,19 @@ from tqdm import tqdm
|
|
|
20
20
|
from megadetector.detection.video_utils import is_video_file
|
|
21
21
|
from megadetector.utils.ct_utils import args_to_object, is_list_sorted # noqa
|
|
22
22
|
|
|
23
|
-
typical_info_fields = ['detector',
|
|
24
|
-
'
|
|
25
|
-
'
|
|
26
|
-
|
|
27
|
-
|
|
23
|
+
typical_info_fields = ['detector',
|
|
24
|
+
'detection_completion_time',
|
|
25
|
+
'classifier',
|
|
26
|
+
'classification_completion_time',
|
|
27
|
+
'detection_metadata',
|
|
28
|
+
'classifier_metadata']
|
|
29
|
+
|
|
30
|
+
required_keys = ['info',
|
|
31
|
+
'images',
|
|
32
|
+
'detection_categories']
|
|
33
|
+
|
|
34
|
+
typical_keys = ['classification_categories',
|
|
35
|
+
'classification_category_descriptions']
|
|
28
36
|
|
|
29
37
|
|
|
30
38
|
#%% Classes
|
|
@@ -15,10 +15,10 @@ import json
|
|
|
15
15
|
# Created by get_lila_category_list.py
|
|
16
16
|
input_lila_category_list_file = os.path.expanduser('~/lila/lila_categories_list/lila_dataset_to_categories.json')
|
|
17
17
|
|
|
18
|
-
output_file = os.path.expanduser('~/lila/
|
|
18
|
+
output_file = os.path.expanduser('~/lila/lila_additions_2025.03.24.csv')
|
|
19
19
|
|
|
20
20
|
datasets_to_map = [
|
|
21
|
-
'
|
|
21
|
+
'UNSW Predators'
|
|
22
22
|
]
|
|
23
23
|
|
|
24
24
|
|
|
@@ -125,6 +125,8 @@ output_df = pd.DataFrame(data=output_rows, columns=[
|
|
|
125
125
|
'scientific_name', 'common_name', 'taxonomy_string'])
|
|
126
126
|
output_df.to_csv(output_file, index=None, header=True)
|
|
127
127
|
|
|
128
|
+
# from megadetector.utils.path_utils import open_file; open_file(output_file)
|
|
129
|
+
|
|
128
130
|
|
|
129
131
|
#%% Manual lookup
|
|
130
132
|
|
|
@@ -138,10 +140,8 @@ if False:
|
|
|
138
140
|
|
|
139
141
|
#%%
|
|
140
142
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
# q = 'notamacropus'
|
|
144
|
-
q = 'insects'
|
|
143
|
+
q = 'dasyurus maculatus'
|
|
144
|
+
|
|
145
145
|
taxonomy_preference = 'inat'
|
|
146
146
|
m = get_preferred_taxonomic_match(q,taxonomy_preference)
|
|
147
147
|
# print(m.scientific_name); import clipboard; clipboard.copy(m.scientific_name)
|
|
@@ -16,7 +16,7 @@ import os
|
|
|
16
16
|
import pandas as pd
|
|
17
17
|
|
|
18
18
|
# lila_taxonomy_file = r"c:\git\agentmorrisprivate\lila-taxonomy\lila-taxonomy-mapping.csv"
|
|
19
|
-
lila_taxonomy_file = os.path.expanduser('~/lila/
|
|
19
|
+
lila_taxonomy_file = os.path.expanduser('~/lila/lila_additions_2025.03.24.csv')
|
|
20
20
|
|
|
21
21
|
preview_base = os.path.expanduser('~/lila/lila_taxonomy_preview')
|
|
22
22
|
os.makedirs(preview_base,exist_ok=True)
|
|
@@ -72,65 +72,10 @@ from megadetector.taxonomy_mapping.species_lookup import \
|
|
|
72
72
|
initialize_taxonomy_lookup()
|
|
73
73
|
|
|
74
74
|
|
|
75
|
-
#%% Optionally remap all gbif-based mappings to inat (or vice-versa)
|
|
76
|
-
|
|
77
|
-
if False:
|
|
78
|
-
|
|
79
|
-
#%%
|
|
80
|
-
|
|
81
|
-
source_mappings = ['gbif','manual']
|
|
82
|
-
target_mapping = 'inat'
|
|
83
|
-
valid_mappings = ['gbif','inat','manual']
|
|
84
|
-
|
|
85
|
-
assert target_mapping in valid_mappings
|
|
86
|
-
for source_mapping in source_mappings:
|
|
87
|
-
assert source_mapping != target_mapping and \
|
|
88
|
-
source_mapping in valid_mappings
|
|
89
|
-
|
|
90
|
-
n_remappings = 0
|
|
91
|
-
|
|
92
|
-
# i_row = 1; row = df.iloc[i_row]; row
|
|
93
|
-
for i_row,row in df.iterrows():
|
|
94
|
-
|
|
95
|
-
if row['source'] not in source_mappings:
|
|
96
|
-
continue
|
|
97
|
-
|
|
98
|
-
scientific_name = row['scientific_name']
|
|
99
|
-
old_common = taxonomy_string_to_common_name(row['taxonomy_string'])
|
|
100
|
-
|
|
101
|
-
m = get_preferred_taxonomic_match(scientific_name,target_mapping)
|
|
102
|
-
|
|
103
|
-
if m is None or m.source != target_mapping:
|
|
104
|
-
print('No mapping for {} ({}) ({})'.format(scientific_name,row['query'],old_common))
|
|
105
|
-
continue
|
|
106
|
-
|
|
107
|
-
assert m.scientific_name == row['scientific_name']
|
|
108
|
-
|
|
109
|
-
if m.taxonomic_level == 'variety' and row['taxonomy_level'] == 'subspecies':
|
|
110
|
-
pass
|
|
111
|
-
else:
|
|
112
|
-
assert m.taxonomic_level == row['taxonomy_level']
|
|
113
|
-
|
|
114
|
-
new_common = taxonomy_string_to_common_name(m.taxonomy_string)
|
|
115
|
-
|
|
116
|
-
if row['taxonomy_string'] != m.taxonomy_string:
|
|
117
|
-
print('Remapping {} ({} to {})'.format(scientific_name, old_common, new_common))
|
|
118
|
-
n_remappings += 1
|
|
119
|
-
df.loc[i_row,'taxonomy_string'] = m.taxonomy_string
|
|
120
|
-
|
|
121
|
-
if row['source'] != 'manual':
|
|
122
|
-
df.loc[i_row,'source'] = m.source
|
|
123
|
-
|
|
124
|
-
# This should be zero for the release .csv
|
|
125
|
-
print('Made {} remappings'.format(n_remappings))
|
|
126
|
-
|
|
127
|
-
#%%
|
|
128
|
-
|
|
129
|
-
df.to_csv(lila_taxonomy_file.replace('.csv','_remapped.csv'),header=True,index=False)
|
|
130
|
-
|
|
131
|
-
|
|
132
75
|
#%% Check for mappings that disagree with the taxonomy string
|
|
133
76
|
|
|
77
|
+
# For example, cases where the "level" column says "species", but the taxonomy string says it's a genus.
|
|
78
|
+
|
|
134
79
|
df = pd.read_csv(lila_taxonomy_file)
|
|
135
80
|
|
|
136
81
|
n_taxonomy_changes = 0
|
|
@@ -602,8 +602,17 @@ hyphenated_terms = ['crowned', 'backed', 'throated', 'tailed', 'headed', 'cheeke
|
|
|
602
602
|
|
|
603
603
|
def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retry=True) -> TaxonomicMatch:
|
|
604
604
|
"""
|
|
605
|
-
Wrapper for
|
|
606
|
-
preferences that are specific to our scenario.
|
|
605
|
+
Wrapper for _get_preferred_taxonomic_match, but expressing a variety of heuristics
|
|
606
|
+
and preferences that are specific to our scenario.
|
|
607
|
+
|
|
608
|
+
Args:
|
|
609
|
+
query (str): The common or scientific name we want to look up
|
|
610
|
+
taxonomy_preference (str, optional): 'inat' or 'gbif'
|
|
611
|
+
retry (bool, optional): if the initial lookup fails, should we try heuristic
|
|
612
|
+
substitutions, e.g. replacing "_" with " ", or "spp" with "species"?
|
|
613
|
+
|
|
614
|
+
Returns:
|
|
615
|
+
TaxonomicMatch: the best taxonomic match, or None
|
|
607
616
|
"""
|
|
608
617
|
|
|
609
618
|
m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
|
|
@@ -616,6 +625,36 @@ def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retr
|
|
|
616
625
|
return m
|
|
617
626
|
|
|
618
627
|
|
|
628
|
+
def validate_and_convert(data):
|
|
629
|
+
"""
|
|
630
|
+
Recursively validates that all elements in the nested structure are only
|
|
631
|
+
tuples, lists, ints, or np.int64, and converts np.int64 to int.
|
|
632
|
+
|
|
633
|
+
Args:
|
|
634
|
+
data: The nested structure to validate and convert
|
|
635
|
+
|
|
636
|
+
Returns:
|
|
637
|
+
The validated and converted structure
|
|
638
|
+
|
|
639
|
+
Raises:
|
|
640
|
+
TypeError: If an invalid type is encountered
|
|
641
|
+
"""
|
|
642
|
+
|
|
643
|
+
if isinstance(data, np.int64):
|
|
644
|
+
return int(data)
|
|
645
|
+
elif isinstance(data, int) or isinstance(data, str):
|
|
646
|
+
return data
|
|
647
|
+
elif isinstance(data, (list, tuple)):
|
|
648
|
+
# Process lists and tuples recursively
|
|
649
|
+
container_type = type(data)
|
|
650
|
+
return container_type(validate_and_convert(item) for item in data)
|
|
651
|
+
else:
|
|
652
|
+
raise TypeError(f"Invalid type encountered: {type(data).__name__}. "
|
|
653
|
+
f"Only int, np.int64, list, and tuple are allowed.")
|
|
654
|
+
|
|
655
|
+
# ...def validate_and_convert(...)
|
|
656
|
+
|
|
657
|
+
|
|
619
658
|
def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') -> TaxonomicMatch:
|
|
620
659
|
|
|
621
660
|
query = query.lower().strip().replace('_', ' ')
|
|
@@ -760,6 +799,10 @@ def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') ->
|
|
|
760
799
|
|
|
761
800
|
# ...if we needed to look in the GBIF taxonomy
|
|
762
801
|
|
|
802
|
+
# Convert np.int64's to ints
|
|
803
|
+
if match is not None:
|
|
804
|
+
match = validate_and_convert(match)
|
|
805
|
+
|
|
763
806
|
taxonomy_string = str(match)
|
|
764
807
|
|
|
765
808
|
return TaxonomicMatch(scientific_name, common_name, taxonomic_level, source,
|
megadetector/utils/ct_utils.py
CHANGED
|
@@ -483,7 +483,9 @@ def sort_dictionary_by_key(d,reverse=False):
|
|
|
483
483
|
def sort_dictionary_by_value(d,sort_values=None,reverse=False):
|
|
484
484
|
"""
|
|
485
485
|
Sorts the dictionary [d] by value. If sort_values is None, uses d.values(),
|
|
486
|
-
otherwise uses the dictionary sort_values as the sorting criterion.
|
|
486
|
+
otherwise uses the dictionary sort_values as the sorting criterion. Always
|
|
487
|
+
returns a new standard dict, so if [d] is, for example, a defaultdict, the
|
|
488
|
+
returned value is not.
|
|
487
489
|
|
|
488
490
|
Args:
|
|
489
491
|
d (dict): dictionary to sort
|
|
@@ -492,7 +494,7 @@ def sort_dictionary_by_value(d,sort_values=None,reverse=False):
|
|
|
492
494
|
reverse (bool, optional): whether to sort in reverse (descending) order
|
|
493
495
|
|
|
494
496
|
Returns:
|
|
495
|
-
dict: sorted copy of [d
|
|
497
|
+
dict: sorted copy of [d
|
|
496
498
|
"""
|
|
497
499
|
|
|
498
500
|
if sort_values is None:
|
|
@@ -517,6 +519,52 @@ def invert_dictionary(d):
|
|
|
517
519
|
return {v: k for k, v in d.items()}
|
|
518
520
|
|
|
519
521
|
|
|
522
|
+
def round_floats_in_nested_dict(obj, decimal_places=5):
|
|
523
|
+
"""
|
|
524
|
+
Recursively rounds all floating point values in a nested structure to the
|
|
525
|
+
specified number of decimal places. Handles dictionaries, lists, tuples,
|
|
526
|
+
sets, and other iterables. Modifies mutable objects in place.
|
|
527
|
+
|
|
528
|
+
Args:
|
|
529
|
+
obj: The object to process (can be a dict, list, set, tuple, or primitive value)
|
|
530
|
+
decimal_places: Number of decimal places to round to (default: 5)
|
|
531
|
+
|
|
532
|
+
Returns:
|
|
533
|
+
The processed object (useful for recursive calls)
|
|
534
|
+
"""
|
|
535
|
+
if isinstance(obj, dict):
|
|
536
|
+
for key in obj:
|
|
537
|
+
obj[key] = round_floats_in_nested_dict(obj[key], decimal_places)
|
|
538
|
+
return obj
|
|
539
|
+
|
|
540
|
+
elif isinstance(obj, list):
|
|
541
|
+
for i in range(len(obj)):
|
|
542
|
+
obj[i] = round_floats_in_nested_dict(obj[i], decimal_places)
|
|
543
|
+
return obj
|
|
544
|
+
|
|
545
|
+
elif isinstance(obj, tuple):
|
|
546
|
+
# Tuples are immutable, so we create a new one
|
|
547
|
+
return tuple(round_floats_in_nested_dict(item, decimal_places) for item in obj)
|
|
548
|
+
|
|
549
|
+
elif isinstance(obj, set):
|
|
550
|
+
# Sets are mutable but we can't modify elements in-place
|
|
551
|
+
# Convert to list, process, and convert back to set
|
|
552
|
+
return set(round_floats_in_nested_dict(list(obj), decimal_places))
|
|
553
|
+
|
|
554
|
+
elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
|
|
555
|
+
# Handle other iterable types - convert to list, process, and convert back
|
|
556
|
+
return type(obj)(round_floats_in_nested_dict(item, decimal_places) for item in obj)
|
|
557
|
+
|
|
558
|
+
elif isinstance(obj, float):
|
|
559
|
+
return round(obj, decimal_places)
|
|
560
|
+
|
|
561
|
+
else:
|
|
562
|
+
# For other types (int, str, bool, None, etc.), return as is
|
|
563
|
+
return obj
|
|
564
|
+
|
|
565
|
+
# ...def round_floats_in_nested_dict(...)
|
|
566
|
+
|
|
567
|
+
|
|
520
568
|
def image_file_to_camera_folder(image_fn):
|
|
521
569
|
r"""
|
|
522
570
|
Removes common overflow folders (e.g. RECNX101, RECNX102) from paths, i.e. turn:
|
|
@@ -780,7 +828,7 @@ def dict_to_kvp_list(d,
|
|
|
780
828
|
if len(d) == 0:
|
|
781
829
|
return ''
|
|
782
830
|
|
|
783
|
-
s =
|
|
831
|
+
s = None
|
|
784
832
|
for k in d.keys():
|
|
785
833
|
assert isinstance(k,str), 'Input {} is not a str <--> str dict'.format(str(d))
|
|
786
834
|
v = d[k]
|
|
@@ -800,6 +848,9 @@ def dict_to_kvp_list(d,
|
|
|
800
848
|
s += item_separator
|
|
801
849
|
s += k + kv_separator + v
|
|
802
850
|
|
|
851
|
+
if s is None:
|
|
852
|
+
s = ''
|
|
853
|
+
|
|
803
854
|
return s
|
|
804
855
|
|
|
805
856
|
|
|
@@ -856,3 +907,25 @@ def __module_test__():
|
|
|
856
907
|
L = [{'a':5},{'a':0},{'a':10}]
|
|
857
908
|
k = 'a'
|
|
858
909
|
sort_list_of_dicts_by_key(L, k, reverse=True)
|
|
910
|
+
|
|
911
|
+
|
|
912
|
+
##%% Test float rounding
|
|
913
|
+
|
|
914
|
+
# Example with mixed collection types
|
|
915
|
+
data = {
|
|
916
|
+
"name": "Project X",
|
|
917
|
+
"values": [1.23456789, 2.3456789],
|
|
918
|
+
"tuple_values": (3.45678901, 4.56789012),
|
|
919
|
+
"set_values": {5.67890123, 6.78901234},
|
|
920
|
+
"metrics": {
|
|
921
|
+
"score": 98.7654321,
|
|
922
|
+
"components": [5.6789012, 6.7890123]
|
|
923
|
+
}
|
|
924
|
+
}
|
|
925
|
+
|
|
926
|
+
result = round_floats_in_nested_dict(data)
|
|
927
|
+
assert result['values'][0] == 1.23457
|
|
928
|
+
assert result['tuple_values'][0] == 3.45679
|
|
929
|
+
assert min(list(result['set_values'])) == 5.6789
|
|
930
|
+
|
|
931
|
+
|
|
@@ -17,9 +17,6 @@ import sys
|
|
|
17
17
|
import argparse
|
|
18
18
|
import re
|
|
19
19
|
|
|
20
|
-
import azure.common
|
|
21
|
-
from azure.storage.blob import BlobServiceClient, ContentSettings
|
|
22
|
-
|
|
23
20
|
from megadetector.utils.path_utils import is_image_file
|
|
24
21
|
|
|
25
22
|
|
|
@@ -139,6 +136,8 @@ def traverse_and_create_index(dir, sas_url=None, overwrite_files=False,
|
|
|
139
136
|
# If we want to set the content type in blob storage using a SAS URL
|
|
140
137
|
if sas_url:
|
|
141
138
|
|
|
139
|
+
from azure.storage.blob import BlobServiceClient, ContentSettings
|
|
140
|
+
|
|
142
141
|
# Example: sas_url = 'https://accname.blob.core.windows.net/bname/path/to/folder?st=...&se=...&sp=...&...'
|
|
143
142
|
if '?' in sas_url:
|
|
144
143
|
# 'https://accname.blob.core.windows.net/bname/path/to/folder' and 'st=...&se=...&sp=...&...'
|
|
@@ -196,6 +195,7 @@ def traverse_and_create_index(dir, sas_url=None, overwrite_files=False,
|
|
|
196
195
|
|
|
197
196
|
# Set content type in blob storage
|
|
198
197
|
if sas_url:
|
|
198
|
+
import azure.common
|
|
199
199
|
if container_folder:
|
|
200
200
|
output_blob_path = container_folder + '/' + output_file[len(dir) + 1:]
|
|
201
201
|
else:
|
|
@@ -237,7 +237,7 @@ def main():
|
|
|
237
237
|
args = parser.parse_args()
|
|
238
238
|
|
|
239
239
|
assert os.path.isdir(args.directory), "{} is not a valid directory".format(args.directory)
|
|
240
|
-
assert re.match('https?://[^\.]+\.blob\.core\.windows\.net/.+', args.sas_url), "--sas_url does not " + \
|
|
240
|
+
assert re.match(r'https?://[^\.]+\.blob\.core\.windows\.net/.+', args.sas_url), "--sas_url does not " + \
|
|
241
241
|
"match the format https://accname.blob.core.windows.net/bname/path/to/folder?..."
|
|
242
242
|
|
|
243
243
|
traverse_and_create_index(args.directory, overwrite_files=args.enable_overwrite, sas_url=args.sas_url, basepath=args.basepath)
|