megadetector 5.0.24__py3-none-any.whl → 5.0.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (41) hide show
  1. megadetector/data_management/cct_json_utils.py +15 -2
  2. megadetector/data_management/coco_to_yolo.py +53 -31
  3. megadetector/data_management/databases/combine_coco_camera_traps_files.py +7 -3
  4. megadetector/data_management/databases/integrity_check_json_db.py +2 -2
  5. megadetector/data_management/lila/add_locations_to_island_camera_traps.py +73 -69
  6. megadetector/data_management/lila/add_locations_to_nacti.py +114 -110
  7. megadetector/data_management/lila/generate_lila_per_image_labels.py +2 -2
  8. megadetector/data_management/lila/test_lila_metadata_urls.py +21 -10
  9. megadetector/data_management/remap_coco_categories.py +60 -11
  10. megadetector/data_management/{wi_to_md.py → speciesnet_to_md.py} +2 -2
  11. megadetector/data_management/yolo_to_coco.py +45 -15
  12. megadetector/detection/run_detector.py +1 -0
  13. megadetector/detection/run_detector_batch.py +5 -4
  14. megadetector/postprocessing/classification_postprocessing.py +788 -524
  15. megadetector/postprocessing/compare_batch_results.py +176 -9
  16. megadetector/postprocessing/create_crop_folder.py +420 -0
  17. megadetector/postprocessing/load_api_results.py +4 -1
  18. megadetector/postprocessing/md_to_coco.py +1 -1
  19. megadetector/postprocessing/postprocess_batch_results.py +158 -44
  20. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +3 -8
  21. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +2 -2
  22. megadetector/postprocessing/separate_detections_into_folders.py +20 -4
  23. megadetector/postprocessing/subset_json_detector_output.py +180 -15
  24. megadetector/postprocessing/validate_batch_results.py +13 -5
  25. megadetector/taxonomy_mapping/map_new_lila_datasets.py +6 -6
  26. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +3 -58
  27. megadetector/taxonomy_mapping/species_lookup.py +45 -2
  28. megadetector/utils/ct_utils.py +76 -3
  29. megadetector/utils/directory_listing.py +4 -4
  30. megadetector/utils/gpu_test.py +21 -3
  31. megadetector/utils/md_tests.py +142 -49
  32. megadetector/utils/path_utils.py +342 -19
  33. megadetector/utils/wi_utils.py +1286 -212
  34. megadetector/visualization/visualization_utils.py +16 -4
  35. megadetector/visualization/visualize_db.py +1 -1
  36. megadetector/visualization/visualize_detector_output.py +1 -4
  37. {megadetector-5.0.24.dist-info → megadetector-5.0.26.dist-info}/METADATA +6 -3
  38. {megadetector-5.0.24.dist-info → megadetector-5.0.26.dist-info}/RECORD +41 -40
  39. {megadetector-5.0.24.dist-info → megadetector-5.0.26.dist-info}/WHEEL +1 -1
  40. {megadetector-5.0.24.dist-info → megadetector-5.0.26.dist-info/licenses}/LICENSE +0 -0
  41. {megadetector-5.0.24.dist-info → megadetector-5.0.26.dist-info}/top_level.txt +0 -0
@@ -61,9 +61,11 @@ import os
61
61
  import re
62
62
 
63
63
  from tqdm import tqdm
64
+ from collections import defaultdict
64
65
 
65
66
  from megadetector.utils.ct_utils import args_to_object, get_max_conf, invert_dictionary
66
67
  from megadetector.utils.path_utils import top_level_folder
68
+ from megadetector.utils.path_utils import recursive_file_list
67
69
 
68
70
 
69
71
  #%% Helper classes
@@ -136,7 +138,18 @@ class SubsetJsonDetectorOutputOptions:
136
138
 
137
139
  #: Set to >0 during testing to limit the number of images that get processed.
138
140
  self.debug_max_images = -1
141
+
142
+ #: Keep only files in this list, which can be a .json results file or a folder.
143
+ #
144
+ #: Assumes that the input .json file contains relative paths when comparing to a folder.
145
+ self.keep_files_in_list = None
146
+
147
+ #: Remove classification with <= N instances. Does not re-map categories
148
+ #: to be contiguous. Set to 1 to remove empty categories only.
149
+ self.remove_classification_categories_below_count = None
139
150
 
151
+ # ...class SubsetJsonDetectorOutputOptions
152
+
140
153
 
141
154
  #%% Main function
142
155
 
@@ -156,11 +169,104 @@ def _write_detection_results(data, output_filename, options):
156
169
  else:
157
170
  os.makedirs(basedir, exist_ok=True)
158
171
 
159
- print('Writing detection output to {}'.format(output_filename))
160
- with open(output_filename, 'w') as f:
172
+ n_images = len(data['images'])
173
+
174
+ print('Writing detection output (with {} images) to {}'.format(n_images,output_filename))
175
+ with open(output_filename, 'w', newline='\n') as f:
161
176
  json.dump(data,f,indent=1)
162
177
 
163
- # ..._write_detection_results()
178
+ # ...def _write_detection_results(...)
179
+
180
+
181
+ def remove_classification_categories_below_count(data, options):
182
+ """
183
+ Removes all classification categories below a threshold count. Does not re-map
184
+ classification category IDs.
185
+
186
+ Args:
187
+ data (dict): data loaded from a MD results file
188
+ options (SubsetJsonDetectorOutputOptions): parameters for subsetting
189
+
190
+ Returns:
191
+ dict: Possibly-modified version of [data] (also modifies in place)
192
+ """
193
+
194
+ if options.remove_classification_categories_below_count is None:
195
+ return data
196
+ if 'classification_categories' not in data:
197
+ return data
198
+
199
+ classification_category_id_to_count = {}
200
+
201
+ for classification_category_id in data['classification_categories']:
202
+ classification_category_id_to_count[classification_category_id] = 0
203
+
204
+ # Count the number of occurrences of each classification category
205
+ for im in data['images']:
206
+ if 'detections' not in im or im['detections'] is None:
207
+ continue
208
+ for det in im['detections']:
209
+ if 'classifications' not in det:
210
+ continue
211
+ for classification in det['classifications']:
212
+ classification_category_id_to_count[classification[0]] = \
213
+ classification_category_id_to_count[classification[0]] + 1
214
+
215
+
216
+ # Which categories have above-threshold counts?
217
+ classification_category_ids_to_keep = set()
218
+
219
+ for classification_category_id in classification_category_id_to_count:
220
+ if classification_category_id_to_count[classification_category_id] > \
221
+ options.remove_classification_categories_below_count:
222
+ classification_category_ids_to_keep.add(classification_category_id)
223
+
224
+ n_categories_removed = \
225
+ len(classification_category_id_to_count) - \
226
+ len(classification_category_ids_to_keep)
227
+
228
+ print('Removing {} of {} classification categories'.format(
229
+ n_categories_removed,len(classification_category_id_to_count)))
230
+
231
+ if n_categories_removed == 0:
232
+ return data
233
+
234
+
235
+ # Filter the category list
236
+ output_classification_categories = {}
237
+ for category_id in data['classification_categories']:
238
+ if category_id in classification_category_ids_to_keep:
239
+ output_classification_categories[category_id] = \
240
+ data['classification_categories'][category_id]
241
+ data['classification_categories'] = output_classification_categories
242
+ assert len(data['classification_categories']) == len(classification_category_ids_to_keep)
243
+
244
+
245
+ # If necessary, filter the category descriptions
246
+ if 'classification_category_descriptions' in data:
247
+ output_classification_category_descriptions = {}
248
+ for category_id in data['classification_category_descriptions']:
249
+ if category_id in classification_category_ids_to_keep:
250
+ output_classification_category_descriptions[category_id] = \
251
+ data['classification_category_descriptions'][category_id]
252
+ data['classification_category_descriptions'] = output_classification_category_descriptions
253
+
254
+ # Filter images
255
+ for im in data['images']:
256
+ if 'detections' not in im or im['detections'] is None:
257
+ continue
258
+ for det in im['detections']:
259
+ if 'classifications' not in det:
260
+ continue
261
+ classifications_to_keep = []
262
+ for classification in det['classifications']:
263
+ if classification[0] in classification_category_ids_to_keep:
264
+ classifications_to_keep.append(classification)
265
+ det['classifications'] = classifications_to_keep
266
+
267
+ return data
268
+
269
+ # ...def remove_classification_categories_below_count(...)
164
270
 
165
271
 
166
272
  def subset_json_detector_output_by_confidence(data, options):
@@ -172,7 +278,7 @@ def subset_json_detector_output_by_confidence(data, options):
172
278
  options (SubsetJsonDetectorOutputOptions): parameters for subsetting
173
279
 
174
280
  Returns:
175
- dict: Possibly-modified version of data (also modifies in place)
281
+ dict: Possibly-modified version of [data] (also modifies in place)
176
282
  """
177
283
 
178
284
  if options.confidence_threshold is None:
@@ -234,9 +340,55 @@ def subset_json_detector_output_by_confidence(data, options):
234
340
 
235
341
  return data
236
342
 
237
- # ...subset_json_detector_output_by_confidence()
343
+ # ...def subset_json_detector_output_by_confidence(...)
344
+
345
+
346
+ def subset_json_detector_output_by_list(data, options):
347
+ """
348
+ Keeps only files in options.keep_files_in_list, which can be a .json results file or a folder.
349
+ Assumes that the input .json file contains relative paths when comparing to a folder.
350
+
351
+ Args:
352
+ data (dict): data loaded from a MD results file
353
+ options (SubsetJsonDetectorOutputOptions): parameters for subsetting
354
+
355
+ Returns:
356
+ dict: Possibly-modified version of [data] (also modifies in place)
357
+ """
358
+
359
+ if options.keep_files_in_list is None:
360
+ return
361
+
362
+ files_to_keep = None
363
+
364
+ if os.path.isfile(options.keep_files_in_list):
365
+ with open(options.keep_files_in_list,'r') as f:
366
+ d = json.load(f)
367
+ files_to_keep = [im['file'] for im in d['images']]
368
+ elif os.path.isdir(options.keep_files_in_list):
369
+ files_to_keep = \
370
+ recursive_file_list(options.keep_files_in_list,return_relative_paths=True)
371
+ else:
372
+ raise ValueError('Subsetting .json file by list: {} is neither a .json results file nor a folder'.format(
373
+ options.keep_files_in_list))
374
+
375
+ files_to_keep = [fn.replace('\\','/') for fn in files_to_keep]
376
+ files_to_keep_set = set(files_to_keep)
377
+
378
+ images_to_keep = []
379
+
380
+ for im in data['images']:
381
+ fn = im['file'].replace('\\','/')
382
+ if fn in files_to_keep_set:
383
+ images_to_keep.append(im)
384
+
385
+ data['images'] = images_to_keep
386
+
387
+ return data
238
388
 
389
+ # ...def subset_json_detector_output_by_list(...)
239
390
 
391
+
240
392
  def subset_json_detector_output_by_categories(data, options):
241
393
  """
242
394
  Removes all detections without detections above a threshold for specific categories.
@@ -246,7 +398,7 @@ def subset_json_detector_output_by_categories(data, options):
246
398
  options (SubsetJsonDetectorOutputOptions): parameters for subsetting
247
399
 
248
400
  Returns:
249
- dict: Possibly-modified version of data (also modifies in place)
401
+ dict: Possibly-modified version of [data] (also modifies in place)
250
402
  """
251
403
 
252
404
  # If categories_to_keep is supplied as a list, convert to a dict
@@ -342,7 +494,7 @@ def subset_json_detector_output_by_categories(data, options):
342
494
 
343
495
  return data
344
496
 
345
- # ...subset_json_detector_output_by_categories()
497
+ # ...def subset_json_detector_output_by_categories(...)
346
498
 
347
499
 
348
500
  def remove_failed_images(data,options):
@@ -354,7 +506,7 @@ def remove_failed_images(data,options):
354
506
  options (SubsetJsonDetectorOutputOptions): parameters for subsetting
355
507
 
356
508
  Returns:
357
- dict: Possibly-modified version of data (also modifies in place)
509
+ dict: Possibly-modified version of [data] (also modifies in place)
358
510
  """
359
511
 
360
512
  images_in = data['images']
@@ -381,7 +533,7 @@ def remove_failed_images(data,options):
381
533
 
382
534
  return data
383
535
 
384
- # ...remove_failed_images()
536
+ # ...def remove_failed_images(...)
385
537
 
386
538
 
387
539
  def subset_json_detector_output_by_query(data, options):
@@ -394,7 +546,7 @@ def subset_json_detector_output_by_query(data, options):
394
546
  options (SubsetJsonDetectorOutputOptions): parameters for subsetting
395
547
 
396
548
  Returns:
397
- dict: Possibly-modified version of data (also modifies in place)
549
+ dict: Possibly-modified version of [data] (also modifies in place)
398
550
  """
399
551
 
400
552
  images_in = data['images']
@@ -441,7 +593,7 @@ def subset_json_detector_output_by_query(data, options):
441
593
 
442
594
  return data
443
595
 
444
- # ...subset_json_detector_output_by_query()
596
+ # ...def subset_json_detector_output_by_query(...)
445
597
 
446
598
 
447
599
  def subset_json_detector_output(input_filename, output_filename, options, data=None):
@@ -481,10 +633,10 @@ def subset_json_detector_output(input_filename, output_filename, options, data=N
481
633
  raise ValueError('When splitting by folders, output must be a valid directory name, you specified an existing file')
482
634
 
483
635
  if data is None:
484
- print('Reading json...', end='')
636
+ print('Reading file {}'.format(input_filename))
485
637
  with open(input_filename) as f:
486
638
  data = json.load(f)
487
- print(' ...done, read {} images'.format(len(data['images'])))
639
+ print('Read {} images'.format(len(data['images'])))
488
640
  if options.debug_max_images > 0:
489
641
  print('Trimming to {} images'.format(options.debug_max_images))
490
642
  data['images'] = data['images'][:options.debug_max_images]
@@ -500,7 +652,7 @@ def subset_json_detector_output(input_filename, output_filename, options, data=N
500
652
  if options.remove_failed_images:
501
653
 
502
654
  data = remove_failed_images(data, options)
503
-
655
+
504
656
  if options.confidence_threshold is not None:
505
657
 
506
658
  data = subset_json_detector_output_by_confidence(data, options)
@@ -508,6 +660,14 @@ def subset_json_detector_output(input_filename, output_filename, options, data=N
508
660
  if (options.categories_to_keep is not None) or (options.category_names_to_keep is not None):
509
661
 
510
662
  data = subset_json_detector_output_by_categories(data, options)
663
+
664
+ if options.remove_classification_categories_below_count is not None:
665
+
666
+ data = remove_classification_categories_below_count(data, options)
667
+
668
+ if options.keep_files_in_list is not None:
669
+
670
+ data = subset_json_detector_output_by_list(data, options)
511
671
 
512
672
  if not options.split_folders:
513
673
 
@@ -615,7 +775,7 @@ def subset_json_detector_output(input_filename, output_filename, options, data=N
615
775
 
616
776
  # ...if we're splitting folders
617
777
 
618
- # ...subset_json_detector_output()
778
+ # ...def subset_json_detector_output(...)
619
779
 
620
780
 
621
781
  #%% Interactive driver
@@ -676,6 +836,9 @@ def main():
676
836
  help='Replace [query] with this')
677
837
  parser.add_argument('--confidence_threshold', type=float, default=None,
678
838
  help='Remove detections below this confidence level')
839
+ parser.add_argument('--keep_files_in_list', type=str, default=None,
840
+ help='Keep only files in this list, which can be a .json results file or a folder.' + \
841
+ ' Assumes that the input .json file contains relative paths when comparing to a folder.')
679
842
  parser.add_argument('--split_folders', action='store_true',
680
843
  help='Split .json files by leaf-node folder')
681
844
  parser.add_argument('--split_folder_param', type=int,
@@ -690,6 +853,8 @@ def main():
690
853
  help='When using split_folders and make_folder_relative, copy jsons to their corresponding folders (relative to output_file)')
691
854
  parser.add_argument('--create_folders', action='store_true',
692
855
  help='When using copy_jsons_to_folders, create folders that don''t exist')
856
+ parser.add_argument('--remove_classification_categories_below_count', type=int, default=None,
857
+ help='Remove classification categories with less than this many instances (no removal by default)')
693
858
 
694
859
  if len(sys.argv[1:]) == 0:
695
860
  parser.print_help()
@@ -20,11 +20,19 @@ from tqdm import tqdm
20
20
  from megadetector.detection.video_utils import is_video_file
21
21
  from megadetector.utils.ct_utils import args_to_object, is_list_sorted # noqa
22
22
 
23
- typical_info_fields = ['detector','detection_completion_time',
24
- 'classifier','classification_completion_time',
25
- 'detection_metadata','classifier_metadata']
26
- required_keys = ['info','images','detection_categories']
27
- typical_keys = ['classification_categories']
23
+ typical_info_fields = ['detector',
24
+ 'detection_completion_time',
25
+ 'classifier',
26
+ 'classification_completion_time',
27
+ 'detection_metadata',
28
+ 'classifier_metadata']
29
+
30
+ required_keys = ['info',
31
+ 'images',
32
+ 'detection_categories']
33
+
34
+ typical_keys = ['classification_categories',
35
+ 'classification_category_descriptions']
28
36
 
29
37
 
30
38
  #%% Classes
@@ -15,10 +15,10 @@ import json
15
15
  # Created by get_lila_category_list.py
16
16
  input_lila_category_list_file = os.path.expanduser('~/lila/lila_categories_list/lila_dataset_to_categories.json')
17
17
 
18
- output_file = os.path.expanduser('~/lila/lila_additions_2024.12.31.csv')
18
+ output_file = os.path.expanduser('~/lila/lila_additions_2025.03.24.csv')
19
19
 
20
20
  datasets_to_map = [
21
- 'Seattle(ish) Camera Traps'
21
+ 'UNSW Predators'
22
22
  ]
23
23
 
24
24
 
@@ -125,6 +125,8 @@ output_df = pd.DataFrame(data=output_rows, columns=[
125
125
  'scientific_name', 'common_name', 'taxonomy_string'])
126
126
  output_df.to_csv(output_file, index=None, header=True)
127
127
 
128
+ # from megadetector.utils.path_utils import open_file; open_file(output_file)
129
+
128
130
 
129
131
  #%% Manual lookup
130
132
 
@@ -138,10 +140,8 @@ if False:
138
140
 
139
141
  #%%
140
142
 
141
- # q = 'white-throated monkey'
142
- # q = 'cingulata'
143
- # q = 'notamacropus'
144
- q = 'insects'
143
+ q = 'dasyurus maculatus'
144
+
145
145
  taxonomy_preference = 'inat'
146
146
  m = get_preferred_taxonomic_match(q,taxonomy_preference)
147
147
  # print(m.scientific_name); import clipboard; clipboard.copy(m.scientific_name)
@@ -16,7 +16,7 @@ import os
16
16
  import pandas as pd
17
17
 
18
18
  # lila_taxonomy_file = r"c:\git\agentmorrisprivate\lila-taxonomy\lila-taxonomy-mapping.csv"
19
- lila_taxonomy_file = os.path.expanduser('~/lila/lila_additions_2024.12.31.csv')
19
+ lila_taxonomy_file = os.path.expanduser('~/lila/lila_additions_2025.03.24.csv')
20
20
 
21
21
  preview_base = os.path.expanduser('~/lila/lila_taxonomy_preview')
22
22
  os.makedirs(preview_base,exist_ok=True)
@@ -72,65 +72,10 @@ from megadetector.taxonomy_mapping.species_lookup import \
72
72
  initialize_taxonomy_lookup()
73
73
 
74
74
 
75
- #%% Optionally remap all gbif-based mappings to inat (or vice-versa)
76
-
77
- if False:
78
-
79
- #%%
80
-
81
- source_mappings = ['gbif','manual']
82
- target_mapping = 'inat'
83
- valid_mappings = ['gbif','inat','manual']
84
-
85
- assert target_mapping in valid_mappings
86
- for source_mapping in source_mappings:
87
- assert source_mapping != target_mapping and \
88
- source_mapping in valid_mappings
89
-
90
- n_remappings = 0
91
-
92
- # i_row = 1; row = df.iloc[i_row]; row
93
- for i_row,row in df.iterrows():
94
-
95
- if row['source'] not in source_mappings:
96
- continue
97
-
98
- scientific_name = row['scientific_name']
99
- old_common = taxonomy_string_to_common_name(row['taxonomy_string'])
100
-
101
- m = get_preferred_taxonomic_match(scientific_name,target_mapping)
102
-
103
- if m is None or m.source != target_mapping:
104
- print('No mapping for {} ({}) ({})'.format(scientific_name,row['query'],old_common))
105
- continue
106
-
107
- assert m.scientific_name == row['scientific_name']
108
-
109
- if m.taxonomic_level == 'variety' and row['taxonomy_level'] == 'subspecies':
110
- pass
111
- else:
112
- assert m.taxonomic_level == row['taxonomy_level']
113
-
114
- new_common = taxonomy_string_to_common_name(m.taxonomy_string)
115
-
116
- if row['taxonomy_string'] != m.taxonomy_string:
117
- print('Remapping {} ({} to {})'.format(scientific_name, old_common, new_common))
118
- n_remappings += 1
119
- df.loc[i_row,'taxonomy_string'] = m.taxonomy_string
120
-
121
- if row['source'] != 'manual':
122
- df.loc[i_row,'source'] = m.source
123
-
124
- # This should be zero for the release .csv
125
- print('Made {} remappings'.format(n_remappings))
126
-
127
- #%%
128
-
129
- df.to_csv(lila_taxonomy_file.replace('.csv','_remapped.csv'),header=True,index=False)
130
-
131
-
132
75
  #%% Check for mappings that disagree with the taxonomy string
133
76
 
77
+ # For example, cases where the "level" column says "species", but the taxonomy string says it's a genus.
78
+
134
79
  df = pd.read_csv(lila_taxonomy_file)
135
80
 
136
81
  n_taxonomy_changes = 0
@@ -602,8 +602,17 @@ hyphenated_terms = ['crowned', 'backed', 'throated', 'tailed', 'headed', 'cheeke
602
602
 
603
603
  def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retry=True) -> TaxonomicMatch:
604
604
  """
605
- Wrapper for species_lookup.py, but expressing a variety of heuristics and
606
- preferences that are specific to our scenario.
605
+ Wrapper for _get_preferred_taxonomic_match, but expressing a variety of heuristics
606
+ and preferences that are specific to our scenario.
607
+
608
+ Args:
609
+ query (str): The common or scientific name we want to look up
610
+ taxonomy_preference (str, optional): 'inat' or 'gbif'
611
+ retry (bool, optional): if the initial lookup fails, should we try heuristic
612
+ substitutions, e.g. replacing "_" with " ", or "spp" with "species"?
613
+
614
+ Returns:
615
+ TaxonomicMatch: the best taxonomic match, or None
607
616
  """
608
617
 
609
618
  m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
@@ -616,6 +625,36 @@ def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retr
616
625
  return m
617
626
 
618
627
 
628
+ def validate_and_convert(data):
629
+ """
630
+ Recursively validates that all elements in the nested structure are only
631
+ tuples, lists, ints, or np.int64, and converts np.int64 to int.
632
+
633
+ Args:
634
+ data: The nested structure to validate and convert
635
+
636
+ Returns:
637
+ The validated and converted structure
638
+
639
+ Raises:
640
+ TypeError: If an invalid type is encountered
641
+ """
642
+
643
+ if isinstance(data, np.int64):
644
+ return int(data)
645
+ elif isinstance(data, int) or isinstance(data, str):
646
+ return data
647
+ elif isinstance(data, (list, tuple)):
648
+ # Process lists and tuples recursively
649
+ container_type = type(data)
650
+ return container_type(validate_and_convert(item) for item in data)
651
+ else:
652
+ raise TypeError(f"Invalid type encountered: {type(data).__name__}. "
653
+ f"Only int, np.int64, list, and tuple are allowed.")
654
+
655
+ # ...def validate_and_convert(...)
656
+
657
+
619
658
  def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') -> TaxonomicMatch:
620
659
 
621
660
  query = query.lower().strip().replace('_', ' ')
@@ -760,6 +799,10 @@ def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') ->
760
799
 
761
800
  # ...if we needed to look in the GBIF taxonomy
762
801
 
802
+ # Convert np.int64's to ints
803
+ if match is not None:
804
+ match = validate_and_convert(match)
805
+
763
806
  taxonomy_string = str(match)
764
807
 
765
808
  return TaxonomicMatch(scientific_name, common_name, taxonomic_level, source,
@@ -483,7 +483,9 @@ def sort_dictionary_by_key(d,reverse=False):
483
483
  def sort_dictionary_by_value(d,sort_values=None,reverse=False):
484
484
  """
485
485
  Sorts the dictionary [d] by value. If sort_values is None, uses d.values(),
486
- otherwise uses the dictionary sort_values as the sorting criterion.
486
+ otherwise uses the dictionary sort_values as the sorting criterion. Always
487
+ returns a new standard dict, so if [d] is, for example, a defaultdict, the
488
+ returned value is not.
487
489
 
488
490
  Args:
489
491
  d (dict): dictionary to sort
@@ -492,7 +494,7 @@ def sort_dictionary_by_value(d,sort_values=None,reverse=False):
492
494
  reverse (bool, optional): whether to sort in reverse (descending) order
493
495
 
494
496
  Returns:
495
- dict: sorted copy of [d]
497
+ dict: sorted copy of [d
496
498
  """
497
499
 
498
500
  if sort_values is None:
@@ -517,6 +519,52 @@ def invert_dictionary(d):
517
519
  return {v: k for k, v in d.items()}
518
520
 
519
521
 
522
+ def round_floats_in_nested_dict(obj, decimal_places=5):
523
+ """
524
+ Recursively rounds all floating point values in a nested structure to the
525
+ specified number of decimal places. Handles dictionaries, lists, tuples,
526
+ sets, and other iterables. Modifies mutable objects in place.
527
+
528
+ Args:
529
+ obj: The object to process (can be a dict, list, set, tuple, or primitive value)
530
+ decimal_places: Number of decimal places to round to (default: 5)
531
+
532
+ Returns:
533
+ The processed object (useful for recursive calls)
534
+ """
535
+ if isinstance(obj, dict):
536
+ for key in obj:
537
+ obj[key] = round_floats_in_nested_dict(obj[key], decimal_places)
538
+ return obj
539
+
540
+ elif isinstance(obj, list):
541
+ for i in range(len(obj)):
542
+ obj[i] = round_floats_in_nested_dict(obj[i], decimal_places)
543
+ return obj
544
+
545
+ elif isinstance(obj, tuple):
546
+ # Tuples are immutable, so we create a new one
547
+ return tuple(round_floats_in_nested_dict(item, decimal_places) for item in obj)
548
+
549
+ elif isinstance(obj, set):
550
+ # Sets are mutable but we can't modify elements in-place
551
+ # Convert to list, process, and convert back to set
552
+ return set(round_floats_in_nested_dict(list(obj), decimal_places))
553
+
554
+ elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
555
+ # Handle other iterable types - convert to list, process, and convert back
556
+ return type(obj)(round_floats_in_nested_dict(item, decimal_places) for item in obj)
557
+
558
+ elif isinstance(obj, float):
559
+ return round(obj, decimal_places)
560
+
561
+ else:
562
+ # For other types (int, str, bool, None, etc.), return as is
563
+ return obj
564
+
565
+ # ...def round_floats_in_nested_dict(...)
566
+
567
+
520
568
  def image_file_to_camera_folder(image_fn):
521
569
  r"""
522
570
  Removes common overflow folders (e.g. RECNX101, RECNX102) from paths, i.e. turn:
@@ -780,7 +828,7 @@ def dict_to_kvp_list(d,
780
828
  if len(d) == 0:
781
829
  return ''
782
830
 
783
- s = ''
831
+ s = None
784
832
  for k in d.keys():
785
833
  assert isinstance(k,str), 'Input {} is not a str <--> str dict'.format(str(d))
786
834
  v = d[k]
@@ -800,6 +848,9 @@ def dict_to_kvp_list(d,
800
848
  s += item_separator
801
849
  s += k + kv_separator + v
802
850
 
851
+ if s is None:
852
+ s = ''
853
+
803
854
  return s
804
855
 
805
856
 
@@ -856,3 +907,25 @@ def __module_test__():
856
907
  L = [{'a':5},{'a':0},{'a':10}]
857
908
  k = 'a'
858
909
  sort_list_of_dicts_by_key(L, k, reverse=True)
910
+
911
+
912
+ ##%% Test float rounding
913
+
914
+ # Example with mixed collection types
915
+ data = {
916
+ "name": "Project X",
917
+ "values": [1.23456789, 2.3456789],
918
+ "tuple_values": (3.45678901, 4.56789012),
919
+ "set_values": {5.67890123, 6.78901234},
920
+ "metrics": {
921
+ "score": 98.7654321,
922
+ "components": [5.6789012, 6.7890123]
923
+ }
924
+ }
925
+
926
+ result = round_floats_in_nested_dict(data)
927
+ assert result['values'][0] == 1.23457
928
+ assert result['tuple_values'][0] == 3.45679
929
+ assert min(list(result['set_values'])) == 5.6789
930
+
931
+
@@ -17,9 +17,6 @@ import sys
17
17
  import argparse
18
18
  import re
19
19
 
20
- import azure.common
21
- from azure.storage.blob import BlobServiceClient, ContentSettings
22
-
23
20
  from megadetector.utils.path_utils import is_image_file
24
21
 
25
22
 
@@ -139,6 +136,8 @@ def traverse_and_create_index(dir, sas_url=None, overwrite_files=False,
139
136
  # If we want to set the content type in blob storage using a SAS URL
140
137
  if sas_url:
141
138
 
139
+ from azure.storage.blob import BlobServiceClient, ContentSettings
140
+
142
141
  # Example: sas_url = 'https://accname.blob.core.windows.net/bname/path/to/folder?st=...&se=...&sp=...&...'
143
142
  if '?' in sas_url:
144
143
  # 'https://accname.blob.core.windows.net/bname/path/to/folder' and 'st=...&se=...&sp=...&...'
@@ -196,6 +195,7 @@ def traverse_and_create_index(dir, sas_url=None, overwrite_files=False,
196
195
 
197
196
  # Set content type in blob storage
198
197
  if sas_url:
198
+ import azure.common
199
199
  if container_folder:
200
200
  output_blob_path = container_folder + '/' + output_file[len(dir) + 1:]
201
201
  else:
@@ -237,7 +237,7 @@ def main():
237
237
  args = parser.parse_args()
238
238
 
239
239
  assert os.path.isdir(args.directory), "{} is not a valid directory".format(args.directory)
240
- assert re.match('https?://[^\.]+\.blob\.core\.windows\.net/.+', args.sas_url), "--sas_url does not " + \
240
+ assert re.match(r'https?://[^\.]+\.blob\.core\.windows\.net/.+', args.sas_url), "--sas_url does not " + \
241
241
  "match the format https://accname.blob.core.windows.net/bname/path/to/folder?..."
242
242
 
243
243
  traverse_and_create_index(args.directory, overwrite_files=args.enable_overwrite, sas_url=args.sas_url, basepath=args.basepath)