megadetector 5.0.25__py3-none-any.whl → 5.0.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (34) hide show
  1. megadetector/data_management/cct_json_utils.py +15 -2
  2. megadetector/data_management/coco_to_yolo.py +53 -31
  3. megadetector/data_management/databases/combine_coco_camera_traps_files.py +7 -3
  4. megadetector/data_management/databases/integrity_check_json_db.py +2 -2
  5. megadetector/data_management/lila/generate_lila_per_image_labels.py +2 -2
  6. megadetector/data_management/lila/test_lila_metadata_urls.py +21 -10
  7. megadetector/data_management/remap_coco_categories.py +60 -11
  8. megadetector/data_management/yolo_to_coco.py +45 -15
  9. megadetector/postprocessing/classification_postprocessing.py +788 -524
  10. megadetector/postprocessing/create_crop_folder.py +95 -33
  11. megadetector/postprocessing/load_api_results.py +4 -1
  12. megadetector/postprocessing/md_to_coco.py +1 -1
  13. megadetector/postprocessing/postprocess_batch_results.py +156 -42
  14. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +3 -8
  15. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +2 -2
  16. megadetector/postprocessing/separate_detections_into_folders.py +20 -4
  17. megadetector/postprocessing/subset_json_detector_output.py +180 -15
  18. megadetector/postprocessing/validate_batch_results.py +13 -5
  19. megadetector/taxonomy_mapping/map_new_lila_datasets.py +6 -6
  20. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +3 -58
  21. megadetector/taxonomy_mapping/species_lookup.py +45 -2
  22. megadetector/utils/ct_utils.py +4 -2
  23. megadetector/utils/directory_listing.py +1 -1
  24. megadetector/utils/md_tests.py +2 -1
  25. megadetector/utils/path_utils.py +308 -19
  26. megadetector/utils/wi_utils.py +363 -186
  27. megadetector/visualization/visualization_utils.py +2 -1
  28. megadetector/visualization/visualize_db.py +1 -1
  29. megadetector/visualization/visualize_detector_output.py +1 -4
  30. {megadetector-5.0.25.dist-info → megadetector-5.0.26.dist-info}/METADATA +4 -3
  31. {megadetector-5.0.25.dist-info → megadetector-5.0.26.dist-info}/RECORD +34 -34
  32. {megadetector-5.0.25.dist-info → megadetector-5.0.26.dist-info}/WHEEL +1 -1
  33. {megadetector-5.0.25.dist-info → megadetector-5.0.26.dist-info/licenses}/LICENSE +0 -0
  34. {megadetector-5.0.25.dist-info → megadetector-5.0.26.dist-info}/top_level.txt +0 -0
@@ -181,7 +181,7 @@ class RepeatDetectionOptions:
181
181
  #: Original size is preserved if this is None.
182
182
  #:
183
183
  #: This does *not* include the tile image grid.
184
- self.maxOutputImageWidth = None
184
+ self.maxOutputImageWidth = 2000
185
185
 
186
186
  #: Line thickness (in pixels) for box rendering
187
187
  self.lineThickness = 10
@@ -256,7 +256,7 @@ class RepeatDetectionOptions:
256
256
  self.detectionTilesPrimaryImageLocation = 'right'
257
257
 
258
258
  #: Maximum number of individual detection instances to include in the mosaic
259
- self.detectionTilesMaxCrops = 250
259
+ self.detectionTilesMaxCrops = 150
260
260
 
261
261
  #: If bRenderOtherDetections is True, what color should we use to render the
262
262
  #: (hopefully pretty subtle) non-target detections?
@@ -86,6 +86,7 @@ from functools import partial
86
86
  from tqdm import tqdm
87
87
 
88
88
  from megadetector.utils.ct_utils import args_to_object, is_float
89
+ from megadetector.utils.path_utils import remove_empty_folders
89
90
  from megadetector.detection.run_detector import get_typical_confidence_threshold_from_results
90
91
  from megadetector.visualization import visualization_utils as vis_utils
91
92
  from megadetector.visualization.visualization_utils import blur_detections
@@ -167,7 +168,7 @@ class SeparateDetectionsIntoFoldersOptions:
167
168
  #:
168
169
  #: deer=0.75,cow=0.75
169
170
  #:
170
- #: Converted internally to a dict mapping name:threshold
171
+ #: String, converted internally to a dict mapping name:threshold
171
172
  self.classification_thresholds = None
172
173
 
173
174
  ## Debug or internal attributes
@@ -194,6 +195,10 @@ class SeparateDetectionsIntoFoldersOptions:
194
195
  #: Can also be a comma-separated list.
195
196
  self.category_names_to_blur = None
196
197
 
198
+ #: Remove all empty folders from the target folder at the end of the process,
199
+ #: whether or not they were created by this script
200
+ self.remove_empty_folders = False
201
+
197
202
  # ...__init__()
198
203
 
199
204
  # ...class SeparateDetectionsIntoFoldersOptions
@@ -319,7 +324,7 @@ def _process_detections(im,options):
319
324
 
320
325
  classification_category_id = classification[0]
321
326
  classification_confidence = classification[1]
322
-
327
+
323
328
  # Do we have a threshold for this category, and if so, is
324
329
  # this classification above threshold?
325
330
  assert options.classification_category_id_to_name is not None
@@ -521,7 +526,11 @@ def separate_detections_into_folders(options):
521
526
  for category_name in category_names:
522
527
 
523
528
  # Do we have a custom threshold for this category?
524
- assert category_name in options.category_name_to_threshold
529
+ if category_name not in options.category_name_to_threshold:
530
+ print('Warning: category {} in detection file, but not in threshold mapping'.format(
531
+ category_name))
532
+ options.category_name_to_threshold[category_name] = None
533
+
525
534
  if options.category_name_to_threshold[category_name] is None:
526
535
  options.category_name_to_threshold[category_name] = default_threshold
527
536
 
@@ -584,7 +593,7 @@ def separate_detections_into_folders(options):
584
593
 
585
594
  # ...for each token
586
595
 
587
- options.classification_thresholds = classification_thresholds
596
+ options.classification_thresholds = classification_thresholds
588
597
 
589
598
  # ...if classification thresholds are still in string format
590
599
 
@@ -611,6 +620,10 @@ def separate_detections_into_folders(options):
611
620
  pool = ThreadPool(options.n_threads)
612
621
  process_detections_with_options = partial(_process_detections, options=options)
613
622
  _ = list(tqdm(pool.imap(process_detections_with_options, images), total=len(images)))
623
+
624
+ if options.remove_empty_folders:
625
+ print('Removing empty folders from {}'.format(options.base_output_folder))
626
+ remove_empty_folders(options.base_output_folder)
614
627
 
615
628
  # ...def separate_detections_into_folders
616
629
 
@@ -715,6 +728,9 @@ def main():
715
728
  default_box_expansion))
716
729
  parser.add_argument('--category_names_to_blur', type=str, default=None,
717
730
  help='Comma-separated list of category names to blur (or a single category name, e.g. "person")')
731
+ parser.add_argument('--remove_empty_folders', action='store_true',
732
+ help='Remove all empty folders from the target folder at the end of the process, ' + \
733
+ 'whether or not they were created by this script')
718
734
 
719
735
  if len(sys.argv[1:])==0:
720
736
  parser.print_help()
@@ -61,9 +61,11 @@ import os
61
61
  import re
62
62
 
63
63
  from tqdm import tqdm
64
+ from collections import defaultdict
64
65
 
65
66
  from megadetector.utils.ct_utils import args_to_object, get_max_conf, invert_dictionary
66
67
  from megadetector.utils.path_utils import top_level_folder
68
+ from megadetector.utils.path_utils import recursive_file_list
67
69
 
68
70
 
69
71
  #%% Helper classes
@@ -136,7 +138,18 @@ class SubsetJsonDetectorOutputOptions:
136
138
 
137
139
  #: Set to >0 during testing to limit the number of images that get processed.
138
140
  self.debug_max_images = -1
141
+
142
+ #: Keep only files in this list, which can be a .json results file or a folder.
143
+ #
144
+ #: Assumes that the input .json file contains relative paths when comparing to a folder.
145
+ self.keep_files_in_list = None
146
+
147
+ #: Remove classification with <= N instances. Does not re-map categories
148
+ #: to be contiguous. Set to 1 to remove empty categories only.
149
+ self.remove_classification_categories_below_count = None
139
150
 
151
+ # ...class SubsetJsonDetectorOutputOptions
152
+
140
153
 
141
154
  #%% Main function
142
155
 
@@ -156,11 +169,104 @@ def _write_detection_results(data, output_filename, options):
156
169
  else:
157
170
  os.makedirs(basedir, exist_ok=True)
158
171
 
159
- print('Writing detection output to {}'.format(output_filename))
160
- with open(output_filename, 'w') as f:
172
+ n_images = len(data['images'])
173
+
174
+ print('Writing detection output (with {} images) to {}'.format(n_images,output_filename))
175
+ with open(output_filename, 'w', newline='\n') as f:
161
176
  json.dump(data,f,indent=1)
162
177
 
163
- # ..._write_detection_results()
178
+ # ...def _write_detection_results(...)
179
+
180
+
181
+ def remove_classification_categories_below_count(data, options):
182
+ """
183
+ Removes all classification categories below a threshold count. Does not re-map
184
+ classification category IDs.
185
+
186
+ Args:
187
+ data (dict): data loaded from a MD results file
188
+ options (SubsetJsonDetectorOutputOptions): parameters for subsetting
189
+
190
+ Returns:
191
+ dict: Possibly-modified version of [data] (also modifies in place)
192
+ """
193
+
194
+ if options.remove_classification_categories_below_count is None:
195
+ return data
196
+ if 'classification_categories' not in data:
197
+ return data
198
+
199
+ classification_category_id_to_count = {}
200
+
201
+ for classification_category_id in data['classification_categories']:
202
+ classification_category_id_to_count[classification_category_id] = 0
203
+
204
+ # Count the number of occurrences of each classification category
205
+ for im in data['images']:
206
+ if 'detections' not in im or im['detections'] is None:
207
+ continue
208
+ for det in im['detections']:
209
+ if 'classifications' not in det:
210
+ continue
211
+ for classification in det['classifications']:
212
+ classification_category_id_to_count[classification[0]] = \
213
+ classification_category_id_to_count[classification[0]] + 1
214
+
215
+
216
+ # Which categories have above-threshold counts?
217
+ classification_category_ids_to_keep = set()
218
+
219
+ for classification_category_id in classification_category_id_to_count:
220
+ if classification_category_id_to_count[classification_category_id] > \
221
+ options.remove_classification_categories_below_count:
222
+ classification_category_ids_to_keep.add(classification_category_id)
223
+
224
+ n_categories_removed = \
225
+ len(classification_category_id_to_count) - \
226
+ len(classification_category_ids_to_keep)
227
+
228
+ print('Removing {} of {} classification categories'.format(
229
+ n_categories_removed,len(classification_category_id_to_count)))
230
+
231
+ if n_categories_removed == 0:
232
+ return data
233
+
234
+
235
+ # Filter the category list
236
+ output_classification_categories = {}
237
+ for category_id in data['classification_categories']:
238
+ if category_id in classification_category_ids_to_keep:
239
+ output_classification_categories[category_id] = \
240
+ data['classification_categories'][category_id]
241
+ data['classification_categories'] = output_classification_categories
242
+ assert len(data['classification_categories']) == len(classification_category_ids_to_keep)
243
+
244
+
245
+ # If necessary, filter the category descriptions
246
+ if 'classification_category_descriptions' in data:
247
+ output_classification_category_descriptions = {}
248
+ for category_id in data['classification_category_descriptions']:
249
+ if category_id in classification_category_ids_to_keep:
250
+ output_classification_category_descriptions[category_id] = \
251
+ data['classification_category_descriptions'][category_id]
252
+ data['classification_category_descriptions'] = output_classification_category_descriptions
253
+
254
+ # Filter images
255
+ for im in data['images']:
256
+ if 'detections' not in im or im['detections'] is None:
257
+ continue
258
+ for det in im['detections']:
259
+ if 'classifications' not in det:
260
+ continue
261
+ classifications_to_keep = []
262
+ for classification in det['classifications']:
263
+ if classification[0] in classification_category_ids_to_keep:
264
+ classifications_to_keep.append(classification)
265
+ det['classifications'] = classifications_to_keep
266
+
267
+ return data
268
+
269
+ # ...def remove_classification_categories_below_count(...)
164
270
 
165
271
 
166
272
  def subset_json_detector_output_by_confidence(data, options):
@@ -172,7 +278,7 @@ def subset_json_detector_output_by_confidence(data, options):
172
278
  options (SubsetJsonDetectorOutputOptions): parameters for subsetting
173
279
 
174
280
  Returns:
175
- dict: Possibly-modified version of data (also modifies in place)
281
+ dict: Possibly-modified version of [data] (also modifies in place)
176
282
  """
177
283
 
178
284
  if options.confidence_threshold is None:
@@ -234,9 +340,55 @@ def subset_json_detector_output_by_confidence(data, options):
234
340
 
235
341
  return data
236
342
 
237
- # ...subset_json_detector_output_by_confidence()
343
+ # ...def subset_json_detector_output_by_confidence(...)
344
+
345
+
346
+ def subset_json_detector_output_by_list(data, options):
347
+ """
348
+ Keeps only files in options.keep_files_in_list, which can be a .json results file or a folder.
349
+ Assumes that the input .json file contains relative paths when comparing to a folder.
350
+
351
+ Args:
352
+ data (dict): data loaded from a MD results file
353
+ options (SubsetJsonDetectorOutputOptions): parameters for subsetting
354
+
355
+ Returns:
356
+ dict: Possibly-modified version of [data] (also modifies in place)
357
+ """
358
+
359
+ if options.keep_files_in_list is None:
360
+ return
361
+
362
+ files_to_keep = None
363
+
364
+ if os.path.isfile(options.keep_files_in_list):
365
+ with open(options.keep_files_in_list,'r') as f:
366
+ d = json.load(f)
367
+ files_to_keep = [im['file'] for im in d['images']]
368
+ elif os.path.isdir(options.keep_files_in_list):
369
+ files_to_keep = \
370
+ recursive_file_list(options.keep_files_in_list,return_relative_paths=True)
371
+ else:
372
+ raise ValueError('Subsetting .json file by list: {} is neither a .json results file nor a folder'.format(
373
+ options.keep_files_in_list))
374
+
375
+ files_to_keep = [fn.replace('\\','/') for fn in files_to_keep]
376
+ files_to_keep_set = set(files_to_keep)
377
+
378
+ images_to_keep = []
379
+
380
+ for im in data['images']:
381
+ fn = im['file'].replace('\\','/')
382
+ if fn in files_to_keep_set:
383
+ images_to_keep.append(im)
384
+
385
+ data['images'] = images_to_keep
386
+
387
+ return data
238
388
 
389
+ # ...def subset_json_detector_output_by_list(...)
239
390
 
391
+
240
392
  def subset_json_detector_output_by_categories(data, options):
241
393
  """
242
394
  Removes all detections without detections above a threshold for specific categories.
@@ -246,7 +398,7 @@ def subset_json_detector_output_by_categories(data, options):
246
398
  options (SubsetJsonDetectorOutputOptions): parameters for subsetting
247
399
 
248
400
  Returns:
249
- dict: Possibly-modified version of data (also modifies in place)
401
+ dict: Possibly-modified version of [data] (also modifies in place)
250
402
  """
251
403
 
252
404
  # If categories_to_keep is supplied as a list, convert to a dict
@@ -342,7 +494,7 @@ def subset_json_detector_output_by_categories(data, options):
342
494
 
343
495
  return data
344
496
 
345
- # ...subset_json_detector_output_by_categories()
497
+ # ...def subset_json_detector_output_by_categories(...)
346
498
 
347
499
 
348
500
  def remove_failed_images(data,options):
@@ -354,7 +506,7 @@ def remove_failed_images(data,options):
354
506
  options (SubsetJsonDetectorOutputOptions): parameters for subsetting
355
507
 
356
508
  Returns:
357
- dict: Possibly-modified version of data (also modifies in place)
509
+ dict: Possibly-modified version of [data] (also modifies in place)
358
510
  """
359
511
 
360
512
  images_in = data['images']
@@ -381,7 +533,7 @@ def remove_failed_images(data,options):
381
533
 
382
534
  return data
383
535
 
384
- # ...remove_failed_images()
536
+ # ...def remove_failed_images(...)
385
537
 
386
538
 
387
539
  def subset_json_detector_output_by_query(data, options):
@@ -394,7 +546,7 @@ def subset_json_detector_output_by_query(data, options):
394
546
  options (SubsetJsonDetectorOutputOptions): parameters for subsetting
395
547
 
396
548
  Returns:
397
- dict: Possibly-modified version of data (also modifies in place)
549
+ dict: Possibly-modified version of [data] (also modifies in place)
398
550
  """
399
551
 
400
552
  images_in = data['images']
@@ -441,7 +593,7 @@ def subset_json_detector_output_by_query(data, options):
441
593
 
442
594
  return data
443
595
 
444
- # ...subset_json_detector_output_by_query()
596
+ # ...def subset_json_detector_output_by_query(...)
445
597
 
446
598
 
447
599
  def subset_json_detector_output(input_filename, output_filename, options, data=None):
@@ -481,10 +633,10 @@ def subset_json_detector_output(input_filename, output_filename, options, data=N
481
633
  raise ValueError('When splitting by folders, output must be a valid directory name, you specified an existing file')
482
634
 
483
635
  if data is None:
484
- print('Reading json...', end='')
636
+ print('Reading file {}'.format(input_filename))
485
637
  with open(input_filename) as f:
486
638
  data = json.load(f)
487
- print(' ...done, read {} images'.format(len(data['images'])))
639
+ print('Read {} images'.format(len(data['images'])))
488
640
  if options.debug_max_images > 0:
489
641
  print('Trimming to {} images'.format(options.debug_max_images))
490
642
  data['images'] = data['images'][:options.debug_max_images]
@@ -500,7 +652,7 @@ def subset_json_detector_output(input_filename, output_filename, options, data=N
500
652
  if options.remove_failed_images:
501
653
 
502
654
  data = remove_failed_images(data, options)
503
-
655
+
504
656
  if options.confidence_threshold is not None:
505
657
 
506
658
  data = subset_json_detector_output_by_confidence(data, options)
@@ -508,6 +660,14 @@ def subset_json_detector_output(input_filename, output_filename, options, data=N
508
660
  if (options.categories_to_keep is not None) or (options.category_names_to_keep is not None):
509
661
 
510
662
  data = subset_json_detector_output_by_categories(data, options)
663
+
664
+ if options.remove_classification_categories_below_count is not None:
665
+
666
+ data = remove_classification_categories_below_count(data, options)
667
+
668
+ if options.keep_files_in_list is not None:
669
+
670
+ data = subset_json_detector_output_by_list(data, options)
511
671
 
512
672
  if not options.split_folders:
513
673
 
@@ -615,7 +775,7 @@ def subset_json_detector_output(input_filename, output_filename, options, data=N
615
775
 
616
776
  # ...if we're splitting folders
617
777
 
618
- # ...subset_json_detector_output()
778
+ # ...def subset_json_detector_output(...)
619
779
 
620
780
 
621
781
  #%% Interactive driver
@@ -676,6 +836,9 @@ def main():
676
836
  help='Replace [query] with this')
677
837
  parser.add_argument('--confidence_threshold', type=float, default=None,
678
838
  help='Remove detections below this confidence level')
839
+ parser.add_argument('--keep_files_in_list', type=str, default=None,
840
+ help='Keep only files in this list, which can be a .json results file or a folder.' + \
841
+ ' Assumes that the input .json file contains relative paths when comparing to a folder.')
679
842
  parser.add_argument('--split_folders', action='store_true',
680
843
  help='Split .json files by leaf-node folder')
681
844
  parser.add_argument('--split_folder_param', type=int,
@@ -690,6 +853,8 @@ def main():
690
853
  help='When using split_folders and make_folder_relative, copy jsons to their corresponding folders (relative to output_file)')
691
854
  parser.add_argument('--create_folders', action='store_true',
692
855
  help='When using copy_jsons_to_folders, create folders that don''t exist')
856
+ parser.add_argument('--remove_classification_categories_below_count', type=int, default=None,
857
+ help='Remove classification categories with less than this many instances (no removal by default)')
693
858
 
694
859
  if len(sys.argv[1:]) == 0:
695
860
  parser.print_help()
@@ -20,11 +20,19 @@ from tqdm import tqdm
20
20
  from megadetector.detection.video_utils import is_video_file
21
21
  from megadetector.utils.ct_utils import args_to_object, is_list_sorted # noqa
22
22
 
23
- typical_info_fields = ['detector','detection_completion_time',
24
- 'classifier','classification_completion_time',
25
- 'detection_metadata','classifier_metadata']
26
- required_keys = ['info','images','detection_categories']
27
- typical_keys = ['classification_categories']
23
+ typical_info_fields = ['detector',
24
+ 'detection_completion_time',
25
+ 'classifier',
26
+ 'classification_completion_time',
27
+ 'detection_metadata',
28
+ 'classifier_metadata']
29
+
30
+ required_keys = ['info',
31
+ 'images',
32
+ 'detection_categories']
33
+
34
+ typical_keys = ['classification_categories',
35
+ 'classification_category_descriptions']
28
36
 
29
37
 
30
38
  #%% Classes
@@ -15,10 +15,10 @@ import json
15
15
  # Created by get_lila_category_list.py
16
16
  input_lila_category_list_file = os.path.expanduser('~/lila/lila_categories_list/lila_dataset_to_categories.json')
17
17
 
18
- output_file = os.path.expanduser('~/lila/lila_additions_2024.12.31.csv')
18
+ output_file = os.path.expanduser('~/lila/lila_additions_2025.03.24.csv')
19
19
 
20
20
  datasets_to_map = [
21
- 'Seattle(ish) Camera Traps'
21
+ 'UNSW Predators'
22
22
  ]
23
23
 
24
24
 
@@ -125,6 +125,8 @@ output_df = pd.DataFrame(data=output_rows, columns=[
125
125
  'scientific_name', 'common_name', 'taxonomy_string'])
126
126
  output_df.to_csv(output_file, index=None, header=True)
127
127
 
128
+ # from megadetector.utils.path_utils import open_file; open_file(output_file)
129
+
128
130
 
129
131
  #%% Manual lookup
130
132
 
@@ -138,10 +140,8 @@ if False:
138
140
 
139
141
  #%%
140
142
 
141
- # q = 'white-throated monkey'
142
- # q = 'cingulata'
143
- # q = 'notamacropus'
144
- q = 'insects'
143
+ q = 'dasyurus maculatus'
144
+
145
145
  taxonomy_preference = 'inat'
146
146
  m = get_preferred_taxonomic_match(q,taxonomy_preference)
147
147
  # print(m.scientific_name); import clipboard; clipboard.copy(m.scientific_name)
@@ -16,7 +16,7 @@ import os
16
16
  import pandas as pd
17
17
 
18
18
  # lila_taxonomy_file = r"c:\git\agentmorrisprivate\lila-taxonomy\lila-taxonomy-mapping.csv"
19
- lila_taxonomy_file = os.path.expanduser('~/lila/lila_additions_2024.12.31.csv')
19
+ lila_taxonomy_file = os.path.expanduser('~/lila/lila_additions_2025.03.24.csv')
20
20
 
21
21
  preview_base = os.path.expanduser('~/lila/lila_taxonomy_preview')
22
22
  os.makedirs(preview_base,exist_ok=True)
@@ -72,65 +72,10 @@ from megadetector.taxonomy_mapping.species_lookup import \
72
72
  initialize_taxonomy_lookup()
73
73
 
74
74
 
75
- #%% Optionally remap all gbif-based mappings to inat (or vice-versa)
76
-
77
- if False:
78
-
79
- #%%
80
-
81
- source_mappings = ['gbif','manual']
82
- target_mapping = 'inat'
83
- valid_mappings = ['gbif','inat','manual']
84
-
85
- assert target_mapping in valid_mappings
86
- for source_mapping in source_mappings:
87
- assert source_mapping != target_mapping and \
88
- source_mapping in valid_mappings
89
-
90
- n_remappings = 0
91
-
92
- # i_row = 1; row = df.iloc[i_row]; row
93
- for i_row,row in df.iterrows():
94
-
95
- if row['source'] not in source_mappings:
96
- continue
97
-
98
- scientific_name = row['scientific_name']
99
- old_common = taxonomy_string_to_common_name(row['taxonomy_string'])
100
-
101
- m = get_preferred_taxonomic_match(scientific_name,target_mapping)
102
-
103
- if m is None or m.source != target_mapping:
104
- print('No mapping for {} ({}) ({})'.format(scientific_name,row['query'],old_common))
105
- continue
106
-
107
- assert m.scientific_name == row['scientific_name']
108
-
109
- if m.taxonomic_level == 'variety' and row['taxonomy_level'] == 'subspecies':
110
- pass
111
- else:
112
- assert m.taxonomic_level == row['taxonomy_level']
113
-
114
- new_common = taxonomy_string_to_common_name(m.taxonomy_string)
115
-
116
- if row['taxonomy_string'] != m.taxonomy_string:
117
- print('Remapping {} ({} to {})'.format(scientific_name, old_common, new_common))
118
- n_remappings += 1
119
- df.loc[i_row,'taxonomy_string'] = m.taxonomy_string
120
-
121
- if row['source'] != 'manual':
122
- df.loc[i_row,'source'] = m.source
123
-
124
- # This should be zero for the release .csv
125
- print('Made {} remappings'.format(n_remappings))
126
-
127
- #%%
128
-
129
- df.to_csv(lila_taxonomy_file.replace('.csv','_remapped.csv'),header=True,index=False)
130
-
131
-
132
75
  #%% Check for mappings that disagree with the taxonomy string
133
76
 
77
+ # For example, cases where the "level" column says "species", but the taxonomy string says it's a genus.
78
+
134
79
  df = pd.read_csv(lila_taxonomy_file)
135
80
 
136
81
  n_taxonomy_changes = 0
@@ -602,8 +602,17 @@ hyphenated_terms = ['crowned', 'backed', 'throated', 'tailed', 'headed', 'cheeke
602
602
 
603
603
  def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retry=True) -> TaxonomicMatch:
604
604
  """
605
- Wrapper for species_lookup.py, but expressing a variety of heuristics and
606
- preferences that are specific to our scenario.
605
+ Wrapper for _get_preferred_taxonomic_match, but expressing a variety of heuristics
606
+ and preferences that are specific to our scenario.
607
+
608
+ Args:
609
+ query (str): The common or scientific name we want to look up
610
+ taxonomy_preference (str, optional): 'inat' or 'gbif'
611
+ retry (bool, optional): if the initial lookup fails, should we try heuristic
612
+ substitutions, e.g. replacing "_" with " ", or "spp" with "species"?
613
+
614
+ Returns:
615
+ TaxonomicMatch: the best taxonomic match, or None
607
616
  """
608
617
 
609
618
  m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
@@ -616,6 +625,36 @@ def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retr
616
625
  return m
617
626
 
618
627
 
628
+ def validate_and_convert(data):
629
+ """
630
+ Recursively validates that all elements in the nested structure are only
631
+ tuples, lists, ints, or np.int64, and converts np.int64 to int.
632
+
633
+ Args:
634
+ data: The nested structure to validate and convert
635
+
636
+ Returns:
637
+ The validated and converted structure
638
+
639
+ Raises:
640
+ TypeError: If an invalid type is encountered
641
+ """
642
+
643
+ if isinstance(data, np.int64):
644
+ return int(data)
645
+ elif isinstance(data, int) or isinstance(data, str):
646
+ return data
647
+ elif isinstance(data, (list, tuple)):
648
+ # Process lists and tuples recursively
649
+ container_type = type(data)
650
+ return container_type(validate_and_convert(item) for item in data)
651
+ else:
652
+ raise TypeError(f"Invalid type encountered: {type(data).__name__}. "
653
+ f"Only int, np.int64, list, and tuple are allowed.")
654
+
655
+ # ...def validate_and_convert(...)
656
+
657
+
619
658
  def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') -> TaxonomicMatch:
620
659
 
621
660
  query = query.lower().strip().replace('_', ' ')
@@ -760,6 +799,10 @@ def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') ->
760
799
 
761
800
  # ...if we needed to look in the GBIF taxonomy
762
801
 
802
+ # Convert np.int64's to ints
803
+ if match is not None:
804
+ match = validate_and_convert(match)
805
+
763
806
  taxonomy_string = str(match)
764
807
 
765
808
  return TaxonomicMatch(scientific_name, common_name, taxonomic_level, source,
@@ -483,7 +483,9 @@ def sort_dictionary_by_key(d,reverse=False):
483
483
  def sort_dictionary_by_value(d,sort_values=None,reverse=False):
484
484
  """
485
485
  Sorts the dictionary [d] by value. If sort_values is None, uses d.values(),
486
- otherwise uses the dictionary sort_values as the sorting criterion.
486
+ otherwise uses the dictionary sort_values as the sorting criterion. Always
487
+ returns a new standard dict, so if [d] is, for example, a defaultdict, the
488
+ returned value is not.
487
489
 
488
490
  Args:
489
491
  d (dict): dictionary to sort
@@ -492,7 +494,7 @@ def sort_dictionary_by_value(d,sort_values=None,reverse=False):
492
494
  reverse (bool, optional): whether to sort in reverse (descending) order
493
495
 
494
496
  Returns:
495
- dict: sorted copy of [d]
497
+ dict: sorted copy of [d
496
498
  """
497
499
 
498
500
  if sort_values is None:
@@ -237,7 +237,7 @@ def main():
237
237
  args = parser.parse_args()
238
238
 
239
239
  assert os.path.isdir(args.directory), "{} is not a valid directory".format(args.directory)
240
- assert re.match('https?://[^\.]+\.blob\.core\.windows\.net/.+', args.sas_url), "--sas_url does not " + \
240
+ assert re.match(r'https?://[^\.]+\.blob\.core\.windows\.net/.+', args.sas_url), "--sas_url does not " + \
241
241
  "match the format https://accname.blob.core.windows.net/bname/path/to/folder?..."
242
242
 
243
243
  traverse_and_create_index(args.directory, overwrite_files=args.enable_overwrite, sas_url=args.sas_url, basepath=args.basepath)
@@ -1173,6 +1173,7 @@ def run_cli_tests(options):
1173
1173
  ## Return early if we're not running torch-related tests
1174
1174
 
1175
1175
  if options.test_mode == 'utils-only':
1176
+ print('utils-only tests finished, returning')
1176
1177
  return
1177
1178
 
1178
1179
 
@@ -1828,7 +1829,7 @@ def main():
1828
1829
  parser.add_argument(
1829
1830
  '--test_mode',
1830
1831
  type=str,
1831
- default='utils-only',
1832
+ default='all',
1832
1833
  help='Test mode: "all" or "utils-only"'
1833
1834
  )
1834
1835