megadetector 10.0.9__py3-none-any.whl → 10.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (84) hide show
  1. megadetector/data_management/animl_to_md.py +5 -2
  2. megadetector/data_management/cct_json_utils.py +4 -2
  3. megadetector/data_management/cct_to_md.py +5 -4
  4. megadetector/data_management/cct_to_wi.py +5 -1
  5. megadetector/data_management/coco_to_yolo.py +3 -2
  6. megadetector/data_management/databases/combine_coco_camera_traps_files.py +4 -4
  7. megadetector/data_management/databases/integrity_check_json_db.py +2 -2
  8. megadetector/data_management/databases/subset_json_db.py +0 -3
  9. megadetector/data_management/generate_crops_from_cct.py +6 -4
  10. megadetector/data_management/get_image_sizes.py +5 -35
  11. megadetector/data_management/labelme_to_coco.py +10 -6
  12. megadetector/data_management/labelme_to_yolo.py +19 -28
  13. megadetector/data_management/lila/create_lila_test_set.py +22 -2
  14. megadetector/data_management/lila/generate_lila_per_image_labels.py +7 -5
  15. megadetector/data_management/lila/lila_common.py +2 -2
  16. megadetector/data_management/lila/test_lila_metadata_urls.py +0 -1
  17. megadetector/data_management/ocr_tools.py +6 -10
  18. megadetector/data_management/read_exif.py +69 -13
  19. megadetector/data_management/remap_coco_categories.py +1 -1
  20. megadetector/data_management/remove_exif.py +10 -5
  21. megadetector/data_management/rename_images.py +20 -13
  22. megadetector/data_management/resize_coco_dataset.py +10 -4
  23. megadetector/data_management/speciesnet_to_md.py +3 -3
  24. megadetector/data_management/yolo_output_to_md_output.py +3 -1
  25. megadetector/data_management/yolo_to_coco.py +28 -19
  26. megadetector/detection/change_detection.py +26 -18
  27. megadetector/detection/process_video.py +1 -1
  28. megadetector/detection/pytorch_detector.py +5 -5
  29. megadetector/detection/run_detector.py +34 -10
  30. megadetector/detection/run_detector_batch.py +60 -42
  31. megadetector/detection/run_inference_with_yolov5_val.py +3 -1
  32. megadetector/detection/run_md_and_speciesnet.py +282 -110
  33. megadetector/detection/run_tiled_inference.py +7 -7
  34. megadetector/detection/tf_detector.py +4 -6
  35. megadetector/detection/video_utils.py +9 -6
  36. megadetector/postprocessing/add_max_conf.py +4 -4
  37. megadetector/postprocessing/categorize_detections_by_size.py +3 -2
  38. megadetector/postprocessing/classification_postprocessing.py +19 -21
  39. megadetector/postprocessing/combine_batch_outputs.py +3 -2
  40. megadetector/postprocessing/compare_batch_results.py +49 -27
  41. megadetector/postprocessing/convert_output_format.py +8 -6
  42. megadetector/postprocessing/create_crop_folder.py +13 -4
  43. megadetector/postprocessing/generate_csv_report.py +22 -8
  44. megadetector/postprocessing/load_api_results.py +8 -4
  45. megadetector/postprocessing/md_to_coco.py +2 -3
  46. megadetector/postprocessing/md_to_labelme.py +12 -8
  47. megadetector/postprocessing/md_to_wi.py +2 -1
  48. megadetector/postprocessing/merge_detections.py +4 -6
  49. megadetector/postprocessing/postprocess_batch_results.py +4 -3
  50. megadetector/postprocessing/remap_detection_categories.py +6 -3
  51. megadetector/postprocessing/render_detection_confusion_matrix.py +18 -10
  52. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
  53. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +5 -3
  54. megadetector/postprocessing/separate_detections_into_folders.py +10 -4
  55. megadetector/postprocessing/subset_json_detector_output.py +1 -1
  56. megadetector/postprocessing/top_folders_to_bottom.py +22 -7
  57. megadetector/postprocessing/validate_batch_results.py +1 -1
  58. megadetector/taxonomy_mapping/map_new_lila_datasets.py +59 -3
  59. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +1 -1
  60. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +26 -17
  61. megadetector/taxonomy_mapping/species_lookup.py +51 -2
  62. megadetector/utils/ct_utils.py +9 -4
  63. megadetector/utils/directory_listing.py +3 -0
  64. megadetector/utils/extract_frames_from_video.py +4 -0
  65. megadetector/utils/gpu_test.py +6 -6
  66. megadetector/utils/md_tests.py +21 -21
  67. megadetector/utils/path_utils.py +171 -36
  68. megadetector/utils/split_locations_into_train_val.py +0 -4
  69. megadetector/utils/string_utils.py +21 -0
  70. megadetector/utils/url_utils.py +5 -3
  71. megadetector/utils/wi_platform_utils.py +168 -24
  72. megadetector/utils/wi_taxonomy_utils.py +38 -8
  73. megadetector/utils/write_html_image_list.py +1 -2
  74. megadetector/visualization/plot_utils.py +31 -19
  75. megadetector/visualization/render_images_with_thumbnails.py +3 -0
  76. megadetector/visualization/visualization_utils.py +18 -7
  77. megadetector/visualization/visualize_db.py +9 -26
  78. megadetector/visualization/visualize_detector_output.py +1 -0
  79. megadetector/visualization/visualize_video_output.py +14 -2
  80. {megadetector-10.0.9.dist-info → megadetector-10.0.11.dist-info}/METADATA +1 -1
  81. {megadetector-10.0.9.dist-info → megadetector-10.0.11.dist-info}/RECORD +84 -84
  82. {megadetector-10.0.9.dist-info → megadetector-10.0.11.dist-info}/WHEEL +0 -0
  83. {megadetector-10.0.9.dist-info → megadetector-10.0.11.dist-info}/licenses/LICENSE +0 -0
  84. {megadetector-10.0.9.dist-info → megadetector-10.0.11.dist-info}/top_level.txt +0 -0
@@ -28,6 +28,7 @@ from functools import partial
28
28
 
29
29
  from megadetector.visualization.visualization_utils import open_image
30
30
  from megadetector.utils.ct_utils import round_float
31
+ from megadetector.utils.ct_utils import write_json
31
32
  from megadetector.detection.run_detector import DEFAULT_DETECTOR_LABEL_MAP, FAILURE_IMAGE_OPEN
32
33
 
33
34
  output_precision = 3
@@ -36,8 +37,11 @@ default_confidence_threshold = 0.15
36
37
 
37
38
  #%% Functions
38
39
 
39
- def get_labelme_dict_for_image(im,image_base_name=None,category_id_to_name=None,
40
- info=None,confidence_threshold=None):
40
+ def get_labelme_dict_for_image(im,
41
+ image_base_name=None,
42
+ category_id_to_name=None,
43
+ info=None,
44
+ confidence_threshold=None):
41
45
  """
42
46
  For the given image struct in MD results format, reformat the detections into
43
47
  labelme format.
@@ -60,7 +64,7 @@ def get_labelme_dict_for_image(im,image_base_name=None,category_id_to_name=None,
60
64
  if image_base_name is None:
61
65
  image_base_name = os.path.basename(im['file'])
62
66
 
63
- if category_id_to_name:
67
+ if category_id_to_name is None:
64
68
  category_id_to_name = DEFAULT_DETECTOR_LABEL_MAP
65
69
 
66
70
  if confidence_threshold is None:
@@ -138,8 +142,7 @@ def _write_output_for_image(im,
138
142
  info=info,
139
143
  confidence_threshold=confidence_threshold)
140
144
 
141
- with open(json_path,'w') as f:
142
- json.dump(output_dict,f,indent=1)
145
+ write_json(json_path,output_dict)
143
146
 
144
147
  # ...def write_output_for_image(...)
145
148
 
@@ -256,9 +259,10 @@ def md_to_labelme(results_file,
256
259
  md_results['images']),
257
260
  total=len(md_results['images'])))
258
261
  finally:
259
- pool.close()
260
- pool.join()
261
- print("Pool closed and joined for labelme file writes")
262
+ if pool is not None:
263
+ pool.close()
264
+ pool.join()
265
+ print("Pool closed and joined for labelme file writes")
262
266
 
263
267
  # ...for each image
264
268
 
@@ -10,6 +10,7 @@ Converts the MD .json format to the WI predictions.json format.
10
10
 
11
11
  import sys
12
12
  import argparse
13
+
13
14
  from megadetector.utils.wi_taxonomy_utils import generate_predictions_json_from_md_results
14
15
 
15
16
 
@@ -34,7 +35,7 @@ def main(): # noqa
34
35
 
35
36
  generate_predictions_json_from_md_results(args.md_results_file,
36
37
  args.predictions_json_file,
37
- base_folder=None)
38
+ base_folder=args.base_folder)
38
39
 
39
40
  if __name__ == '__main__':
40
41
  main()
@@ -23,6 +23,7 @@ import os
23
23
  from tqdm import tqdm
24
24
 
25
25
  from megadetector.utils.ct_utils import get_iou
26
+ from megadetector.utils.ct_utils import write_json
26
27
 
27
28
 
28
29
  #%% Structs
@@ -121,8 +122,6 @@ def merge_detections(source_files,target_file,output_file,options=None):
121
122
 
122
123
  assert os.path.isfile(target_file)
123
124
 
124
- os.makedirs(os.path.dirname(output_file),exist_ok=True)
125
-
126
125
  with open(target_file,'r') as f:
127
126
  output_data = json.load(f)
128
127
 
@@ -290,8 +289,7 @@ def merge_detections(source_files,target_file,output_file,options=None):
290
289
 
291
290
  # ...for each source file
292
291
 
293
- with open(output_file,'w') as f:
294
- json.dump(output_data,f,indent=1)
292
+ write_json(output_file,output_data)
295
293
 
296
294
  print('Saved merged results to {}'.format(output_file))
297
295
 
@@ -308,7 +306,7 @@ def main():
308
306
  default_options = MergeDetectionsOptions()
309
307
 
310
308
  parser = argparse.ArgumentParser(
311
- description='Merge detections from one or more MegaDetector results files into an existing reuslts file')
309
+ description='Merge detections from one or more MegaDetector results files into an existing results file')
312
310
  parser.add_argument(
313
311
  'source_files',
314
312
  nargs='+',
@@ -359,7 +357,7 @@ def main():
359
357
  type=int,
360
358
  nargs='+',
361
359
  default=None,
362
- help='List of numeric detection categories to include')
360
+ help='List of numeric detection categories to exclude')
363
361
  parser.add_argument(
364
362
  '--merge_empty_only',
365
363
  action='store_true',
@@ -1889,8 +1889,9 @@ def process_batch_results(options):
1889
1889
  if options.include_classification_category_report:
1890
1890
 
1891
1891
  # TODO: it's only for silly historical reasons that we re-read
1892
- # the input file in this case; we're not currently carrying the json
1893
- # representation around, only the Pandas representation.
1892
+ # the input file in this case; because this module has used Pandas
1893
+ # forever, we're not currently carrying the json representation around,
1894
+ # only the Pandas representation.
1894
1895
 
1895
1896
  print('Generating classification category report')
1896
1897
 
@@ -1905,7 +1906,7 @@ def process_batch_results(options):
1905
1906
  if ('classifications' in det) and (len(det['classifications']) > 0):
1906
1907
  class_id = det['classifications'][0][0]
1907
1908
  if class_id not in classification_category_to_count:
1908
- classification_category_to_count[class_id] = 0
1909
+ classification_category_to_count[class_id] = 1
1909
1910
  else:
1910
1911
  classification_category_to_count[class_id] = \
1911
1912
  classification_category_to_count[class_id] + 1
@@ -18,6 +18,7 @@ import argparse
18
18
  from tqdm import tqdm
19
19
 
20
20
  from megadetector.utils.ct_utils import invert_dictionary
21
+ from megadetector.utils.ct_utils import write_json
21
22
 
22
23
 
23
24
  #%% Main function
@@ -132,14 +133,16 @@ def remap_detection_categories(input_file,
132
133
  for det in im['detections']:
133
134
  det['category'] = input_category_id_to_output_category_id[det['category']]
134
135
 
135
- input_data['detection_categories'] = target_category_map
136
+ # ...for each image
136
137
 
137
- with open(output_file,'w') as f:
138
- json.dump(input_data,f,indent=1)
138
+ input_data['detection_categories'] = target_category_map
139
139
 
140
+ write_json(output_file,input_data)
140
141
 
141
142
  print('Saved remapped results to {}'.format(output_file))
142
143
 
144
+ # ...def remap_detection_categories(...)
145
+
143
146
 
144
147
  #%% Interactive driver
145
148
 
@@ -252,9 +252,10 @@ def render_detection_confusion_matrix(ground_truth_file,
252
252
  md_formatted_results['images']),
253
253
  total=len(md_formatted_results['images'])))
254
254
  finally:
255
- pool.close()
256
- pool.join()
257
- print("Pool closed and joined for confusion matrix rendering")
255
+ if pool is not None:
256
+ pool.close()
257
+ pool.join()
258
+ print("Pool closed and joined for confusion matrix rendering")
258
259
 
259
260
  else:
260
261
 
@@ -369,11 +370,15 @@ def render_detection_confusion_matrix(ground_truth_file,
369
370
 
370
371
  # If there were no detections at all, call this image empty
371
372
  if len(results_im['detections']) == 0:
373
+
372
374
  predicted_category_name = empty_category_name
375
+
373
376
  # Otherwise look for above-threshold detections
374
377
  else:
378
+
375
379
  results_category_name_to_confidence = defaultdict(int)
376
380
  for det in results_im['detections']:
381
+
377
382
  category_name = results_category_id_to_name[det['category']]
378
383
  detection_threshold = confidence_thresholds['default']
379
384
  if category_name in confidence_thresholds:
@@ -381,12 +386,15 @@ def render_detection_confusion_matrix(ground_truth_file,
381
386
  if det['conf'] > detection_threshold:
382
387
  results_category_name_to_confidence[category_name] = max(
383
388
  results_category_name_to_confidence[category_name],det['conf'])
384
- # If there were no detections above threshold
385
- if len(results_category_name_to_confidence) == 0:
386
- predicted_category_name = empty_category_name
387
- else:
388
- predicted_category_name = max(results_category_name_to_confidence,
389
- key=results_category_name_to_confidence.get)
389
+
390
+ # ...for each detection
391
+
392
+ # If there were no detections above threshold
393
+ if len(results_category_name_to_confidence) == 0:
394
+ predicted_category_name = empty_category_name
395
+ else:
396
+ predicted_category_name = max(results_category_name_to_confidence,
397
+ key=results_category_name_to_confidence.get)
390
398
 
391
399
  ground_truth_category_index = gt_category_name_to_category_index[ground_truth_category_name]
392
400
  predicted_category_index = gt_category_name_to_category_index[predicted_category_name]
@@ -396,7 +404,7 @@ def render_detection_confusion_matrix(ground_truth_file,
396
404
 
397
405
  confusion_matrix[ground_truth_category_index,predicted_category_index] += 1
398
406
 
399
- # ...for each file
407
+ # ...for each ground truth file
400
408
 
401
409
  plt.ioff()
402
410
 
@@ -37,7 +37,7 @@ def remove_repeat_detections(input_file,output_file,filtering_dir):
37
37
  """
38
38
 
39
39
  assert os.path.isfile(input_file), "Can't find file {}".format(input_file)
40
- assert os.path.isdir(filtering_dir), "Can't find folder {}".format(filtering_dir)
40
+ assert os.path.exists(filtering_dir), "Can't find input file/folder {}".format(filtering_dir)
41
41
  options = repeat_detections_core.RepeatDetectionOptions()
42
42
  if os.path.isfile(filtering_dir):
43
43
  options.filterFileToLoad = filtering_dir
@@ -869,7 +869,7 @@ def _update_detection_table(repeat_detection_results, options, output_file_name=
869
869
  detection_to_modify = row_detections[instance.i_detection]
870
870
 
871
871
  # Make sure the bounding box matches
872
- assert (instance_bbox[0:3] == detection_to_modify['bbox'][0:3])
872
+ assert (instance_bbox[0:4] == detection_to_modify['bbox'][0:4])
873
873
 
874
874
  # Make the probability negative, if it hasn't been switched by
875
875
  # another bounding box
@@ -1149,7 +1149,8 @@ def find_repeat_detections(input_filename, output_file_name=None, options=None):
1149
1149
 
1150
1150
  # Load the filtering file
1151
1151
  detection_index_file_name = options.filterFileToLoad
1152
- s_in = open(detection_index_file_name, 'r').read()
1152
+ with open(detection_index_file_name, 'r') as f:
1153
+ s_in = f.read()
1153
1154
  detection_info = jsonpickle.decode(s_in)
1154
1155
  filtering_base_dir = os.path.dirname(options.filterFileToLoad)
1155
1156
  suspicious_detections = detection_info['suspicious_detections']
@@ -1382,7 +1383,8 @@ def find_repeat_detections(input_filename, output_file_name=None, options=None):
1382
1383
 
1383
1384
  # candidate_detection_file = all_candidate_detection_files[0]
1384
1385
  for candidate_detection_file in all_candidate_detection_files:
1385
- s = open(candidate_detection_file, 'r').read()
1386
+ with open(candidate_detection_file, 'r') as f:
1387
+ s = f.read()
1386
1388
  candidate_detections_this_file = jsonpickle.decode(s)
1387
1389
  all_candidate_detections.append(candidate_detections_this_file)
1388
1390
 
@@ -494,7 +494,8 @@ def separate_detections_into_folders(options):
494
494
 
495
495
  # Load detection results
496
496
  print('Loading detection results')
497
- results = json.load(open(options.results_file))
497
+ with open(options.results_file,'r') as f:
498
+ results = json.load(f)
498
499
  images = results['images']
499
500
 
500
501
  for im in images:
@@ -618,8 +619,13 @@ def separate_detections_into_folders(options):
618
619
 
619
620
  print('Starting a pool with {} threads'.format(options.n_threads))
620
621
  pool = ThreadPool(options.n_threads)
621
- process_detections_with_options = partial(_process_detections, options=options)
622
- _ = list(tqdm(pool.imap(process_detections_with_options, images), total=len(images)))
622
+ try:
623
+ process_detections_with_options = partial(_process_detections, options=options)
624
+ _ = list(tqdm(pool.imap(process_detections_with_options, images), total=len(images)))
625
+ finally:
626
+ pool.close()
627
+ pool.join()
628
+ print('Pool closed and joined for folder separation')
623
629
 
624
630
  if options.remove_empty_folders:
625
631
  print('Removing empty folders from {}'.format(options.base_output_folder))
@@ -736,7 +742,7 @@ def main(): # noqa
736
742
  help='Line thickness (in pixels) for rendering, only meaningful if ' + \
737
743
  'using render_boxes (defaults to {})'.format(
738
744
  default_line_thickness))
739
- parser.add_argument('--box_expansion', type=int, default=default_line_thickness,
745
+ parser.add_argument('--box_expansion', type=int, default=default_box_expansion,
740
746
  help='Box expansion (in pixels) for rendering, only meaningful if ' + \
741
747
  'using render_boxes (defaults to {})'.format(
742
748
  default_box_expansion))
@@ -433,7 +433,7 @@ def subset_json_detector_output_by_list(data, options):
433
433
  """
434
434
 
435
435
  if options.keep_files_in_list is None:
436
- return
436
+ return data
437
437
 
438
438
  files_to_keep = None
439
439
 
@@ -45,7 +45,12 @@ class TopFoldersToBottomOptions:
45
45
  Options used to parameterize top_folders_to_bottom()
46
46
  """
47
47
 
48
- def __init__(self,input_folder,output_folder,copy=True,n_threads=1):
48
+ def __init__(self,
49
+ input_folder,
50
+ output_folder,
51
+ copy=True,
52
+ n_threads=1,
53
+ overwrite=False):
49
54
 
50
55
  #: Whether to copy (True) vs. move (False) false when re-organizing
51
56
  self.copy = copy
@@ -60,7 +65,7 @@ class TopFoldersToBottomOptions:
60
65
  self.output_folder = output_folder
61
66
 
62
67
  #: If this is False and an output file exists, throw an error
63
- self.overwrite = False
68
+ self.overwrite = overwrite
64
69
 
65
70
 
66
71
  #%% Main functions
@@ -130,6 +135,7 @@ def top_folders_to_bottom(options):
130
135
  options (TopFoldersToBottomOptions): See TopFoldersToBottomOptions for parameter details.
131
136
 
132
137
  """
138
+
133
139
  os.makedirs(options.output_folder,exist_ok=True)
134
140
 
135
141
  # Enumerate input folder
@@ -167,10 +173,15 @@ def top_folders_to_bottom(options):
167
173
 
168
174
  print('Starting a pool with {} threads'.format(options.n_threads))
169
175
  pool = ThreadPool(options.n_threads)
170
- process_file_with_options = partial(_process_file, options=options)
171
- _ = list(tqdm(pool.imap(process_file_with_options, relative_files), total=len(relative_files)))
176
+ try:
177
+ process_file_with_options = partial(_process_file, options=options)
178
+ _ = list(tqdm(pool.imap(process_file_with_options, relative_files), total=len(relative_files)))
179
+ finally:
180
+ pool.close()
181
+ pool.join()
182
+ print('Pool closed and join for folder inversion')
172
183
 
173
- # ...def top_folders_to_bottom()
184
+ # ...def top_folders_to_bottom(...)
174
185
 
175
186
 
176
187
  #%% Interactive driver
@@ -192,7 +203,7 @@ if False:
192
203
 
193
204
  #%% Command-line driver
194
205
 
195
- # python top_folders_to_bottom.py "g:\temp\separated_images" "g:\temp\separated_images_inverted" --n_threads 100
206
+ # python top_folders_to_bottom.py "g:\temp\separated_images" "g:\temp\separated_images_inverted" --n_threads 10
196
207
 
197
208
  def main(): # noqa
198
209
 
@@ -215,7 +226,11 @@ def main(): # noqa
215
226
 
216
227
  # Convert to an options object
217
228
  options = TopFoldersToBottomOptions(
218
- args.input_folder,args.output_folder,copy=args.copy,n_threads=args.n_threads)
229
+ args.input_folder,
230
+ args.output_folder,
231
+ copy=args.copy,
232
+ n_threads=args.n_threads,
233
+ overwrite=args.overwrite)
219
234
 
220
235
  top_folders_to_bottom(options)
221
236
 
@@ -39,7 +39,7 @@ typical_keys = ['classification_categories',
39
39
 
40
40
  class ValidateBatchResultsOptions:
41
41
  """
42
- Options controlling the behavior of validate_bach_results()
42
+ Options controlling the behavior of validate_batch_results()
43
43
  """
44
44
 
45
45
  def __init__(self):
@@ -15,10 +15,10 @@ import json
15
15
  # Created by get_lila_category_list.py
16
16
  input_lila_category_list_file = os.path.expanduser('~/lila/lila_categories_list/lila_dataset_to_categories.json')
17
17
 
18
- output_file = os.path.expanduser('~/lila/lila_additions_2025.06.23.csv')
18
+ output_file = os.path.expanduser('~/lila/lila_additions_2025.10.07.csv')
19
19
 
20
20
  datasets_to_map = [
21
- 'Nkhotakota Camera Traps'
21
+ 'California Small Animals'
22
22
  ]
23
23
 
24
24
 
@@ -128,6 +128,52 @@ output_df.to_csv(output_file, index=None, header=True)
128
128
  # from megadetector.utils.path_utils import open_file; open_file(output_file)
129
129
 
130
130
 
131
+ #%% Remap missing entries in the .csv file
132
+
133
+ # ...typically because I made a change to the mapping code.
134
+
135
+ from megadetector.utils.path_utils import insert_before_extension
136
+ from megadetector.utils.ct_utils import is_empty
137
+
138
+ remapped_file = insert_before_extension(output_file,'remapped')
139
+
140
+ df = pd.read_csv(output_file)
141
+
142
+ for i_row,row in df.iterrows():
143
+
144
+ # Do we need to map this row?
145
+ if is_empty(row['source']):
146
+
147
+ query = row['query']
148
+ print('Mapping {}'.format(query))
149
+
150
+ taxonomic_match = get_preferred_taxonomic_match(query,taxonomy_preference=taxonomy_preference)
151
+
152
+ if (taxonomic_match.source == taxonomy_preference):
153
+
154
+ source = taxonomic_match.source
155
+ taxonomy_level = taxonomic_match.taxonomic_level
156
+ scientific_name = taxonomic_match.scientific_name
157
+ common_name = taxonomic_match.common_name
158
+ taxonomy_string = taxonomic_match.taxonomy_string
159
+
160
+ # Write source, taxonomy_level, scientific_name, common_name, and taxonomy_string
161
+ # to the corresponding columns in the current row in df
162
+ df.loc[i_row, 'source'] = source
163
+ df.loc[i_row, 'taxonomy_level'] = taxonomy_level
164
+ df.loc[i_row, 'scientific_name'] = scientific_name
165
+ df.loc[i_row, 'common_name'] = common_name
166
+ df.loc[i_row, 'taxonomy_string'] = taxonomy_string
167
+
168
+ # ...if we found a match
169
+
170
+ # ...do we need to map this row?
171
+
172
+ # ...for each row
173
+
174
+ df.to_csv(remapped_file, index=None, header=True)
175
+
176
+
131
177
  #%% Manual lookup
132
178
 
133
179
  if False:
@@ -140,11 +186,19 @@ if False:
140
186
 
141
187
  #%%
142
188
 
143
- q = 'animalia'
189
+ from megadetector.taxonomy_mapping.species_lookup import pop_levels
190
+
191
+ # Use this when an iNat match includes an empty subgenus with the same name as the genus
192
+ n_levels_to_pop = 0
193
+ q = 'sus scrofa'
144
194
 
145
195
  taxonomy_preference = 'inat'
146
196
  m = get_preferred_taxonomic_match(q,taxonomy_preference)
197
+ if n_levels_to_pop > 0:
198
+ m = pop_levels(m,n_levels_to_pop)
199
+
147
200
  # print(m.scientific_name); import clipboard; clipboard.copy(m.scientific_name)
201
+ # common_name = eval(m.__dict__['taxonomy_string'])[0][-1][0]; print(common_name); clipboard.copy(common_name)
148
202
 
149
203
  if (m is None) or (len(m.taxonomy_string) == 0):
150
204
  print('No match')
@@ -155,3 +209,5 @@ if False:
155
209
  print(m.source)
156
210
  print(m.taxonomy_string)
157
211
  import clipboard; clipboard.copy(m.taxonomy_string)
212
+
213
+
@@ -162,4 +162,4 @@ if False:
162
162
 
163
163
  print('Wrote final output to {}'.format(release_taxonomy_file))
164
164
 
165
- # ...if False
165
+
@@ -16,7 +16,7 @@ import os
16
16
  import pandas as pd
17
17
 
18
18
  # lila_taxonomy_file = r"c:\git\agentmorrisprivate\lila-taxonomy\lila-taxonomy-mapping.csv"
19
- lila_taxonomy_file = os.path.expanduser('~/lila/lila_additions_2025.06.23.csv')
19
+ lila_taxonomy_file = os.path.expanduser('~/lila/lila_additions_2025.10.07.csv')
20
20
 
21
21
  preview_base = os.path.expanduser('~/lila/lila_taxonomy_preview')
22
22
  os.makedirs(preview_base,exist_ok=True)
@@ -56,11 +56,6 @@ def taxonomy_string_to_level(taxonomy_string):
56
56
  return level
57
57
 
58
58
 
59
- #%% Read the taxonomy mapping file
60
-
61
- df = pd.read_csv(lila_taxonomy_file)
62
-
63
-
64
59
  #%% Prepare taxonomy lookup
65
60
 
66
61
  from megadetector.taxonomy_mapping.species_lookup import \
@@ -95,20 +90,29 @@ taxonomy_preference = 'inat'
95
90
  # i_row = 0; row = df.iloc[i_row]
96
91
  for i_row,row in tqdm(df.iterrows(),total=len(df)):
97
92
 
98
- sn = row['scientific_name']
99
- if not isinstance(sn,str):
100
- continue
93
+ try:
94
+
95
+ sn = row['scientific_name']
96
+ if not isinstance(sn,str):
97
+ continue
101
98
 
102
- m = get_preferred_taxonomic_match(sn,taxonomy_preference)
103
- assert m.scientific_name == sn
99
+ m = get_preferred_taxonomic_match(sn,taxonomy_preference)
100
+ assert m.scientific_name == sn
104
101
 
105
- ts = row['taxonomy_string']
106
- assert m.taxonomy_string[0:50] == ts[0:50], 'Mismatch for {}:\n\n{}\n\n{}\n'.format(
107
- row['dataset_name'],ts,m.taxonomy_string)
102
+ ts = row['taxonomy_string']
103
+ assert m.taxonomy_string[0:50] == ts[0:50], 'Mismatch for {}:\n\n{}\n\n{}\n'.format(
104
+ row['dataset_name'],ts,m.taxonomy_string)
105
+
106
+ if ts != m.taxonomy_string:
107
+ n_taxonomy_changes += 1
108
+ df.loc[i_row,'taxonomy_string'] = m.taxonomy_string
109
+
110
+ except Exception as e:
108
111
 
109
- if ts != m.taxonomy_string:
110
- n_taxonomy_changes += 1
111
- df.loc[i_row,'taxonomy_string'] = m.taxonomy_string
112
+ print('Error at row {}: {}'.format(i_row,str(e)))
113
+ raise
114
+
115
+ # ...for each row
112
116
 
113
117
  print('\nMade {} taxonomy changes'.format(n_taxonomy_changes))
114
118
 
@@ -325,6 +329,11 @@ for i_row,row in df.iterrows():
325
329
 
326
330
  #%% Download sample images for all scientific names
327
331
 
332
+ # You might have to do this:
333
+ #
334
+ # pip install python-magic
335
+ # pip install python-magic-bin
336
+
328
337
  # Takes ~1 minute per 10 rows
329
338
 
330
339
  remapped_queries = {'papio':'papio+baboon',
@@ -560,6 +560,7 @@ def get_taxonomic_info(query: str) -> List[Dict[str, Any]]:
560
560
  Main entry point: get taxonomic matches from both taxonomies for [query],
561
561
  which may be a scientific or common name.
562
562
  """
563
+
563
564
  query = query.strip().lower()
564
565
  # print("Finding taxonomy information for: {0}".format(query))
565
566
 
@@ -682,6 +683,35 @@ hyphenated_terms = ['crowned', 'backed', 'throated', 'tailed', 'headed', 'cheeke
682
683
  'fronted', 'bellied', 'spotted', 'eared', 'collared', 'breasted',
683
684
  'necked']
684
685
 
686
+ def pop_levels(m, n_levels=1):
687
+ """
688
+ Remove [n_levels] levels from the bottom of the TaxonomicMatch object m, typically used to remove
689
+ silly subgenera.
690
+ """
691
+
692
+ v = eval(m.taxonomy_string)
693
+ assert v[0][1] == m.taxonomic_level
694
+ assert v[0][2] == m.scientific_name
695
+ popped_v = v[n_levels:]
696
+ taxonomic_level = popped_v[0][1]
697
+ scientific_name = popped_v[0][2]
698
+ common_name = popped_v[0][3]
699
+ if len(common_name) == 0:
700
+ common_name = ''
701
+ else:
702
+ common_name = common_name[0]
703
+ taxonomy_string = str(popped_v)
704
+ source = m.source
705
+ return TaxonomicMatch(scientific_name=scientific_name,
706
+ common_name=common_name,
707
+ taxonomic_level=taxonomic_level,
708
+ source=source,
709
+ taxonomy_string=taxonomy_string,
710
+ match=None)
711
+
712
+ # ...def pop_levels(...)
713
+
714
+
685
715
  def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retry=True) -> TaxonomicMatch:
686
716
  """
687
717
  Wrapper for _get_preferred_taxonomic_match, but expressing a variety of heuristics
@@ -704,6 +734,17 @@ def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retr
704
734
  for s in hyphenated_terms:
705
735
  query = query.replace(' ' + s,'-' + s)
706
736
  m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
737
+
738
+ if (len(m.scientific_name) > 0) or (not retry):
739
+ return m
740
+
741
+ query = query.replace(' species','')
742
+ query = query.replace(' order','')
743
+ query = query.replace(' genus','')
744
+ query = query.replace(' family','')
745
+ query = query.replace(' subfamily','')
746
+ m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
747
+
707
748
  return m
708
749
 
709
750
 
@@ -887,8 +928,16 @@ def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') ->
887
928
 
888
929
  taxonomy_string = str(match)
889
930
 
890
- return TaxonomicMatch(scientific_name, common_name, taxonomic_level, source,
891
- taxonomy_string, match),query
931
+ m = TaxonomicMatch(scientific_name, common_name, taxonomic_level, source,
932
+ taxonomy_string, match)
933
+
934
+ if (m.taxonomic_level == 'subgenus' and \
935
+ match[1][1] == 'genus' and \
936
+ match[1][2] == m.scientific_name):
937
+ print('Removing redundant subgenus {}'.format(scientific_name))
938
+ m = pop_levels(m,1)
939
+
940
+ return m,query
892
941
 
893
942
  # ...def _get_preferred_taxonomic_match()
894
943