megadetector 5.0.6__py3-none-any.whl → 5.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (62) hide show
  1. api/batch_processing/data_preparation/manage_local_batch.py +278 -197
  2. api/batch_processing/data_preparation/manage_video_batch.py +7 -2
  3. api/batch_processing/postprocessing/add_max_conf.py +1 -0
  4. api/batch_processing/postprocessing/compare_batch_results.py +110 -60
  5. api/batch_processing/postprocessing/load_api_results.py +55 -69
  6. api/batch_processing/postprocessing/md_to_labelme.py +1 -0
  7. api/batch_processing/postprocessing/postprocess_batch_results.py +158 -50
  8. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +625 -0
  9. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
  10. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
  11. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +222 -74
  12. api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
  13. api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
  14. classification/prepare_classification_script.py +191 -191
  15. data_management/coco_to_yolo.py +65 -44
  16. data_management/databases/integrity_check_json_db.py +7 -5
  17. data_management/generate_crops_from_cct.py +1 -1
  18. data_management/importers/animl_results_to_md_results.py +2 -2
  19. data_management/importers/noaa_seals_2019.py +1 -1
  20. data_management/importers/zamba_results_to_md_results.py +2 -2
  21. data_management/labelme_to_coco.py +34 -6
  22. data_management/labelme_to_yolo.py +1 -1
  23. data_management/lila/create_lila_blank_set.py +474 -0
  24. data_management/lila/create_lila_test_set.py +2 -1
  25. data_management/lila/create_links_to_md_results_files.py +1 -1
  26. data_management/lila/download_lila_subset.py +46 -21
  27. data_management/lila/generate_lila_per_image_labels.py +23 -14
  28. data_management/lila/get_lila_annotation_counts.py +16 -10
  29. data_management/lila/lila_common.py +14 -11
  30. data_management/lila/test_lila_metadata_urls.py +116 -0
  31. data_management/resize_coco_dataset.py +12 -10
  32. data_management/yolo_output_to_md_output.py +40 -13
  33. data_management/yolo_to_coco.py +34 -21
  34. detection/process_video.py +36 -14
  35. detection/pytorch_detector.py +1 -1
  36. detection/run_detector.py +73 -18
  37. detection/run_detector_batch.py +104 -24
  38. detection/run_inference_with_yolov5_val.py +127 -26
  39. detection/run_tiled_inference.py +153 -43
  40. detection/video_utils.py +3 -1
  41. md_utils/ct_utils.py +79 -3
  42. md_utils/md_tests.py +253 -15
  43. md_utils/path_utils.py +129 -24
  44. md_utils/process_utils.py +26 -7
  45. md_utils/split_locations_into_train_val.py +215 -0
  46. md_utils/string_utils.py +10 -0
  47. md_utils/url_utils.py +0 -2
  48. md_utils/write_html_image_list.py +1 -0
  49. md_visualization/visualization_utils.py +17 -2
  50. md_visualization/visualize_db.py +8 -0
  51. md_visualization/visualize_detector_output.py +185 -104
  52. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/METADATA +2 -2
  53. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/RECORD +62 -58
  54. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/WHEEL +1 -1
  55. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
  56. taxonomy_mapping/map_new_lila_datasets.py +43 -39
  57. taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
  58. taxonomy_mapping/preview_lila_taxonomy.py +27 -27
  59. taxonomy_mapping/species_lookup.py +33 -13
  60. taxonomy_mapping/taxonomy_csv_checker.py +7 -5
  61. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/LICENSE +0 -0
  62. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/top_level.txt +0 -0
@@ -23,7 +23,6 @@ import collections
23
23
  import copy
24
24
  import errno
25
25
  import io
26
- import itertools
27
26
  import os
28
27
  import sys
29
28
  import time
@@ -54,6 +53,7 @@ from md_utils import path_utils
54
53
  from data_management.cct_json_utils import (CameraTrapJsonUtils, IndexedJsonDb)
55
54
  from api.batch_processing.postprocessing.load_api_results import load_api_results
56
55
  from md_utils.ct_utils import args_to_object
56
+ from md_utils.ct_utils import invert_dictionary
57
57
 
58
58
  from detection.run_detector import get_typical_confidence_threshold_from_results
59
59
 
@@ -114,10 +114,18 @@ class PostProcessingOptions:
114
114
  # detections_animal, detections_person, detections_vehicle
115
115
  rendering_bypass_sets = []
116
116
 
117
- # By default, choose a confidence threshold based on the detector version
117
+ # If this is None, choose a confidence threshold based on the detector version.
118
+ #
119
+ # This can either be a float or a dictionary mapping category names (not IDs) to
120
+ # thresholds. The category "default" can be used to specify thresholds for
121
+ # other categories. Currently the use of a dict here is not supported when
122
+ # ground truth is supplied.
118
123
  confidence_threshold = None
119
124
 
120
125
  # Confidence threshold to apply to classification (not detection) results
126
+ #
127
+ # Only a float is supported here (unlike the "confidence_threshold" parameter, which
128
+ # can be a dict).
121
129
  classification_confidence_threshold = 0.5
122
130
 
123
131
  # Used for summary statistics only
@@ -163,6 +171,9 @@ class PostProcessingOptions:
163
171
  #
164
172
  # Currently only supported when ground truth is unavailable
165
173
  include_almost_detections = False
174
+
175
+ # Only a float is supported here (unlike the "confidence_threshold" parameter, which
176
+ # can be a dict).
166
177
  almost_detection_confidence_threshold = None
167
178
 
168
179
  # Control rendering parallelization
@@ -427,12 +438,25 @@ def render_bounding_boxes(
427
438
  vis_utils.render_db_bounding_boxes(ground_truth_boxes, gt_classes, image,
428
439
  original_size=original_size,label_map=label_map,
429
440
  thickness=4,expansion=4)
441
+
442
+ # render_detection_bounding_boxes expects either a float or a dict mapping
443
+ # category IDs to names.
444
+ if isinstance(options.confidence_threshold,float):
445
+ rendering_confidence_threshold = options.confidence_threshold
446
+ else:
447
+ category_ids = set()
448
+ for d in detections:
449
+ category_ids.add(d['category'])
450
+ rendering_confidence_threshold = {}
451
+ for category_id in category_ids:
452
+ rendering_confidence_threshold[category_id] = \
453
+ get_threshold_for_category_id(category_id, options, detection_categories)
430
454
 
431
455
  vis_utils.render_detection_bounding_boxes(
432
456
  detections, image,
433
457
  label_map=detection_categories,
434
458
  classification_label_map=classification_categories,
435
- confidence_threshold=options.confidence_threshold,
459
+ confidence_threshold=rendering_confidence_threshold,
436
460
  thickness=options.line_thickness,
437
461
  expansion=options.box_expansion)
438
462
 
@@ -535,15 +559,68 @@ def prepare_html_subpages(images_html, output_dir, options=None):
535
559
 
536
560
  # ...prepare_html_subpages()
537
561
 
538
- # Get unique categories above the threshold for this image
539
- def get_positive_categories(detections,options):
562
+
563
+ # Determine the confidence threshold we should use for a specific category name
564
+ def get_threshold_for_category_name(category_name,options):
565
+
566
+ if isinstance(options.confidence_threshold,float):
567
+ return options.confidence_threshold
568
+ else:
569
+ assert isinstance(options.confidence_threshold,dict), \
570
+ 'confidence_threshold must either be a float or a dict'
571
+
572
+ if category_name in options.confidence_threshold:
573
+
574
+ return options.confidence_threshold[category_name]
575
+
576
+ else:
577
+ assert 'default' in options.confidence_threshold, \
578
+ 'category {} not in confidence_threshold dict, and no default supplied'.format(
579
+ category_name)
580
+ return options.confidence_threshold['default']
581
+
582
+
583
+ # Determine the confidence threshold we should use for a specific category ID
584
+ #
585
+ # detection_categories is a dict mapping category IDs to names.
586
+ def get_threshold_for_category_id(category_id,options,detection_categories):
587
+
588
+ if isinstance(options.confidence_threshold,float):
589
+ return options.confidence_threshold
590
+
591
+ assert category_id in detection_categories, \
592
+ 'Invalid category ID {}'.format(category_id)
593
+
594
+ category_name = detection_categories[category_id]
595
+
596
+ return get_threshold_for_category_name(category_name,options)
597
+
598
+
599
+ # Get a sorted list of unique categories (as string IDs) above the threshold for this image
600
+ #
601
+ # "detection_categories" is a dict mapping category IDs to names.
602
+ def get_positive_categories(detections,options,detection_categories):
540
603
  positive_categories = set()
541
604
  for d in detections:
542
- if d['conf'] >= options.confidence_threshold:
605
+ threshold = get_threshold_for_category_id(d['category'], options, detection_categories)
606
+ if d['conf'] >= threshold:
543
607
  positive_categories.add(d['category'])
544
608
  return sorted(positive_categories)
545
609
 
546
610
 
611
+ # Determine whether any positive detections are present in the detection list
612
+ # [detections].
613
+ def has_positive_detection(detections,options,detection_categories):
614
+
615
+ found_positive_detection = False
616
+ for d in detections:
617
+ threshold = get_threshold_for_category_id(d['category'], options, detection_categories)
618
+ if d['conf'] >= threshold:
619
+ found_positive_detection = True
620
+ break
621
+ return found_positive_detection
622
+
623
+
547
624
  # Render an image (with no ground truth information)
548
625
  #
549
626
  # Returns a list of rendering structs, where the first item is a category (e.g. "detections_animal"),
@@ -573,8 +650,12 @@ def render_image_no_gt(file_info,detection_categories_to_results_name,
573
650
  max_conf = file_info[1]
574
651
  detections = file_info[2]
575
652
 
653
+ # Determine whether any positive detections are present (using a threshold that
654
+ # may vary by category)
655
+ found_positive_detection = has_positive_detection(detections,options,detection_categories)
656
+
576
657
  detection_status = DetectionStatus.DS_UNASSIGNED
577
- if max_conf >= options.confidence_threshold:
658
+ if found_positive_detection:
578
659
  detection_status = DetectionStatus.DS_POSITIVE
579
660
  else:
580
661
  if options.include_almost_detections:
@@ -587,7 +668,7 @@ def render_image_no_gt(file_info,detection_categories_to_results_name,
587
668
 
588
669
  if detection_status == DetectionStatus.DS_POSITIVE:
589
670
  if options.separate_detections_by_category:
590
- positive_categories = tuple(get_positive_categories(detections,options))
671
+ positive_categories = tuple(get_positive_categories(detections,options,detection_categories))
591
672
  if positive_categories not in detection_categories_to_results_name:
592
673
  raise ValueError('Error: {} not in category mapping (file {})'.format(
593
674
  str(positive_categories),image_relative_path))
@@ -703,7 +784,7 @@ def render_image_with_gt(file_info,ground_truth_indexed_db,
703
784
  f'ground truth status (status: {gt_status}, classes: {gt_class_summary})')
704
785
  return None
705
786
 
706
- detected = max_conf > options.confidence_threshold
787
+ detected = has_positive_detection(detections, options, detection_categories)
707
788
 
708
789
  if gt_presence and detected:
709
790
  if '_classification_accuracy' not in image.keys():
@@ -766,6 +847,10 @@ def process_batch_results(options: PostProcessingOptions
766
847
 
767
848
  ground_truth_indexed_db = None
768
849
 
850
+ if (options.ground_truth_json_file is not None):
851
+ assert (options.confidence_threshold is None) or (isinstance(confidence_threshold,float)), \
852
+ 'Variable confidence thresholds are not supported when supplying ground truth'
853
+
769
854
  if (options.ground_truth_json_file is not None) and (len(options.ground_truth_json_file) > 0):
770
855
 
771
856
  if options.separate_detections_by_category:
@@ -821,7 +906,7 @@ def process_batch_results(options: PostProcessingOptions
821
906
  n_failures = detections_df['failure'].count()
822
907
  print('Ignoring {} failed images'.format(n_failures))
823
908
  # Explicitly forcing a copy() operation here to suppress "trying to be set
824
- # on a copy" # warnings (and associated risks) below.
909
+ # on a copy" warnings (and associated risks) below.
825
910
  detections_df = detections_df[detections_df['failure'].isna()].copy()
826
911
 
827
912
  assert other_fields is not None
@@ -836,31 +921,24 @@ def process_batch_results(options: PostProcessingOptions
836
921
  for k, v in classification_categories.items()
837
922
  }
838
923
 
839
- # Add column 'pred_detection_label' to indicate predicted detection status.
840
- #
841
- # This column doesn't capture category information, it's just about detections,
842
- # non-detections, and almost-detections.
843
- det_status = 'pred_detection_label'
844
- if options.include_almost_detections:
845
- detections_df[det_status] = DetectionStatus.DS_ALMOST
846
- confidences = detections_df['max_detection_conf']
847
-
848
- pos_mask = (confidences >= options.confidence_threshold)
849
- detections_df.loc[pos_mask, det_status] = DetectionStatus.DS_POSITIVE
850
-
851
- neg_mask = (confidences < options.almost_detection_confidence_threshold)
852
- detections_df.loc[neg_mask, det_status] = DetectionStatus.DS_NEGATIVE
853
- else:
854
- detections_df[det_status] = np.where(
855
- detections_df['max_detection_conf'] >= options.confidence_threshold,
856
- DetectionStatus.DS_POSITIVE, DetectionStatus.DS_NEGATIVE)
857
-
858
- n_positives = sum(detections_df[det_status] == DetectionStatus.DS_POSITIVE)
924
+ # Count detections and almost-detections for reporting purposes
925
+ n_positives = 0
926
+ n_almosts = 0
927
+
928
+ for i_row,row in tqdm(detections_df.iterrows(),total=len(detections_df)):
929
+
930
+ detections = row['detections']
931
+ max_conf = row['max_detection_conf']
932
+ if has_positive_detection(detections, options, detection_categories):
933
+ n_positives += 1
934
+ elif (options.almost_detection_confidence_threshold is not None) and \
935
+ (max_conf >= options.almost_detection_confidence_threshold):
936
+ n_almosts += 1
937
+
859
938
  print(f'Finished loading and preprocessing {len(detections_df)} rows '
860
939
  f'from detector output, predicted {n_positives} positives.')
861
940
 
862
941
  if options.include_almost_detections:
863
- n_almosts = sum(detections_df[det_status] == DetectionStatus.DS_ALMOST)
864
942
  print('...and {} almost-positives'.format(n_almosts))
865
943
 
866
944
 
@@ -1009,7 +1087,7 @@ def process_batch_results(options: PostProcessingOptions
1009
1087
  (precision_at_confidence_threshold + recall_at_confidence_threshold)
1010
1088
 
1011
1089
  print('At a confidence threshold of {:.1%}, precision={:.1%}, recall={:.1%}, f1={:.1%}'.format(
1012
- options.confidence_threshold, precision_at_confidence_threshold,
1090
+ str(options.confidence_threshold), precision_at_confidence_threshold,
1013
1091
  recall_at_confidence_threshold, f1))
1014
1092
 
1015
1093
  ##%% Collect classification results, if they exist
@@ -1265,7 +1343,7 @@ def process_batch_results(options: PostProcessingOptions
1265
1343
  </div>
1266
1344
  """.format(
1267
1345
  style_header,job_name_string,model_version_string,
1268
- image_count, options.confidence_threshold,
1346
+ image_count, str(options.confidence_threshold),
1269
1347
  all_tp_count, all_tp_count/total_count,
1270
1348
  image_counts['tn'], image_counts['tn']/total_count,
1271
1349
  image_counts['fp'], image_counts['fp']/total_count,
@@ -1279,7 +1357,7 @@ def process_batch_results(options: PostProcessingOptions
1279
1357
  <p><strong>Precision/recall summary for all {} images</strong></p><img src="{}"><br/>
1280
1358
  </div>
1281
1359
  """.format(
1282
- options.confidence_threshold, precision_at_confidence_threshold, recall_at_confidence_threshold,
1360
+ str(options.confidence_threshold), precision_at_confidence_threshold, recall_at_confidence_threshold,
1283
1361
  len(detections_df), pr_figure_relative_filename
1284
1362
  )
1285
1363
 
@@ -1345,46 +1423,60 @@ def process_batch_results(options: PostProcessingOptions
1345
1423
  # Accumulate html image structs (in the format expected by write_html_image_list)
1346
1424
  # for each category
1347
1425
  images_html = collections.defaultdict(list)
1348
- images_html['non_detections']
1426
+
1349
1427
 
1350
1428
  # Add default entries by accessing them for the first time
1351
1429
 
1352
- # Maps detection categories - e.g. "human" - to result set names, e.g.
1353
- # "detections_human"
1430
+ # Maps sorted tuples of detection category IDs (string ints) - e.g. ("1"), ("1", "4", "7") - to
1431
+ # result set names, e.g. "detections_human", "detections_cat_truck".
1354
1432
  detection_categories_to_results_name = {}
1355
1433
 
1356
1434
  # Keep track of which categories are single-class (e.g. "animal") and which are
1357
1435
  # combinations (e.g. "animal_vehicle")
1358
1436
  detection_categories_to_category_count = {}
1359
- detection_categories_to_category_count['detections'] = 0
1437
+
1438
+ # For the creation of a "non-detections" category
1439
+ images_html['non_detections']
1360
1440
  detection_categories_to_category_count['non_detections'] = 0
1361
- detection_categories_to_category_count['almost_detections'] = 0
1441
+
1362
1442
 
1363
1443
  if not options.separate_detections_by_category:
1364
1444
  # For the creation of a "detections" category
1365
1445
  images_html['detections']
1446
+ detection_categories_to_category_count['detections'] = 0
1366
1447
  else:
1367
1448
  # Add a set of results for each category and combination of categories, e.g.
1368
1449
  # "detections_animal_vehicle". When we're using this script for non-MegaDetector
1369
1450
  # results, this can generate lots of categories, e.g. detections_bear_bird_cat_dog_pig.
1370
1451
  # We'll keep that huge set of combinations in this map, but we'll only write
1371
1452
  # out links for the ones that are non-empty.
1372
- keys = detection_categories.keys()
1373
- subsets = []
1374
- for L in range(1, len(keys)+1):
1375
- for subset in itertools.combinations(keys, L):
1376
- subsets.append(subset)
1377
- for subset in subsets:
1378
- sorted_subset = tuple(sorted(subset))
1453
+ used_combinations = set()
1454
+
1455
+ # row = images_to_visualize.iloc[0]
1456
+ for i_row, row in images_to_visualize.iterrows():
1457
+ detections_this_row = row['detections']
1458
+ above_threshold_category_ids_this_row = set()
1459
+ for detection in detections_this_row:
1460
+ threshold = get_threshold_for_category_id(detection['category'], options, detection_categories)
1461
+ if detection['conf'] >= threshold:
1462
+ above_threshold_category_ids_this_row.add(detection['category'])
1463
+ if len(above_threshold_category_ids_this_row) == 0:
1464
+ continue
1465
+ sorted_categories_this_row = tuple(sorted(above_threshold_category_ids_this_row))
1466
+ used_combinations.add(sorted_categories_this_row)
1467
+
1468
+ for sorted_subset in used_combinations:
1469
+ assert len(sorted_subset) > 0
1379
1470
  results_name = 'detections'
1380
1471
  for category_id in sorted_subset:
1381
1472
  results_name = results_name + '_' + detection_categories[category_id]
1382
1473
  images_html[results_name]
1383
1474
  detection_categories_to_results_name[sorted_subset] = results_name
1384
- detection_categories_to_category_count[results_name] = len(sorted_subset)
1475
+ detection_categories_to_category_count[results_name] = len(sorted_subset)
1385
1476
 
1386
1477
  if options.include_almost_detections:
1387
1478
  images_html['almost_detections']
1479
+ detection_categories_to_category_count['almost_detections'] = 0
1388
1480
 
1389
1481
  # Create output directories
1390
1482
  for res in images_html.keys():
@@ -1495,9 +1587,15 @@ def process_batch_results(options: PostProcessingOptions
1495
1587
  almost_detection_string = ' (&ldquo;almost detection&rdquo; threshold at {:.1%})'.format(
1496
1588
  options.almost_detection_confidence_threshold)
1497
1589
 
1590
+ confidence_threshold_string = ''
1591
+ if isinstance(options.confidence_threshold,float):
1592
+ confidence_threshold_string = '{:.1%}'.format(options.confidence_threshold)
1593
+ else:
1594
+ confidence_threshold_string = str(options.confidence_threshold)
1595
+
1498
1596
  index_page = """<html>\n{}\n<body>\n
1499
1597
  <h2>Visualization of results for {}</h2>\n
1500
- <p>A sample of {} images (of {} total)FAILURE_PLACEHOLDER, annotated with detections above {:.1%} confidence{}.</p>\n
1598
+ <p>A sample of {} images (of {} total)FAILURE_PLACEHOLDER, annotated with detections above confidence {}{}.</p>\n
1501
1599
 
1502
1600
  <div class="contentdiv">
1503
1601
  <p>Model version: {}</p>
@@ -1505,7 +1603,7 @@ def process_batch_results(options: PostProcessingOptions
1505
1603
 
1506
1604
  <h3>Sample images</h3>\n
1507
1605
  <div class="contentdiv">\n""".format(
1508
- style_header, job_name_string, image_count, len(detections_df), options.confidence_threshold,
1606
+ style_header, job_name_string, image_count, len(detections_df), confidence_threshold_string,
1509
1607
  almost_detection_string, model_version_string)
1510
1608
 
1511
1609
  failure_string = ''
@@ -1521,7 +1619,17 @@ def process_batch_results(options: PostProcessingOptions
1521
1619
  friendly_name = friendly_name.capitalize()
1522
1620
  return friendly_name
1523
1621
 
1524
- for result_set_name in images_html.keys():
1622
+ sorted_result_set_names = sorted(list(images_html.keys()))
1623
+
1624
+ result_set_name_to_count = {}
1625
+ for result_set_name in sorted_result_set_names:
1626
+ image_count = image_counts[result_set_name]
1627
+ result_set_name_to_count[result_set_name] = image_count
1628
+ sorted_result_set_names = sorted(sorted_result_set_names,
1629
+ key=lambda x: result_set_name_to_count[x],
1630
+ reverse=True)
1631
+
1632
+ for result_set_name in sorted_result_set_names:
1525
1633
 
1526
1634
  # Don't print classification classes here; we'll do that later with a slightly
1527
1635
  # different structure