megadetector 5.0.5__py3-none-any.whl → 5.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (132) hide show
  1. api/batch_processing/data_preparation/manage_local_batch.py +302 -263
  2. api/batch_processing/data_preparation/manage_video_batch.py +81 -2
  3. api/batch_processing/postprocessing/add_max_conf.py +1 -0
  4. api/batch_processing/postprocessing/categorize_detections_by_size.py +50 -19
  5. api/batch_processing/postprocessing/compare_batch_results.py +110 -60
  6. api/batch_processing/postprocessing/load_api_results.py +56 -70
  7. api/batch_processing/postprocessing/md_to_coco.py +1 -1
  8. api/batch_processing/postprocessing/md_to_labelme.py +2 -1
  9. api/batch_processing/postprocessing/postprocess_batch_results.py +240 -81
  10. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +625 -0
  11. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
  12. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
  13. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +227 -75
  14. api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
  15. api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
  16. api/synchronous/api_core/animal_detection_api/detection/run_detector_batch.py +2 -2
  17. classification/prepare_classification_script.py +191 -191
  18. data_management/coco_to_yolo.py +68 -45
  19. data_management/databases/integrity_check_json_db.py +7 -5
  20. data_management/generate_crops_from_cct.py +3 -3
  21. data_management/get_image_sizes.py +8 -6
  22. data_management/importers/add_timestamps_to_icct.py +79 -0
  23. data_management/importers/animl_results_to_md_results.py +160 -0
  24. data_management/importers/auckland_doc_test_to_json.py +4 -4
  25. data_management/importers/auckland_doc_to_json.py +1 -1
  26. data_management/importers/awc_to_json.py +5 -5
  27. data_management/importers/bellevue_to_json.py +5 -5
  28. data_management/importers/carrizo_shrubfree_2018.py +5 -5
  29. data_management/importers/carrizo_trail_cam_2017.py +5 -5
  30. data_management/importers/cct_field_adjustments.py +2 -3
  31. data_management/importers/channel_islands_to_cct.py +4 -4
  32. data_management/importers/ena24_to_json.py +5 -5
  33. data_management/importers/helena_to_cct.py +10 -10
  34. data_management/importers/idaho-camera-traps.py +12 -12
  35. data_management/importers/idfg_iwildcam_lila_prep.py +8 -8
  36. data_management/importers/jb_csv_to_json.py +4 -4
  37. data_management/importers/missouri_to_json.py +1 -1
  38. data_management/importers/noaa_seals_2019.py +1 -1
  39. data_management/importers/pc_to_json.py +5 -5
  40. data_management/importers/prepare-noaa-fish-data-for-lila.py +4 -4
  41. data_management/importers/prepare_zsl_imerit.py +5 -5
  42. data_management/importers/rspb_to_json.py +4 -4
  43. data_management/importers/save_the_elephants_survey_A.py +5 -5
  44. data_management/importers/save_the_elephants_survey_B.py +6 -6
  45. data_management/importers/snapshot_safari_importer.py +9 -9
  46. data_management/importers/snapshot_serengeti_lila.py +9 -9
  47. data_management/importers/timelapse_csv_set_to_json.py +5 -7
  48. data_management/importers/ubc_to_json.py +4 -4
  49. data_management/importers/umn_to_json.py +4 -4
  50. data_management/importers/wellington_to_json.py +1 -1
  51. data_management/importers/wi_to_json.py +2 -2
  52. data_management/importers/zamba_results_to_md_results.py +181 -0
  53. data_management/labelme_to_coco.py +35 -7
  54. data_management/labelme_to_yolo.py +229 -0
  55. data_management/lila/add_locations_to_island_camera_traps.py +1 -1
  56. data_management/lila/add_locations_to_nacti.py +147 -0
  57. data_management/lila/create_lila_blank_set.py +474 -0
  58. data_management/lila/create_lila_test_set.py +2 -1
  59. data_management/lila/create_links_to_md_results_files.py +106 -0
  60. data_management/lila/download_lila_subset.py +46 -21
  61. data_management/lila/generate_lila_per_image_labels.py +23 -14
  62. data_management/lila/get_lila_annotation_counts.py +17 -11
  63. data_management/lila/lila_common.py +14 -11
  64. data_management/lila/test_lila_metadata_urls.py +116 -0
  65. data_management/ocr_tools.py +829 -0
  66. data_management/resize_coco_dataset.py +13 -11
  67. data_management/yolo_output_to_md_output.py +84 -12
  68. data_management/yolo_to_coco.py +38 -20
  69. detection/process_video.py +36 -14
  70. detection/pytorch_detector.py +23 -8
  71. detection/run_detector.py +76 -19
  72. detection/run_detector_batch.py +178 -63
  73. detection/run_inference_with_yolov5_val.py +326 -57
  74. detection/run_tiled_inference.py +153 -43
  75. detection/video_utils.py +34 -8
  76. md_utils/ct_utils.py +172 -1
  77. md_utils/md_tests.py +372 -51
  78. md_utils/path_utils.py +167 -39
  79. md_utils/process_utils.py +26 -7
  80. md_utils/split_locations_into_train_val.py +215 -0
  81. md_utils/string_utils.py +10 -0
  82. md_utils/url_utils.py +0 -2
  83. md_utils/write_html_image_list.py +9 -26
  84. md_visualization/plot_utils.py +12 -8
  85. md_visualization/visualization_utils.py +106 -7
  86. md_visualization/visualize_db.py +16 -8
  87. md_visualization/visualize_detector_output.py +208 -97
  88. {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/METADATA +3 -6
  89. {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/RECORD +98 -121
  90. {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/WHEEL +1 -1
  91. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
  92. taxonomy_mapping/map_new_lila_datasets.py +43 -39
  93. taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
  94. taxonomy_mapping/preview_lila_taxonomy.py +27 -27
  95. taxonomy_mapping/species_lookup.py +33 -13
  96. taxonomy_mapping/taxonomy_csv_checker.py +7 -5
  97. api/synchronous/api_core/yolov5/detect.py +0 -252
  98. api/synchronous/api_core/yolov5/export.py +0 -607
  99. api/synchronous/api_core/yolov5/hubconf.py +0 -146
  100. api/synchronous/api_core/yolov5/models/__init__.py +0 -0
  101. api/synchronous/api_core/yolov5/models/common.py +0 -738
  102. api/synchronous/api_core/yolov5/models/experimental.py +0 -104
  103. api/synchronous/api_core/yolov5/models/tf.py +0 -574
  104. api/synchronous/api_core/yolov5/models/yolo.py +0 -338
  105. api/synchronous/api_core/yolov5/train.py +0 -670
  106. api/synchronous/api_core/yolov5/utils/__init__.py +0 -36
  107. api/synchronous/api_core/yolov5/utils/activations.py +0 -103
  108. api/synchronous/api_core/yolov5/utils/augmentations.py +0 -284
  109. api/synchronous/api_core/yolov5/utils/autoanchor.py +0 -170
  110. api/synchronous/api_core/yolov5/utils/autobatch.py +0 -66
  111. api/synchronous/api_core/yolov5/utils/aws/__init__.py +0 -0
  112. api/synchronous/api_core/yolov5/utils/aws/resume.py +0 -40
  113. api/synchronous/api_core/yolov5/utils/benchmarks.py +0 -148
  114. api/synchronous/api_core/yolov5/utils/callbacks.py +0 -71
  115. api/synchronous/api_core/yolov5/utils/dataloaders.py +0 -1087
  116. api/synchronous/api_core/yolov5/utils/downloads.py +0 -178
  117. api/synchronous/api_core/yolov5/utils/flask_rest_api/example_request.py +0 -19
  118. api/synchronous/api_core/yolov5/utils/flask_rest_api/restapi.py +0 -46
  119. api/synchronous/api_core/yolov5/utils/general.py +0 -1018
  120. api/synchronous/api_core/yolov5/utils/loggers/__init__.py +0 -187
  121. api/synchronous/api_core/yolov5/utils/loggers/wandb/__init__.py +0 -0
  122. api/synchronous/api_core/yolov5/utils/loggers/wandb/log_dataset.py +0 -27
  123. api/synchronous/api_core/yolov5/utils/loggers/wandb/sweep.py +0 -41
  124. api/synchronous/api_core/yolov5/utils/loggers/wandb/wandb_utils.py +0 -577
  125. api/synchronous/api_core/yolov5/utils/loss.py +0 -234
  126. api/synchronous/api_core/yolov5/utils/metrics.py +0 -355
  127. api/synchronous/api_core/yolov5/utils/plots.py +0 -489
  128. api/synchronous/api_core/yolov5/utils/torch_utils.py +0 -314
  129. api/synchronous/api_core/yolov5/val.py +0 -394
  130. md_utils/matlab_porting_tools.py +0 -97
  131. {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/LICENSE +0 -0
  132. {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/top_level.txt +0 -0
@@ -14,7 +14,10 @@ import warnings
14
14
  import sklearn.cluster
15
15
  import numpy as np
16
16
  import jsonpickle
17
+ import traceback
17
18
  import pandas as pd
19
+ import json
20
+ import shutil
18
21
 
19
22
  from tqdm import tqdm
20
23
  from operator import attrgetter
@@ -35,6 +38,8 @@ from api.batch_processing.postprocessing.postprocess_batch_results import relati
35
38
  from md_visualization.visualization_utils import open_image, render_detection_bounding_boxes
36
39
  from md_visualization import render_images_with_thumbnails
37
40
  from md_visualization import visualization_utils as vis_utils
41
+ from md_utils.path_utils import flatten_path
42
+ from md_utils.ct_utils import invert_dictionary
38
43
 
39
44
  # "PIL cannot read EXIF metainfo for the images"
40
45
  warnings.filterwarnings('ignore', '(Possibly )?corrupt EXIF data', UserWarning)
@@ -42,10 +47,12 @@ warnings.filterwarnings('ignore', '(Possibly )?corrupt EXIF data', UserWarning)
42
47
  # "Metadata Warning, tag 256 had too many entries: 42, expected 1"
43
48
  warnings.filterwarnings('ignore', 'Metadata warning', UserWarning)
44
49
 
50
+ jsonpickle.set_encoder_options('json', sort_keys=True, indent=1)
51
+
45
52
 
46
53
  #%% Constants
47
54
 
48
- DETECTION_INDEX_FILE_NAME = 'detectionIndex.json'
55
+ detection_index_file_name_base = 'detectionIndex.json'
49
56
 
50
57
 
51
58
  #%% Classes
@@ -74,26 +81,32 @@ class RepeatDetectionOptions:
74
81
  # How many occurrences of a single location (as defined by the IOU threshold)
75
82
  # are required before we declare it suspicious?
76
83
  occurrenceThreshold = 20
84
+
85
+ # Ignore "suspicious" detections smaller than some size
86
+ minSuspiciousDetectionSize = 0.0
77
87
 
78
88
  # Ignore "suspicious" detections larger than some size; these are often animals
79
89
  # taking up the whole image. This is expressed as a fraction of the image size.
80
90
  maxSuspiciousDetectionSize = 0.2
81
91
 
82
- # Ignore "suspicious" detections smaller than some size
83
- minSuspiciousDetectionSize = 0.0
84
-
85
92
  # Ignore folders with more than this many images in them
86
93
  maxImagesPerFolder = None
87
94
 
88
95
  # A list of classes we don't want to treat as suspicious. Each element is an int.
89
96
  excludeClasses = [] # [annotation_constants.detector_bbox_category_name_to_id['person']]
90
97
 
98
+ # For very large sets of results, passing chunks of results to and from workers as
99
+ # parameters ('memory') can be memory-intensive, so we can serialize to intermediate
100
+ # files instead ('file').
101
+ #
102
+ # The use of 'file' here is still experimental.
103
+ pass_detections_to_processes_method = 'memory'
104
+
91
105
  nWorkers = 10
92
106
 
107
+ # Should we use threads or processes for parallelization?
93
108
  parallelizationUsesThreads = True
94
109
 
95
- viz_target_width = 800
96
-
97
110
  # Load detections from a filter file rather than finding them from the detector output
98
111
 
99
112
  # .json file containing detections, generally this is the detectionIndex.json file in
@@ -121,6 +134,10 @@ class RepeatDetectionOptions:
121
134
  bParallelizeComparisons = True
122
135
  bParallelizeRendering = True
123
136
 
137
+ # If this is False (default), a detection from class A is not considered to be "the same"
138
+ # as a detection from class B, even if they're at the same location.
139
+ categoryAgnosticComparisons = False
140
+
124
141
  # Determines whether bounding-box rendering errors (typically network errors) should
125
142
  # be treated as failures
126
143
  bFailOnRenderError = False
@@ -209,7 +226,7 @@ class RepeatDetectionResults:
209
226
  """
210
227
 
211
228
  # The data table (Pandas DataFrame), as loaded from the input json file via
212
- # load_api_results()
229
+ # load_api_results(). Has columns ['file', 'detections','failure'].
213
230
  detectionResults = None
214
231
 
215
232
  # The other fields in the input json file, loaded via load_api_results()
@@ -309,7 +326,7 @@ class DetectionLocation:
309
326
  return detection
310
327
 
311
328
 
312
- #%% Helper functions
329
+ #%% Support functions
313
330
 
314
331
  def enumerate_images(dirName,outputFileName=None):
315
332
  """
@@ -343,7 +360,7 @@ def render_bounding_box(detection, inputFileName, outputFileName, lineWidth=5,
343
360
 
344
361
 
345
362
  def detection_rect_to_rtree_rect(detection_rect):
346
- # We store detetions as x/y/w/h, rtree and pyqtree use l/b/r/t
363
+ # We store detections as x/y/w/h, rtree and pyqtree use l/b/r/t
347
364
  l = detection_rect[0]
348
365
  b = detection_rect[1]
349
366
  r = detection_rect[0] + detection_rect[2]
@@ -352,7 +369,7 @@ def detection_rect_to_rtree_rect(detection_rect):
352
369
 
353
370
 
354
371
  def rtree_rect_to_detection_rect(rtree_rect):
355
- # We store detetions as x/y/w/h, rtree and pyqtree use l/b/r/t
372
+ # We store detections as x/y/w/h, rtree and pyqtree use l/b/r/t
356
373
  x = rtree_rect[0]
357
374
  y = rtree_rect[1]
358
375
  w = rtree_rect[2] - rtree_rect[0]
@@ -360,12 +377,11 @@ def rtree_rect_to_detection_rect(rtree_rect):
360
377
  return (x,y,w,h)
361
378
 
362
379
 
363
- #%% Sort a list of candidate detections to make them visually easier to review
364
-
365
380
  def sort_detections_for_directory(candidateDetections,options):
366
381
  """
367
382
  candidateDetections is a list of DetectionLocation objects. Sorts them to
368
- put nearby detections next to each other, for easier visual review.
383
+ put nearby detections next to each other, for easier visual review. Returns
384
+ a sorted copy of candidateDetections, does not sort in-place.
369
385
  """
370
386
 
371
387
  if len(candidateDetections) <= 1 or options.smartSort is None:
@@ -458,13 +474,24 @@ def sort_detections_for_directory(candidateDetections,options):
458
474
  raise ValueError('Unrecognized sort method {}'.format(
459
475
  options.smartSort))
460
476
 
461
-
462
- #%% Look for matches (one directory)
477
+ # ...def sort_detections_for_directory(...)
478
+
463
479
 
464
480
  def find_matches_in_directory(dirNameAndRows, options):
465
481
  """
466
482
  dirNameAndRows is a tuple of (name,rows).
467
483
 
484
+ "name" is a location name, typically a folder name.
485
+
486
+ "rows" is a Pandas dataframe with one row per image in this location, with columns:
487
+
488
+ * 'file': relative file name
489
+ * 'detections': a list of MD detection objects, i.e. dicts with keys ['category','conf','bbox']
490
+ * 'max_detection_conf': maximum confidence of any detection, in any category
491
+
492
+ "rows" can also point to a .csv file, in which case the detection table will be read from that
493
+ .csv file, and results will be written to a .csv file rather than being returned.
494
+
468
495
  Find all unique detections in this directory.
469
496
 
470
497
  Returns a list of DetectionLocation objects.
@@ -476,11 +503,21 @@ def find_matches_in_directory(dirNameAndRows, options):
476
503
  # Create a tree to store candidate detections
477
504
  candidateDetectionsIndex = pyqtree.Index(bbox=(-0.1,-0.1,1.1,1.1))
478
505
 
479
- assert len(dirNameAndRows) == 2
480
- assert isinstance(dirNameAndRows[0],str)
481
- dirName = dirNameAndRows[0]
506
+ assert len(dirNameAndRows) == 2, 'find_matches_in_directory: invalid input'
507
+ assert isinstance(dirNameAndRows[0],str), 'find_matches_in_directory: invalid location name'
508
+ dirName = dirNameAndRows[0]
482
509
  rows = dirNameAndRows[1]
483
-
510
+
511
+ detections_loaded_from_csv_file = None
512
+
513
+ if isinstance(rows,str):
514
+ detections_loaded_from_csv_file = rows
515
+ print('Loading results for location {} from {}'.format(
516
+ dirName,detections_loaded_from_csv_file))
517
+ rows = pd.read_csv(detections_loaded_from_csv_file)
518
+ # Pandas writes out detections out as strings, convert them back to lists
519
+ rows['detections'] = rows['detections'].apply(lambda s: json.loads(s.replace('\'','"')))
520
+
484
521
  if options.maxImagesPerFolder is not None and len(rows) > options.maxImagesPerFolder:
485
522
  print('Ignoring directory {} because it has {} images (limit set to {})'.format(
486
523
  dirName,len(rows),options.maxImagesPerFolder))
@@ -535,7 +572,7 @@ def find_matches_in_directory(dirNameAndRows, options):
535
572
  # }
536
573
  detections = row['detections']
537
574
  if isinstance(detections,float):
538
- assert isinstance(row['failure'],str)
575
+ assert isinstance(row['failure'],str), 'Expected failure indicator'
539
576
  print('Skipping failed image {} ({})'.format(filename,row['failure']))
540
577
  continue
541
578
 
@@ -550,8 +587,9 @@ def find_matches_in_directory(dirNameAndRows, options):
550
587
  print('Skipping detection {}'.format(iDetection))
551
588
  continue
552
589
 
553
- assert 'category' in detection and 'conf' in detection and \
554
- 'bbox' in detection
590
+ assert 'category' in detection and \
591
+ 'conf' in detection and \
592
+ 'bbox' in detection, 'Illegal detection'
555
593
 
556
594
  confidence = detection['conf']
557
595
 
@@ -568,7 +606,7 @@ def find_matches_in_directory(dirNameAndRows, options):
568
606
  continue
569
607
 
570
608
  # Optionally exclude some classes from consideration as suspicious
571
- if len(options.excludeClasses) > 0:
609
+ if (options.excludeClasses is not None) and (len(options.excludeClasses) > 0):
572
610
  iClass = int(detection['category'])
573
611
  if iClass in options.excludeClasses:
574
612
  continue
@@ -584,8 +622,12 @@ def find_matches_in_directory(dirNameAndRows, options):
584
622
 
585
623
  area = h * w
586
624
 
625
+ if area < 0:
626
+ print('Warning: negative-area bounding box for file {}'.format(filename))
627
+ area = abs(area); h = abs(h); w = abs(w)
628
+
587
629
  assert area >= 0.0 and area <= 1.0, \
588
- 'Illegal bounding box area {}'.format(area)
630
+ 'Illegal bounding box area {} in image {}'.format(area,filename)
589
631
 
590
632
  if area < options.minSuspiciousDetectionSize:
591
633
  continue
@@ -615,7 +657,7 @@ def find_matches_in_directory(dirNameAndRows, options):
615
657
  overlappingCandidateDetections):
616
658
 
617
659
  # Don't match across categories
618
- if candidate.category != category:
660
+ if (candidate.category != category) and (not (options.categoryAgnosticComparisons)):
619
661
  continue
620
662
 
621
663
  # Is this a match?
@@ -649,9 +691,7 @@ def find_matches_in_directory(dirNameAndRows, options):
649
691
  candidate = DetectionLocation(instance=instance,
650
692
  detection=detection, relativeDir=dirName,
651
693
  category=category, id=i_iteration)
652
-
653
- # candidateDetections.append(candidate)
654
-
694
+
655
695
  # pyqtree
656
696
  candidateDetectionsIndex.insert(item=candidate,bbox=rtree_rect)
657
697
 
@@ -669,20 +709,45 @@ def find_matches_in_directory(dirNameAndRows, options):
669
709
  candidateDetections.sort(
670
710
  key=lambda x: x.id, reverse=False)
671
711
 
672
- return candidateDetections
673
-
674
- # ...def find_matches_in_directory(dirName)
712
+ if detections_loaded_from_csv_file is not None:
713
+ location_results_file = \
714
+ os.path.splitext(detections_loaded_from_csv_file)[0] + \
715
+ '_results.json'
716
+ print('Writing results for location {} to {}'.format(
717
+ dirName,location_results_file))
718
+ s = jsonpickle.encode(candidateDetections,make_refs=False)
719
+ with open(location_results_file,'w') as f:
720
+ f.write(s)
721
+ # json.dump(candidateDetections,f,indent=1)
722
+ return location_results_file
723
+ else:
724
+ return candidateDetections
675
725
 
726
+ # ...def find_matches_in_directory(...)
676
727
 
677
- #%% Update the detection table based on suspicious results, write .csv output
678
728
 
679
- def update_detection_table(RepeatDetectionResults, options, outputFilename=None):
729
+ def update_detection_table(repeatDetectionResults, options, outputFilename=None):
730
+ """
731
+ Changes confidence values in repeatDetectionResults.detectionResults so that detections
732
+ deemed to be possible false positives are given negative confidence values.
733
+
734
+ repeatDetectionResults is an object of type RepeatDetectionResults, with a pandas
735
+ dataframe (detectionResults) containing all the detections loaded from the .json file,
736
+ and a list of detections for each location (suspiciousDetections) that are deemed to
737
+ be suspicious.
738
+
739
+ returns the modified pandas dataframe (repeatDetectionResults.detectionResults), but
740
+ also modifies it in place.
741
+ """
680
742
 
681
- detectionResults = RepeatDetectionResults.detectionResults
743
+ # This is the pandas dataframe that contains actual detection results.
744
+ #
745
+ # Has fields ['file', 'detections','failure'].
746
+ detectionResults = repeatDetectionResults.detectionResults
682
747
 
683
748
  # An array of length nDirs, where each element is a list of DetectionLocation
684
749
  # objects for that directory that have been flagged as suspicious
685
- suspiciousDetectionsByDirectory = RepeatDetectionResults.suspiciousDetections
750
+ suspiciousDetectionsByDirectory = repeatDetectionResults.suspiciousDetections
686
751
 
687
752
  nBboxChanges = 0
688
753
 
@@ -711,8 +776,8 @@ def update_detection_table(RepeatDetectionResults, options, outputFilename=None)
711
776
  # if iou < options.iouThreshold:
712
777
  # print('IOU warning: {},{}'.format(iou,options.iouThreshold))
713
778
 
714
- assert instance.filename in RepeatDetectionResults.filenameToRow
715
- iRow = RepeatDetectionResults.filenameToRow[instance.filename]
779
+ assert instance.filename in repeatDetectionResults.filenameToRow
780
+ iRow = repeatDetectionResults.filenameToRow[instance.filename]
716
781
  row = detectionResults.iloc[iRow]
717
782
  rowDetections = row['detections']
718
783
  detectionToModify = rowDetections[instance.iDetection]
@@ -796,7 +861,7 @@ def update_detection_table(RepeatDetectionResults, options, outputFilename=None)
796
861
 
797
862
  # If we're also writing output...
798
863
  if outputFilename is not None and len(outputFilename) > 0:
799
- write_api_results(detectionResults, RepeatDetectionResults.otherFields,
864
+ write_api_results(detectionResults, repeatDetectionResults.otherFields,
800
865
  outputFilename)
801
866
 
802
867
  print(
@@ -805,7 +870,7 @@ def update_detection_table(RepeatDetectionResults, options, outputFilename=None)
805
870
 
806
871
  return detectionResults
807
872
 
808
- # ...def update_detection_table(RepeatDetectionResults,options)
873
+ # ...def update_detection_table(...)
809
874
 
810
875
 
811
876
  def render_sample_image_for_detection(detection,filteringDir,options):
@@ -841,12 +906,12 @@ def render_sample_image_for_detection(detection,filteringDir,options):
841
906
 
842
907
  try:
843
908
 
909
+ im = open_image(inputFullPath)
910
+
844
911
  # Should we render (typically in a very light color) detections
845
912
  # *other* than the one we're highlighting here?
846
913
  if options.bRenderOtherDetections:
847
-
848
- im = open_image(inputFullPath)
849
-
914
+
850
915
  # Optionally resize the output image
851
916
  if (options.maxOutputImageWidth is not None) and \
852
917
  (im.size[0] > options.maxOutputImageWidth):
@@ -892,6 +957,10 @@ def render_sample_image_for_detection(detection,filteringDir,options):
892
957
  render_bounding_box(detection, inputFullPath, outputFullPath,
893
958
  lineWidth=options.lineThickness, expansion=options.boxExpansion)
894
959
 
960
+ # ...if we are/aren't rendering other bounding boxes
961
+
962
+ # If we're rendering detection tiles, we'll re-load and re-write the image we
963
+ # just wrote to outputFullPath
895
964
  if options.bRenderDetectionTiles:
896
965
 
897
966
  assert not is_sas_url(options.imageBase), "Can't render detection tiles from SAS URLs"
@@ -899,6 +968,8 @@ def render_sample_image_for_detection(detection,filteringDir,options):
899
968
  if options.detectionTilesPrimaryImageWidth is not None:
900
969
  primaryImageWidth = options.detectionTilesPrimaryImageWidth
901
970
  else:
971
+ # "im" may be a resized version of the original image, if we've already run
972
+ # the code to render other bounding boxes.
902
973
  primaryImageWidth = im.size[0]
903
974
 
904
975
  if options.detectionTilesCroppedGridWidth <= 1.0:
@@ -922,7 +993,8 @@ def render_sample_image_for_detection(detection,filteringDir,options):
922
993
  secondaryImageFilenameList[0:options.detectionTilesMaxCrops]
923
994
  secondaryImageBoundingBoxList = \
924
995
  secondaryImageBoundingBoxList[0:options.detectionTilesMaxCrops]
925
-
996
+
997
+ # This will over-write the image we've already written to outputFullPath
926
998
  render_images_with_thumbnails.render_images_with_thumbnails(
927
999
  primary_image_filename=outputFullPath,
928
1000
  primary_image_width=primaryImageWidth,
@@ -936,16 +1008,20 @@ def render_sample_image_for_detection(detection,filteringDir,options):
936
1008
  # bDetectionTilesCroppedGridWidth = 0.6
937
1009
  # bDetectionTilesPrimaryImageLocation='right'
938
1010
 
939
- # ...if we are/aren't rendering other bounding boxes
1011
+ # ...if we are/aren't rendering detection tiles
940
1012
 
941
1013
  except Exception as e:
942
- print('Warning: error rendering bounding box from {} to {}: {}'.format(
943
- inputFullPath,outputFullPath,e))
1014
+
1015
+ stack_trace = traceback.format_exc()
1016
+ print('Warning: error rendering bounding box from {} to {}: {} ({})'.format(
1017
+ inputFullPath,outputFullPath,e,stack_trace))
944
1018
  if options.bFailOnRenderError:
945
1019
  raise
946
-
947
1020
 
948
- #%% Main function
1021
+ # ...def render_sample_image_for_detection(...)
1022
+
1023
+
1024
+ #%% Main entry point
949
1025
 
950
1026
  def find_repeat_detections(inputFilename, outputFilename=None, options=None):
951
1027
 
@@ -998,9 +1074,9 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
998
1074
 
999
1075
  # Load file to a pandas dataframe. Also populates 'max_detection_conf', even if it's
1000
1076
  # not present in the .json file.
1001
-
1002
1077
  detectionResults, otherFields = load_api_results(inputFilename, normalize_paths=True,
1003
- filename_replacements=options.filenameReplacements)
1078
+ filename_replacements=options.filenameReplacements,
1079
+ force_forward_slashes=True)
1004
1080
  toReturn.detectionResults = detectionResults
1005
1081
  toReturn.otherFields = otherFields
1006
1082
 
@@ -1024,7 +1100,7 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
1024
1100
  assert os.path.isfile(absolutePath), 'Could not find file {}'.format(absolutePath)
1025
1101
 
1026
1102
 
1027
- ##%% Separate files into directories
1103
+ ##%% Separate files into locations
1028
1104
 
1029
1105
  # This will be a map from a directory name to smaller data frames
1030
1106
  rowsByDirectory = {}
@@ -1032,12 +1108,12 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
1032
1108
  # This is a mapping back into the rows of the original table
1033
1109
  filenameToRow = {}
1034
1110
 
1035
- print('Separating files into directories...')
1111
+ print('Separating images into locations...')
1036
1112
 
1037
1113
  nCustomDirReplacements = 0
1038
1114
 
1039
1115
  # iRow = 0; row = detectionResults.iloc[0]
1040
- for iRow, row in detectionResults.iterrows():
1116
+ for iRow, row in tqdm(detectionResults.iterrows(),total=len(detectionResults)):
1041
1117
 
1042
1118
  relativePath = row['file']
1043
1119
 
@@ -1075,7 +1151,7 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
1075
1151
  if options.customDirNameFunction is not None:
1076
1152
  print('Custom dir name function made {} replacements (of {} images)'.format(
1077
1153
  nCustomDirReplacements,len(detectionResults)))
1078
-
1154
+
1079
1155
  # Convert lists of rows to proper DataFrames
1080
1156
  dirs = list(rowsByDirectory.keys())
1081
1157
  for d in dirs:
@@ -1084,11 +1160,10 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
1084
1160
  toReturn.rowsByDirectory = rowsByDirectory
1085
1161
  toReturn.filenameToRow = filenameToRow
1086
1162
 
1087
- print('Finished separating {} files into {} directories'.format(len(detectionResults),
1088
- len(rowsByDirectory)))
1089
-
1163
+ print('Finished separating {} files into {} locations'.format(len(detectionResults),
1164
+ len(rowsByDirectory)))
1090
1165
 
1091
- ##% Look for matches (or load them from file)
1166
+ ##% Look for repeat detections (or load them from file)
1092
1167
 
1093
1168
  dirsToSearch = list(rowsByDirectory.keys())
1094
1169
  if options.debugMaxDir > 0:
@@ -1115,6 +1190,11 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
1115
1190
 
1116
1191
  allCandidateDetections = [None] * len(dirsToSearch)
1117
1192
 
1193
+ # If we serialize results to intermediate files, we need to remove slashes from
1194
+ # location names; we store mappings here.
1195
+ normalized_location_name_to_location_name = None
1196
+ location_name_to_normalized_location_name = None
1197
+
1118
1198
  if not options.bParallelizeComparisons:
1119
1199
 
1120
1200
  options.pbar = None
@@ -1132,7 +1212,7 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
1132
1212
  print('Pool of {} requested, but only {} folders available, reducing pool to {}'.\
1133
1213
  format(n_workers,len(dirNameAndRows),len(dirNameAndRows)))
1134
1214
  n_workers = len(dirNameAndRows)
1135
-
1215
+
1136
1216
  if options.parallelizationUsesThreads:
1137
1217
  pool = ThreadPool(n_workers); poolstring = 'threads'
1138
1218
  else:
@@ -1140,24 +1220,96 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
1140
1220
 
1141
1221
  print('Starting comparison pool with {} {}'.format(n_workers,poolstring))
1142
1222
 
1143
- # We get slightly nicer progress bar behavior using threads, by passing a pbar
1144
- # object and letting it get updated. We can't serialize this object across
1145
- # processes.
1146
- if options.parallelizationUsesThreads:
1147
- options.pbar = tqdm(total=len(dirNameAndRows))
1148
- allCandidateDetections = list(pool.imap(
1149
- partial(find_matches_in_directory,options=options), dirNameAndRows))
1150
- else:
1223
+ assert options.pass_detections_to_processes_method in ('file','memory'), \
1224
+ 'Unrecognized IPC mechanism: {}'.format(options.pass_detections_to_processes_method)
1225
+
1226
+ # ** Experimental **
1227
+ #
1228
+ # Rather than passing detections and results around in memory, write detections and
1229
+ # results for each worker to intermediate files. May improve performance for very large
1230
+ # results sets that exceed working memory.
1231
+ if options.pass_detections_to_processes_method == 'file':
1232
+
1233
+ ##%% Convert location names to normalized names we can write to files
1234
+
1235
+ normalized_location_name_to_location_name = {}
1236
+ for dir_name in dirsToSearch:
1237
+ normalized_location_name = flatten_path(dir_name)
1238
+ assert normalized_location_name not in normalized_location_name_to_location_name, \
1239
+ 'Redundant location name {}, can\'t serialize to intermediate files'.format(
1240
+ dir_name)
1241
+ normalized_location_name_to_location_name[normalized_location_name] = dir_name
1242
+
1243
+ location_name_to_normalized_location_name = \
1244
+ invert_dictionary(normalized_location_name_to_location_name)
1245
+
1246
+
1247
+ ##%% Write results to files for each location
1248
+
1249
+ print('Writing results to intermediate files')
1250
+
1251
+ intermediate_json_file_folder = os.path.join(options.outputBase,'intermediate_results')
1252
+ os.makedirs(intermediate_json_file_folder,exist_ok=True)
1253
+
1254
+ # i_location = 0; location_info = dirNameAndRows[0]
1255
+ dirNameAndIntermediateFile = []
1256
+
1257
+ # i_location = 0; location_info = dirNameAndRows[i_location]
1258
+ for i_location, location_info in tqdm(enumerate(dirNameAndRows)):
1259
+
1260
+ location_name = location_info[0]
1261
+ assert location_name in location_name_to_normalized_location_name
1262
+ normalized_location_name = location_name_to_normalized_location_name[location_name]
1263
+ intermediate_results_file = os.path.join(intermediate_json_file_folder,
1264
+ normalized_location_name + '.csv')
1265
+ detections_table_this_location = location_info[1]
1266
+ detections_table_this_location.to_csv(intermediate_results_file,header=True,index=False)
1267
+ dirNameAndIntermediateFile.append((location_name,intermediate_results_file))
1268
+
1269
+
1270
+ ##%% Find detections in each directory
1271
+
1151
1272
  options.pbar = None
1152
- allCandidateDetections = list(tqdm(pool.imap(
1153
- partial(find_matches_in_directory,options=options), dirNameAndRows)))
1273
+ allCandidateDetectionFiles = list(pool.imap(
1274
+ partial(find_matches_in_directory,options=options), dirNameAndIntermediateFile))
1275
+
1276
+
1277
+ ##%% Load into a combined list of candidate detections
1278
+
1279
+ allCandidateDetections = []
1280
+
1281
+ # candidate_detection_file = allCandidateDetectionFiles[0]
1282
+ for candidate_detection_file in allCandidateDetectionFiles:
1283
+ s = open(candidate_detection_file, 'r').read()
1284
+ candidate_detections_this_file = jsonpickle.decode(s)
1285
+ allCandidateDetections.append(candidate_detections_this_file)
1286
+
1287
+
1288
+ ##%% Clean up intermediate files
1289
+
1290
+ shutil.rmtree(intermediate_json_file_folder)
1291
+
1292
+ # If we're passing things around in memory, rather than via intermediate files
1293
+ else:
1294
+
1295
+ # We get slightly nicer progress bar behavior using threads, by passing a pbar
1296
+ # object and letting it get updated. We can't serialize this object across
1297
+ # processes.
1298
+ if options.parallelizationUsesThreads:
1299
+ options.pbar = tqdm(total=len(dirNameAndRows))
1300
+ allCandidateDetections = list(pool.imap(
1301
+ partial(find_matches_in_directory,options=options), dirNameAndRows))
1302
+ else:
1303
+ options.pbar = None
1304
+ allCandidateDetections = list(tqdm(pool.imap(
1305
+ partial(find_matches_in_directory,options=options), dirNameAndRows)))
1154
1306
 
1155
1307
  print('\nFinished looking for similar detections')
1156
1308
 
1157
1309
 
1158
- ##%% Find suspicious locations based on match results
1310
+ ##%% Mark suspicious locations based on match results
1159
1311
 
1160
- print('Searching for repeat detections...')
1312
+ print('Marking repeat detections...')
1161
1313
 
1162
1314
  nImagesWithSuspiciousDetections = 0
1163
1315
  nSuspiciousDetections = 0
@@ -1198,7 +1350,8 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
1198
1350
 
1199
1351
  # ...for each directory
1200
1352
 
1201
- print('Finished searching for repeat detections')
1353
+ print('Finished marking repeat detections')
1354
+
1202
1355
  print('Found {} unique detections on {} images that are suspicious'.format(
1203
1356
  nSuspiciousDetections, nImagesWithSuspiciousDetections))
1204
1357
 
@@ -1367,8 +1520,7 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
1367
1520
  detection.sampleImageDetections = None
1368
1521
 
1369
1522
  # Write out the detection index
1370
- detectionIndexFileName = os.path.join(filteringDir, DETECTION_INDEX_FILE_NAME)
1371
- jsonpickle.set_encoder_options('json', sort_keys=True, indent=2)
1523
+ detectionIndexFileName = os.path.join(filteringDir, detection_index_file_name_base)
1372
1524
 
1373
1525
  # Prepare the data we're going to write to the detection index file
1374
1526
  detectionInfo = {}
@@ -1392,4 +1544,4 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
1392
1544
 
1393
1545
  return toReturn
1394
1546
 
1395
- # ...find_repeat_detections()
1547
+ # ...def find_repeat_detections()