megadetector 5.0.6__py3-none-any.whl → 5.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (62) hide show
  1. api/batch_processing/data_preparation/manage_local_batch.py +278 -197
  2. api/batch_processing/data_preparation/manage_video_batch.py +7 -2
  3. api/batch_processing/postprocessing/add_max_conf.py +1 -0
  4. api/batch_processing/postprocessing/compare_batch_results.py +110 -60
  5. api/batch_processing/postprocessing/load_api_results.py +55 -69
  6. api/batch_processing/postprocessing/md_to_labelme.py +1 -0
  7. api/batch_processing/postprocessing/postprocess_batch_results.py +158 -50
  8. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +625 -0
  9. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
  10. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
  11. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +222 -74
  12. api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
  13. api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
  14. classification/prepare_classification_script.py +191 -191
  15. data_management/coco_to_yolo.py +65 -44
  16. data_management/databases/integrity_check_json_db.py +7 -5
  17. data_management/generate_crops_from_cct.py +1 -1
  18. data_management/importers/animl_results_to_md_results.py +2 -2
  19. data_management/importers/noaa_seals_2019.py +1 -1
  20. data_management/importers/zamba_results_to_md_results.py +2 -2
  21. data_management/labelme_to_coco.py +34 -6
  22. data_management/labelme_to_yolo.py +1 -1
  23. data_management/lila/create_lila_blank_set.py +474 -0
  24. data_management/lila/create_lila_test_set.py +2 -1
  25. data_management/lila/create_links_to_md_results_files.py +1 -1
  26. data_management/lila/download_lila_subset.py +46 -21
  27. data_management/lila/generate_lila_per_image_labels.py +23 -14
  28. data_management/lila/get_lila_annotation_counts.py +16 -10
  29. data_management/lila/lila_common.py +14 -11
  30. data_management/lila/test_lila_metadata_urls.py +116 -0
  31. data_management/resize_coco_dataset.py +12 -10
  32. data_management/yolo_output_to_md_output.py +40 -13
  33. data_management/yolo_to_coco.py +34 -21
  34. detection/process_video.py +36 -14
  35. detection/pytorch_detector.py +1 -1
  36. detection/run_detector.py +73 -18
  37. detection/run_detector_batch.py +104 -24
  38. detection/run_inference_with_yolov5_val.py +127 -26
  39. detection/run_tiled_inference.py +153 -43
  40. detection/video_utils.py +3 -1
  41. md_utils/ct_utils.py +79 -3
  42. md_utils/md_tests.py +253 -15
  43. md_utils/path_utils.py +129 -24
  44. md_utils/process_utils.py +26 -7
  45. md_utils/split_locations_into_train_val.py +215 -0
  46. md_utils/string_utils.py +10 -0
  47. md_utils/url_utils.py +0 -2
  48. md_utils/write_html_image_list.py +1 -0
  49. md_visualization/visualization_utils.py +17 -2
  50. md_visualization/visualize_db.py +8 -0
  51. md_visualization/visualize_detector_output.py +185 -104
  52. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/METADATA +2 -2
  53. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/RECORD +62 -58
  54. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/WHEEL +1 -1
  55. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
  56. taxonomy_mapping/map_new_lila_datasets.py +43 -39
  57. taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
  58. taxonomy_mapping/preview_lila_taxonomy.py +27 -27
  59. taxonomy_mapping/species_lookup.py +33 -13
  60. taxonomy_mapping/taxonomy_csv_checker.py +7 -5
  61. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/LICENSE +0 -0
  62. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/top_level.txt +0 -0
@@ -14,7 +14,10 @@ import warnings
14
14
  import sklearn.cluster
15
15
  import numpy as np
16
16
  import jsonpickle
17
+ import traceback
17
18
  import pandas as pd
19
+ import json
20
+ import shutil
18
21
 
19
22
  from tqdm import tqdm
20
23
  from operator import attrgetter
@@ -35,6 +38,8 @@ from api.batch_processing.postprocessing.postprocess_batch_results import relati
35
38
  from md_visualization.visualization_utils import open_image, render_detection_bounding_boxes
36
39
  from md_visualization import render_images_with_thumbnails
37
40
  from md_visualization import visualization_utils as vis_utils
41
+ from md_utils.path_utils import flatten_path
42
+ from md_utils.ct_utils import invert_dictionary
38
43
 
39
44
  # "PIL cannot read EXIF metainfo for the images"
40
45
  warnings.filterwarnings('ignore', '(Possibly )?corrupt EXIF data', UserWarning)
@@ -42,10 +47,12 @@ warnings.filterwarnings('ignore', '(Possibly )?corrupt EXIF data', UserWarning)
42
47
  # "Metadata Warning, tag 256 had too many entries: 42, expected 1"
43
48
  warnings.filterwarnings('ignore', 'Metadata warning', UserWarning)
44
49
 
50
+ jsonpickle.set_encoder_options('json', sort_keys=True, indent=1)
51
+
45
52
 
46
53
  #%% Constants
47
54
 
48
- DETECTION_INDEX_FILE_NAME = 'detectionIndex.json'
55
+ detection_index_file_name_base = 'detectionIndex.json'
49
56
 
50
57
 
51
58
  #%% Classes
@@ -75,25 +82,31 @@ class RepeatDetectionOptions:
75
82
  # are required before we declare it suspicious?
76
83
  occurrenceThreshold = 20
77
84
 
85
+ # Ignore "suspicious" detections smaller than some size
86
+ minSuspiciousDetectionSize = 0.0
87
+
78
88
  # Ignore "suspicious" detections larger than some size; these are often animals
79
89
  # taking up the whole image. This is expressed as a fraction of the image size.
80
90
  maxSuspiciousDetectionSize = 0.2
81
91
 
82
- # Ignore "suspicious" detections smaller than some size
83
- minSuspiciousDetectionSize = 0.0
84
-
85
92
  # Ignore folders with more than this many images in them
86
93
  maxImagesPerFolder = None
87
94
 
88
95
  # A list of classes we don't want to treat as suspicious. Each element is an int.
89
96
  excludeClasses = [] # [annotation_constants.detector_bbox_category_name_to_id['person']]
90
97
 
98
+ # For very large sets of results, passing chunks of results to and from workers as
99
+ # parameters ('memory') can be memory-intensive, so we can serialize to intermediate
100
+ # files instead ('file').
101
+ #
102
+ # The use of 'file' here is still experimental.
103
+ pass_detections_to_processes_method = 'memory'
104
+
91
105
  nWorkers = 10
92
106
 
107
+ # Should we use threads or processes for parallelization?
93
108
  parallelizationUsesThreads = True
94
109
 
95
- viz_target_width = 800
96
-
97
110
  # Load detections from a filter file rather than finding them from the detector output
98
111
 
99
112
  # .json file containing detections, generally this is the detectionIndex.json file in
@@ -213,7 +226,7 @@ class RepeatDetectionResults:
213
226
  """
214
227
 
215
228
  # The data table (Pandas DataFrame), as loaded from the input json file via
216
- # load_api_results()
229
+ # load_api_results(). Has columns ['file', 'detections','failure'].
217
230
  detectionResults = None
218
231
 
219
232
  # The other fields in the input json file, loaded via load_api_results()
@@ -313,7 +326,7 @@ class DetectionLocation:
313
326
  return detection
314
327
 
315
328
 
316
- #%% Helper functions
329
+ #%% Support functions
317
330
 
318
331
  def enumerate_images(dirName,outputFileName=None):
319
332
  """
@@ -347,7 +360,7 @@ def render_bounding_box(detection, inputFileName, outputFileName, lineWidth=5,
347
360
 
348
361
 
349
362
  def detection_rect_to_rtree_rect(detection_rect):
350
- # We store detetions as x/y/w/h, rtree and pyqtree use l/b/r/t
363
+ # We store detections as x/y/w/h, rtree and pyqtree use l/b/r/t
351
364
  l = detection_rect[0]
352
365
  b = detection_rect[1]
353
366
  r = detection_rect[0] + detection_rect[2]
@@ -356,7 +369,7 @@ def detection_rect_to_rtree_rect(detection_rect):
356
369
 
357
370
 
358
371
  def rtree_rect_to_detection_rect(rtree_rect):
359
- # We store detetions as x/y/w/h, rtree and pyqtree use l/b/r/t
372
+ # We store detections as x/y/w/h, rtree and pyqtree use l/b/r/t
360
373
  x = rtree_rect[0]
361
374
  y = rtree_rect[1]
362
375
  w = rtree_rect[2] - rtree_rect[0]
@@ -364,12 +377,11 @@ def rtree_rect_to_detection_rect(rtree_rect):
364
377
  return (x,y,w,h)
365
378
 
366
379
 
367
- #%% Sort a list of candidate detections to make them visually easier to review
368
-
369
380
  def sort_detections_for_directory(candidateDetections,options):
370
381
  """
371
382
  candidateDetections is a list of DetectionLocation objects. Sorts them to
372
- put nearby detections next to each other, for easier visual review.
383
+ put nearby detections next to each other, for easier visual review. Returns
384
+ a sorted copy of candidateDetections, does not sort in-place.
373
385
  """
374
386
 
375
387
  if len(candidateDetections) <= 1 or options.smartSort is None:
@@ -462,13 +474,24 @@ def sort_detections_for_directory(candidateDetections,options):
462
474
  raise ValueError('Unrecognized sort method {}'.format(
463
475
  options.smartSort))
464
476
 
465
-
466
- #%% Look for matches (one directory)
477
+ # ...def sort_detections_for_directory(...)
478
+
467
479
 
468
480
  def find_matches_in_directory(dirNameAndRows, options):
469
481
  """
470
482
  dirNameAndRows is a tuple of (name,rows).
471
483
 
484
+ "name" is a location name, typically a folder name.
485
+
486
+ "rows" is a Pandas dataframe with one row per image in this location, with columns:
487
+
488
+ * 'file': relative file name
489
+ * 'detections': a list of MD detection objects, i.e. dicts with keys ['category','conf','bbox']
490
+ * 'max_detection_conf': maximum confidence of any detection, in any category
491
+
492
+ "rows" can also point to a .csv file, in which case the detection table will be read from that
493
+ .csv file, and results will be written to a .csv file rather than being returned.
494
+
472
495
  Find all unique detections in this directory.
473
496
 
474
497
  Returns a list of DetectionLocation objects.
@@ -480,11 +503,21 @@ def find_matches_in_directory(dirNameAndRows, options):
480
503
  # Create a tree to store candidate detections
481
504
  candidateDetectionsIndex = pyqtree.Index(bbox=(-0.1,-0.1,1.1,1.1))
482
505
 
483
- assert len(dirNameAndRows) == 2
484
- assert isinstance(dirNameAndRows[0],str)
485
- dirName = dirNameAndRows[0]
506
+ assert len(dirNameAndRows) == 2, 'find_matches_in_directory: invalid input'
507
+ assert isinstance(dirNameAndRows[0],str), 'find_matches_in_directory: invalid location name'
508
+ dirName = dirNameAndRows[0]
486
509
  rows = dirNameAndRows[1]
487
-
510
+
511
+ detections_loaded_from_csv_file = None
512
+
513
+ if isinstance(rows,str):
514
+ detections_loaded_from_csv_file = rows
515
+ print('Loading results for location {} from {}'.format(
516
+ dirName,detections_loaded_from_csv_file))
517
+ rows = pd.read_csv(detections_loaded_from_csv_file)
518
+ # Pandas writes out detections out as strings, convert them back to lists
519
+ rows['detections'] = rows['detections'].apply(lambda s: json.loads(s.replace('\'','"')))
520
+
488
521
  if options.maxImagesPerFolder is not None and len(rows) > options.maxImagesPerFolder:
489
522
  print('Ignoring directory {} because it has {} images (limit set to {})'.format(
490
523
  dirName,len(rows),options.maxImagesPerFolder))
@@ -539,7 +572,7 @@ def find_matches_in_directory(dirNameAndRows, options):
539
572
  # }
540
573
  detections = row['detections']
541
574
  if isinstance(detections,float):
542
- assert isinstance(row['failure'],str)
575
+ assert isinstance(row['failure'],str), 'Expected failure indicator'
543
576
  print('Skipping failed image {} ({})'.format(filename,row['failure']))
544
577
  continue
545
578
 
@@ -554,8 +587,9 @@ def find_matches_in_directory(dirNameAndRows, options):
554
587
  print('Skipping detection {}'.format(iDetection))
555
588
  continue
556
589
 
557
- assert 'category' in detection and 'conf' in detection and \
558
- 'bbox' in detection
590
+ assert 'category' in detection and \
591
+ 'conf' in detection and \
592
+ 'bbox' in detection, 'Illegal detection'
559
593
 
560
594
  confidence = detection['conf']
561
595
 
@@ -572,7 +606,7 @@ def find_matches_in_directory(dirNameAndRows, options):
572
606
  continue
573
607
 
574
608
  # Optionally exclude some classes from consideration as suspicious
575
- if len(options.excludeClasses) > 0:
609
+ if (options.excludeClasses is not None) and (len(options.excludeClasses) > 0):
576
610
  iClass = int(detection['category'])
577
611
  if iClass in options.excludeClasses:
578
612
  continue
@@ -588,8 +622,12 @@ def find_matches_in_directory(dirNameAndRows, options):
588
622
 
589
623
  area = h * w
590
624
 
625
+ if area < 0:
626
+ print('Warning: negative-area bounding box for file {}'.format(filename))
627
+ area = abs(area); h = abs(h); w = abs(w)
628
+
591
629
  assert area >= 0.0 and area <= 1.0, \
592
- 'Illegal bounding box area {}'.format(area)
630
+ 'Illegal bounding box area {} in image {}'.format(area,filename)
593
631
 
594
632
  if area < options.minSuspiciousDetectionSize:
595
633
  continue
@@ -653,9 +691,7 @@ def find_matches_in_directory(dirNameAndRows, options):
653
691
  candidate = DetectionLocation(instance=instance,
654
692
  detection=detection, relativeDir=dirName,
655
693
  category=category, id=i_iteration)
656
-
657
- # candidateDetections.append(candidate)
658
-
694
+
659
695
  # pyqtree
660
696
  candidateDetectionsIndex.insert(item=candidate,bbox=rtree_rect)
661
697
 
@@ -673,20 +709,45 @@ def find_matches_in_directory(dirNameAndRows, options):
673
709
  candidateDetections.sort(
674
710
  key=lambda x: x.id, reverse=False)
675
711
 
676
- return candidateDetections
677
-
678
- # ...def find_matches_in_directory(dirName)
712
+ if detections_loaded_from_csv_file is not None:
713
+ location_results_file = \
714
+ os.path.splitext(detections_loaded_from_csv_file)[0] + \
715
+ '_results.json'
716
+ print('Writing results for location {} to {}'.format(
717
+ dirName,location_results_file))
718
+ s = jsonpickle.encode(candidateDetections,make_refs=False)
719
+ with open(location_results_file,'w') as f:
720
+ f.write(s)
721
+ # json.dump(candidateDetections,f,indent=1)
722
+ return location_results_file
723
+ else:
724
+ return candidateDetections
679
725
 
726
+ # ...def find_matches_in_directory(...)
680
727
 
681
- #%% Update the detection table based on suspicious results, write .csv output
682
728
 
683
- def update_detection_table(RepeatDetectionResults, options, outputFilename=None):
729
+ def update_detection_table(repeatDetectionResults, options, outputFilename=None):
730
+ """
731
+ Changes confidence values in repeatDetectionResults.detectionResults so that detections
732
+ deemed to be possible false positives are given negative confidence values.
733
+
734
+ repeatDetectionResults is an object of type RepeatDetectionResults, with a pandas
735
+ dataframe (detectionResults) containing all the detections loaded from the .json file,
736
+ and a list of detections for each location (suspiciousDetections) that are deemed to
737
+ be suspicious.
738
+
739
+ returns the modified pandas dataframe (repeatDetectionResults.detectionResults), but
740
+ also modifies it in place.
741
+ """
684
742
 
685
- detectionResults = RepeatDetectionResults.detectionResults
743
+ # This is the pandas dataframe that contains actual detection results.
744
+ #
745
+ # Has fields ['file', 'detections','failure'].
746
+ detectionResults = repeatDetectionResults.detectionResults
686
747
 
687
748
  # An array of length nDirs, where each element is a list of DetectionLocation
688
749
  # objects for that directory that have been flagged as suspicious
689
- suspiciousDetectionsByDirectory = RepeatDetectionResults.suspiciousDetections
750
+ suspiciousDetectionsByDirectory = repeatDetectionResults.suspiciousDetections
690
751
 
691
752
  nBboxChanges = 0
692
753
 
@@ -715,8 +776,8 @@ def update_detection_table(RepeatDetectionResults, options, outputFilename=None)
715
776
  # if iou < options.iouThreshold:
716
777
  # print('IOU warning: {},{}'.format(iou,options.iouThreshold))
717
778
 
718
- assert instance.filename in RepeatDetectionResults.filenameToRow
719
- iRow = RepeatDetectionResults.filenameToRow[instance.filename]
779
+ assert instance.filename in repeatDetectionResults.filenameToRow
780
+ iRow = repeatDetectionResults.filenameToRow[instance.filename]
720
781
  row = detectionResults.iloc[iRow]
721
782
  rowDetections = row['detections']
722
783
  detectionToModify = rowDetections[instance.iDetection]
@@ -800,7 +861,7 @@ def update_detection_table(RepeatDetectionResults, options, outputFilename=None)
800
861
 
801
862
  # If we're also writing output...
802
863
  if outputFilename is not None and len(outputFilename) > 0:
803
- write_api_results(detectionResults, RepeatDetectionResults.otherFields,
864
+ write_api_results(detectionResults, repeatDetectionResults.otherFields,
804
865
  outputFilename)
805
866
 
806
867
  print(
@@ -809,7 +870,7 @@ def update_detection_table(RepeatDetectionResults, options, outputFilename=None)
809
870
 
810
871
  return detectionResults
811
872
 
812
- # ...def update_detection_table(RepeatDetectionResults,options)
873
+ # ...def update_detection_table(...)
813
874
 
814
875
 
815
876
  def render_sample_image_for_detection(detection,filteringDir,options):
@@ -845,12 +906,12 @@ def render_sample_image_for_detection(detection,filteringDir,options):
845
906
 
846
907
  try:
847
908
 
909
+ im = open_image(inputFullPath)
910
+
848
911
  # Should we render (typically in a very light color) detections
849
912
  # *other* than the one we're highlighting here?
850
913
  if options.bRenderOtherDetections:
851
-
852
- im = open_image(inputFullPath)
853
-
914
+
854
915
  # Optionally resize the output image
855
916
  if (options.maxOutputImageWidth is not None) and \
856
917
  (im.size[0] > options.maxOutputImageWidth):
@@ -896,6 +957,10 @@ def render_sample_image_for_detection(detection,filteringDir,options):
896
957
  render_bounding_box(detection, inputFullPath, outputFullPath,
897
958
  lineWidth=options.lineThickness, expansion=options.boxExpansion)
898
959
 
960
+ # ...if we are/aren't rendering other bounding boxes
961
+
962
+ # If we're rendering detection tiles, we'll re-load and re-write the image we
963
+ # just wrote to outputFullPath
899
964
  if options.bRenderDetectionTiles:
900
965
 
901
966
  assert not is_sas_url(options.imageBase), "Can't render detection tiles from SAS URLs"
@@ -903,6 +968,8 @@ def render_sample_image_for_detection(detection,filteringDir,options):
903
968
  if options.detectionTilesPrimaryImageWidth is not None:
904
969
  primaryImageWidth = options.detectionTilesPrimaryImageWidth
905
970
  else:
971
+ # "im" may be a resized version of the original image, if we've already run
972
+ # the code to render other bounding boxes.
906
973
  primaryImageWidth = im.size[0]
907
974
 
908
975
  if options.detectionTilesCroppedGridWidth <= 1.0:
@@ -926,7 +993,8 @@ def render_sample_image_for_detection(detection,filteringDir,options):
926
993
  secondaryImageFilenameList[0:options.detectionTilesMaxCrops]
927
994
  secondaryImageBoundingBoxList = \
928
995
  secondaryImageBoundingBoxList[0:options.detectionTilesMaxCrops]
929
-
996
+
997
+ # This will over-write the image we've already written to outputFullPath
930
998
  render_images_with_thumbnails.render_images_with_thumbnails(
931
999
  primary_image_filename=outputFullPath,
932
1000
  primary_image_width=primaryImageWidth,
@@ -940,16 +1008,20 @@ def render_sample_image_for_detection(detection,filteringDir,options):
940
1008
  # bDetectionTilesCroppedGridWidth = 0.6
941
1009
  # bDetectionTilesPrimaryImageLocation='right'
942
1010
 
943
- # ...if we are/aren't rendering other bounding boxes
1011
+ # ...if we are/aren't rendering detection tiles
944
1012
 
945
1013
  except Exception as e:
946
- print('Warning: error rendering bounding box from {} to {}: {}'.format(
947
- inputFullPath,outputFullPath,e))
1014
+
1015
+ stack_trace = traceback.format_exc()
1016
+ print('Warning: error rendering bounding box from {} to {}: {} ({})'.format(
1017
+ inputFullPath,outputFullPath,e,stack_trace))
948
1018
  if options.bFailOnRenderError:
949
1019
  raise
950
-
951
1020
 
952
- #%% Main function
1021
+ # ...def render_sample_image_for_detection(...)
1022
+
1023
+
1024
+ #%% Main entry point
953
1025
 
954
1026
  def find_repeat_detections(inputFilename, outputFilename=None, options=None):
955
1027
 
@@ -1002,9 +1074,9 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
1002
1074
 
1003
1075
  # Load file to a pandas dataframe. Also populates 'max_detection_conf', even if it's
1004
1076
  # not present in the .json file.
1005
-
1006
1077
  detectionResults, otherFields = load_api_results(inputFilename, normalize_paths=True,
1007
- filename_replacements=options.filenameReplacements)
1078
+ filename_replacements=options.filenameReplacements,
1079
+ force_forward_slashes=True)
1008
1080
  toReturn.detectionResults = detectionResults
1009
1081
  toReturn.otherFields = otherFields
1010
1082
 
@@ -1028,7 +1100,7 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
1028
1100
  assert os.path.isfile(absolutePath), 'Could not find file {}'.format(absolutePath)
1029
1101
 
1030
1102
 
1031
- ##%% Separate files into directories
1103
+ ##%% Separate files into locations
1032
1104
 
1033
1105
  # This will be a map from a directory name to smaller data frames
1034
1106
  rowsByDirectory = {}
@@ -1036,12 +1108,12 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
1036
1108
  # This is a mapping back into the rows of the original table
1037
1109
  filenameToRow = {}
1038
1110
 
1039
- print('Separating files into directories...')
1111
+ print('Separating images into locations...')
1040
1112
 
1041
1113
  nCustomDirReplacements = 0
1042
1114
 
1043
1115
  # iRow = 0; row = detectionResults.iloc[0]
1044
- for iRow, row in detectionResults.iterrows():
1116
+ for iRow, row in tqdm(detectionResults.iterrows(),total=len(detectionResults)):
1045
1117
 
1046
1118
  relativePath = row['file']
1047
1119
 
@@ -1079,7 +1151,7 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
1079
1151
  if options.customDirNameFunction is not None:
1080
1152
  print('Custom dir name function made {} replacements (of {} images)'.format(
1081
1153
  nCustomDirReplacements,len(detectionResults)))
1082
-
1154
+
1083
1155
  # Convert lists of rows to proper DataFrames
1084
1156
  dirs = list(rowsByDirectory.keys())
1085
1157
  for d in dirs:
@@ -1088,11 +1160,10 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
1088
1160
  toReturn.rowsByDirectory = rowsByDirectory
1089
1161
  toReturn.filenameToRow = filenameToRow
1090
1162
 
1091
- print('Finished separating {} files into {} directories'.format(len(detectionResults),
1092
- len(rowsByDirectory)))
1093
-
1163
+ print('Finished separating {} files into {} locations'.format(len(detectionResults),
1164
+ len(rowsByDirectory)))
1094
1165
 
1095
- ##% Look for matches (or load them from file)
1166
+ ##% Look for repeat detections (or load them from file)
1096
1167
 
1097
1168
  dirsToSearch = list(rowsByDirectory.keys())
1098
1169
  if options.debugMaxDir > 0:
@@ -1119,6 +1190,11 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
1119
1190
 
1120
1191
  allCandidateDetections = [None] * len(dirsToSearch)
1121
1192
 
1193
+ # If we serialize results to intermediate files, we need to remove slashes from
1194
+ # location names; we store mappings here.
1195
+ normalized_location_name_to_location_name = None
1196
+ location_name_to_normalized_location_name = None
1197
+
1122
1198
  if not options.bParallelizeComparisons:
1123
1199
 
1124
1200
  options.pbar = None
@@ -1136,7 +1212,7 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
1136
1212
  print('Pool of {} requested, but only {} folders available, reducing pool to {}'.\
1137
1213
  format(n_workers,len(dirNameAndRows),len(dirNameAndRows)))
1138
1214
  n_workers = len(dirNameAndRows)
1139
-
1215
+
1140
1216
  if options.parallelizationUsesThreads:
1141
1217
  pool = ThreadPool(n_workers); poolstring = 'threads'
1142
1218
  else:
@@ -1144,24 +1220,96 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
1144
1220
 
1145
1221
  print('Starting comparison pool with {} {}'.format(n_workers,poolstring))
1146
1222
 
1147
- # We get slightly nicer progress bar behavior using threads, by passing a pbar
1148
- # object and letting it get updated. We can't serialize this object across
1149
- # processes.
1150
- if options.parallelizationUsesThreads:
1151
- options.pbar = tqdm(total=len(dirNameAndRows))
1152
- allCandidateDetections = list(pool.imap(
1153
- partial(find_matches_in_directory,options=options), dirNameAndRows))
1154
- else:
1223
+ assert options.pass_detections_to_processes_method in ('file','memory'), \
1224
+ 'Unrecognized IPC mechanism: {}'.format(options.pass_detections_to_processes_method)
1225
+
1226
+ # ** Experimental **
1227
+ #
1228
+ # Rather than passing detections and results around in memory, write detections and
1229
+ # results for each worker to intermediate files. May improve performance for very large
1230
+ # results sets that exceed working memory.
1231
+ if options.pass_detections_to_processes_method == 'file':
1232
+
1233
+ ##%% Convert location names to normalized names we can write to files
1234
+
1235
+ normalized_location_name_to_location_name = {}
1236
+ for dir_name in dirsToSearch:
1237
+ normalized_location_name = flatten_path(dir_name)
1238
+ assert normalized_location_name not in normalized_location_name_to_location_name, \
1239
+ 'Redundant location name {}, can\'t serialize to intermediate files'.format(
1240
+ dir_name)
1241
+ normalized_location_name_to_location_name[normalized_location_name] = dir_name
1242
+
1243
+ location_name_to_normalized_location_name = \
1244
+ invert_dictionary(normalized_location_name_to_location_name)
1245
+
1246
+
1247
+ ##%% Write results to files for each location
1248
+
1249
+ print('Writing results to intermediate files')
1250
+
1251
+ intermediate_json_file_folder = os.path.join(options.outputBase,'intermediate_results')
1252
+ os.makedirs(intermediate_json_file_folder,exist_ok=True)
1253
+
1254
+ # i_location = 0; location_info = dirNameAndRows[0]
1255
+ dirNameAndIntermediateFile = []
1256
+
1257
+ # i_location = 0; location_info = dirNameAndRows[i_location]
1258
+ for i_location, location_info in tqdm(enumerate(dirNameAndRows)):
1259
+
1260
+ location_name = location_info[0]
1261
+ assert location_name in location_name_to_normalized_location_name
1262
+ normalized_location_name = location_name_to_normalized_location_name[location_name]
1263
+ intermediate_results_file = os.path.join(intermediate_json_file_folder,
1264
+ normalized_location_name + '.csv')
1265
+ detections_table_this_location = location_info[1]
1266
+ detections_table_this_location.to_csv(intermediate_results_file,header=True,index=False)
1267
+ dirNameAndIntermediateFile.append((location_name,intermediate_results_file))
1268
+
1269
+
1270
+ ##%% Find detections in each directory
1271
+
1155
1272
  options.pbar = None
1156
- allCandidateDetections = list(tqdm(pool.imap(
1157
- partial(find_matches_in_directory,options=options), dirNameAndRows)))
1273
+ allCandidateDetectionFiles = list(pool.imap(
1274
+ partial(find_matches_in_directory,options=options), dirNameAndIntermediateFile))
1275
+
1276
+
1277
+ ##%% Load into a combined list of candidate detections
1278
+
1279
+ allCandidateDetections = []
1280
+
1281
+ # candidate_detection_file = allCandidateDetectionFiles[0]
1282
+ for candidate_detection_file in allCandidateDetectionFiles:
1283
+ s = open(candidate_detection_file, 'r').read()
1284
+ candidate_detections_this_file = jsonpickle.decode(s)
1285
+ allCandidateDetections.append(candidate_detections_this_file)
1286
+
1287
+
1288
+ ##%% Clean up intermediate files
1289
+
1290
+ shutil.rmtree(intermediate_json_file_folder)
1291
+
1292
+ # If we're passing things around in memory, rather than via intermediate files
1293
+ else:
1294
+
1295
+ # We get slightly nicer progress bar behavior using threads, by passing a pbar
1296
+ # object and letting it get updated. We can't serialize this object across
1297
+ # processes.
1298
+ if options.parallelizationUsesThreads:
1299
+ options.pbar = tqdm(total=len(dirNameAndRows))
1300
+ allCandidateDetections = list(pool.imap(
1301
+ partial(find_matches_in_directory,options=options), dirNameAndRows))
1302
+ else:
1303
+ options.pbar = None
1304
+ allCandidateDetections = list(tqdm(pool.imap(
1305
+ partial(find_matches_in_directory,options=options), dirNameAndRows)))
1158
1306
 
1159
1307
  print('\nFinished looking for similar detections')
1160
1308
 
1161
1309
 
1162
- ##%% Find suspicious locations based on match results
1310
+ ##%% Mark suspicious locations based on match results
1163
1311
 
1164
- print('Searching for repeat detections...')
1312
+ print('Marking repeat detections...')
1165
1313
 
1166
1314
  nImagesWithSuspiciousDetections = 0
1167
1315
  nSuspiciousDetections = 0
@@ -1202,7 +1350,8 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
1202
1350
 
1203
1351
  # ...for each directory
1204
1352
 
1205
- print('Finished searching for repeat detections')
1353
+ print('Finished marking repeat detections')
1354
+
1206
1355
  print('Found {} unique detections on {} images that are suspicious'.format(
1207
1356
  nSuspiciousDetections, nImagesWithSuspiciousDetections))
1208
1357
 
@@ -1371,8 +1520,7 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
1371
1520
  detection.sampleImageDetections = None
1372
1521
 
1373
1522
  # Write out the detection index
1374
- detectionIndexFileName = os.path.join(filteringDir, DETECTION_INDEX_FILE_NAME)
1375
- jsonpickle.set_encoder_options('json', sort_keys=True, indent=1)
1523
+ detectionIndexFileName = os.path.join(filteringDir, detection_index_file_name_base)
1376
1524
 
1377
1525
  # Prepare the data we're going to write to the detection index file
1378
1526
  detectionInfo = {}
@@ -1396,4 +1544,4 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
1396
1544
 
1397
1545
  return toReturn
1398
1546
 
1399
- # ...find_repeat_detections()
1547
+ # ...def find_repeat_detections()