PyPI - megadetector - Versions diffs - 5.0.5__py3-none-any.whl → 5.0.7__py3-none-any.whl - Mend

megadetector 5.0.5py3-none-any.whl → 5.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (132) hide show

api/batch_processing/data_preparation/manage_local_batch.py +302 -263
api/batch_processing/data_preparation/manage_video_batch.py +81 -2
api/batch_processing/postprocessing/add_max_conf.py +1 -0
api/batch_processing/postprocessing/categorize_detections_by_size.py +50 -19
api/batch_processing/postprocessing/compare_batch_results.py +110 -60
api/batch_processing/postprocessing/load_api_results.py +56 -70
api/batch_processing/postprocessing/md_to_coco.py +1 -1
api/batch_processing/postprocessing/md_to_labelme.py +2 -1
api/batch_processing/postprocessing/postprocess_batch_results.py +240 -81
api/batch_processing/postprocessing/render_detection_confusion_matrix.py +625 -0
api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +227 -75
api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
api/synchronous/api_core/animal_detection_api/detection/run_detector_batch.py +2 -2
classification/prepare_classification_script.py +191 -191
data_management/coco_to_yolo.py +68 -45
data_management/databases/integrity_check_json_db.py +7 -5
data_management/generate_crops_from_cct.py +3 -3
data_management/get_image_sizes.py +8 -6
data_management/importers/add_timestamps_to_icct.py +79 -0
data_management/importers/animl_results_to_md_results.py +160 -0
data_management/importers/auckland_doc_test_to_json.py +4 -4
data_management/importers/auckland_doc_to_json.py +1 -1
data_management/importers/awc_to_json.py +5 -5
data_management/importers/bellevue_to_json.py +5 -5
data_management/importers/carrizo_shrubfree_2018.py +5 -5
data_management/importers/carrizo_trail_cam_2017.py +5 -5
data_management/importers/cct_field_adjustments.py +2 -3
data_management/importers/channel_islands_to_cct.py +4 -4
data_management/importers/ena24_to_json.py +5 -5
data_management/importers/helena_to_cct.py +10 -10
data_management/importers/idaho-camera-traps.py +12 -12
data_management/importers/idfg_iwildcam_lila_prep.py +8 -8
data_management/importers/jb_csv_to_json.py +4 -4
data_management/importers/missouri_to_json.py +1 -1
data_management/importers/noaa_seals_2019.py +1 -1
data_management/importers/pc_to_json.py +5 -5
data_management/importers/prepare-noaa-fish-data-for-lila.py +4 -4
data_management/importers/prepare_zsl_imerit.py +5 -5
data_management/importers/rspb_to_json.py +4 -4
data_management/importers/save_the_elephants_survey_A.py +5 -5
data_management/importers/save_the_elephants_survey_B.py +6 -6
data_management/importers/snapshot_safari_importer.py +9 -9
data_management/importers/snapshot_serengeti_lila.py +9 -9
data_management/importers/timelapse_csv_set_to_json.py +5 -7
data_management/importers/ubc_to_json.py +4 -4
data_management/importers/umn_to_json.py +4 -4
data_management/importers/wellington_to_json.py +1 -1
data_management/importers/wi_to_json.py +2 -2
data_management/importers/zamba_results_to_md_results.py +181 -0
data_management/labelme_to_coco.py +35 -7
data_management/labelme_to_yolo.py +229 -0
data_management/lila/add_locations_to_island_camera_traps.py +1 -1
data_management/lila/add_locations_to_nacti.py +147 -0
data_management/lila/create_lila_blank_set.py +474 -0
data_management/lila/create_lila_test_set.py +2 -1
data_management/lila/create_links_to_md_results_files.py +106 -0
data_management/lila/download_lila_subset.py +46 -21
data_management/lila/generate_lila_per_image_labels.py +23 -14
data_management/lila/get_lila_annotation_counts.py +17 -11
data_management/lila/lila_common.py +14 -11
data_management/lila/test_lila_metadata_urls.py +116 -0
data_management/ocr_tools.py +829 -0
data_management/resize_coco_dataset.py +13 -11
data_management/yolo_output_to_md_output.py +84 -12
data_management/yolo_to_coco.py +38 -20
detection/process_video.py +36 -14
detection/pytorch_detector.py +23 -8
detection/run_detector.py +76 -19
detection/run_detector_batch.py +178 -63
detection/run_inference_with_yolov5_val.py +326 -57
detection/run_tiled_inference.py +153 -43
detection/video_utils.py +34 -8
md_utils/ct_utils.py +172 -1
md_utils/md_tests.py +372 -51
md_utils/path_utils.py +167 -39
md_utils/process_utils.py +26 -7
md_utils/split_locations_into_train_val.py +215 -0
md_utils/string_utils.py +10 -0
md_utils/url_utils.py +0 -2
md_utils/write_html_image_list.py +9 -26
md_visualization/plot_utils.py +12 -8
md_visualization/visualization_utils.py +106 -7
md_visualization/visualize_db.py +16 -8
md_visualization/visualize_detector_output.py +208 -97
{megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/METADATA +3 -6
{megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/RECORD +98 -121
{megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/WHEEL +1 -1
taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
taxonomy_mapping/map_new_lila_datasets.py +43 -39
taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
taxonomy_mapping/preview_lila_taxonomy.py +27 -27
taxonomy_mapping/species_lookup.py +33 -13
taxonomy_mapping/taxonomy_csv_checker.py +7 -5
api/synchronous/api_core/yolov5/detect.py +0 -252
api/synchronous/api_core/yolov5/export.py +0 -607
api/synchronous/api_core/yolov5/hubconf.py +0 -146
api/synchronous/api_core/yolov5/models/__init__.py +0 -0
api/synchronous/api_core/yolov5/models/common.py +0 -738
api/synchronous/api_core/yolov5/models/experimental.py +0 -104
api/synchronous/api_core/yolov5/models/tf.py +0 -574
api/synchronous/api_core/yolov5/models/yolo.py +0 -338
api/synchronous/api_core/yolov5/train.py +0 -670
api/synchronous/api_core/yolov5/utils/__init__.py +0 -36
api/synchronous/api_core/yolov5/utils/activations.py +0 -103
api/synchronous/api_core/yolov5/utils/augmentations.py +0 -284
api/synchronous/api_core/yolov5/utils/autoanchor.py +0 -170
api/synchronous/api_core/yolov5/utils/autobatch.py +0 -66
api/synchronous/api_core/yolov5/utils/aws/__init__.py +0 -0
api/synchronous/api_core/yolov5/utils/aws/resume.py +0 -40
api/synchronous/api_core/yolov5/utils/benchmarks.py +0 -148
api/synchronous/api_core/yolov5/utils/callbacks.py +0 -71
api/synchronous/api_core/yolov5/utils/dataloaders.py +0 -1087
api/synchronous/api_core/yolov5/utils/downloads.py +0 -178
api/synchronous/api_core/yolov5/utils/flask_rest_api/example_request.py +0 -19
api/synchronous/api_core/yolov5/utils/flask_rest_api/restapi.py +0 -46
api/synchronous/api_core/yolov5/utils/general.py +0 -1018
api/synchronous/api_core/yolov5/utils/loggers/__init__.py +0 -187
api/synchronous/api_core/yolov5/utils/loggers/wandb/__init__.py +0 -0
api/synchronous/api_core/yolov5/utils/loggers/wandb/log_dataset.py +0 -27
api/synchronous/api_core/yolov5/utils/loggers/wandb/sweep.py +0 -41
api/synchronous/api_core/yolov5/utils/loggers/wandb/wandb_utils.py +0 -577
api/synchronous/api_core/yolov5/utils/loss.py +0 -234
api/synchronous/api_core/yolov5/utils/metrics.py +0 -355
api/synchronous/api_core/yolov5/utils/plots.py +0 -489
api/synchronous/api_core/yolov5/utils/torch_utils.py +0 -314
api/synchronous/api_core/yolov5/val.py +0 -394
md_utils/matlab_porting_tools.py +0 -97
{megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/LICENSE +0 -0
{megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/top_level.txt +0 -0

api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py CHANGED Viewed

@@ -14,7 +14,10 @@ import warnings
 import sklearn.cluster
 import numpy as np
 import jsonpickle
+import traceback
 import pandas as pd
+import json
+import shutil
 from tqdm import tqdm
 from operator import attrgetter
@@ -35,6 +38,8 @@ from api.batch_processing.postprocessing.postprocess_batch_results import relati
 from md_visualization.visualization_utils import open_image, render_detection_bounding_boxes
 from md_visualization import render_images_with_thumbnails
 from md_visualization import visualization_utils as vis_utils
+from md_utils.path_utils import flatten_path
+from md_utils.ct_utils import invert_dictionary
 # "PIL cannot read EXIF metainfo for the images"
 warnings.filterwarnings('ignore', '(Possibly )?corrupt EXIF data', UserWarning)
@@ -42,10 +47,12 @@ warnings.filterwarnings('ignore', '(Possibly )?corrupt EXIF data', UserWarning)
 # "Metadata Warning, tag 256 had too many entries: 42, expected 1"
 warnings.filterwarnings('ignore', 'Metadata warning', UserWarning)
+jsonpickle.set_encoder_options('json', sort_keys=True, indent=1)
 #%% Constants
-DETECTION_INDEX_FILE_NAME = 'detectionIndex.json'
+detection_index_file_name_base = 'detectionIndex.json'
 #%% Classes
@@ -74,26 +81,32 @@ class RepeatDetectionOptions:
     # How many occurrences of a single location (as defined by the IOU threshold)
     # are required before we declare it suspicious?
     occurrenceThreshold = 20
+    # Ignore "suspicious" detections smaller than some size
+    minSuspiciousDetectionSize = 0.0
     # Ignore "suspicious" detections larger than some size; these are often animals
     # taking up the whole image.  This is expressed as a fraction of the image size.
     maxSuspiciousDetectionSize = 0.2
-    # Ignore "suspicious" detections smaller than some size
-    minSuspiciousDetectionSize = 0.0
     # Ignore folders with more than this many images in them
     maxImagesPerFolder = None
     # A list of classes we don't want to treat as suspicious. Each element is an int.
     excludeClasses = []  # [annotation_constants.detector_bbox_category_name_to_id['person']]
+    # For very large sets of results, passing chunks of results to and from workers as
+    # parameters ('memory') can be memory-intensive, so we can serialize to intermediate
+    # files instead ('file').
+    #
+    # The use of 'file' here is still experimental.
+    pass_detections_to_processes_method = 'memory'
     nWorkers = 10
+    # Should we use threads or processes for parallelization?
     parallelizationUsesThreads = True
-    viz_target_width = 800
     # Load detections from a filter file rather than finding them from the detector output
     # .json file containing detections, generally this is the detectionIndex.json file in
@@ -121,6 +134,10 @@ class RepeatDetectionOptions:
     bParallelizeComparisons = True
     bParallelizeRendering = True
+    # If this is False (default), a detection from class A is not considered to be "the same"
+    # as a detection from class B, even if they're at the same location.
+    categoryAgnosticComparisons = False
     # Determines whether bounding-box rendering errors (typically network errors) should
     # be treated as failures
     bFailOnRenderError = False
@@ -209,7 +226,7 @@ class RepeatDetectionResults:
     """
     # The data table (Pandas DataFrame), as loaded from the input json file via
-    # load_api_results()
+    # load_api_results().  Has columns ['file', 'detections','failure'].
     detectionResults = None
     # The other fields in the input json file, loaded via load_api_results()
@@ -309,7 +326,7 @@ class DetectionLocation:
         return detection
-#%% Helper functions
+#%% Support functions
 def enumerate_images(dirName,outputFileName=None):
     """
@@ -343,7 +360,7 @@ def render_bounding_box(detection, inputFileName, outputFileName, lineWidth=5,
 def detection_rect_to_rtree_rect(detection_rect):
-    # We store detetions as x/y/w/h, rtree and pyqtree use l/b/r/t
+    # We store detections as x/y/w/h, rtree and pyqtree use l/b/r/t
     l = detection_rect[0]
     b = detection_rect[1]
     r = detection_rect[0] + detection_rect[2]
@@ -352,7 +369,7 @@ def detection_rect_to_rtree_rect(detection_rect):
 def rtree_rect_to_detection_rect(rtree_rect):
-    # We store detetions as x/y/w/h, rtree and pyqtree use l/b/r/t
+    # We store detections as x/y/w/h, rtree and pyqtree use l/b/r/t
     x = rtree_rect[0]
     y = rtree_rect[1]
     w = rtree_rect[2] - rtree_rect[0]
@@ -360,12 +377,11 @@ def rtree_rect_to_detection_rect(rtree_rect):
     return (x,y,w,h)
-#%% Sort a list of candidate detections to make them visually easier to review
 def sort_detections_for_directory(candidateDetections,options):
     """
     candidateDetections is a list of DetectionLocation objects.  Sorts them to
-    put nearby detections next to each other, for easier visual review.
+    put nearby detections next to each other, for easier visual review.  Returns
+    a sorted copy of candidateDetections, does not sort in-place.
     """
     if len(candidateDetections) <= 1 or options.smartSort is None:
@@ -458,13 +474,24 @@ def sort_detections_for_directory(candidateDetections,options):
         raise ValueError('Unrecognized sort method {}'.format(
             options.smartSort))
-#%% Look for matches (one directory)
+# ...def sort_detections_for_directory(...)
 def find_matches_in_directory(dirNameAndRows, options):
     """
     dirNameAndRows is a tuple of (name,rows).
+    "name" is a location name, typically a folder name.
+    "rows" is a Pandas dataframe with one row per image in this location, with columns:
+        * 'file': relative file name
+        * 'detections': a list of MD detection objects, i.e. dicts with keys ['category','conf','bbox']
+        * 'max_detection_conf': maximum confidence of any detection, in any category
+    "rows" can also point to a .csv file, in which case the detection table will be read from that
+    .csv file, and results will be written to a .csv file rather than being returned.
     Find all unique detections in this directory.
     Returns a list of DetectionLocation objects.
@@ -476,11 +503,21 @@ def find_matches_in_directory(dirNameAndRows, options):
     # Create a tree to store candidate detections
     candidateDetectionsIndex = pyqtree.Index(bbox=(-0.1,-0.1,1.1,1.1))
-    assert len(dirNameAndRows) == 2
-    assert isinstance(dirNameAndRows[0],str)
-    dirName = dirNameAndRows[0]
+    assert len(dirNameAndRows) == 2, 'find_matches_in_directory: invalid input'
+    assert isinstance(dirNameAndRows[0],str), 'find_matches_in_directory: invalid location name'
+    dirName = dirNameAndRows[0]
     rows = dirNameAndRows[1]
+    detections_loaded_from_csv_file = None
+    if isinstance(rows,str):
+        detections_loaded_from_csv_file = rows
+        print('Loading results for location {} from {}'.format(
+            dirName,detections_loaded_from_csv_file))
+        rows = pd.read_csv(detections_loaded_from_csv_file)
+        # Pandas writes out detections out as strings, convert them back to lists
+        rows['detections'] = rows['detections'].apply(lambda s: json.loads(s.replace('\'','"')))
     if options.maxImagesPerFolder is not None and len(rows) > options.maxImagesPerFolder:
         print('Ignoring directory {} because it has {} images (limit set to {})'.format(
             dirName,len(rows),options.maxImagesPerFolder))
@@ -535,7 +572,7 @@ def find_matches_in_directory(dirNameAndRows, options):
         # }
         detections = row['detections']
         if isinstance(detections,float):
-            assert isinstance(row['failure'],str)
+            assert isinstance(row['failure'],str), 'Expected failure indicator'
             print('Skipping failed image {} ({})'.format(filename,row['failure']))
             continue
@@ -550,8 +587,9 @@ def find_matches_in_directory(dirNameAndRows, options):
                 print('Skipping detection {}'.format(iDetection))
                 continue
-            assert 'category' in detection and 'conf' in detection and \
-                'bbox' in detection
+            assert 'category' in detection and \
+                'conf' in detection and \
+                'bbox' in detection, 'Illegal detection'
             confidence = detection['conf']
@@ -568,7 +606,7 @@ def find_matches_in_directory(dirNameAndRows, options):
                 continue
             # Optionally exclude some classes from consideration as suspicious
-            if len(options.excludeClasses) > 0:
+            if (options.excludeClasses is not None) and (len(options.excludeClasses) > 0):
                 iClass = int(detection['category'])
                 if iClass in options.excludeClasses:
                     continue
@@ -584,8 +622,12 @@ def find_matches_in_directory(dirNameAndRows, options):
             area = h * w
+            if area < 0:
+                print('Warning: negative-area bounding box for file {}'.format(filename))
+                area = abs(area); h = abs(h); w = abs(w)
             assert area >= 0.0 and area <= 1.0, \
-                'Illegal bounding box area {}'.format(area)
+                'Illegal bounding box area {} in image {}'.format(area,filename)
             if area < options.minSuspiciousDetectionSize:
                 continue
@@ -615,7 +657,7 @@ def find_matches_in_directory(dirNameAndRows, options):
                     overlappingCandidateDetections):
                 # Don't match across categories
-                if candidate.category != category:
+                if (candidate.category != category) and (not (options.categoryAgnosticComparisons)):
                     continue
                 # Is this a match?
@@ -649,9 +691,7 @@ def find_matches_in_directory(dirNameAndRows, options):
                 candidate = DetectionLocation(instance=instance,
                                               detection=detection, relativeDir=dirName,
                                               category=category, id=i_iteration)
-                # candidateDetections.append(candidate)
                 # pyqtree
                 candidateDetectionsIndex.insert(item=candidate,bbox=rtree_rect)
@@ -669,20 +709,45 @@ def find_matches_in_directory(dirNameAndRows, options):
     candidateDetections.sort(
         key=lambda x: x.id, reverse=False)
-    return candidateDetections
-# ...def find_matches_in_directory(dirName)
+    if detections_loaded_from_csv_file is not None:
+        location_results_file = \
+            os.path.splitext(detections_loaded_from_csv_file)[0] + \
+            '_results.json'
+        print('Writing results for location {} to {}'.format(
+            dirName,location_results_file))
+        s = jsonpickle.encode(candidateDetections,make_refs=False)
+        with open(location_results_file,'w') as f:
+            f.write(s)
+            # json.dump(candidateDetections,f,indent=1)
+        return location_results_file
+    else:
+        return candidateDetections
+# ...def find_matches_in_directory(...)
-#%% Update the detection table based on suspicious results, write .csv output
-def update_detection_table(RepeatDetectionResults, options, outputFilename=None):
+def update_detection_table(repeatDetectionResults, options, outputFilename=None):
+    """
+    Changes confidence values in repeatDetectionResults.detectionResults so that detections
+    deemed to be possible false positives are given negative confidence values.
+    repeatDetectionResults is an object of type RepeatDetectionResults, with a pandas
+    dataframe (detectionResults) containing all the detections loaded from the .json file,
+    and a list of detections for each location (suspiciousDetections) that are deemed to
+    be suspicious.
+    returns the modified pandas dataframe (repeatDetectionResults.detectionResults), but
+    also modifies it in place.
+    """
-    detectionResults = RepeatDetectionResults.detectionResults
+    # This is the pandas dataframe that contains actual detection results.
+    #
+    # Has fields ['file', 'detections','failure'].
+    detectionResults = repeatDetectionResults.detectionResults
     # An array of length nDirs, where each element is a list of DetectionLocation
     # objects for that directory that have been flagged as suspicious
-    suspiciousDetectionsByDirectory = RepeatDetectionResults.suspiciousDetections
+    suspiciousDetectionsByDirectory = repeatDetectionResults.suspiciousDetections
     nBboxChanges = 0
@@ -711,8 +776,8 @@ def update_detection_table(RepeatDetectionResults, options, outputFilename=None)
                 # if iou < options.iouThreshold:
                 #    print('IOU warning: {},{}'.format(iou,options.iouThreshold))
-                assert instance.filename in RepeatDetectionResults.filenameToRow
-                iRow = RepeatDetectionResults.filenameToRow[instance.filename]
+                assert instance.filename in repeatDetectionResults.filenameToRow
+                iRow = repeatDetectionResults.filenameToRow[instance.filename]
                 row = detectionResults.iloc[iRow]
                 rowDetections = row['detections']
                 detectionToModify = rowDetections[instance.iDetection]
@@ -796,7 +861,7 @@ def update_detection_table(RepeatDetectionResults, options, outputFilename=None)
     # If we're also writing output...
     if outputFilename is not None and len(outputFilename) > 0:
-        write_api_results(detectionResults, RepeatDetectionResults.otherFields,
+        write_api_results(detectionResults, repeatDetectionResults.otherFields,
                           outputFilename)
     print(
@@ -805,7 +870,7 @@ def update_detection_table(RepeatDetectionResults, options, outputFilename=None)
     return detectionResults
-# ...def update_detection_table(RepeatDetectionResults,options)
+# ...def update_detection_table(...)
 def render_sample_image_for_detection(detection,filteringDir,options):
@@ -841,12 +906,12 @@ def render_sample_image_for_detection(detection,filteringDir,options):
     try:
+        im = open_image(inputFullPath)
         # Should we render (typically in a very light color) detections
         # *other* than the one we're highlighting here?
         if options.bRenderOtherDetections:
-            im = open_image(inputFullPath)
             # Optionally resize the output image
             if (options.maxOutputImageWidth is not None) and \
                 (im.size[0] > options.maxOutputImageWidth):
@@ -892,6 +957,10 @@ def render_sample_image_for_detection(detection,filteringDir,options):
             render_bounding_box(detection, inputFullPath, outputFullPath,
                 lineWidth=options.lineThickness, expansion=options.boxExpansion)
+        # ...if we are/aren't rendering other bounding boxes
+        # If we're rendering detection tiles, we'll re-load and re-write the image we
+        # just wrote to outputFullPath
         if options.bRenderDetectionTiles:
             assert not is_sas_url(options.imageBase), "Can't render detection tiles from SAS URLs"
@@ -899,6 +968,8 @@ def render_sample_image_for_detection(detection,filteringDir,options):
             if options.detectionTilesPrimaryImageWidth is not None:
                 primaryImageWidth = options.detectionTilesPrimaryImageWidth
             else:
+                # "im" may be a resized version of the original image, if we've already run
+                # the code to render other bounding boxes.
                 primaryImageWidth = im.size[0]
             if options.detectionTilesCroppedGridWidth <= 1.0:
@@ -922,7 +993,8 @@ def render_sample_image_for_detection(detection,filteringDir,options):
                         secondaryImageFilenameList[0:options.detectionTilesMaxCrops]
                     secondaryImageBoundingBoxList = \
                         secondaryImageBoundingBoxList[0:options.detectionTilesMaxCrops]
+            # This will over-write the image we've already written to outputFullPath
             render_images_with_thumbnails.render_images_with_thumbnails(
                 primary_image_filename=outputFullPath,
                 primary_image_width=primaryImageWidth,
@@ -936,16 +1008,20 @@ def render_sample_image_for_detection(detection,filteringDir,options):
             # bDetectionTilesCroppedGridWidth = 0.6
             # bDetectionTilesPrimaryImageLocation='right'
-        # ...if we are/aren't rendering other bounding boxes
+        # ...if we are/aren't rendering detection tiles
     except Exception as e:
-        print('Warning: error rendering bounding box from {} to {}: {}'.format(
-            inputFullPath,outputFullPath,e))
+        stack_trace = traceback.format_exc()
+        print('Warning: error rendering bounding box from {} to {}: {} ({})'.format(
+            inputFullPath,outputFullPath,e,stack_trace))
         if options.bFailOnRenderError:
             raise
-#%% Main function
+# ...def render_sample_image_for_detection(...)
+#%% Main entry point
 def find_repeat_detections(inputFilename, outputFilename=None, options=None):
@@ -998,9 +1074,9 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
     # Load file to a pandas dataframe.  Also populates 'max_detection_conf', even if it's
     # not present in the .json file.
     detectionResults, otherFields = load_api_results(inputFilename, normalize_paths=True,
-                                         filename_replacements=options.filenameReplacements)
+                                         filename_replacements=options.filenameReplacements,
+                                         force_forward_slashes=True)
     toReturn.detectionResults = detectionResults
     toReturn.otherFields = otherFields
@@ -1024,7 +1100,7 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
             assert os.path.isfile(absolutePath), 'Could not find file {}'.format(absolutePath)
-    ##%% Separate files into directories
+    ##%% Separate files into locations
     # This will be a map from a directory name to smaller data frames
     rowsByDirectory = {}
@@ -1032,12 +1108,12 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
     # This is a mapping back into the rows of the original table
     filenameToRow = {}
-    print('Separating files into directories...')
+    print('Separating images into locations...')
     nCustomDirReplacements = 0
     # iRow = 0; row = detectionResults.iloc[0]
-    for iRow, row in detectionResults.iterrows():
+    for iRow, row in tqdm(detectionResults.iterrows(),total=len(detectionResults)):
         relativePath = row['file']
@@ -1075,7 +1151,7 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
     if options.customDirNameFunction is not None:
         print('Custom dir name function made {} replacements (of {} images)'.format(
             nCustomDirReplacements,len(detectionResults)))
     # Convert lists of rows to proper DataFrames
     dirs = list(rowsByDirectory.keys())
     for d in dirs:
@@ -1084,11 +1160,10 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
     toReturn.rowsByDirectory = rowsByDirectory
     toReturn.filenameToRow = filenameToRow
-    print('Finished separating {} files into {} directories'.format(len(detectionResults),
-                                                                    len(rowsByDirectory)))
+    print('Finished separating {} files into {} locations'.format(len(detectionResults),
+                                                                  len(rowsByDirectory)))
-    ##% Look for matches (or load them from file)
+    ##% Look for repeat detections (or load them from file)
     dirsToSearch = list(rowsByDirectory.keys())
     if options.debugMaxDir > 0:
@@ -1115,6 +1190,11 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
         allCandidateDetections = [None] * len(dirsToSearch)
+        # If we serialize results to intermediate files, we need to remove slashes from
+        # location names; we store mappings here.
+        normalized_location_name_to_location_name = None
+        location_name_to_normalized_location_name = None
         if not options.bParallelizeComparisons:
             options.pbar = None
@@ -1132,7 +1212,7 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
                 print('Pool of {} requested, but only {} folders available, reducing pool to {}'.\
                       format(n_workers,len(dirNameAndRows),len(dirNameAndRows)))
                 n_workers = len(dirNameAndRows)
             if options.parallelizationUsesThreads:
                 pool = ThreadPool(n_workers); poolstring = 'threads'
             else:
@@ -1140,24 +1220,96 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
             print('Starting comparison pool with {} {}'.format(n_workers,poolstring))
-            # We get slightly nicer progress bar behavior using threads, by passing a pbar
-            # object and letting it get updated.  We can't serialize this object across
-            # processes.
-            if options.parallelizationUsesThreads:
-                options.pbar = tqdm(total=len(dirNameAndRows))
-                allCandidateDetections = list(pool.imap(
-                    partial(find_matches_in_directory,options=options), dirNameAndRows))
-            else:
+            assert options.pass_detections_to_processes_method in ('file','memory'), \
+                'Unrecognized IPC mechanism: {}'.format(options.pass_detections_to_processes_method)
+            # ** Experimental **
+            #
+            # Rather than passing detections and results around in memory, write detections and
+            # results for each worker to intermediate files.  May improve performance for very large
+            # results sets that exceed working memory.
+            if options.pass_detections_to_processes_method == 'file':
+                ##%% Convert location names to normalized names we can write to files
+                normalized_location_name_to_location_name = {}
+                for dir_name in dirsToSearch:
+                    normalized_location_name = flatten_path(dir_name)
+                    assert normalized_location_name not in normalized_location_name_to_location_name, \
+                        'Redundant location name {}, can\'t serialize to intermediate files'.format(
+                            dir_name)
+                    normalized_location_name_to_location_name[normalized_location_name] = dir_name
+                location_name_to_normalized_location_name = \
+                    invert_dictionary(normalized_location_name_to_location_name)
+                ##%% Write results to files for each location
+                print('Writing results to intermediate files')
+                intermediate_json_file_folder = os.path.join(options.outputBase,'intermediate_results')
+                os.makedirs(intermediate_json_file_folder,exist_ok=True)
+                # i_location = 0; location_info = dirNameAndRows[0]
+                dirNameAndIntermediateFile = []
+                # i_location = 0; location_info = dirNameAndRows[i_location]
+                for i_location, location_info in tqdm(enumerate(dirNameAndRows)):
+                    location_name = location_info[0]
+                    assert location_name in location_name_to_normalized_location_name
+                    normalized_location_name = location_name_to_normalized_location_name[location_name]
+                    intermediate_results_file = os.path.join(intermediate_json_file_folder,
+                                                             normalized_location_name + '.csv')
+                    detections_table_this_location = location_info[1]
+                    detections_table_this_location.to_csv(intermediate_results_file,header=True,index=False)
+                    dirNameAndIntermediateFile.append((location_name,intermediate_results_file))
+                ##%% Find detections in each directory
                 options.pbar = None
-                allCandidateDetections = list(tqdm(pool.imap(
-                    partial(find_matches_in_directory,options=options), dirNameAndRows)))
+                allCandidateDetectionFiles = list(pool.imap(
+                    partial(find_matches_in_directory,options=options), dirNameAndIntermediateFile))
+                ##%% Load into a combined list of candidate detections
+                allCandidateDetections = []
+                # candidate_detection_file = allCandidateDetectionFiles[0]
+                for candidate_detection_file in allCandidateDetectionFiles:
+                    s = open(candidate_detection_file, 'r').read()
+                    candidate_detections_this_file = jsonpickle.decode(s)
+                    allCandidateDetections.append(candidate_detections_this_file)
+                ##%% Clean up intermediate files
+                shutil.rmtree(intermediate_json_file_folder)
+            # If we're passing things around in memory, rather than via intermediate files
+            else:
+                # We get slightly nicer progress bar behavior using threads, by passing a pbar
+                # object and letting it get updated.  We can't serialize this object across
+                # processes.
+                if options.parallelizationUsesThreads:
+                    options.pbar = tqdm(total=len(dirNameAndRows))
+                    allCandidateDetections = list(pool.imap(
+                        partial(find_matches_in_directory,options=options), dirNameAndRows))
+                else:
+                    options.pbar = None
+                    allCandidateDetections = list(tqdm(pool.imap(
+                        partial(find_matches_in_directory,options=options), dirNameAndRows)))
         print('\nFinished looking for similar detections')
-        ##%% Find suspicious locations based on match results
+        ##%% Mark suspicious locations based on match results
-        print('Searching for repeat detections...')
+        print('Marking repeat detections...')
         nImagesWithSuspiciousDetections = 0
         nSuspiciousDetections = 0
@@ -1198,7 +1350,8 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
         # ...for each directory
-        print('Finished searching for repeat detections')
+        print('Finished marking repeat detections')
         print('Found {} unique detections on {} images that are suspicious'.format(
                 nSuspiciousDetections, nImagesWithSuspiciousDetections))
@@ -1367,8 +1520,7 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
             detection.sampleImageDetections = None
         # Write out the detection index
-        detectionIndexFileName = os.path.join(filteringDir, DETECTION_INDEX_FILE_NAME)
-        jsonpickle.set_encoder_options('json', sort_keys=True, indent=2)
+        detectionIndexFileName = os.path.join(filteringDir, detection_index_file_name_base)
         # Prepare the data we're going to write to the detection index file
         detectionInfo = {}
@@ -1392,4 +1544,4 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
     return toReturn
-# ...find_repeat_detections()
+# ...def find_repeat_detections()

megadetector 5.0.5__py3-none-any.whl → 5.0.7__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.5py3-none-any.whl → 5.0.7py3-none-any.whl