PyPI - megadetector - Versions diffs - 5.0.6__py3-none-any.whl → 5.0.8__py3-none-any.whl - Mend

megadetector 5.0.6py3-none-any.whl → 5.0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (75) hide show

api/batch_processing/data_preparation/manage_local_batch.py +297 -202
api/batch_processing/data_preparation/manage_video_batch.py +7 -2
api/batch_processing/postprocessing/add_max_conf.py +1 -0
api/batch_processing/postprocessing/combine_api_outputs.py +2 -2
api/batch_processing/postprocessing/compare_batch_results.py +111 -61
api/batch_processing/postprocessing/convert_output_format.py +24 -6
api/batch_processing/postprocessing/load_api_results.py +56 -72
api/batch_processing/postprocessing/md_to_labelme.py +119 -51
api/batch_processing/postprocessing/merge_detections.py +30 -5
api/batch_processing/postprocessing/postprocess_batch_results.py +175 -55
api/batch_processing/postprocessing/remap_detection_categories.py +163 -0
api/batch_processing/postprocessing/render_detection_confusion_matrix.py +628 -0
api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +224 -76
api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
classification/prepare_classification_script.py +191 -191
data_management/cct_json_utils.py +7 -2
data_management/coco_to_labelme.py +263 -0
data_management/coco_to_yolo.py +72 -48
data_management/databases/integrity_check_json_db.py +75 -64
data_management/databases/subset_json_db.py +1 -1
data_management/generate_crops_from_cct.py +1 -1
data_management/get_image_sizes.py +44 -26
data_management/importers/animl_results_to_md_results.py +3 -5
data_management/importers/noaa_seals_2019.py +2 -2
data_management/importers/zamba_results_to_md_results.py +2 -2
data_management/labelme_to_coco.py +264 -127
data_management/labelme_to_yolo.py +96 -53
data_management/lila/create_lila_blank_set.py +557 -0
data_management/lila/create_lila_test_set.py +2 -1
data_management/lila/create_links_to_md_results_files.py +1 -1
data_management/lila/download_lila_subset.py +138 -45
data_management/lila/generate_lila_per_image_labels.py +23 -14
data_management/lila/get_lila_annotation_counts.py +16 -10
data_management/lila/lila_common.py +15 -42
data_management/lila/test_lila_metadata_urls.py +116 -0
data_management/read_exif.py +65 -16
data_management/remap_coco_categories.py +84 -0
data_management/resize_coco_dataset.py +14 -31
data_management/wi_download_csv_to_coco.py +239 -0
data_management/yolo_output_to_md_output.py +40 -13
data_management/yolo_to_coco.py +313 -100
detection/process_video.py +36 -14
detection/pytorch_detector.py +1 -1
detection/run_detector.py +73 -18
detection/run_detector_batch.py +116 -27
detection/run_inference_with_yolov5_val.py +135 -27
detection/run_tiled_inference.py +153 -43
detection/tf_detector.py +2 -1
detection/video_utils.py +4 -2
md_utils/ct_utils.py +101 -6
md_utils/md_tests.py +264 -17
md_utils/path_utils.py +326 -47
md_utils/process_utils.py +26 -7
md_utils/split_locations_into_train_val.py +215 -0
md_utils/string_utils.py +10 -0
md_utils/url_utils.py +66 -3
md_utils/write_html_image_list.py +12 -2
md_visualization/visualization_utils.py +380 -74
md_visualization/visualize_db.py +41 -10
md_visualization/visualize_detector_output.py +185 -104
{megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/METADATA +11 -13
{megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/RECORD +74 -67
{megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/WHEEL +1 -1
taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
taxonomy_mapping/map_new_lila_datasets.py +43 -39
taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
taxonomy_mapping/preview_lila_taxonomy.py +27 -27
taxonomy_mapping/species_lookup.py +33 -13
taxonomy_mapping/taxonomy_csv_checker.py +7 -5
md_visualization/visualize_megadb.py +0 -183
{megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/LICENSE +0 -0
{megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/top_level.txt +0 -0

api/batch_processing/postprocessing/load_api_results.py CHANGED Viewed

@@ -2,17 +2,18 @@
 #
 # load_api_results.py
 #
-# Loads the output of the batch processing API (json) into a pandas dataframe.
+# DEPRECATED
 #
-# Also functions to group entries by seq_id.
+# As of 2023.12, this module is used in postprocessing and RDE.  Not recommended
+# for new code.
 #
-# Includes the deprecated functions that worked with the old CSV API output format.
+# Loads the output of the batch processing API (json) into a Pandas dataframe.
+#
+# Includes functions to read/write the (very very old) .csv results format.
 #
 ########
-#%% Constants and imports
-from collections import defaultdict
+#%% Imports
 import json
 import os
@@ -23,72 +24,32 @@ import pandas as pd
 from md_utils import ct_utils
-headers = ['image_path', 'max_confidence', 'detections']
-#%% Functions for grouping by sequence_id
-def ss_file_to_file_name(f):
-    # example
-    # input 'file': 'SER/S1/F08/F08_R3/S1_F08_R3_PICT1150.JPG'
-    # output 'id': 'S1/F08/F08_R3/S1_F08_R3_PICT1150.JPG'
-    return f.split('SER/')[1].split('.JPG')[0]
-def caltech_file_to_file_name(f):
-    return f.split('cct_images/')[1].split('.')[0]
-def api_results_groupby(api_output_path, gt_db_indexed, file_to_image_id, field='seq_id'):
-    """
-    Given the output file of the API, groupby (currently only seq_id).
-    Args:
-        api_output_path: path to the API output json file
-        gt_db_indexed: an instance of IndexedJsonDb so we know the seq_id to image_id mapping
-        file_to_image_id: a function that takes in the 'file' field in 'images' in the detector
-            output file and converts it to the 'id' field in the gt DB.
-        field: which field in the 'images' array to group by
-    Returns:
-    A dict where the keys are of the field requested, each points to an array
-    containing entries in the 'images' section of the output file
-    """
-    with open(api_output_path) as f:
-        detection_results = json.load(f)
-    res = defaultdict(list)
-    for i in detection_results['images']:
-        image_id = file_to_image_id(i['file'])
-        field_val = gt_db_indexed.image_id_to_image[image_id][field]
-        res[field_val].append(i)
-    return res
-#%% Functions for loading the result as a Pandas DataFrame
+#%% Functions for loading .json results into a Pandas DataFrame, and writing back to .json
 def load_api_results(api_output_path: str, normalize_paths: bool = True,
-                     filename_replacements: Optional[Mapping[str, str]] = None
+                     filename_replacements: Optional[Mapping[str, str]] = None,
+                     force_forward_slashes: bool = True
                      ) -> Tuple[pd.DataFrame, Dict]:
     """
-    Loads the json formatted results from the batch processing API to a
-    Pandas DataFrame, mainly useful for various postprocessing functions.
+    Loads json-formatted MegaDetector results to a Pandas DataFrame.
     Args:
-        api_output_path: path to the API output json file
+        api_output_path: path to the output json file
         normalize_paths: whether to apply os.path.normpath to the 'file' field
             in each image entry in the output file
         filename_replacements: replace some path tokens to match local paths to
             the original blob structure
+        force_forward_slashes: whether to convert backslashes to forward slashes
+            in filenames
     Returns:
         detection_results: pd.DataFrame, contains at least the columns:
-                ['file', 'detections','failure']
+                ['file', 'detections','failure']
         other_fields: a dict containing fields in the results other than 'images'
     """
-    print('Loading API results from {}'.format(api_output_path))
+    print('Loading results from {}'.format(api_output_path))
     with open(api_output_path) as f:
         detection_results = json.load(f)
@@ -97,18 +58,20 @@ def load_api_results(api_output_path: str, normalize_paths: bool = True,
     for s in ['info', 'detection_categories', 'images']:
         assert s in detection_results, 'Missing field {} in detection results'.format(s)
-    # Fields in the API output json other than 'images'
+    # Fields in the output json other than 'images'
     other_fields = {}
     for k, v in detection_results.items():
         if k != 'images':
             other_fields[k] = v
-    # Normalize paths to simplify comparisons later
     if normalize_paths:
         for image in detection_results['images']:
-            image['file'] = os.path.normpath(image['file'])
-            # image['file'] = image['file'].replace('\\','/')
+            image['file'] = os.path.normpath(image['file'])
+    if force_forward_slashes:
+        for image in detection_results['images']:
+            image['file'] = image['file'].replace('\\','/')
     # Replace some path tokens to match local paths to original blob structure
     if filename_replacements is not None:
         for string_to_replace in filename_replacements.keys():
@@ -127,9 +90,7 @@ def load_api_results(api_output_path: str, normalize_paths: bool = True,
     # Pack the json output into a Pandas DataFrame
     detection_results = pd.DataFrame(detection_results['images'])
-    print('Finished loading API results for {} images from {}'.format(
+    print('Finished loading MegaDetector results for {} images from {}'.format(
             len(detection_results),api_output_path))
     return detection_results, other_fields
@@ -137,7 +98,7 @@ def load_api_results(api_output_path: str, normalize_paths: bool = True,
 def write_api_results(detection_results_table, other_fields, out_path):
     """
-    Writes a Pandas DataFrame back to a json that is compatible with the API output format.
+    Writes a Pandas DataFrame to the MegaDetector .json format.
     """
     print('Writing detection results to {}'.format(out_path))
@@ -148,6 +109,27 @@ def write_api_results(detection_results_table, other_fields, out_path):
                                              double_precision=3)
     images = json.loads(images)
     fields['images'] = images
+    # Convert the 'version' field back to a string as per format convention
+    try:
+        version = other_fields['info']['format_version']
+        if not isinstance(version,str):
+            other_fields['info']['format_version'] = str(version)
+    except Exception:
+        print('Warning: error determining format version')
+        pass
+    # Remove 'max_detection_conf' as per newer file convention (format >= v1.3)
+    try:
+        version = other_fields['info']['format_version']
+        version = float(version)
+        if version >= 1.3:
+            for im in images:
+                if 'max_detection_conf' in im:
+                    del im['max_detection_conf']
+    except Exception:
+        print('Warning: error removing max_detection_conf from output')
+        pass
     with open(out_path, 'w') as f:
         json.dump(fields, f, indent=1)
@@ -157,15 +139,16 @@ def write_api_results(detection_results_table, other_fields, out_path):
 def load_api_results_csv(filename, normalize_paths=True, filename_replacements={}, nrows=None):
     """
-    DEPRECATED
-    Loads .csv-formatted results from the batch processing API to a pandas table
+    [DEPRECATED]
+    Loads .csv-formatted MegaDetector results to a pandas table
     """
-    print('Loading API results from {}'.format(filename))
+    print('Loading MegaDetector results from {}'.format(filename))
     detection_results = pd.read_csv(filename,nrows=nrows)
-    print('De-serializing API results from {}'.format(filename))
+    print('De-serializing MegaDetector results from {}'.format(filename))
     # Confirm that this is really a detector output file
     for s in ['image_path','max_confidence','detections']:
@@ -191,17 +174,18 @@ def load_api_results_csv(filename, normalize_paths=True, filename_replacements={
             fn = fn.replace(string_to_replace,replacement_string)
             detection_results.at[iRow,'image_path'] = fn
-    print('Finished loading and de-serializing API results for {} images from {}'.format(
+    print('Finished loading and de-serializing MD results for {} images from {}'.format(
         len(detection_results),filename))
     return detection_results
 def write_api_results_csv(detection_results, filename):
-    """
-    DEPRECATED
-    Writes a pandas table to csv in a way that's compatible with the .csv API output
-    format.  Currently just a wrapper around to_csv that just forces output writing
+    """
+    [DEPRECATED]
+    Writes a Pandas table to csv in a way that's compatible with the .csv output
+    format.  Currently just a wrapper around to_csv that forces output writing
     to go through a common code path.
     """

api/batch_processing/postprocessing/md_to_labelme.py CHANGED Viewed

@@ -20,6 +20,10 @@ import json
 from tqdm import tqdm
+from multiprocessing.pool import Pool
+from multiprocessing.pool import ThreadPool
+from functools import partial
 from md_visualization.visualization_utils import open_image
 from md_utils.ct_utils import truncate_float
@@ -29,15 +33,21 @@ default_confidence_threshold = 0.15
 #%% Functions
-def get_labelme_dict_for_image(im,image_base_name,category_id_to_name,info=None,confidence_threshold=None):
+def get_labelme_dict_for_image(im,image_base_name,category_id_to_name,
+                               info=None,confidence_threshold=None):
     """
     For the given image struct in MD results format, reformat the detections into
     labelme format.  Returns a dict.
+    'height' and 'width' are required in [im].
+    image_base_name is written directly to the 'imagePath' field in the output; it should generally be
+    os.path.basename(your_image_file).
     """
     if confidence_threshold is None:
         confidence_threshold = -1.0
     output_dict = {}
     if info is not None:
         output_dict['detector_info'] = info
@@ -48,7 +58,9 @@ def get_labelme_dict_for_image(im,image_base_name,category_id_to_name,info=None,
     output_dict['imageHeight'] = im['height']
     output_dict['imageWidth'] = im['width']
     output_dict['imageData'] = None
+    output_dict['detections'] = im['detections']
+    # det = im['detections'][1]
     for det in im['detections']:
         if det['conf'] < confidence_threshold:
@@ -78,69 +90,125 @@ def get_labelme_dict_for_image(im,image_base_name,category_id_to_name,info=None,
 # ...def get_labelme_dict_for_image()
+def _write_output_for_image(im,image_base,extension_prefix,info,
+                            confidence_threshold,category_id_to_name,overwrite,
+                            verbose=False):
+    if 'failure' in im and im['failure'] is not None:
+        assert 'detections' not in im or im['detections'] is None
+        if verbose:
+            print('Skipping labelme file generation for failed image {}'.format(
+                im['file']))
+        return
+    im_full_path = os.path.join(image_base,im['file'])
+    json_path = os.path.splitext(im_full_path)[0] + extension_prefix + '.json'
+    if (not overwrite) and (os.path.isfile(json_path)):
+        if verbose:
+            print('Skipping existing file {}'.format(json_path))
+        return
+    output_dict = get_labelme_dict_for_image(im,
+                                             image_base_name=os.path.basename(im_full_path),
+                                             category_id_to_name=category_id_to_name,
+                                             info=info,
+                                             confidence_threshold=confidence_threshold)
+    with open(json_path,'w') as f:
+        json.dump(output_dict,f,indent=1)
+# ...def write_output_for_image(...)
 def md_to_labelme(results_file,image_base,confidence_threshold=None,
-                  overwrite=False):
+                  overwrite=False,extension_prefix='',n_workers=1,
+                  use_threads=False,bypass_image_size_read=False,
+                  verbose=False):
     """
     For all the images in [results_file], write a .json file in labelme format alongside the
     corresponding relative path within image_base.
+    If non-empty, "extension_prefix" will be inserted before the .json extension.
     """
-    # Load MD results
-    with open(results_file,'r') as f:
-        md_results = json.load(f)
+    if extension_prefix is None:
+        extension_prefix = ''
-    # Read image sizes
-    #
-    # TODO: parallelize this loop
-    #
-    # im = md_results['images'][0]
-    for im in tqdm(md_results['images']):
+    # Load MD results if necessary
+    if isinstance(results_file,dict):
+        md_results = results_file
+    else:
+        print('Loading MD results...')
+        with open(results_file,'r') as f:
+            md_results = json.load(f)
-        # Make sure this file exists
-        im_full_path = os.path.join(image_base,im['file'])
-        assert os.path.isfile(im_full_path), 'Image file {} does not exist'.format(im_full_path)
+    # Read image sizes if necessary
+    if bypass_image_size_read:
-        # Load w/h information if necessary
-        if 'height' not in im or 'width' not in im:
-            try:
-                pil_im = open_image(im_full_path)
-                im['width'] = pil_im.width
-                im['height'] = pil_im.height
-            except Exception:
-                print('Warning: cannot open image {}, treating as a failure during inference'.format(
-                    im_full_path))
-                if 'failure' not in im:
-                    im['failure'] = 'Failure image access'
-        # ...if we need to read w/h information
+        print('Bypassing image size read')
-    # ...for each image
+    else:
-    # Write output
-    for im in tqdm(md_results['images']):
-        if 'failure' in im and im['failure'] is not None:
-            assert 'detections' not in im
-            print('Warning: skipping labelme file generation for failed image {}'.format(
-                im['file']))
-            continue
+        # TODO: parallelize this loop
+        print('Reading image sizes...')
+        # im = md_results['images'][0]
+        for im in tqdm(md_results['images']):
+            # Make sure this file exists
+            im_full_path = os.path.join(image_base,im['file'])
+            assert os.path.isfile(im_full_path), 'Image file {} does not exist'.format(im_full_path)
+            json_path = os.path.splitext(im_full_path)[0] + extension_prefix + '.json'
+            # Don't even bother reading sizes for files we're not going to generate
+            if (not overwrite) and (os.path.isfile(json_path)):
+                continue
-        im_full_path = os.path.join(image_base,im['file'])
-        json_path = os.path.splitext(im_full_path)[0] + '.json'
+            # Load w/h information if necessary
+            if 'height' not in im or 'width' not in im:
+                try:
+                    pil_im = open_image(im_full_path)
+                    im['width'] = pil_im.width
+                    im['height'] = pil_im.height
+                except Exception:
+                    print('Warning: cannot open image {}, treating as a failure during inference'.format(
+                        im_full_path))
+                    if 'failure' not in im:
+                        im['failure'] = 'Failure image access'
+            # ...if we need to read w/h information
+        # ...for each image
-        if (not overwrite) and (os.path.isfile(json_path)):
-            print('Skipping existing file {}'.format(json_path))
-            continue
+    # ...if we're not bypassing image size read
-        output_dict = get_labelme_dict_for_image(im,
-                                                 image_base_name=os.path.basename(im_full_path),
-                                                 category_id_to_name=md_results['detection_categories'],
-                                                 info=md_results['info'],
-                                                 confidence_threshold=confidence_threshold)
-        with open(json_path,'w') as f:
-            json.dump(output_dict,f,indent=1)
+    print('\nGenerating labelme files...')
+    # Write output
+    if n_workers <= 1:
+        for im in tqdm(md_results['images']):
+            _write_output_for_image(im,image_base,extension_prefix,md_results['info'],confidence_threshold,
+                                   md_results['detection_categories'],overwrite,verbose)
+    else:
+        if use_threads:
+            print('Starting parallel thread pool with {} workers'.format(n_workers))
+            pool = ThreadPool(n_workers)
+        else:
+            print('Starting parallel process pool with {} workers'.format(n_workers))
+            pool = Pool(n_workers)
+        _ = list(tqdm(pool.imap(
+                partial(_write_output_for_image,
+                        image_base=image_base,extension_prefix=extension_prefix,
+                        info=md_results['info'],confidence_threshold=confidence_threshold,
+                        category_id_to_name=md_results['detection_categories'],
+                        overwrite=overwrite,verbose=verbose),
+                 md_results['images']),
+                 total=len(md_results['images'])))
     # ...for each image

api/batch_processing/postprocessing/merge_detections.py CHANGED Viewed

@@ -3,9 +3,12 @@
 # merge_detections.py
 #
 # Merge high-confidence detections from one or more results files into another
-# file.   Typically used to combine results from MDv5b and/or MDv4 into a "primary"
+# file.  Typically used to combine results from MDv5b and/or MDv4 into a "primary"
 # results file from MDv5a.
 #
+# Detection categories must be the same in both files; if you want to first remap
+# one file's category mapping to be the same as another's, see remap_detection_categories.
+#
 # If you want to literally merge two .json files, see combine_api_outputs.py.
 #
 ########
@@ -30,7 +33,7 @@ class MergeDetectionsOptions:
         self.max_detection_size = 1.01
         self.min_detection_size = 0
-        self.source_confidence_thresholds = [0.2]
+        self.source_confidence_thresholds = [0.05]
         # Don't bother merging into target images if there is a similar detection
         # above this threshold (or if there is *any* detection above this threshold,
@@ -38,7 +41,7 @@ class MergeDetectionsOptions:
         self.target_confidence_threshold = 0.2
         # If you want to merge only certain categories, specify one
-        # (but not both) of these.
+        # (but not both) of these.  These are category IDs, not names.
         self.categories_to_include = None
         self.categories_to_exclude = None
@@ -47,11 +50,28 @@ class MergeDetectionsOptions:
         self.merge_empty_only = False
         self.iou_threshold = 0.65
+        self.overwrite = False
 #%% Main function
 def merge_detections(source_files,target_file,output_file,options=None):
+    """
+    Merge high-confidence detections from one or more results files into another
+    file.   Typically used to combine results from MDv5b and/or MDv4 into a "primary"
+    results file from MDv5a.
+    [source_files] (a list of files or a single filename) specifies the set of
+    results files that will be merged into [target_file].  The difference between a
+    "source file" and the "target file" is that if no merging is necessary, either because
+    two boxes are nearly identical or because merge_only_empty is True and the target
+    file already has above-threshold detection for an image+category, the output file gets
+    the results of the "target" file.  I.e., the "target" file wins all ties.
+    The results are written to [output_file].
+    """
     if isinstance(source_files,str):
         source_files = [source_files]
@@ -59,6 +79,10 @@ def merge_detections(source_files,target_file,output_file,options=None):
     if options is None:
         options = MergeDetectionsOptions()
+    if (not options.overwrite) and (os.path.isfile(output_file)):
+        print('File {} exists, bypassing merge'.format(output_file))
+        return
     assert not ((options.categories_to_exclude is not None) and \
                 (options.categories_to_include is not None)), \
                 'categories_to_include and categories_to_exclude are mutually exclusive'
@@ -133,7 +157,8 @@ def merge_detections(source_files,target_file,output_file,options=None):
         output_data['info']['detections_transferred_from'].append(os.path.basename(source_file))
         output_data['info']['detector'] = output_data['info']['detector'] + ' + ' + source_detector_name
-        assert source_data['detection_categories'] == output_data['detection_categories']
+        assert source_data['detection_categories'] == output_data['detection_categories'], \
+            'Cannot merge files with different detection category maps'
         source_confidence_threshold = options.source_confidence_thresholds[i_source_file]
@@ -246,7 +271,7 @@ def merge_detections(source_files,target_file,output_file,options=None):
     # ...for each source file
     with open(output_file,'w') as f:
-        json.dump(output_data,f,indent=2)
+        json.dump(output_data,f,indent=1)
     print('Saved merged results to {}'.format(output_file))

megadetector 5.0.6__py3-none-any.whl → 5.0.8__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.6py3-none-any.whl → 5.0.8py3-none-any.whl