PyPI - megadetector - Versions diffs - 10.0.7__py3-none-any.whl → 10.0.9__py3-none-any.whl - Mend

megadetector 10.0.7py3-none-any.whl → 10.0.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (23) hide show

megadetector/postprocessing/compare_batch_results.py CHANGED Viewed

@@ -136,7 +136,7 @@ class BatchComparisonOptions:
         #: Colormap to use for detections in file B (maps detection categories to colors)
         self.colormap_b = ['RoyalBlue']
-        #: Process-based parallelization isn't supported yet; this must be "True"
+        #: Whether to render images with threads (True) or processes (False)
         self.parallelize_rendering_with_threads = True
         #: List of filenames to include in the comparison, or None to use all files
@@ -152,7 +152,7 @@ class BatchComparisonOptions:
         self.target_width = 800
         #: Number of workers to use for rendering, or <=1 to disable parallelization
-        self.n_rendering_workers = 20
+        self.n_rendering_workers = 10
         #: Random seed for image sampling (not used if max_images_per_category is None)
         self.random_seed = 0
@@ -183,7 +183,7 @@ class BatchComparisonOptions:
         #: Should we show category names (instead of numbers) on detected boxes?
         self.show_category_names_on_detected_boxes = True
-        #: List of PairwiseBatchComparisonOptions that defines the comparisons we'll render.
+        #: List of PairwiseBatchComparisonOptions that defines the comparisons we'll render
         self.pairwise_options = []
         #: Only process images whose file names contain this token
@@ -197,7 +197,7 @@ class BatchComparisonOptions:
         self.verbose = False
         #: Separate out the "clean TP" and "clean TN" categories, only relevant when GT is
-        #: available.
+        #: available
         self.include_clean_categories = True
         #: When rendering to the output table, optionally write alternative strings
@@ -211,6 +211,10 @@ class BatchComparisonOptions:
         #: Should we include a TOC?  TOC is always omitted if <=2 comparisons are performed.
         self.include_toc = True
+        #: Should we return the mapping from categories (e.g. "common detections") to image
+        #: pairs?  Makes the return dict much larger, but allows post-hoc exploration.
+        self.return_images_by_category = False
 # ...class BatchComparisonOptions
@@ -224,7 +228,7 @@ class PairwiseBatchComparisonResults:
         #: String of HTML content suitable for rendering to an HTML file
         self.html_content = None
-        #: Possibly-modified version of the PairwiseBatchComparisonOptions supplied as input.
+        #: Possibly-modified version of the PairwiseBatchComparisonOptions supplied as input
         self.pairwise_options = None
         #: A dictionary with keys representing category names; in the no-ground-truth case, for example,
@@ -295,7 +299,8 @@ def _render_image_pair(fn,image_pairs,category_folder,options,pairwise_options):
     """
     input_image_path = os.path.join(options.image_folder,fn)
-    assert os.path.isfile(input_image_path), 'Image {} does not exist'.format(input_image_path)
+    assert os.path.isfile(input_image_path), \
+        'Image {} does not exist'.format(input_image_path)
     im = visualization_utils.open_image(input_image_path)
     image_pair = image_pairs[fn]
@@ -628,11 +633,21 @@ def _pairwise_compare_batch_results(options,output_index,pairwise_options):
     os.makedirs(options.output_folder,exist_ok=True)
+    # Just in case the user provided a single category instead of a list
+    # for category_names_to_include
+    if options.category_names_to_include is not None:
+        if isinstance(options.category_names_to_include,str):
+            options.category_names_to_include = [options.category_names_to_include]
     ##%% Load both result sets
+    if options.verbose:
+        print('Loading {}'.format(pairwise_options.results_filename_a))
     with open(pairwise_options.results_filename_a,'r') as f:
         results_a = json.load(f)
+    if options.verbose:
+        print('Loading {}'.format(pairwise_options.results_filename_b))
     with open(pairwise_options.results_filename_b,'r') as f:
         results_b = json.load(f)
@@ -654,6 +669,17 @@ def _pairwise_compare_batch_results(options,output_index,pairwise_options):
     detection_category_name_to_id = invert_dictionary(detection_categories_a)
     options.detection_category_id_to_name = detection_category_id_to_name
+    category_name_to_id_a = invert_dictionary(detection_categories_a)
+    category_name_to_id_b = invert_dictionary(detection_categories_b)
+    category_ids_to_include_a = []
+    category_ids_to_include_b = []
+    for category_name in options.category_names_to_include:
+        if category_name in category_name_to_id_a:
+            category_ids_to_include_a.append(category_name_to_id_a[category_name])
+        if category_name in category_name_to_id_b:
+            category_ids_to_include_b.append(category_name_to_id_b[category_name])
     if pairwise_options.results_description_a is None:
         if 'detector' not in results_a['info']:
             print('No model metadata supplied for results-A, assuming MDv4')
@@ -679,7 +705,7 @@ def _pairwise_compare_batch_results(options,output_index,pairwise_options):
     filename_to_image_b = {im['file']:im for im in images_b}
-    ##%% Make sure they represent the same set of images
+    ##%% Make sure the two result sets represent the same set of images
     filenames_a = [im['file'] for im in images_a]
     filenames_b_set = set([im['file'] for im in images_b])
@@ -914,7 +940,8 @@ def _pairwise_compare_batch_results(options,output_index,pairwise_options):
                 pairwise_options.detection_thresholds_b['default']
     # fn = filenames_to_compare[0]
-    for i_file,fn in tqdm(enumerate(filenames_to_compare),total=len(filenames_to_compare)):
+    for i_file,fn in tqdm(enumerate(filenames_to_compare),
+                          total=len(filenames_to_compare)):
         if fn not in filename_to_image_b:
@@ -1000,27 +1027,11 @@ def _pairwise_compare_batch_results(options,output_index,pairwise_options):
                     categories_above_threshold_b.add(category_id)
             if invalid_category_error:
                 continue
             # Should we be restricting the comparison to only certain categories?
             if options.category_names_to_include is not None:
-                # Just in case the user provided a single category instead of a list
-                if isinstance(options.category_names_to_include,str):
-                    options.category_names_to_include = [options.category_names_to_include]
-                category_name_to_id_a = invert_dictionary(detection_categories_a)
-                category_name_to_id_b = invert_dictionary(detection_categories_b)
-                category_ids_to_include_a = []
-                category_ids_to_include_b = []
-                for category_name in options.category_names_to_include:
-                    if category_name in category_name_to_id_a:
-                        category_ids_to_include_a.append(category_name_to_id_a[category_name])
-                    if category_name in category_name_to_id_b:
-                        category_ids_to_include_b.append(category_name_to_id_b[category_name])
                 # Restrict the categories we treat as above-threshold to the set we're supposed
                 # to be using
                 categories_above_threshold_a = [category_id for category_id in categories_above_threshold_a if \
@@ -1287,7 +1298,7 @@ def _pairwise_compare_batch_results(options,output_index,pairwise_options):
             max_conf_b = _maxempty([det['conf'] for det in im_b['detections']])
             sort_conf = max(max_conf_a,max_conf_b)
-    # ...what kind of ground truth (if any) do we have?
+        # ...what kind of ground truth (if any) do we have?
         assert comparison_category is not None
         categories_to_image_pairs[comparison_category][fn] = im_pair
@@ -1313,7 +1324,11 @@ def _pairwise_compare_batch_results(options,output_index,pairwise_options):
     local_output_folder = os.path.join(options.output_folder,'cmp_' + \
                                        str(output_index).zfill(3))
-    def render_detection_comparisons(category,image_pairs,image_filenames):
+    def _render_detection_comparisons(category,image_pairs,image_filenames):
+        """
+        Render all the detection results pairs for the sampled images in a
+        particular category (e.g. all the "common detections").
+        """
         print('Rendering detections for category {}'.format(category))
@@ -1336,7 +1351,7 @@ def _pairwise_compare_batch_results(options,output_index,pairwise_options):
         return output_image_paths
-    # ...def render_detection_comparisons()
+    # ...def _render_detection_comparisons()
     if len(options.colormap_a) > 1:
         color_string_a = str(options.colormap_a)
@@ -1371,7 +1386,7 @@ def _pairwise_compare_batch_results(options,output_index,pairwise_options):
         input_image_absolute_paths = [os.path.join(options.image_folder,fn) for fn in image_filenames]
-        category_image_output_paths = render_detection_comparisons(category,
+        category_image_output_paths = _render_detection_comparisons(category,
                                                             image_pairs,image_filenames)
         category_html_filename = os.path.join(local_output_folder,
@@ -1469,6 +1484,8 @@ def _pairwise_compare_batch_results(options,output_index,pairwise_options):
             print("Pool closed and joined for comparison rendering")
         except Exception:
             pass
     ##%% Write the top-level HTML file content
     html_output_string  = ''
@@ -1591,8 +1608,11 @@ def compare_batch_results(options):
     for i_comparison,pairwise_options in enumerate(pairwise_options_list):
         print('Running comparison {} of {}'.format(i_comparison,n_comparisons))
+        pairwise_options.verbose = options.verbose
         pairwise_results = \
             _pairwise_compare_batch_results(options,i_comparison,pairwise_options)
+        if not options.return_images_by_category:
+            pairwise_results.categories_to_image_pairs = None
         html_content += pairwise_results.html_content
         all_pairwise_results.append(pairwise_results)

megadetector/postprocessing/convert_output_format.py CHANGED Viewed

@@ -2,12 +2,8 @@
 convert_output_format.py
-Converts between file formats output by our batch processing API.  Currently
-supports json <--> csv conversion, but this should be the landing place for any
-conversion - including between hypothetical alternative .json versions - that we support
-in the future.
-The .csv format is largely obsolete, don't use it unless you're super-duper sure you need it.
+Converts between file .json and .csv representations of MD output.  The .csv format is
+largely obsolete, don't use it unless you're super-duper sure you need it.
 """
@@ -15,13 +11,16 @@ The .csv format is largely obsolete, don't use it unless you're super-duper sure
 import argparse
 import json
-import csv
 import sys
 import os
 from tqdm import tqdm
+from collections import defaultdict
+import pandas as pd
 from megadetector.postprocessing.load_api_results import load_api_results_csv
+from megadetector.utils.wi_taxonomy_utils import load_md_or_speciesnet_file
 from megadetector.data_management.annotations import annotation_constants
 from megadetector.utils import ct_utils
@@ -35,16 +34,13 @@ def convert_json_to_csv(input_path,
                         min_confidence=None,
                         omit_bounding_boxes=False,
                         output_encoding=None,
-                        overwrite=True):
+                        overwrite=True,
+                        verbose=False):
     """
     Converts a MD results .json file to a totally non-standard .csv format.
     If [output_path] is None, will convert x.json to x.csv.
-    TODO: this function should obviously be using Pandas or some other sensible structured
-    representation of tabular data.  Even a list of dicts.  This implementation is quite
-    brittle and depends on adding fields to every row in exactly the right order.
     Args:
         input_path (str): the input .json file to convert
         output_path (str, optional): the output .csv file to generate; if this is None, uses
@@ -57,7 +53,7 @@ def convert_json_to_csv(input_path,
         output_encoding (str, optional): encoding to use for the .csv file
         overwrite (bool, optional): whether to overwrite an existing .csv file; if this is False and
             the output file exists, no-ops and returns
+        verbose (bool, optional): enable additional debug output
     """
     if output_path is None:
@@ -68,36 +64,28 @@ def convert_json_to_csv(input_path,
         return
     print('Loading json results from {}...'.format(input_path))
-    json_output = json.load(open(input_path))
-    rows = []
+    json_output = load_md_or_speciesnet_file(input_path,
+                                             verbose=verbose)
-    fixed_columns = ['image_path', 'max_confidence', 'detections']
+    def clean_category_name(s):
+        return s.replace(',','_').replace(' ','_').lower()
-    # We add an output column for each class other than 'empty',
-    # containing the maximum probability of  that class for each image
-    # n_non_empty_detection_categories = len(annotation_constants.annotation_bbox_categories) - 1
-    n_non_empty_detection_categories = annotation_constants.NUM_DETECTOR_CATEGORIES
-    detection_category_column_names = []
-    assert annotation_constants.detector_bbox_category_id_to_name[0] == 'empty'
-    for cat_id in range(1,n_non_empty_detection_categories+1):
-        cat_name = annotation_constants.detector_bbox_category_id_to_name[cat_id]
-        detection_category_column_names.append('max_conf_' + cat_name)
+    # Create column names for max detection confidences
+    detection_category_id_to_max_conf_column_name = {}
+    for category_id in json_output['detection_categories'].keys():
+        category_name = clean_category_name(json_output['detection_categories'][category_id])
+        detection_category_id_to_max_conf_column_name[category_id] = \
+            'max_conf_' + category_name
-    n_classification_categories = 0
+    classification_category_id_to_max_conf_column_name = {}
+    # Create column names for max classification confidences (if necessary)
     if 'classification_categories' in json_output.keys():
-        classification_category_id_to_name = json_output['classification_categories']
-        classification_category_ids = list(classification_category_id_to_name.keys())
-        classification_category_id_to_column_number = {}
-        classification_category_column_names = []
-        for i_category,category_id in enumerate(classification_category_ids):
-            category_name = classification_category_id_to_name[category_id].\
-                replace(' ','_').replace(',','')
-            classification_category_column_names.append('max_classification_conf_' + category_name)
-            classification_category_id_to_column_number[category_id] = i_category
-        n_classification_categories = len(classification_category_ids)
+        for category_id in json_output['classification_categories'].keys():
+            category_name = clean_category_name(json_output['classification_categories'][category_id])
+            classification_category_id_to_max_conf_column_name[category_id] = \
+                'max_classification_conf_' + category_name
     # There are several .json fields for which we add .csv columns; other random bespoke fields
     # will be ignored.
@@ -117,26 +105,43 @@ def convert_json_to_csv(input_path,
     if len(optional_fields_present) > 0:
         print('Found {} optional fields'.format(len(optional_fields_present)))
-    expected_row_length = len(fixed_columns) + len(detection_category_column_names) + \
-        n_classification_categories + len(optional_fields_present)
     print('Formatting results...')
+    output_records = []
     # i_image = 0; im = json_output['images'][i_image]
     for im in tqdm(json_output['images']):
-        image_id = im['file']
+        output_record = {}
+        output_records.append(output_record)
+        output_record['image_path'] = im['file']
+        output_record['max_confidence'] = ''
+        output_record['detections'] = ''
+        for field_name in optional_fields_present:
+            output_record[field_name] = ''
+            if field_name in im:
+                output_record[field_name] = im[field_name]
+        for detection_category_id in detection_category_id_to_max_conf_column_name:
+            column_name = detection_category_id_to_max_conf_column_name[detection_category_id]
+            output_record[column_name] = 0
+        for classification_category_id in classification_category_id_to_max_conf_column_name:
+            column_name = classification_category_id_to_max_conf_column_name[classification_category_id]
+            output_record[column_name] = 0
         if 'failure' in im and im['failure'] is not None:
-            row = [image_id, 'failure', im['failure']]
-            rows.append(row)
+            output_record['max_confidence'] = 'failure'
+            output_record['detections'] = im['failure']
             # print('Skipping failed image {} ({})'.format(im['file'],im['failure']))
             continue
         max_conf = ct_utils.get_max_conf(im)
+        detection_category_id_to_max_conf = defaultdict(float)
+        classification_category_id_to_max_conf = defaultdict(float)
         detections = []
-        max_detection_category_probabilities = [None] * n_non_empty_detection_categories
-        max_classification_category_probabilities = [0] * n_classification_categories
         # d = im['detections'][0]
         for d in im['detections']:
@@ -155,31 +160,24 @@ def convert_json_to_csv(input_path,
             xmax = input_bbox[0] + input_bbox[2]
             ymax = input_bbox[1] + input_bbox[3]
             output_detection = [ymin, xmin, ymax, xmax]
             output_detection.append(d['conf'])
-            # Category 0 is empty, for which we don't have a column, so the max
-            # confidence for category N goes in column N-1
-            detection_category_id = int(d['category'])
-            assert detection_category_id > 0 and detection_category_id <= \
-                n_non_empty_detection_categories
-            detection_category_column = detection_category_id - 1
-            detection_category_max = max_detection_category_probabilities[detection_category_column]
-            if detection_category_max is None or d['conf'] > detection_category_max:
-                max_detection_category_probabilities[detection_category_column] = d['conf']
-            output_detection.append(detection_category_id)
+            output_detection.append(int(d['category']))
             detections.append(output_detection)
+            detection_category_id = d['category']
+            detection_category_max = detection_category_id_to_max_conf[detection_category_id]
+            if d['conf'] > detection_category_max:
+                detection_category_id_to_max_conf[detection_category_id] = d['conf']
             if 'classifications' in d:
-                assert n_classification_categories > 0,\
-                    'Oops, I have classification results, but no classification metadata'
                 for c in d['classifications']:
-                    category_id = c[0]
-                    p = c[1]
-                    category_index = classification_category_id_to_column_number[category_id]
-                    if (max_classification_category_probabilities[category_index] < p):
-                        max_classification_category_probabilities[category_index] = p
+                    classification_category_id = c[0]
+                    classification_conf = c[1]
+                    classification_category_max = \
+                        classification_category_id_to_max_conf[classification_category_id]
+                    if classification_conf > classification_category_max:
+                        classification_category_id_to_max_conf[classification_category_id] = d['conf']
                 # ...for each classification
@@ -191,40 +189,36 @@ def convert_json_to_csv(input_path,
         if not omit_bounding_boxes:
             detection_string = json.dumps(detections)
-        row = [image_id, max_conf, detection_string]
-        row.extend(max_detection_category_probabilities)
-        row.extend(max_classification_category_probabilities)
+        output_record['detections'] = detection_string
+        output_record['max_confidence'] = max_conf
-        for field_name in optional_fields_present:
-            if field_name not in im:
-                row.append('')
-            else:
-                row.append(str(im[field_name]))
+        for detection_category_id in detection_category_id_to_max_conf_column_name:
+            column_name = detection_category_id_to_max_conf_column_name[detection_category_id]
+            output_record[column_name] = \
+                detection_category_id_to_max_conf[detection_category_id]
-        assert len(row) == expected_row_length
-        rows.append(row)
+        for classification_category_id in classification_category_id_to_max_conf_column_name:
+            column_name = classification_category_id_to_max_conf_column_name[classification_category_id]
+            output_record[column_name] = \
+                classification_category_id_to_max_conf[classification_category_id]
     # ...for each image
     print('Writing to csv...')
-    with open(output_path, 'w', newline='', encoding=output_encoding) as f:
-        writer = csv.writer(f, delimiter=',')
-        header = fixed_columns
-        header.extend(detection_category_column_names)
-        if n_classification_categories > 0:
-            header.extend(classification_category_column_names)
-        for field_name in optional_fields_present:
-            header.append(field_name)
-        writer.writerow(header)
-        writer.writerows(rows)
+    df = pd.DataFrame(output_records)
+    if omit_bounding_boxes:
+        df = df.drop('detections',axis=1)
+    df.to_csv(output_path,index=False,header=True)
 # ...def convert_json_to_csv(...)
 def convert_csv_to_json(input_path,output_path=None,overwrite=True):
     """
-    Convert .csv to .json.  If output_path is None, will convert x.csv to x.json.
+    Convert .csv to .json.  If output_path is None, will convert x.csv to x.json.  This
+    supports a largely obsolete .csv format, there's almost no reason you want to do this.
     Args:
         input_path (str): .csv filename to convert to .json

megadetector/postprocessing/postprocess_batch_results.py CHANGED Viewed

@@ -1145,7 +1145,7 @@ def process_batch_results(options):
     images_to_visualize = detections_df
-    if options.num_images_to_sample is not None and options.num_images_to_sample > 0:
+    if (options.num_images_to_sample is not None) and (options.num_images_to_sample > 0):
         images_to_visualize = images_to_visualize.sample(
             n=min(options.num_images_to_sample, len(images_to_visualize)),
             random_state=options.sample_seed)

megadetector/postprocessing/subset_json_detector_output.py CHANGED Viewed

@@ -83,6 +83,9 @@ class SubsetJsonDetectorOutputOptions:
     def __init__(self):
         #: Only process files containing the token 'query'
+        #:
+        #: Does not support general regexes, but supports ^ as a special case
+        #: regex-like notation for "starts with"
         self.query = None
         #: Replace 'query' with 'replacement' if 'replacement' is not None.  If 'query' is None,
@@ -153,6 +156,12 @@ class SubsetJsonDetectorOutputOptions:
         #: to be contiguous.  Set to 1 to remove empty categories only.
         self.remove_classification_categories_below_count = None
+        #: Remove detections above a threshold size (as a fraction of the image size)
+        self.maximum_detection_size = None
+        #: Remove detections below a threshold size (as a fraction of the image size)
+        self.minimum_detection_size = None
 # ...class SubsetJsonDetectorOutputOptions
@@ -271,6 +280,71 @@ def remove_classification_categories_below_count(data, options):
 # ...def remove_classification_categories_below_count(...)
+def subset_json_detector_output_by_size(data, options):
+    """
+    Remove detections above or below threshold sizes (as a fraction
+    of the image size).
+    Args:
+        data (dict): data loaded from a MD results file
+        options (SubsetJsonDetectorOutputOptions): parameters for subsetting
+    Returns:
+        dict: Possibly-modified version of [data] (also modifies in place)
+    """
+    if (options.maximum_detection_size is None) and \
+        (options.minimum_detection_size is None):
+        return data
+    if options.maximum_detection_size is None:
+        options.maximum_detection_size = 1000
+    if options.minimum_detection_size is None:
+        options.minimum_detection_size = -1000
+    print('Subsetting by size ({} <--> {})'.format(
+        options.minimum_detection_size,
+        options.maximum_detection_size))
+    images_in = data['images']
+    images_out = []
+    # im = images_in[0]
+    for i_image, im in tqdm(enumerate(images_in), total=len(images_in)):
+        # Always keep failed images; if the caller wants to remove these, they
+        # will use remove_failed_images
+        if ('detections' not in im) or (im['detections'] is None):
+            images_out.append(im)
+            continue
+        detections_to_keep = []
+        for det in im['detections']:
+            # [x_min, y_min, width_of_box, height_of_box]
+            detection_size = det['bbox'][2] * det['bbox'][3]
+            if (detection_size >= options.minimum_detection_size) and \
+               (detection_size <= options.maximum_detection_size):
+                detections_to_keep.append(det)
+        im['detections'] = detections_to_keep
+        images_out.append(im)
+    # ...for each image
+    data['images'] = images_out
+    print('done, found {} matches (of {})'.format(
+            len(data['images']),len(images_in)))
+    return data
+# ...def subset_json_detector_output_by_size(...)
 def subset_json_detector_output_by_confidence(data, options):
     """
     Removes all detections below options.confidence_threshold.
@@ -671,6 +745,11 @@ def subset_json_detector_output(input_filename, output_filename, options, data=N
         data = subset_json_detector_output_by_list(data, options)
+    if (options.maximum_detection_size is not None) or \
+        (options.minimum_detection_size is not None):
+        data = subset_json_detector_output_by_size(data, options)
     if not options.split_folders:
         _write_detection_results(data, output_filename, options)
@@ -834,6 +913,10 @@ def main(): # noqa
                         help='Replace [query] with this')
     parser.add_argument('--confidence_threshold', type=float, default=None,
                         help='Remove detections below this confidence level')
+    parser.add_argument('--maximum_detection_size', type=float, default=None,
+                        help='Remove detections above this size (as a fraction of the image size)')
+    parser.add_argument('--minimum_detection_size', type=float, default=None,
+                        help='Remove detections below this size (as a fraction of the image size)')
     parser.add_argument('--keep_files_in_list', type=str, default=None,
                         help='Keep only files in this list, which can be a .json results file or a folder.' + \
                              ' Assumes that the input .json file contains relative paths when comparing to a folder.')

megadetector 10.0.7__py3-none-any.whl → 10.0.9__py3-none-any.whl

Potentially problematic release.

megadetector 10.0.7py3-none-any.whl → 10.0.9py3-none-any.whl