PyPI - megadetector - Versions diffs - 5.0.6__py3-none-any.whl → 5.0.7__py3-none-any.whl - Mend

megadetector 5.0.6py3-none-any.whl → 5.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (62) hide show

api/batch_processing/data_preparation/manage_local_batch.py +278 -197
api/batch_processing/data_preparation/manage_video_batch.py +7 -2
api/batch_processing/postprocessing/add_max_conf.py +1 -0
api/batch_processing/postprocessing/compare_batch_results.py +110 -60
api/batch_processing/postprocessing/load_api_results.py +55 -69
api/batch_processing/postprocessing/md_to_labelme.py +1 -0
api/batch_processing/postprocessing/postprocess_batch_results.py +158 -50
api/batch_processing/postprocessing/render_detection_confusion_matrix.py +625 -0
api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +222 -74
api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
classification/prepare_classification_script.py +191 -191
data_management/coco_to_yolo.py +65 -44
data_management/databases/integrity_check_json_db.py +7 -5
data_management/generate_crops_from_cct.py +1 -1
data_management/importers/animl_results_to_md_results.py +2 -2
data_management/importers/noaa_seals_2019.py +1 -1
data_management/importers/zamba_results_to_md_results.py +2 -2
data_management/labelme_to_coco.py +34 -6
data_management/labelme_to_yolo.py +1 -1
data_management/lila/create_lila_blank_set.py +474 -0
data_management/lila/create_lila_test_set.py +2 -1
data_management/lila/create_links_to_md_results_files.py +1 -1
data_management/lila/download_lila_subset.py +46 -21
data_management/lila/generate_lila_per_image_labels.py +23 -14
data_management/lila/get_lila_annotation_counts.py +16 -10
data_management/lila/lila_common.py +14 -11
data_management/lila/test_lila_metadata_urls.py +116 -0
data_management/resize_coco_dataset.py +12 -10
data_management/yolo_output_to_md_output.py +40 -13
data_management/yolo_to_coco.py +34 -21
detection/process_video.py +36 -14
detection/pytorch_detector.py +1 -1
detection/run_detector.py +73 -18
detection/run_detector_batch.py +104 -24
detection/run_inference_with_yolov5_val.py +127 -26
detection/run_tiled_inference.py +153 -43
detection/video_utils.py +3 -1
md_utils/ct_utils.py +79 -3
md_utils/md_tests.py +253 -15
md_utils/path_utils.py +129 -24
md_utils/process_utils.py +26 -7
md_utils/split_locations_into_train_val.py +215 -0
md_utils/string_utils.py +10 -0
md_utils/url_utils.py +0 -2
md_utils/write_html_image_list.py +1 -0
md_visualization/visualization_utils.py +17 -2
md_visualization/visualize_db.py +8 -0
md_visualization/visualize_detector_output.py +185 -104
{megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/METADATA +2 -2
{megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/RECORD +62 -58
{megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/WHEEL +1 -1
taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
taxonomy_mapping/map_new_lila_datasets.py +43 -39
taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
taxonomy_mapping/preview_lila_taxonomy.py +27 -27
taxonomy_mapping/species_lookup.py +33 -13
taxonomy_mapping/taxonomy_csv_checker.py +7 -5
{megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/LICENSE +0 -0
{megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/top_level.txt +0 -0

md_utils/process_utils.py CHANGED Viewed

@@ -17,14 +17,28 @@ import subprocess
 os.environ["PYTHONUNBUFFERED"] = "1"
-def execute(cmd):
+def execute(cmd,encoding=None,errors=None,env=None,verbose=False):
     """
     Run [cmd] (a single string) in a shell, yielding each line of output to the caller.
+    The "encoding", "errors", and "env" parameters are passed directly to subprocess.Popen().
+    "verbose" only impacts output about process management, it is not related to printing
+    output from the child process.
     """
+    if verbose:
+        if encoding is not None:
+            print('Launching child process with non-default encoding {}'.format(encoding))
+        if errors is not None:
+            print('Launching child process with non-default text error handling {}'.format(errors))
+        if env is not None:
+            print('Launching child process with non-default environment {}'.format(str(env)))
     # https://stackoverflow.com/questions/4417546/constantly-print-subprocess-output-while-process-is-running
     popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
-                             shell=True, universal_newlines=True)
+                             shell=True, universal_newlines=True, encoding=encoding,
+                             errors=errors, env=env)
     for stdout_line in iter(popen.stdout.readline, ""):
         yield stdout_line
     popen.stdout.close()
@@ -33,22 +47,27 @@ def execute(cmd):
         raise subprocess.CalledProcessError(return_code, cmd)
-def execute_and_print(cmd,print_output=True):
+def execute_and_print(cmd,print_output=True,encoding=None,errors=None,env=None,verbose=False):
     """
     Run [cmd] (a single string) in a shell, capturing and printing output.  Returns
     a dictionary with fields "status" and "output".
+    The "encoding", "errors", and "env" parameters are passed directly to subprocess.Popen().
+    "verbose" only impacts output about process management, it is not related to printing
+    output from the child process.
     """
     to_return = {'status':'unknown','output':''}
-    output=[]
+    output = []
     try:
-        for s in execute(cmd):
+        for s in execute(cmd,encoding=encoding,errors=errors,env=env,verbose=verbose):
             output.append(s)
             if print_output:
                 print(s,end='',flush=True)
         to_return['status'] = 0
     except subprocess.CalledProcessError as cpe:
-        print('execute_and_print caught error: {}'.format(cpe.output))
+        print('execute_and_print caught error: {} ({})'.format(cpe.output,str(cpe)))
         to_return['status'] = cpe.returncode
     to_return['output'] = output

md_utils/split_locations_into_train_val.py ADDED Viewed

@@ -0,0 +1,215 @@
+########
+#
+# split_locations_into_train_val.py
+#
+# Split a list of location IDs into training and validation, targeting a specific
+# train/val split for each category, but allowing some categories to be tighter or looser
+# than others.  Does nothing particularly clever, just randomly splits locations into
+# train/val lots of times using the target val fraction, and picks the one that meets the
+# specified constraints and minimizes weighted error, where "error" is defined as the
+# sum of each class's absolute divergence from the target val fraction.
+#
+########
+#%% Imports/constants
+import random
+import numpy as np
+from collections import defaultdict
+from md_utils.ct_utils import sort_dictionary_by_value
+from tqdm import tqdm
+#%% Main function
+def split_locations_into_train_val(location_to_category_counts,
+                                   n_random_seeds=10000,
+                                   target_val_fraction=0.15,
+                                   category_to_max_allowable_error=None,
+                                   category_to_error_weight=None,
+                                   default_max_allowable_error=0.1):
+    """
+    Split a list of location IDs into training and validation, targeting a specific
+    train/val split for each category, but allowing some categories to be tighter or looser
+    than others.  Does nothing particularly clever, just randomly splits locations into
+    train/val lots of times using the target val fraction, and picks the one that meets the
+    specified constraints and minimizes weighted error, where "error" is defined as the
+    sum of each class's absolute divergence from the target val fraction.
+    location_to_category_counts should be a dict mapping location IDs to dicts,
+    with each dict mapping a category name to a count.  Any categories not present in a
+    particular dict are assumed to have a count of zero for that location.
+    If not None, category_to_max_allowable_error should be a dict mapping category names
+    to maximum allowable errors.  These are hard constraints, but you can specify a subset
+    of categories.  Categories not included here have a maximum error of Inf.
+    If not None, category_to_error_weight should be a dict mapping category names to
+    error weights.  You can specify a subset of categories.  Categories not included here
+    have a weight of 1.0.
+    default_max_allowable_error is the maximum allowable error for categories not present in
+    category_to_max_allowable_error.  Set to None (or >= 1.0) to disable hard constraints for
+    categories not present in category_to_max_allowable_error
+    returns val_locations,category_to_val_fraction
+    """
+    location_ids = list(location_to_category_counts.keys())
+    n_val_locations = int(target_val_fraction*len(location_ids))
+    if category_to_max_allowable_error is None:
+        category_to_max_allowable_error = {}
+    if category_to_error_weight is None:
+        category_to_error_weight = {}
+    # category ID to total count; the total count is used only for printouts
+    category_id_to_count = {}
+    for location_id in location_to_category_counts:
+        for category_id in location_to_category_counts[location_id].keys():
+            if category_id not in category_id_to_count:
+                category_id_to_count[category_id] = 0
+            category_id_to_count[category_id] += \
+                location_to_category_counts[location_id][category_id]
+    category_ids = set(category_id_to_count.keys())
+    print('Splitting {} categories over {} locations'.format(
+        len(category_ids),len(location_ids)))
+    # random_seed = 0
+    def compute_seed_errors(random_seed):
+        """
+        Compute the per-category error for a specific random seed.
+        returns weighted_average_error,category_to_val_fraction
+        """
+        # Randomly split into train/val
+        random.seed(random_seed)
+        val_locations = random.sample(location_ids,k=n_val_locations)
+        val_locations_set = set(val_locations)
+        # For each category, measure the % of images that went into the val set
+        category_to_val_fraction = defaultdict(float)
+        for category_id in category_ids:
+            category_val_count = 0
+            category_train_count = 0
+            for location_id in location_to_category_counts:
+                if category_id not in location_to_category_counts[location_id]:
+                    location_category_count = 0
+                else:
+                    location_category_count = location_to_category_counts[location_id][category_id]
+                if location_id in val_locations_set:
+                    category_val_count += location_category_count
+                else:
+                    category_train_count += location_category_count
+            category_val_fraction = category_val_count / (category_val_count + category_train_count)
+            category_to_val_fraction[category_id] = category_val_fraction
+        # Absolute deviation from the target val fraction for each categorys
+        category_errors = {}
+        weighted_category_errors = {}
+        # category = next(iter(category_to_val_fraction))
+        for category in category_to_val_fraction:
+            category_val_fraction = category_to_val_fraction[category]
+            category_error = abs(category_val_fraction-target_val_fraction)
+            category_errors[category] = category_error
+            category_weight = 1.0
+            if category in category_to_error_weight:
+                category_weight = category_to_error_weight[category]
+            weighted_category_error = category_error * category_weight
+            weighted_category_errors[category] = weighted_category_error
+        weighted_average_error = np.mean(list(weighted_category_errors.values()))
+        return weighted_average_error,weighted_category_errors,category_to_val_fraction
+    # ... def compute_seed_errors(...)
+    # This will only include random seeds that satisfy the hard constraints
+    random_seed_to_weighted_average_error = {}
+    # random_seed = 0
+    for random_seed in tqdm(range(0,n_random_seeds)):
+        weighted_average_error,weighted_category_errors,category_to_val_fraction = \
+            compute_seed_errors(random_seed)
+        seed_satisfies_hard_constraints = True
+        for category in category_to_val_fraction:
+            if category in category_to_max_allowable_error:
+                max_allowable_error = category_to_max_allowable_error[category]
+            else:
+                if default_max_allowable_error is None:
+                    continue
+                max_allowable_error = default_max_allowable_error
+            val_fraction = category_to_val_fraction[category]
+            category_error = abs(val_fraction - target_val_fraction)
+            if category_error > max_allowable_error:
+                seed_satisfies_hard_constraints = False
+                break
+        if seed_satisfies_hard_constraints:
+            random_seed_to_weighted_average_error[random_seed] = weighted_average_error
+    # ...for each random seed
+    assert len(random_seed_to_weighted_average_error) > 0, \
+        'No random seed met all the hard constraints'
+    print('\n{} of {} random seeds satisfied hard constraints'.format(
+        len(random_seed_to_weighted_average_error),n_random_seeds))
+    min_error = None
+    min_error_seed = None
+    for random_seed in random_seed_to_weighted_average_error.keys():
+        error_metric = random_seed_to_weighted_average_error[random_seed]
+        if min_error is None or error_metric < min_error:
+            min_error = error_metric
+            min_error_seed = random_seed
+    random.seed(min_error_seed)
+    val_locations = random.sample(location_ids,k=n_val_locations)
+    train_locations = []
+    for location_id in location_ids:
+        if location_id not in val_locations:
+            train_locations.append(location_id)
+    print('\nVal locations:\n')
+    for loc in val_locations:
+        print('{}'.format(loc))
+    print('')
+    weighted_average_error,weighted_category_errors,category_to_val_fraction = \
+        compute_seed_errors(min_error_seed)
+    random_seed = min_error_seed
+    category_to_val_fraction = sort_dictionary_by_value(category_to_val_fraction,reverse=True)
+    category_to_val_fraction = sort_dictionary_by_value(category_to_val_fraction,
+                                                        sort_values=category_id_to_count,
+                                                        reverse=True)
+    print('Val fractions by category:\n')
+    for category in category_to_val_fraction:
+        print('{} ({}) {:.2f}'.format(
+            category,category_id_to_count[category],
+            category_to_val_fraction[category]))
+    return val_locations,category_to_val_fraction
+# ...def split_locations_into_train_val(...)

md_utils/string_utils.py CHANGED Viewed

@@ -57,3 +57,13 @@ def human_readable_to_bytes(size):
             bytes = 0
     return bytes
+def remove_ansi_codes(s):
+    """
+    Remove ANSI escape codes from a string.
+    https://stackoverflow.com/questions/14693701/how-can-i-remove-the-ansi-escape-sequences-from-a-string-in-python#14693789
+    """
+    ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
+    return ansi_escape.sub('', s)

md_utils/url_utils.py CHANGED Viewed

	@@ -140,5 +140,3 @@ def test_urls(urls, error_on_failure=True):
140 140
141 141	return status_codes
142 142
143	-
144	-

md_utils/write_html_image_list.py CHANGED Viewed

@@ -177,6 +177,7 @@ def write_html_image_list(filename=None,images=None,options=None):
         filename = filename.encode('ascii','ignore').decode('ascii')
         if options['urlEncodeFilenames']:
+            filename = filename.replace('\\','/')
             filename = urllib.parse.quote(filename)
         if len(title) > 0:

md_visualization/visualization_utils.py CHANGED Viewed

@@ -172,12 +172,20 @@ def resize_image(image, target_width, target_height=-1, output_file=None):
     in place. If either width or height are -1, resizes with aspect ratio preservation.
     If both are -1, returns the original image (does not copy in this case).
+    None is equivalent to -1 for target_width and target_height.
     [image] can be a PIL image or a filename.
     """
     if isinstance(image,str):
         image = load_image(image)
+    if target_width is None:
+        target_width = -1
+    if target_height is None:
+        target_height = -1
     # Null operation
     if target_width == -1 and target_height == -1:
         return image
@@ -371,7 +379,8 @@ def render_detection_bounding_boxes(detections, image,
             The type of the numerical label (default string) needs to be consistent with the keys in
             label_map; no casting is carried out.  If this is None, no classification labels are shown.
-        confidence_threshold: optional, threshold above which the bounding box is rendered.
+        confidence_threshold: optional, threshold above which boxes are rendered.  Can also be a dictionary
+        mapping category IDs to thresholds.
         thickness: line thickness in pixels. Default value is 4.
@@ -405,9 +414,15 @@ def render_detection_bounding_boxes(detections, image,
         score = detection['conf']
+        if isinstance(confidence_threshold,dict):
+            rendering_threshold = confidence_threshold[detection['category']]
+        else:
+            rendering_threshold = confidence_threshold
         # Always render objects with a confidence of "None", this is typically used
         # for ground truth data.
-        if score is None or score >= confidence_threshold:
+        if score is None or score >= rendering_threshold:
             x1, y1, w_box, h_box = detection['bbox']
             display_boxes.append([y1, x1, y1 + h_box, x1 + w_box])

md_visualization/visualize_db.py CHANGED Viewed

@@ -41,7 +41,15 @@ class DbVizOptions:
     #
     # If viz_size is None or (-1,-1), the original image size is used.
     viz_size = (675, -1)
+    # The most relevant option one might want to set here is:
+    #
+    # htmlOptions['maxFiguresPerHtmlFile']
+    #
+    # ...which can be used to paginate previews to a number of images that will load well
+    # in a browser (5000 is a reasonable limit).
     htmlOptions = write_html_image_list()
     sort_by_filename = True
     trim_to_images_with_bboxes = False

megadetector 5.0.6__py3-none-any.whl → 5.0.7__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.6py3-none-any.whl → 5.0.7py3-none-any.whl