PyPI - megadetector - Versions diffs - 5.0.6__py3-none-any.whl → 5.0.8__py3-none-any.whl - Mend

megadetector 5.0.6py3-none-any.whl → 5.0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (75) hide show

api/batch_processing/data_preparation/manage_local_batch.py +297 -202
api/batch_processing/data_preparation/manage_video_batch.py +7 -2
api/batch_processing/postprocessing/add_max_conf.py +1 -0
api/batch_processing/postprocessing/combine_api_outputs.py +2 -2
api/batch_processing/postprocessing/compare_batch_results.py +111 -61
api/batch_processing/postprocessing/convert_output_format.py +24 -6
api/batch_processing/postprocessing/load_api_results.py +56 -72
api/batch_processing/postprocessing/md_to_labelme.py +119 -51
api/batch_processing/postprocessing/merge_detections.py +30 -5
api/batch_processing/postprocessing/postprocess_batch_results.py +175 -55
api/batch_processing/postprocessing/remap_detection_categories.py +163 -0
api/batch_processing/postprocessing/render_detection_confusion_matrix.py +628 -0
api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +224 -76
api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
classification/prepare_classification_script.py +191 -191
data_management/cct_json_utils.py +7 -2
data_management/coco_to_labelme.py +263 -0
data_management/coco_to_yolo.py +72 -48
data_management/databases/integrity_check_json_db.py +75 -64
data_management/databases/subset_json_db.py +1 -1
data_management/generate_crops_from_cct.py +1 -1
data_management/get_image_sizes.py +44 -26
data_management/importers/animl_results_to_md_results.py +3 -5
data_management/importers/noaa_seals_2019.py +2 -2
data_management/importers/zamba_results_to_md_results.py +2 -2
data_management/labelme_to_coco.py +264 -127
data_management/labelme_to_yolo.py +96 -53
data_management/lila/create_lila_blank_set.py +557 -0
data_management/lila/create_lila_test_set.py +2 -1
data_management/lila/create_links_to_md_results_files.py +1 -1
data_management/lila/download_lila_subset.py +138 -45
data_management/lila/generate_lila_per_image_labels.py +23 -14
data_management/lila/get_lila_annotation_counts.py +16 -10
data_management/lila/lila_common.py +15 -42
data_management/lila/test_lila_metadata_urls.py +116 -0
data_management/read_exif.py +65 -16
data_management/remap_coco_categories.py +84 -0
data_management/resize_coco_dataset.py +14 -31
data_management/wi_download_csv_to_coco.py +239 -0
data_management/yolo_output_to_md_output.py +40 -13
data_management/yolo_to_coco.py +313 -100
detection/process_video.py +36 -14
detection/pytorch_detector.py +1 -1
detection/run_detector.py +73 -18
detection/run_detector_batch.py +116 -27
detection/run_inference_with_yolov5_val.py +135 -27
detection/run_tiled_inference.py +153 -43
detection/tf_detector.py +2 -1
detection/video_utils.py +4 -2
md_utils/ct_utils.py +101 -6
md_utils/md_tests.py +264 -17
md_utils/path_utils.py +326 -47
md_utils/process_utils.py +26 -7
md_utils/split_locations_into_train_val.py +215 -0
md_utils/string_utils.py +10 -0
md_utils/url_utils.py +66 -3
md_utils/write_html_image_list.py +12 -2
md_visualization/visualization_utils.py +380 -74
md_visualization/visualize_db.py +41 -10
md_visualization/visualize_detector_output.py +185 -104
{megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/METADATA +11 -13
{megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/RECORD +74 -67
{megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/WHEEL +1 -1
taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
taxonomy_mapping/map_new_lila_datasets.py +43 -39
taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
taxonomy_mapping/preview_lila_taxonomy.py +27 -27
taxonomy_mapping/species_lookup.py +33 -13
taxonomy_mapping/taxonomy_csv_checker.py +7 -5
md_visualization/visualize_megadb.py +0 -183
{megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/LICENSE +0 -0
{megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/top_level.txt +0 -0

md_utils/split_locations_into_train_val.py ADDED Viewed

@@ -0,0 +1,215 @@
+########
+#
+# split_locations_into_train_val.py
+#
+# Split a list of location IDs into training and validation, targeting a specific
+# train/val split for each category, but allowing some categories to be tighter or looser
+# than others.  Does nothing particularly clever, just randomly splits locations into
+# train/val lots of times using the target val fraction, and picks the one that meets the
+# specified constraints and minimizes weighted error, where "error" is defined as the
+# sum of each class's absolute divergence from the target val fraction.
+#
+########
+#%% Imports/constants
+import random
+import numpy as np
+from collections import defaultdict
+from md_utils.ct_utils import sort_dictionary_by_value
+from tqdm import tqdm
+#%% Main function
+def split_locations_into_train_val(location_to_category_counts,
+                                   n_random_seeds=10000,
+                                   target_val_fraction=0.15,
+                                   category_to_max_allowable_error=None,
+                                   category_to_error_weight=None,
+                                   default_max_allowable_error=0.1):
+    """
+    Split a list of location IDs into training and validation, targeting a specific
+    train/val split for each category, but allowing some categories to be tighter or looser
+    than others.  Does nothing particularly clever, just randomly splits locations into
+    train/val lots of times using the target val fraction, and picks the one that meets the
+    specified constraints and minimizes weighted error, where "error" is defined as the
+    sum of each class's absolute divergence from the target val fraction.
+    location_to_category_counts should be a dict mapping location IDs to dicts,
+    with each dict mapping a category name to a count.  Any categories not present in a
+    particular dict are assumed to have a count of zero for that location.
+    If not None, category_to_max_allowable_error should be a dict mapping category names
+    to maximum allowable errors.  These are hard constraints, but you can specify a subset
+    of categories.  Categories not included here have a maximum error of Inf.
+    If not None, category_to_error_weight should be a dict mapping category names to
+    error weights.  You can specify a subset of categories.  Categories not included here
+    have a weight of 1.0.
+    default_max_allowable_error is the maximum allowable error for categories not present in
+    category_to_max_allowable_error.  Set to None (or >= 1.0) to disable hard constraints for
+    categories not present in category_to_max_allowable_error
+    returns val_locations,category_to_val_fraction
+    """
+    location_ids = list(location_to_category_counts.keys())
+    n_val_locations = int(target_val_fraction*len(location_ids))
+    if category_to_max_allowable_error is None:
+        category_to_max_allowable_error = {}
+    if category_to_error_weight is None:
+        category_to_error_weight = {}
+    # category ID to total count; the total count is used only for printouts
+    category_id_to_count = {}
+    for location_id in location_to_category_counts:
+        for category_id in location_to_category_counts[location_id].keys():
+            if category_id not in category_id_to_count:
+                category_id_to_count[category_id] = 0
+            category_id_to_count[category_id] += \
+                location_to_category_counts[location_id][category_id]
+    category_ids = set(category_id_to_count.keys())
+    print('Splitting {} categories over {} locations'.format(
+        len(category_ids),len(location_ids)))
+    # random_seed = 0
+    def compute_seed_errors(random_seed):
+        """
+        Compute the per-category error for a specific random seed.
+        returns weighted_average_error,category_to_val_fraction
+        """
+        # Randomly split into train/val
+        random.seed(random_seed)
+        val_locations = random.sample(location_ids,k=n_val_locations)
+        val_locations_set = set(val_locations)
+        # For each category, measure the % of images that went into the val set
+        category_to_val_fraction = defaultdict(float)
+        for category_id in category_ids:
+            category_val_count = 0
+            category_train_count = 0
+            for location_id in location_to_category_counts:
+                if category_id not in location_to_category_counts[location_id]:
+                    location_category_count = 0
+                else:
+                    location_category_count = location_to_category_counts[location_id][category_id]
+                if location_id in val_locations_set:
+                    category_val_count += location_category_count
+                else:
+                    category_train_count += location_category_count
+            category_val_fraction = category_val_count / (category_val_count + category_train_count)
+            category_to_val_fraction[category_id] = category_val_fraction
+        # Absolute deviation from the target val fraction for each categorys
+        category_errors = {}
+        weighted_category_errors = {}
+        # category = next(iter(category_to_val_fraction))
+        for category in category_to_val_fraction:
+            category_val_fraction = category_to_val_fraction[category]
+            category_error = abs(category_val_fraction-target_val_fraction)
+            category_errors[category] = category_error
+            category_weight = 1.0
+            if category in category_to_error_weight:
+                category_weight = category_to_error_weight[category]
+            weighted_category_error = category_error * category_weight
+            weighted_category_errors[category] = weighted_category_error
+        weighted_average_error = np.mean(list(weighted_category_errors.values()))
+        return weighted_average_error,weighted_category_errors,category_to_val_fraction
+    # ... def compute_seed_errors(...)
+    # This will only include random seeds that satisfy the hard constraints
+    random_seed_to_weighted_average_error = {}
+    # random_seed = 0
+    for random_seed in tqdm(range(0,n_random_seeds)):
+        weighted_average_error,weighted_category_errors,category_to_val_fraction = \
+            compute_seed_errors(random_seed)
+        seed_satisfies_hard_constraints = True
+        for category in category_to_val_fraction:
+            if category in category_to_max_allowable_error:
+                max_allowable_error = category_to_max_allowable_error[category]
+            else:
+                if default_max_allowable_error is None:
+                    continue
+                max_allowable_error = default_max_allowable_error
+            val_fraction = category_to_val_fraction[category]
+            category_error = abs(val_fraction - target_val_fraction)
+            if category_error > max_allowable_error:
+                seed_satisfies_hard_constraints = False
+                break
+        if seed_satisfies_hard_constraints:
+            random_seed_to_weighted_average_error[random_seed] = weighted_average_error
+    # ...for each random seed
+    assert len(random_seed_to_weighted_average_error) > 0, \
+        'No random seed met all the hard constraints'
+    print('\n{} of {} random seeds satisfied hard constraints'.format(
+        len(random_seed_to_weighted_average_error),n_random_seeds))
+    min_error = None
+    min_error_seed = None
+    for random_seed in random_seed_to_weighted_average_error.keys():
+        error_metric = random_seed_to_weighted_average_error[random_seed]
+        if min_error is None or error_metric < min_error:
+            min_error = error_metric
+            min_error_seed = random_seed
+    random.seed(min_error_seed)
+    val_locations = random.sample(location_ids,k=n_val_locations)
+    train_locations = []
+    for location_id in location_ids:
+        if location_id not in val_locations:
+            train_locations.append(location_id)
+    print('\nVal locations:\n')
+    for loc in val_locations:
+        print('{}'.format(loc))
+    print('')
+    weighted_average_error,weighted_category_errors,category_to_val_fraction = \
+        compute_seed_errors(min_error_seed)
+    random_seed = min_error_seed
+    category_to_val_fraction = sort_dictionary_by_value(category_to_val_fraction,reverse=True)
+    category_to_val_fraction = sort_dictionary_by_value(category_to_val_fraction,
+                                                        sort_values=category_id_to_count,
+                                                        reverse=True)
+    print('Val fractions by category:\n')
+    for category in category_to_val_fraction:
+        print('{} ({}) {:.2f}'.format(
+            category,category_id_to_count[category],
+            category_to_val_fraction[category]))
+    return val_locations,category_to_val_fraction
+# ...def split_locations_into_train_val(...)

md_utils/string_utils.py CHANGED Viewed

@@ -57,3 +57,13 @@ def human_readable_to_bytes(size):
             bytes = 0
     return bytes
+def remove_ansi_codes(s):
+    """
+    Remove ANSI escape codes from a string.
+    https://stackoverflow.com/questions/14693701/how-can-i-remove-the-ansi-escape-sequences-from-a-string-in-python#14693789
+    """
+    ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
+    return ansi_escape.sub('', s)

md_utils/url_utils.py CHANGED Viewed

@@ -16,6 +16,7 @@ import requests
 from tqdm import tqdm
 from urllib.parse import urlparse
+from multiprocessing.pool import ThreadPool
 url_utils_temp_dir = None
 max_path_len = 255
@@ -109,7 +110,14 @@ def download_url(url, destination_filename=None, progress_updater=None,
 def download_relative_filename(url, output_base, verbose=False):
     """
-    Download a URL to output_base, preserving relative path
+    Download a URL to output_base, preserving relative path.  Path is relative to
+    the site, so:
+        https://abc.com/xyz/123.txt
+    ...will get downloaded to:
+        output_base/xyz/123.txt
     """
     p = urlparse(url)
@@ -119,6 +127,63 @@ def download_relative_filename(url, output_base, verbose=False):
     download_url(url, destination_filename, verbose=verbose)
+def parallel_download_urls(url_to_target_file,verbose=False,overwrite=False,
+                           n_workers=20):
+    """
+    Download a list of URLs to local files.  url_to_target_file should
+    be a dict mapping URLs to output files.  Catches exceptions and reports
+    them in the returned "results" array.
+    """
+    def _do_parallelized_download(download_info,overwrite=False):
+        url = download_info['url']
+        target_file = download_info['target_file']
+        result = {'status':'unknown','url':url,'target_file':target_file}
+        if ((os.path.isfile(target_file)) and (not overwrite)):
+            result['status'] = 'skipped'
+            return result
+        try:
+            download_url(url=url,
+                         destination_filename=target_file,
+                         verbose=verbose, force_download=overwrite)
+        except Exception as e:
+            print('Warning: error downloading URL {}: {}'.format(
+                url,str(e)))
+            result['status'] = 'error: {}'.format(str(e))
+            return result
+        result['status'] = 'success'
+        return result
+    all_download_info = []
+    for url in url_to_target_file:
+        download_info = {}
+        download_info['url'] = url
+        download_info['target_file'] = url_to_target_file[url]
+        all_download_info.append(download_info)
+    print('Downloading {} images on {} workers'.format(
+        len(all_download_info),n_workers))
+    if n_workers <= 1:
+        results = []
+        for download_info in tqdm(all_download_info):
+            result = _do_parallelized_download(download_info,overwrite=overwrite)
+            results.append(result)
+    else:
+        pool = ThreadPool(n_workers)
+        results = list(tqdm(pool.imap(lambda download_info: _do_parallelized_download(
+            download_info,overwrite=overwrite),all_download_info),
+            total=len(all_download_info)))
+    return results
 def test_urls(urls, error_on_failure=True):
     """
     Verify that a list of URLs is available (returns status 200).  By default,
@@ -140,5 +205,3 @@ def test_urls(urls, error_on_failure=True):
     return status_codes

md_utils/write_html_image_list.py CHANGED Viewed

@@ -42,6 +42,7 @@ def write_html_image_list(filename=None,images=None,options=None):
         defaultImageStyle
         maxFiguresPerHtmlFile
         urlEncodeFilenames (default True, e.g. '#' will be replaced by '%23')
+        urlEncodeLinkTargets (default True, e.g. '#' will be replaced by '%23')
     """
@@ -68,7 +69,10 @@ def write_html_image_list(filename=None,images=None,options=None):
     if 'urlEncodeFilenames' not in options or options['urlEncodeFilenames'] is None:
         options['urlEncodeFilenames'] = True
+    if 'urlEncodeLinkTargets' not in options or options['urlEncodeLinkTargets'] is None:
+        options['urlEncodeLinkTargets'] = True
     # Possibly split the html output for figures into multiple files; Chrome gets sad with
     # thousands of images in a single tab.
     if 'maxFiguresPerHtmlFile' not in options or options['maxFiguresPerHtmlFile'] is None:
@@ -176,7 +180,8 @@ def write_html_image_list(filename=None,images=None,options=None):
         title = title.encode('ascii','ignore').decode('ascii')
         filename = filename.encode('ascii','ignore').decode('ascii')
-        if options['urlEncodeFilenames']:
+        filename = filename.replace('\\','/')
+        if options['urlEncodeFilenames']:
             filename = urllib.parse.quote(filename)
         if len(title) > 0:
@@ -184,6 +189,11 @@ def write_html_image_list(filename=None,images=None,options=None):
                     '<p style="{}">{}</p>\n'\
                     .format(textStyle,title))
+        linkTarget = linkTarget.replace('\\','/')
+        if options['urlEncodeLinkTargets']:
+            # These are typically absolute paths, so we only want to mess with certain characters
+            linkTarget = urllib.parse.quote(linkTarget,safe=':/')
         if len(linkTarget) > 0:
             fHtml.write('<a href="{}">'.format(linkTarget))
             # imageStyle.append(';border:0px;')

megadetector 5.0.6__py3-none-any.whl → 5.0.8__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.6py3-none-any.whl → 5.0.8py3-none-any.whl