PyPI - megadetector - Versions diffs - 5.0.11__py3-none-any.whl → 5.0.12__py3-none-any.whl - Mend

megadetector 5.0.11py3-none-any.whl → 5.0.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (201) hide show

megadetector/utils/split_locations_into_train_val.py ADDED Viewed

@@ -0,0 +1,228 @@
+"""
+split_locations_into_train_val.py
+Splits a list of location IDs into training and validation, targeting a specific
+train/val split for each category, but allowing some categories to be tighter or looser
+than others.  Does nothing particularly clever, just randomly splits locations into
+train/val lots of times using the target val fraction, and picks the one that meets the
+specified constraints and minimizes weighted error, where "error" is defined as the
+sum of each class's absolute divergence from the target val fraction.
+"""
+#%% Imports/constants
+import random
+import numpy as np
+from collections import defaultdict
+from megadetector.utils.ct_utils import sort_dictionary_by_value
+from tqdm import tqdm
+#%% Main function
+def split_locations_into_train_val(location_to_category_counts,
+                                   n_random_seeds=10000,
+                                   target_val_fraction=0.15,
+                                   category_to_max_allowable_error=None,
+                                   category_to_error_weight=None,
+                                   default_max_allowable_error=0.1):
+    """
+    Splits a list of location IDs into training and validation, targeting a specific
+    train/val split for each category, but allowing some categories to be tighter or looser
+    than others.  Does nothing particularly clever, just randomly splits locations into
+    train/val lots of times using the target val fraction, and picks the one that meets the
+    specified constraints and minimizes weighted error, where "error" is defined as the
+    sum of each class's absolute divergence from the target val fraction.
+    Args:
+        location_to_category_counts (dict): a dict mapping location IDs to dicts,
+            with each dict mapping a category name to a count.  Any categories not present
+            in a particular dict are assumed to have a count of zero for that location.
+            For example:
+            .. code-block:: none
+                {'location-000': {'bear':4,'wolf':10},
+                 'location-001': {'bear':12,'elk':20}}
+        n_random_seeds (int, optional): number of random seeds to try, always starting from zero
+        target_val_fraction (float, optional): fraction of images containing each species we'd
+            like to put in the val split
+        category_to_max_allowable_error (dict, optional): a dict mapping category names
+            to maximum allowable errors.  These are hard constraints (i.e., we will error
+            if we can't meet them).  Does not need to include all categories; categories not
+            included will be assigned a maximum error according to [default_max_allowable_error].
+            If this is None, no hard constraints are applied.
+        category_to_error_weight (dict, optional): a dict mapping category names to
+            error weights.  You can specify a subset of categories; categories not included here
+            have a weight of 1.0.  If None, all categories have the same weight.
+        default_max_allowable_error (float, optional): the maximum allowable error for categories not
+            present in [category_to_max_allowable_error].  Set to None (or >= 1.0) to disable hard
+            constraints for categories not present in [category_to_max_allowable_error]
+    Returns:
+        tuple: A two-element tuple:
+            - list of location IDs in the val split
+            - a dict mapping category names to the fraction of images in the val split
+    """
+    location_ids = list(location_to_category_counts.keys())
+    n_val_locations = int(target_val_fraction*len(location_ids))
+    if category_to_max_allowable_error is None:
+        category_to_max_allowable_error = {}
+    if category_to_error_weight is None:
+        category_to_error_weight = {}
+    # category ID to total count; the total count is used only for printouts
+    category_id_to_count = {}
+    for location_id in location_to_category_counts:
+        for category_id in location_to_category_counts[location_id].keys():
+            if category_id not in category_id_to_count:
+                category_id_to_count[category_id] = 0
+            category_id_to_count[category_id] += \
+                location_to_category_counts[location_id][category_id]
+    category_ids = set(category_id_to_count.keys())
+    print('Splitting {} categories over {} locations'.format(
+        len(category_ids),len(location_ids)))
+    # random_seed = 0
+    def compute_seed_errors(random_seed):
+        """
+        Computes the per-category error for a specific random seed.
+        returns weighted_average_error,category_to_val_fraction
+        """
+        # Randomly split into train/val
+        random.seed(random_seed)
+        val_locations = random.sample(location_ids,k=n_val_locations)
+        val_locations_set = set(val_locations)
+        # For each category, measure the % of images that went into the val set
+        category_to_val_fraction = defaultdict(float)
+        for category_id in category_ids:
+            category_val_count = 0
+            category_train_count = 0
+            for location_id in location_to_category_counts:
+                if category_id not in location_to_category_counts[location_id]:
+                    location_category_count = 0
+                else:
+                    location_category_count = location_to_category_counts[location_id][category_id]
+                if location_id in val_locations_set:
+                    category_val_count += location_category_count
+                else:
+                    category_train_count += location_category_count
+            category_val_fraction = category_val_count / (category_val_count + category_train_count)
+            category_to_val_fraction[category_id] = category_val_fraction
+        # Absolute deviation from the target val fraction for each categorys
+        category_errors = {}
+        weighted_category_errors = {}
+        # category = next(iter(category_to_val_fraction))
+        for category in category_to_val_fraction:
+            category_val_fraction = category_to_val_fraction[category]
+            category_error = abs(category_val_fraction-target_val_fraction)
+            category_errors[category] = category_error
+            category_weight = 1.0
+            if category in category_to_error_weight:
+                category_weight = category_to_error_weight[category]
+            weighted_category_error = category_error * category_weight
+            weighted_category_errors[category] = weighted_category_error
+        weighted_average_error = np.mean(list(weighted_category_errors.values()))
+        return weighted_average_error,weighted_category_errors,category_to_val_fraction
+    # ... def compute_seed_errors(...)
+    # This will only include random seeds that satisfy the hard constraints
+    random_seed_to_weighted_average_error = {}
+    # random_seed = 0
+    for random_seed in tqdm(range(0,n_random_seeds)):
+        weighted_average_error,weighted_category_errors,category_to_val_fraction = \
+            compute_seed_errors(random_seed)
+        seed_satisfies_hard_constraints = True
+        for category in category_to_val_fraction:
+            if category in category_to_max_allowable_error:
+                max_allowable_error = category_to_max_allowable_error[category]
+            else:
+                if default_max_allowable_error is None:
+                    continue
+                max_allowable_error = default_max_allowable_error
+            val_fraction = category_to_val_fraction[category]
+            category_error = abs(val_fraction - target_val_fraction)
+            if category_error > max_allowable_error:
+                seed_satisfies_hard_constraints = False
+                break
+        if seed_satisfies_hard_constraints:
+            random_seed_to_weighted_average_error[random_seed] = weighted_average_error
+    # ...for each random seed
+    assert len(random_seed_to_weighted_average_error) > 0, \
+        'No random seed met all the hard constraints'
+    print('\n{} of {} random seeds satisfied hard constraints'.format(
+        len(random_seed_to_weighted_average_error),n_random_seeds))
+    min_error = None
+    min_error_seed = None
+    for random_seed in random_seed_to_weighted_average_error.keys():
+        error_metric = random_seed_to_weighted_average_error[random_seed]
+        if min_error is None or error_metric < min_error:
+            min_error = error_metric
+            min_error_seed = random_seed
+    random.seed(min_error_seed)
+    val_locations = random.sample(location_ids,k=n_val_locations)
+    train_locations = []
+    for location_id in location_ids:
+        if location_id not in val_locations:
+            train_locations.append(location_id)
+    print('\nVal locations:\n')
+    for loc in val_locations:
+        print('{}'.format(loc))
+    print('')
+    weighted_average_error,weighted_category_errors,category_to_val_fraction = \
+        compute_seed_errors(min_error_seed)
+    random_seed = min_error_seed
+    category_to_val_fraction = sort_dictionary_by_value(category_to_val_fraction,reverse=True)
+    category_to_val_fraction = sort_dictionary_by_value(category_to_val_fraction,
+                                                        sort_values=category_id_to_count,
+                                                        reverse=True)
+    print('Val fractions by category:\n')
+    for category in category_to_val_fraction:
+        print('{} ({}) {:.2f}'.format(
+            category,category_id_to_count[category],
+            category_to_val_fraction[category]))
+    return val_locations,category_to_val_fraction
+# ...def split_locations_into_train_val(...)

megadetector/utils/string_utils.py ADDED Viewed

@@ -0,0 +1,92 @@
+"""
+string_utils.py
+Miscellaneous string utilities.
+"""
+#%% Imports
+import re
+#%% Functions
+def is_float(s):
+    """
+    Checks whether [s] is an object (typically a string) that can be cast to a float
+    Args:
+        s (object): object to evaluate
+    Returns:
+        bool: True if s successfully casts to a float, otherwise False
+    """
+    try:
+        _ = float(s)
+    except ValueError:
+        return False
+    return True
+def human_readable_to_bytes(size):
+    """
+    Given a human-readable byte string (e.g. 2G, 10GB, 30MB, 20KB),
+    returns the number of bytes.  Will return 0 if the argument has
+    unexpected form.
+    https://gist.github.com/beugley/ccd69945346759eb6142272a6d69b4e0
+    Args:
+        size (str): string representing a size
+    Returns:
+        int: the corresponding size in bytes
+    """
+    size = re.sub(r'\s+', '', size)
+    if (size[-1] == 'B'):
+        size = size[:-1]
+    if (size.isdigit()):
+        bytes = int(size)
+    elif (is_float(size)):
+        bytes = float(size)
+    else:
+        bytes = size[:-1]
+        unit = size[-1]
+        try:
+            bytes = float(bytes)
+            if (unit == 'T'):
+                bytes *= 1024*1024*1024*1024
+            elif (unit == 'G'):
+                bytes *= 1024*1024*1024
+            elif (unit == 'M'):
+                bytes *= 1024*1024
+            elif (unit == 'K'):
+                bytes *= 1024
+            else:
+                bytes = 0
+        except ValueError:
+            bytes = 0
+    return bytes
+def remove_ansi_codes(s):
+    """
+    Removes ANSI escape codes from a string.
+    https://stackoverflow.com/questions/14693701/how-can-i-remove-the-ansi-escape-sequences-from-a-string-in-python#14693789
+    Args:
+        s (str): the string to de-ANSI-i-fy
+    Returns:
+        str: A copy of [s] without ANSI codes
+    """
+    ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
+    return ansi_escape.sub('', s)

megadetector/utils/url_utils.py ADDED Viewed

@@ -0,0 +1,323 @@
+"""
+url_utils.py
+Frequently-used functions for downloading or manipulating URLs
+"""
+#%% Imports and constants
+import os
+import re
+import urllib
+import tempfile
+import requests
+from functools import partial
+from tqdm import tqdm
+from urllib.parse import urlparse
+from multiprocessing.pool import ThreadPool
+from multiprocessing.pool import Pool
+url_utils_temp_dir = None
+max_path_len = 255
+#%% Download functions
+class DownloadProgressBar():
+    """
+    Progress updater based on the progressbar2 package.
+    https://stackoverflow.com/questions/37748105/how-to-use-progressbar-module-with-urlretrieve
+    """
+    def __init__(self):
+        self.pbar = None
+    def __call__(self, block_num, block_size, total_size):
+        if not self.pbar:
+            # This is a pretty random import I'd rather not depend on outside of the
+            # rare case where it's used, so importing locally
+            # pip install progressbar2
+            import progressbar
+            self.pbar = progressbar.ProgressBar(max_value=total_size)
+            self.pbar.start()
+        downloaded = block_num * block_size
+        if downloaded < total_size:
+            self.pbar.update(downloaded)
+        else:
+            self.pbar.finish()
+def get_temp_folder(preferred_name='url_utils'):
+    """
+    Gets a temporary folder for use within this module.
+    Args:
+        preferred_name (str, optional): subfolder to use within the system temp folder
+    Returns:
+        str: the full path to the temporary subfolder
+    """
+    global url_utils_temp_dir
+    if url_utils_temp_dir is None:
+        url_utils_temp_dir = os.path.join(tempfile.gettempdir(),preferred_name)
+        os.makedirs(url_utils_temp_dir,exist_ok=True)
+    return url_utils_temp_dir
+def download_url(url,
+                 destination_filename=None,
+                 progress_updater=None,
+                 force_download=False,
+                 verbose=True):
+    """
+    Downloads a URL to a file.  If no file is specified, creates a temporary file,
+    making a best effort to avoid filename collisions.
+    Prints some diagnostic information and makes sure to omit SAS tokens from printouts.
+    Args:
+        url (str): the URL to download
+        destination_filename (str, optional): the target filename; if None, will create
+            a file in system temp space
+        progress_updater (object or bool, optional): can be "None", "False", "True", or a
+            specific callable object.  If None or False, no progress updated will be
+            displayed.  If True, a default progress bar will be created.
+        force_download (bool, optional): download this file even if [destination_filename]
+            exists.
+        verbose (bool, optional): enable additional debug console output
+    Returns:
+        str: the filename to which [url] was downloaded, the same as [destination_filename]
+        if [destination_filename] was not None
+    """
+    if progress_updater is not None and isinstance(progress_updater,bool):
+        if not progress_updater:
+            progress_updater = None
+        else:
+            progress_updater = DownloadProgressBar()
+    url_no_sas = url.split('?')[0]
+    if destination_filename is None:
+        target_folder = get_temp_folder()
+        url_without_sas = url.split('?', 1)[0]
+        # This does not guarantee uniqueness, hence "semi-best-effort"
+        url_as_filename = re.sub(r'\W+', '', url_without_sas)
+        n_folder_chars = len(url_utils_temp_dir)
+        if len(url_as_filename) + n_folder_chars > max_path_len:
+            print('Warning: truncating filename target to {} characters'.format(max_path_len))
+            url_as_filename = url_as_filename[-1*(max_path_len-n_folder_chars):]
+        destination_filename = \
+            os.path.join(target_folder,url_as_filename)
+    if (not force_download) and (os.path.isfile(destination_filename)):
+        if verbose:
+            print('Bypassing download of already-downloaded file {}'.format(os.path.basename(url_no_sas)))
+    else:
+        if verbose:
+            print('Downloading file {} to {}'.format(os.path.basename(url_no_sas),destination_filename),end='')
+        target_dir = os.path.dirname(destination_filename)
+        os.makedirs(target_dir,exist_ok=True)
+        urllib.request.urlretrieve(url, destination_filename, progress_updater)
+        assert(os.path.isfile(destination_filename))
+        nBytes = os.path.getsize(destination_filename)
+        if verbose:
+            print('...done, {} bytes.'.format(nBytes))
+    return destination_filename
+def download_relative_filename(url, output_base, verbose=False):
+    """
+    Download a URL to output_base, preserving relative path.  Path is relative to
+    the site, so:
+        https://abc.com/xyz/123.txt
+    ...will get downloaded to:
+        output_base/xyz/123.txt
+    Args:
+        url (str): the URL to download
+        output_base (str): the base folder to which we should download this file
+        verbose (bool, optional): enable additional debug console output
+    Returns:
+        str: the local destination filename
+    """
+    p = urlparse(url)
+    # remove the leading '/'
+    assert p.path.startswith('/'); relative_filename = p.path[1:]
+    destination_filename = os.path.join(output_base,relative_filename)
+    return download_url(url, destination_filename, verbose=verbose)
+def _do_parallelized_download(download_info,overwrite=False,verbose=False):
+    """
+    Internal function for download parallelization.
+    """
+    url = download_info['url']
+    target_file = download_info['target_file']
+    result = {'status':'unknown','url':url,'target_file':target_file}
+    if ((os.path.isfile(target_file)) and (not overwrite)):
+        if verbose:
+            print('Skipping existing file {}'.format(target_file))
+        result['status'] = 'skipped'
+        return result
+    try:
+        download_url(url=url,
+                     destination_filename=target_file,
+                     verbose=verbose,
+                     force_download=overwrite)
+    except Exception as e:
+        print('Warning: error downloading URL {}: {}'.format(
+            url,str(e)))
+        result['status'] = 'error: {}'.format(str(e))
+        return result
+    result['status'] = 'success'
+    return result
+def parallel_download_urls(url_to_target_file,verbose=False,overwrite=False,
+                           n_workers=20,pool_type='thread'):
+    """
+    Downloads a list of URLs to local files.
+    Catches exceptions and reports them in the returned "results" array.
+    Args:
+        url_to_target_file: a dict mapping URLs to local filenames.
+        verbose (bool, optional): enable additional debug console output
+        overwrite (bool, optional): whether to overwrite existing local files
+        n_workers (int, optional): number of concurrent workers, set to <=1 to disable
+            parallelization
+        pool_type (str, optional): worker type to use; should be 'thread' or 'process'
+    Returns:
+        list: list of dicts with keys:
+            - 'url': the url this item refers to
+            - 'status': 'skipped', 'success', or a string starting with 'error'
+            - 'target_file': the local filename to which we downloaded (or tried to
+              download) this URL
+    """
+    all_download_info = []
+    print('Preparing download list')
+    for url in tqdm(url_to_target_file):
+        download_info = {}
+        download_info['url'] = url
+        download_info['target_file'] = url_to_target_file[url]
+        all_download_info.append(download_info)
+    print('Downloading {} images on {} workers'.format(
+        len(all_download_info),n_workers))
+    if n_workers <= 1:
+        results = []
+        for download_info in tqdm(all_download_info):
+            result = _do_parallelized_download(download_info,overwrite=overwrite,verbose=verbose)
+            results.append(result)
+    else:
+        if pool_type == 'thread':
+            pool = ThreadPool(n_workers)
+        else:
+            assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
+            pool = Pool(n_workers)
+        print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
+        results = list(tqdm(pool.imap(
+            partial(_do_parallelized_download,overwrite=overwrite,verbose=verbose),
+            all_download_info), total=len(all_download_info)))
+    return results
+def test_url(url, error_on_failure=True, timeout=None):
+    """
+    Tests the availability of [url], returning an http status code.
+    Args:
+        url (str): URL to test
+        error_on_failure (bool, optional): whether to error (vs. just returning an
+            error code) if accessing this URL fails
+        timeout (int, optional): timeout in seconds to wait before considering this
+            access attempt to be a failure; see requests.head() for precise documentation
+    Returns:
+        int: http status code (200 for success)
+    """
+    # r = requests.get(url, stream=True, verify=True, timeout=timeout)
+    r = requests.head(url, stream=True, verify=True, timeout=timeout)
+    if error_on_failure and r.status_code != 200:
+        raise ValueError('Could not access {}: error {}'.format(url,r.status_code))
+    return r.status_code
+def test_urls(urls, error_on_failure=True, n_workers=1, pool_type='thread', timeout=None):
+    """
+    Verify that URLs are available (i.e., returns status 200).  By default,
+    errors if any URL is unavailable.
+    Args:
+        urls (list): list of URLs to test
+        error_on_failure (bool, optional): whether to error (vs. just returning an
+            error code) if accessing this URL fails
+        n_workers (int, optional): number of concurrent workers, set to <=1 to disable
+            parallelization
+        pool_type (str, optional): worker type to use; should be 'thread' or 'process'
+        timeout (int, optional): timeout in seconds to wait before considering this
+            access attempt to be a failure; see requests.head() for precise documentation
+    Returns:
+        list: a list of http status codes, the same length and order as [urls]
+    """
+    if n_workers <= 1:
+        status_codes = []
+        for url in tqdm(urls):
+            r = requests.get(url, timeout=timeout)
+            if error_on_failure and r.status_code != 200:
+                raise ValueError('Could not access {}: error {}'.format(url,r.status_code))
+            status_codes.append(r.status_code)
+    else:
+        if pool_type == 'thread':
+            pool = ThreadPool(n_workers)
+        else:
+            assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
+            pool = Pool(n_workers)
+        print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
+        status_codes = list(tqdm(pool.imap(
+            partial(test_url,error_on_failure=error_on_failure,timeout=timeout),
+            urls), total=len(urls)))
+    return status_codes

megadetector 5.0.11__py3-none-any.whl → 5.0.12__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.11py3-none-any.whl → 5.0.12py3-none-any.whl