PyPI - megadetector - Versions diffs - 5.0.5__py3-none-any.whl → 5.0.7__py3-none-any.whl - Mend

megadetector 5.0.5py3-none-any.whl → 5.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (132) hide show

api/batch_processing/data_preparation/manage_local_batch.py +302 -263
api/batch_processing/data_preparation/manage_video_batch.py +81 -2
api/batch_processing/postprocessing/add_max_conf.py +1 -0
api/batch_processing/postprocessing/categorize_detections_by_size.py +50 -19
api/batch_processing/postprocessing/compare_batch_results.py +110 -60
api/batch_processing/postprocessing/load_api_results.py +56 -70
api/batch_processing/postprocessing/md_to_coco.py +1 -1
api/batch_processing/postprocessing/md_to_labelme.py +2 -1
api/batch_processing/postprocessing/postprocess_batch_results.py +240 -81
api/batch_processing/postprocessing/render_detection_confusion_matrix.py +625 -0
api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +227 -75
api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
api/synchronous/api_core/animal_detection_api/detection/run_detector_batch.py +2 -2
classification/prepare_classification_script.py +191 -191
data_management/coco_to_yolo.py +68 -45
data_management/databases/integrity_check_json_db.py +7 -5
data_management/generate_crops_from_cct.py +3 -3
data_management/get_image_sizes.py +8 -6
data_management/importers/add_timestamps_to_icct.py +79 -0
data_management/importers/animl_results_to_md_results.py +160 -0
data_management/importers/auckland_doc_test_to_json.py +4 -4
data_management/importers/auckland_doc_to_json.py +1 -1
data_management/importers/awc_to_json.py +5 -5
data_management/importers/bellevue_to_json.py +5 -5
data_management/importers/carrizo_shrubfree_2018.py +5 -5
data_management/importers/carrizo_trail_cam_2017.py +5 -5
data_management/importers/cct_field_adjustments.py +2 -3
data_management/importers/channel_islands_to_cct.py +4 -4
data_management/importers/ena24_to_json.py +5 -5
data_management/importers/helena_to_cct.py +10 -10
data_management/importers/idaho-camera-traps.py +12 -12
data_management/importers/idfg_iwildcam_lila_prep.py +8 -8
data_management/importers/jb_csv_to_json.py +4 -4
data_management/importers/missouri_to_json.py +1 -1
data_management/importers/noaa_seals_2019.py +1 -1
data_management/importers/pc_to_json.py +5 -5
data_management/importers/prepare-noaa-fish-data-for-lila.py +4 -4
data_management/importers/prepare_zsl_imerit.py +5 -5
data_management/importers/rspb_to_json.py +4 -4
data_management/importers/save_the_elephants_survey_A.py +5 -5
data_management/importers/save_the_elephants_survey_B.py +6 -6
data_management/importers/snapshot_safari_importer.py +9 -9
data_management/importers/snapshot_serengeti_lila.py +9 -9
data_management/importers/timelapse_csv_set_to_json.py +5 -7
data_management/importers/ubc_to_json.py +4 -4
data_management/importers/umn_to_json.py +4 -4
data_management/importers/wellington_to_json.py +1 -1
data_management/importers/wi_to_json.py +2 -2
data_management/importers/zamba_results_to_md_results.py +181 -0
data_management/labelme_to_coco.py +35 -7
data_management/labelme_to_yolo.py +229 -0
data_management/lila/add_locations_to_island_camera_traps.py +1 -1
data_management/lila/add_locations_to_nacti.py +147 -0
data_management/lila/create_lila_blank_set.py +474 -0
data_management/lila/create_lila_test_set.py +2 -1
data_management/lila/create_links_to_md_results_files.py +106 -0
data_management/lila/download_lila_subset.py +46 -21
data_management/lila/generate_lila_per_image_labels.py +23 -14
data_management/lila/get_lila_annotation_counts.py +17 -11
data_management/lila/lila_common.py +14 -11
data_management/lila/test_lila_metadata_urls.py +116 -0
data_management/ocr_tools.py +829 -0
data_management/resize_coco_dataset.py +13 -11
data_management/yolo_output_to_md_output.py +84 -12
data_management/yolo_to_coco.py +38 -20
detection/process_video.py +36 -14
detection/pytorch_detector.py +23 -8
detection/run_detector.py +76 -19
detection/run_detector_batch.py +178 -63
detection/run_inference_with_yolov5_val.py +326 -57
detection/run_tiled_inference.py +153 -43
detection/video_utils.py +34 -8
md_utils/ct_utils.py +172 -1
md_utils/md_tests.py +372 -51
md_utils/path_utils.py +167 -39
md_utils/process_utils.py +26 -7
md_utils/split_locations_into_train_val.py +215 -0
md_utils/string_utils.py +10 -0
md_utils/url_utils.py +0 -2
md_utils/write_html_image_list.py +9 -26
md_visualization/plot_utils.py +12 -8
md_visualization/visualization_utils.py +106 -7
md_visualization/visualize_db.py +16 -8
md_visualization/visualize_detector_output.py +208 -97
{megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/METADATA +3 -6
{megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/RECORD +98 -121
{megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/WHEEL +1 -1
taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
taxonomy_mapping/map_new_lila_datasets.py +43 -39
taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
taxonomy_mapping/preview_lila_taxonomy.py +27 -27
taxonomy_mapping/species_lookup.py +33 -13
taxonomy_mapping/taxonomy_csv_checker.py +7 -5
api/synchronous/api_core/yolov5/detect.py +0 -252
api/synchronous/api_core/yolov5/export.py +0 -607
api/synchronous/api_core/yolov5/hubconf.py +0 -146
api/synchronous/api_core/yolov5/models/__init__.py +0 -0
api/synchronous/api_core/yolov5/models/common.py +0 -738
api/synchronous/api_core/yolov5/models/experimental.py +0 -104
api/synchronous/api_core/yolov5/models/tf.py +0 -574
api/synchronous/api_core/yolov5/models/yolo.py +0 -338
api/synchronous/api_core/yolov5/train.py +0 -670
api/synchronous/api_core/yolov5/utils/__init__.py +0 -36
api/synchronous/api_core/yolov5/utils/activations.py +0 -103
api/synchronous/api_core/yolov5/utils/augmentations.py +0 -284
api/synchronous/api_core/yolov5/utils/autoanchor.py +0 -170
api/synchronous/api_core/yolov5/utils/autobatch.py +0 -66
api/synchronous/api_core/yolov5/utils/aws/__init__.py +0 -0
api/synchronous/api_core/yolov5/utils/aws/resume.py +0 -40
api/synchronous/api_core/yolov5/utils/benchmarks.py +0 -148
api/synchronous/api_core/yolov5/utils/callbacks.py +0 -71
api/synchronous/api_core/yolov5/utils/dataloaders.py +0 -1087
api/synchronous/api_core/yolov5/utils/downloads.py +0 -178
api/synchronous/api_core/yolov5/utils/flask_rest_api/example_request.py +0 -19
api/synchronous/api_core/yolov5/utils/flask_rest_api/restapi.py +0 -46
api/synchronous/api_core/yolov5/utils/general.py +0 -1018
api/synchronous/api_core/yolov5/utils/loggers/__init__.py +0 -187
api/synchronous/api_core/yolov5/utils/loggers/wandb/__init__.py +0 -0
api/synchronous/api_core/yolov5/utils/loggers/wandb/log_dataset.py +0 -27
api/synchronous/api_core/yolov5/utils/loggers/wandb/sweep.py +0 -41
api/synchronous/api_core/yolov5/utils/loggers/wandb/wandb_utils.py +0 -577
api/synchronous/api_core/yolov5/utils/loss.py +0 -234
api/synchronous/api_core/yolov5/utils/metrics.py +0 -355
api/synchronous/api_core/yolov5/utils/plots.py +0 -489
api/synchronous/api_core/yolov5/utils/torch_utils.py +0 -314
api/synchronous/api_core/yolov5/val.py +0 -394
md_utils/matlab_porting_tools.py +0 -97
{megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/LICENSE +0 -0
{megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/top_level.txt +0 -0

md_utils/path_utils.py CHANGED Viewed

@@ -21,7 +21,8 @@ import zipfile
 from zipfile import ZipFile
 from datetime import datetime
 from typing import Container, Iterable, List, Optional, Tuple, Sequence
-from multiprocessing.pool import ThreadPool
+from multiprocessing.pool import Pool, ThreadPool
+from functools import partial
 from tqdm import tqdm
 IMG_EXTENSIONS = ('.jpg', '.jpeg', '.gif', '.png', '.tif', '.tiff', '.bmp')
@@ -34,30 +35,51 @@ CHAR_LIMIT = 255
 #%% General path functions
-def recursive_file_list(base_dir, convert_slashes=True, return_relative_paths=False):
-    """
+def recursive_file_list(base_dir, convert_slashes=True,
+                        return_relative_paths=False, sort_files=True,
+                        recursive=True):
+    r"""
     Enumerate files (not directories) in [base_dir], optionally converting
     \ to /
     """
     all_files = []
-    for root, _, filenames in os.walk(base_dir):
-        for filename in filenames:
-            full_path = os.path.join(root, filename)
-            if convert_slashes:
-                full_path = full_path.replace('\\', '/')
-            all_files.append(full_path)
+    if recursive:
+        for root, _, filenames in os.walk(base_dir):
+            for filename in filenames:
+                full_path = os.path.join(root, filename)
+                all_files.append(full_path)
+    else:
+        all_files_relative = os.listdir(base_dir)
+        all_files = [os.path.join(base_dir,fn) for fn in all_files_relative]
+        all_files = [fn for fn in all_files if os.path.isfile(fn)]
     if return_relative_paths:
         all_files = [os.path.relpath(fn,base_dir) for fn in all_files]
+    if convert_slashes:
+        all_files = [fn.replace('\\', '/') for fn in all_files]
+    if sort_files:
+        all_files = sorted(all_files)
-    all_files = sorted(all_files)
     return all_files
-def split_path(path: str) -> List[str]:
+def file_list(base_dir, convert_slashes=True, return_relative_paths=False, sort_files=True,
+              recursive=False):
+    """
+    Trivial wrapper for recursive_file_list, which was a poor function name choice at the time,
+    it doesn't really make sense to have a "recursive" option in a function called "recursive_file_list".
     """
+    return recursive_file_list(base_dir,convert_slashes,return_relative_paths,sort_files,
+                               recursive=recursive)
+def split_path(path: str) -> List[str]:
+    r"""
     Splits [path] into all its constituent tokens.
     Non-recursive version of:
@@ -87,7 +109,7 @@ def split_path(path: str) -> List[str]:
 def fileparts(path: str) -> Tuple[str, str, str]:
-    """
+    r"""
     Breaks down a path into the directory path, filename, and extension.
     Note that the '.' lives with the extension, and separators are removed.
@@ -185,8 +207,9 @@ def safe_create_link(link_exists,link_new):
     and if it has a different target than link_exists, remove and re-create
     it.
-    Errors of link_new already exists but it's not a link.
-    """
+    Errors if link_new already exists but it's not a link.
+    """
     if os.path.exists(link_new) or os.path.islink(link_new):
         assert os.path.islink(link_new)
         if not os.readlink(link_new) == link_exists:
@@ -196,6 +219,23 @@ def safe_create_link(link_exists,link_new):
         os.symlink(link_exists,link_new)
+def get_file_sizes(base_dir, convert_slashes=True):
+    """
+    Get sizes recursively for all files in base_dir, returning a dict mapping
+    relative filenames to size.
+    """
+    relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
+                                             return_relative_paths=True)
+    fn_to_size = {}
+    for fn_relative in tqdm(relative_filenames):
+        fn_abs = os.path.join(base_dir,fn_relative)
+        fn_to_size[fn_relative] = os.path.getsize(fn_abs)
+    return fn_to_size
 #%% Image-related path functions
 def is_image_file(s: str, img_extensions: Container[str] = IMG_EXTENSIONS
@@ -221,10 +261,14 @@ def find_image_strings(strings: Iterable[str]) -> List[str]:
     return [s for s in strings if is_image_file(s)]
-def find_images(dirname: str, recursive: bool = False, return_relative_paths: bool = False) -> List[str]:
+def find_images(dirname: str, recursive: bool = False,
+                return_relative_paths: bool = False,
+                convert_slashes: bool = False) -> List[str]:
     """
     Finds all files in a directory that look like image file names. Returns
-    absolute paths unless return_relative_paths is set.
+    absolute paths unless return_relative_paths is set.  Uses the OS-native
+    path separator unless convert_slahes is set, in which case will always
+    use '/'.
     """
     if recursive:
@@ -238,6 +282,10 @@ def find_images(dirname: str, recursive: bool = False, return_relative_paths: bo
         image_files = [os.path.relpath(fn,dirname) for fn in image_files]
     image_files = sorted(image_files)
+    if convert_slashes:
+        image_files = [fn.replace('\\', '/') for fn in image_files]
     return image_files
@@ -245,11 +293,11 @@ def find_images(dirname: str, recursive: bool = False, return_relative_paths: bo
 def clean_filename(filename: str, allow_list: str = VALID_FILENAME_CHARS,
                    char_limit: int = CHAR_LIMIT, force_lower: bool = False) -> str:
-    """
+    r"""
     Removes non-ASCII and other invalid filename characters (on any
     reasonable OS) from a filename, then trims to a maximum length.
-    Does not allow :\/, use clean_path if you want to preserve those.
+    Does not allow :\/ by default, use clean_path if you want to preserve those.
     Adapted from
     https://gist.github.com/wassname/1393c4a57cfcbf03641dbc31886123b8
@@ -294,30 +342,71 @@ def flatten_path(pathname: str, separator_chars: str = SEPARATOR_CHARS) -> str:
 #%% Platform-independent way to open files in their associated application
-import sys,subprocess
-def open_file(filename):
-    if sys.platform == "win32":
-        os.startfile(filename)
-    else:
-        opener = "open" if sys.platform == "darwin" else "xdg-open"
-        subprocess.call([opener, filename])
+import sys,subprocess,platform,re
+def environment_is_wsl():
+    """
+    Returns True if we're running in WSL
+    """
+    if sys.platform not in ('linux','posix'):
+        return False
+    platform_string = ' '.join(platform.uname()).lower()
+    return 'microsoft' in platform_string and 'wsl' in platform_string
-#%% zipfile management functions
+def wsl_path_to_windows_path(filename):
+    """
+    Converts a WSL path to a Windows path, or returns None if that's not possible.  E.g.
+    converts:
+    /mnt/e/a/b/c
+    ...to:
+    e:\a\b\c
+    """
+    result = subprocess.run(['wslpath', '-w', filename], text=True, capture_output=True)
+    if result.returncode != 0:
+        print('Could not convert path {} from WSL to Windows'.format(filename))
+        return None
+    return result.stdout.strip()
-def unzip_file(input_file, output_folder=None):
+def open_file(filename,attempt_to_open_in_wsl_host=False):
     """
-    Unzip a zipfile to the specified output folder, defaulting to the same location as
-    the input file
+    Opens [filename] in the native OS file handler.  If attempt_to_open_in_wsl_host
+    is True, and we're in WSL, attempts to open [filename] in Windows.
     """
-    if output_folder is None:
-        output_folder = os.path.dirname(input_file)
+    if sys.platform == 'win32':
-    with zipfile.ZipFile(input_file, 'r') as zf:
-        zf.extractall(output_folder)
+        os.startfile(filename)
+    elif sys.platform == 'darwin':
+        opener = 'open'
+        subprocess.call([opener, filename])
+    elif attempt_to_open_in_wsl_host and environment_is_wsl():
+        windows_path = wsl_path_to_windows_path(filename)
+        # Fall back to xdg-open
+        if windows_path is None:
+            subprocess.call(['xdg-open', filename])
+        if os.path.isdir(filename):
+            subprocess.run(["explorer.exe", windows_path])
+        else:
+            os.system("cmd.exe /C start %s" % (re.escape(windows_path)))
+    else:
+        opener = 'xdg-open'
+        subprocess.call([opener, filename])
 #%% File list functions
@@ -393,7 +482,7 @@ def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, com
     relative_filenames = recursive_file_list(input_folder,return_relative_paths=True)
     with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
-        for input_fn_relative in relative_filenames:
+        for input_fn_relative in tqdm(relative_filenames,disable=(not verbose)):
             input_fn_abs = os.path.join(input_folder,input_fn_relative)
             zipf.write(input_fn_abs,
                        arcname=input_fn_relative,
@@ -403,14 +492,53 @@ def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, com
     return output_fn
-def parallel_zip_files(input_files,max_workers=16):
+def parallel_zip_files(input_files, max_workers=16, use_threads=True):
     """
     Zip one or more files to separate output files in parallel, leaving the
-    original files in place.
+    original files in place.  Each file is zipped to [filename].zip.
     """
     n_workers = min(max_workers,len(input_files))
-    pool = ThreadPool(n_workers)
+    if use_threads:
+        pool = ThreadPool(n_workers)
+    else:
+        pool = Pool(n_workers)
     with tqdm(total=len(input_files)) as pbar:
         for i,_ in enumerate(pool.imap_unordered(zip_file,input_files)):
             pbar.update()
+def parallel_zip_folders(input_folders, max_workers=16, use_threads=True,
+                         compresslevel=9, overwrite=False):
+    """
+    Zip one or more folders to separate output files in parallel, leaving the
+    original folders in place.  Each folder is zipped to [folder_name].zip.
+    """
+    n_workers = min(max_workers,len(input_folders))
+    if use_threads:
+        pool = ThreadPool(n_workers)
+    else:
+        pool = Pool(n_workers)
+    with tqdm(total=len(input_folders)) as pbar:
+        for i,_ in enumerate(pool.imap_unordered(
+                partial(zip_folder,overwrite=overwrite,compresslevel=compresslevel),
+                input_folders)):
+            pbar.update()
+def unzip_file(input_file, output_folder=None):
+    """
+    Unzip a zipfile to the specified output folder, defaulting to the same location as
+    the input file
+    """
+    if output_folder is None:
+        output_folder = os.path.dirname(input_file)
+    with zipfile.ZipFile(input_file, 'r') as zf:
+        zf.extractall(output_folder)

md_utils/process_utils.py CHANGED Viewed

@@ -17,14 +17,28 @@ import subprocess
 os.environ["PYTHONUNBUFFERED"] = "1"
-def execute(cmd):
+def execute(cmd,encoding=None,errors=None,env=None,verbose=False):
     """
     Run [cmd] (a single string) in a shell, yielding each line of output to the caller.
+    The "encoding", "errors", and "env" parameters are passed directly to subprocess.Popen().
+    "verbose" only impacts output about process management, it is not related to printing
+    output from the child process.
     """
+    if verbose:
+        if encoding is not None:
+            print('Launching child process with non-default encoding {}'.format(encoding))
+        if errors is not None:
+            print('Launching child process with non-default text error handling {}'.format(errors))
+        if env is not None:
+            print('Launching child process with non-default environment {}'.format(str(env)))
     # https://stackoverflow.com/questions/4417546/constantly-print-subprocess-output-while-process-is-running
     popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
-                             shell=True, universal_newlines=True)
+                             shell=True, universal_newlines=True, encoding=encoding,
+                             errors=errors, env=env)
     for stdout_line in iter(popen.stdout.readline, ""):
         yield stdout_line
     popen.stdout.close()
@@ -33,22 +47,27 @@ def execute(cmd):
         raise subprocess.CalledProcessError(return_code, cmd)
-def execute_and_print(cmd,print_output=True):
+def execute_and_print(cmd,print_output=True,encoding=None,errors=None,env=None,verbose=False):
     """
     Run [cmd] (a single string) in a shell, capturing and printing output.  Returns
     a dictionary with fields "status" and "output".
+    The "encoding", "errors", and "env" parameters are passed directly to subprocess.Popen().
+    "verbose" only impacts output about process management, it is not related to printing
+    output from the child process.
     """
     to_return = {'status':'unknown','output':''}
-    output=[]
+    output = []
     try:
-        for s in execute(cmd):
+        for s in execute(cmd,encoding=encoding,errors=errors,env=env,verbose=verbose):
             output.append(s)
             if print_output:
                 print(s,end='',flush=True)
         to_return['status'] = 0
     except subprocess.CalledProcessError as cpe:
-        print('execute_and_print caught error: {}'.format(cpe.output))
+        print('execute_and_print caught error: {} ({})'.format(cpe.output,str(cpe)))
         to_return['status'] = cpe.returncode
     to_return['output'] = output

md_utils/split_locations_into_train_val.py ADDED Viewed

@@ -0,0 +1,215 @@
+########
+#
+# split_locations_into_train_val.py
+#
+# Split a list of location IDs into training and validation, targeting a specific
+# train/val split for each category, but allowing some categories to be tighter or looser
+# than others.  Does nothing particularly clever, just randomly splits locations into
+# train/val lots of times using the target val fraction, and picks the one that meets the
+# specified constraints and minimizes weighted error, where "error" is defined as the
+# sum of each class's absolute divergence from the target val fraction.
+#
+########
+#%% Imports/constants
+import random
+import numpy as np
+from collections import defaultdict
+from md_utils.ct_utils import sort_dictionary_by_value
+from tqdm import tqdm
+#%% Main function
+def split_locations_into_train_val(location_to_category_counts,
+                                   n_random_seeds=10000,
+                                   target_val_fraction=0.15,
+                                   category_to_max_allowable_error=None,
+                                   category_to_error_weight=None,
+                                   default_max_allowable_error=0.1):
+    """
+    Split a list of location IDs into training and validation, targeting a specific
+    train/val split for each category, but allowing some categories to be tighter or looser
+    than others.  Does nothing particularly clever, just randomly splits locations into
+    train/val lots of times using the target val fraction, and picks the one that meets the
+    specified constraints and minimizes weighted error, where "error" is defined as the
+    sum of each class's absolute divergence from the target val fraction.
+    location_to_category_counts should be a dict mapping location IDs to dicts,
+    with each dict mapping a category name to a count.  Any categories not present in a
+    particular dict are assumed to have a count of zero for that location.
+    If not None, category_to_max_allowable_error should be a dict mapping category names
+    to maximum allowable errors.  These are hard constraints, but you can specify a subset
+    of categories.  Categories not included here have a maximum error of Inf.
+    If not None, category_to_error_weight should be a dict mapping category names to
+    error weights.  You can specify a subset of categories.  Categories not included here
+    have a weight of 1.0.
+    default_max_allowable_error is the maximum allowable error for categories not present in
+    category_to_max_allowable_error.  Set to None (or >= 1.0) to disable hard constraints for
+    categories not present in category_to_max_allowable_error
+    returns val_locations,category_to_val_fraction
+    """
+    location_ids = list(location_to_category_counts.keys())
+    n_val_locations = int(target_val_fraction*len(location_ids))
+    if category_to_max_allowable_error is None:
+        category_to_max_allowable_error = {}
+    if category_to_error_weight is None:
+        category_to_error_weight = {}
+    # category ID to total count; the total count is used only for printouts
+    category_id_to_count = {}
+    for location_id in location_to_category_counts:
+        for category_id in location_to_category_counts[location_id].keys():
+            if category_id not in category_id_to_count:
+                category_id_to_count[category_id] = 0
+            category_id_to_count[category_id] += \
+                location_to_category_counts[location_id][category_id]
+    category_ids = set(category_id_to_count.keys())
+    print('Splitting {} categories over {} locations'.format(
+        len(category_ids),len(location_ids)))
+    # random_seed = 0
+    def compute_seed_errors(random_seed):
+        """
+        Compute the per-category error for a specific random seed.
+        returns weighted_average_error,category_to_val_fraction
+        """
+        # Randomly split into train/val
+        random.seed(random_seed)
+        val_locations = random.sample(location_ids,k=n_val_locations)
+        val_locations_set = set(val_locations)
+        # For each category, measure the % of images that went into the val set
+        category_to_val_fraction = defaultdict(float)
+        for category_id in category_ids:
+            category_val_count = 0
+            category_train_count = 0
+            for location_id in location_to_category_counts:
+                if category_id not in location_to_category_counts[location_id]:
+                    location_category_count = 0
+                else:
+                    location_category_count = location_to_category_counts[location_id][category_id]
+                if location_id in val_locations_set:
+                    category_val_count += location_category_count
+                else:
+                    category_train_count += location_category_count
+            category_val_fraction = category_val_count / (category_val_count + category_train_count)
+            category_to_val_fraction[category_id] = category_val_fraction
+        # Absolute deviation from the target val fraction for each categorys
+        category_errors = {}
+        weighted_category_errors = {}
+        # category = next(iter(category_to_val_fraction))
+        for category in category_to_val_fraction:
+            category_val_fraction = category_to_val_fraction[category]
+            category_error = abs(category_val_fraction-target_val_fraction)
+            category_errors[category] = category_error
+            category_weight = 1.0
+            if category in category_to_error_weight:
+                category_weight = category_to_error_weight[category]
+            weighted_category_error = category_error * category_weight
+            weighted_category_errors[category] = weighted_category_error
+        weighted_average_error = np.mean(list(weighted_category_errors.values()))
+        return weighted_average_error,weighted_category_errors,category_to_val_fraction
+    # ... def compute_seed_errors(...)
+    # This will only include random seeds that satisfy the hard constraints
+    random_seed_to_weighted_average_error = {}
+    # random_seed = 0
+    for random_seed in tqdm(range(0,n_random_seeds)):
+        weighted_average_error,weighted_category_errors,category_to_val_fraction = \
+            compute_seed_errors(random_seed)
+        seed_satisfies_hard_constraints = True
+        for category in category_to_val_fraction:
+            if category in category_to_max_allowable_error:
+                max_allowable_error = category_to_max_allowable_error[category]
+            else:
+                if default_max_allowable_error is None:
+                    continue
+                max_allowable_error = default_max_allowable_error
+            val_fraction = category_to_val_fraction[category]
+            category_error = abs(val_fraction - target_val_fraction)
+            if category_error > max_allowable_error:
+                seed_satisfies_hard_constraints = False
+                break
+        if seed_satisfies_hard_constraints:
+            random_seed_to_weighted_average_error[random_seed] = weighted_average_error
+    # ...for each random seed
+    assert len(random_seed_to_weighted_average_error) > 0, \
+        'No random seed met all the hard constraints'
+    print('\n{} of {} random seeds satisfied hard constraints'.format(
+        len(random_seed_to_weighted_average_error),n_random_seeds))
+    min_error = None
+    min_error_seed = None
+    for random_seed in random_seed_to_weighted_average_error.keys():
+        error_metric = random_seed_to_weighted_average_error[random_seed]
+        if min_error is None or error_metric < min_error:
+            min_error = error_metric
+            min_error_seed = random_seed
+    random.seed(min_error_seed)
+    val_locations = random.sample(location_ids,k=n_val_locations)
+    train_locations = []
+    for location_id in location_ids:
+        if location_id not in val_locations:
+            train_locations.append(location_id)
+    print('\nVal locations:\n')
+    for loc in val_locations:
+        print('{}'.format(loc))
+    print('')
+    weighted_average_error,weighted_category_errors,category_to_val_fraction = \
+        compute_seed_errors(min_error_seed)
+    random_seed = min_error_seed
+    category_to_val_fraction = sort_dictionary_by_value(category_to_val_fraction,reverse=True)
+    category_to_val_fraction = sort_dictionary_by_value(category_to_val_fraction,
+                                                        sort_values=category_id_to_count,
+                                                        reverse=True)
+    print('Val fractions by category:\n')
+    for category in category_to_val_fraction:
+        print('{} ({}) {:.2f}'.format(
+            category,category_id_to_count[category],
+            category_to_val_fraction[category]))
+    return val_locations,category_to_val_fraction
+# ...def split_locations_into_train_val(...)

md_utils/string_utils.py CHANGED Viewed

@@ -57,3 +57,13 @@ def human_readable_to_bytes(size):
             bytes = 0
     return bytes
+def remove_ansi_codes(s):
+    """
+    Remove ANSI escape codes from a string.
+    https://stackoverflow.com/questions/14693701/how-can-i-remove-the-ansi-escape-sequences-from-a-string-in-python#14693789
+    """
+    ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
+    return ansi_escape.sub('', s)

md_utils/url_utils.py CHANGED Viewed

	@@ -140,5 +140,3 @@ def test_urls(urls, error_on_failure=True):
140 140
141 141	return status_codes
142 142
143	-
144	-

megadetector 5.0.5__py3-none-any.whl → 5.0.7__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.5py3-none-any.whl → 5.0.7py3-none-any.whl