PyPI - megadetector - Versions diffs - 5.0.7__py3-none-any.whl → 5.0.9__py3-none-any.whl - Mend

megadetector 5.0.7py3-none-any.whl → 5.0.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (191) hide show

api/__init__.py +0 -0
api/batch_processing/__init__.py +0 -0
api/batch_processing/api_core/__init__.py +0 -0
api/batch_processing/api_core/batch_service/__init__.py +0 -0
api/batch_processing/api_core/batch_service/score.py +0 -1
api/batch_processing/api_core/server_job_status_table.py +0 -1
api/batch_processing/api_core_support/__init__.py +0 -0
api/batch_processing/api_core_support/aggregate_results_manually.py +0 -1
api/batch_processing/api_support/__init__.py +0 -0
api/batch_processing/api_support/summarize_daily_activity.py +0 -1
api/batch_processing/data_preparation/__init__.py +0 -0
api/batch_processing/data_preparation/manage_local_batch.py +93 -79
api/batch_processing/data_preparation/manage_video_batch.py +8 -8
api/batch_processing/integration/digiKam/xmp_integration.py +0 -1
api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -1
api/batch_processing/postprocessing/__init__.py +0 -0
api/batch_processing/postprocessing/add_max_conf.py +12 -12
api/batch_processing/postprocessing/categorize_detections_by_size.py +32 -14
api/batch_processing/postprocessing/combine_api_outputs.py +69 -55
api/batch_processing/postprocessing/compare_batch_results.py +114 -44
api/batch_processing/postprocessing/convert_output_format.py +62 -19
api/batch_processing/postprocessing/load_api_results.py +17 -20
api/batch_processing/postprocessing/md_to_coco.py +31 -21
api/batch_processing/postprocessing/md_to_labelme.py +165 -68
api/batch_processing/postprocessing/merge_detections.py +40 -15
api/batch_processing/postprocessing/postprocess_batch_results.py +270 -186
api/batch_processing/postprocessing/remap_detection_categories.py +170 -0
api/batch_processing/postprocessing/render_detection_confusion_matrix.py +75 -39
api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +53 -44
api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +25 -14
api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +244 -160
api/batch_processing/postprocessing/separate_detections_into_folders.py +159 -114
api/batch_processing/postprocessing/subset_json_detector_output.py +146 -169
api/batch_processing/postprocessing/top_folders_to_bottom.py +77 -43
api/synchronous/__init__.py +0 -0
api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
api/synchronous/api_core/animal_detection_api/api_backend.py +0 -2
api/synchronous/api_core/animal_detection_api/api_frontend.py +266 -268
api/synchronous/api_core/animal_detection_api/config.py +35 -35
api/synchronous/api_core/tests/__init__.py +0 -0
api/synchronous/api_core/tests/load_test.py +109 -109
classification/__init__.py +0 -0
classification/aggregate_classifier_probs.py +21 -24
classification/analyze_failed_images.py +11 -13
classification/cache_batchapi_outputs.py +51 -51
classification/create_classification_dataset.py +69 -68
classification/crop_detections.py +54 -53
classification/csv_to_json.py +97 -100
classification/detect_and_crop.py +105 -105
classification/evaluate_model.py +43 -42
classification/identify_mislabeled_candidates.py +47 -46
classification/json_to_azcopy_list.py +10 -10
classification/json_validator.py +72 -71
classification/map_classification_categories.py +44 -43
classification/merge_classification_detection_output.py +68 -68
classification/prepare_classification_script.py +157 -154
classification/prepare_classification_script_mc.py +228 -228
classification/run_classifier.py +27 -26
classification/save_mislabeled.py +30 -30
classification/train_classifier.py +20 -20
classification/train_classifier_tf.py +21 -22
classification/train_utils.py +10 -10
data_management/__init__.py +0 -0
data_management/annotations/__init__.py +0 -0
data_management/annotations/annotation_constants.py +18 -31
data_management/camtrap_dp_to_coco.py +238 -0
data_management/cct_json_utils.py +107 -59
data_management/cct_to_md.py +176 -158
data_management/cct_to_wi.py +247 -219
data_management/coco_to_labelme.py +272 -0
data_management/coco_to_yolo.py +86 -62
data_management/databases/__init__.py +0 -0
data_management/databases/add_width_and_height_to_db.py +20 -16
data_management/databases/combine_coco_camera_traps_files.py +35 -31
data_management/databases/integrity_check_json_db.py +130 -83
data_management/databases/subset_json_db.py +25 -16
data_management/generate_crops_from_cct.py +27 -45
data_management/get_image_sizes.py +188 -144
data_management/importers/add_nacti_sizes.py +8 -8
data_management/importers/add_timestamps_to_icct.py +78 -78
data_management/importers/animl_results_to_md_results.py +158 -160
data_management/importers/auckland_doc_test_to_json.py +9 -9
data_management/importers/auckland_doc_to_json.py +8 -8
data_management/importers/awc_to_json.py +7 -7
data_management/importers/bellevue_to_json.py +15 -15
data_management/importers/cacophony-thermal-importer.py +13 -13
data_management/importers/carrizo_shrubfree_2018.py +8 -8
data_management/importers/carrizo_trail_cam_2017.py +8 -8
data_management/importers/cct_field_adjustments.py +9 -9
data_management/importers/channel_islands_to_cct.py +10 -10
data_management/importers/eMammal/copy_and_unzip_emammal.py +1 -0
data_management/importers/ena24_to_json.py +7 -7
data_management/importers/filenames_to_json.py +8 -8
data_management/importers/helena_to_cct.py +7 -7
data_management/importers/idaho-camera-traps.py +7 -7
data_management/importers/idfg_iwildcam_lila_prep.py +10 -10
data_management/importers/jb_csv_to_json.py +9 -9
data_management/importers/mcgill_to_json.py +8 -8
data_management/importers/missouri_to_json.py +18 -18
data_management/importers/nacti_fieldname_adjustments.py +10 -10
data_management/importers/noaa_seals_2019.py +8 -8
data_management/importers/pc_to_json.py +7 -7
data_management/importers/plot_wni_giraffes.py +7 -7
data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -359
data_management/importers/prepare_zsl_imerit.py +7 -7
data_management/importers/rspb_to_json.py +8 -8
data_management/importers/save_the_elephants_survey_A.py +8 -8
data_management/importers/save_the_elephants_survey_B.py +9 -9
data_management/importers/snapshot_safari_importer.py +26 -26
data_management/importers/snapshot_safari_importer_reprise.py +665 -665
data_management/importers/snapshot_serengeti_lila.py +14 -14
data_management/importers/sulross_get_exif.py +8 -9
data_management/importers/timelapse_csv_set_to_json.py +11 -11
data_management/importers/ubc_to_json.py +13 -13
data_management/importers/umn_to_json.py +7 -7
data_management/importers/wellington_to_json.py +8 -8
data_management/importers/wi_to_json.py +9 -9
data_management/importers/zamba_results_to_md_results.py +181 -181
data_management/labelme_to_coco.py +309 -159
data_management/labelme_to_yolo.py +103 -60
data_management/lila/__init__.py +0 -0
data_management/lila/add_locations_to_island_camera_traps.py +9 -9
data_management/lila/add_locations_to_nacti.py +147 -147
data_management/lila/create_lila_blank_set.py +114 -31
data_management/lila/create_lila_test_set.py +8 -8
data_management/lila/create_links_to_md_results_files.py +106 -106
data_management/lila/download_lila_subset.py +92 -90
data_management/lila/generate_lila_per_image_labels.py +56 -43
data_management/lila/get_lila_annotation_counts.py +18 -15
data_management/lila/get_lila_image_counts.py +11 -11
data_management/lila/lila_common.py +103 -70
data_management/lila/test_lila_metadata_urls.py +132 -116
data_management/ocr_tools.py +173 -128
data_management/read_exif.py +161 -99
data_management/remap_coco_categories.py +84 -0
data_management/remove_exif.py +58 -62
data_management/resize_coco_dataset.py +32 -44
data_management/wi_download_csv_to_coco.py +246 -0
data_management/yolo_output_to_md_output.py +86 -73
data_management/yolo_to_coco.py +535 -95
detection/__init__.py +0 -0
detection/detector_training/__init__.py +0 -0
detection/process_video.py +85 -33
detection/pytorch_detector.py +43 -25
detection/run_detector.py +157 -72
detection/run_detector_batch.py +189 -114
detection/run_inference_with_yolov5_val.py +118 -51
detection/run_tiled_inference.py +113 -42
detection/tf_detector.py +51 -28
detection/video_utils.py +606 -521
docs/source/conf.py +43 -0
md_utils/__init__.py +0 -0
md_utils/azure_utils.py +9 -9
md_utils/ct_utils.py +249 -70
md_utils/directory_listing.py +59 -64
md_utils/md_tests.py +968 -862
md_utils/path_utils.py +655 -155
md_utils/process_utils.py +157 -133
md_utils/sas_blob_utils.py +20 -20
md_utils/split_locations_into_train_val.py +45 -32
md_utils/string_utils.py +33 -10
md_utils/url_utils.py +208 -27
md_utils/write_html_image_list.py +51 -35
md_visualization/__init__.py +0 -0
md_visualization/plot_utils.py +102 -109
md_visualization/render_images_with_thumbnails.py +34 -34
md_visualization/visualization_utils.py +908 -311
md_visualization/visualize_db.py +109 -58
md_visualization/visualize_detector_output.py +61 -42
{megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/METADATA +21 -17
megadetector-5.0.9.dist-info/RECORD +224 -0
{megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/WHEEL +1 -1
{megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/top_level.txt +1 -0
taxonomy_mapping/__init__.py +0 -0
taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +342 -335
taxonomy_mapping/map_new_lila_datasets.py +154 -154
taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -134
taxonomy_mapping/preview_lila_taxonomy.py +591 -591
taxonomy_mapping/retrieve_sample_image.py +12 -12
taxonomy_mapping/simple_image_download.py +11 -11
taxonomy_mapping/species_lookup.py +10 -10
taxonomy_mapping/taxonomy_csv_checker.py +18 -18
taxonomy_mapping/taxonomy_graph.py +47 -47
taxonomy_mapping/validate_lila_category_mappings.py +83 -76
data_management/cct_json_to_filename_json.py +0 -89
data_management/cct_to_csv.py +0 -140
data_management/databases/remove_corrupted_images_from_db.py +0 -191
detection/detector_training/copy_checkpoints.py +0 -43
md_visualization/visualize_megadb.py +0 -183
megadetector-5.0.7.dist-info/RECORD +0 -202
{megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/LICENSE +0 -0

md_utils/path_utils.py CHANGED Viewed

@@ -1,30 +1,37 @@
-########
-#
-# path_utils.py
-#
-# Miscellaneous useful utils for path manipulation, things that could *almost*
-# be in os.path, but aren't.
-#
-########
+"""
+path_utils.py
+Miscellaneous useful utils for path manipulation, i.e. things that could *almost*
+be in os.path, but aren't.
+"""
 #%% Imports and constants
 import glob
 import ntpath
 import os
-import posixpath
+import sys
+import platform
 import string
 import json
+import shutil
 import unicodedata
 import zipfile
+import tarfile
+import webbrowser
+import subprocess
+import re
 from zipfile import ZipFile
 from datetime import datetime
-from typing import Container, Iterable, List, Optional, Tuple, Sequence
 from multiprocessing.pool import Pool, ThreadPool
 from functools import partial
+from shutil import which
 from tqdm import tqdm
+# Should all be lower-case
 IMG_EXTENSIONS = ('.jpg', '.jpeg', '.gif', '.png', '.tif', '.tiff', '.bmp')
 VALID_FILENAME_CHARS = f"~-_.() {string.ascii_letters}{string.digits}"
@@ -35,14 +42,31 @@ CHAR_LIMIT = 255
 #%% General path functions
-def recursive_file_list(base_dir, convert_slashes=True,
-                        return_relative_paths=False, sort_files=True,
+def recursive_file_list(base_dir,
+                        convert_slashes=True,
+                        return_relative_paths=False,
+                        sort_files=True,
                         recursive=True):
     r"""
-    Enumerate files (not directories) in [base_dir], optionally converting
-    \ to /
+    Enumerates files (not directories) in [base_dir], optionally converting
+    backslahes to slashes
+    Args:
+        base_dir (str): folder to enumerate
+        convert_slashes (bool, optional): force forward slashes; if this is False, will use
+            the native path separator
+        return_relative_paths (bool, optional): return paths that are relative to [base_dir],
+            rather than absolute paths
+        sort_files (bool, optional): force files to be sorted, otherwise uses the sorting
+            provided by os.walk()
+        recursive (bool, optional): enumerate recursively
+    Returns:
+        list: list of filenames
     """
+    assert os.path.isdir(base_dir), '{} is not a folder'.format(base_dir)
     all_files = []
     if recursive:
@@ -71,61 +95,51 @@ def file_list(base_dir, convert_slashes=True, return_relative_paths=False, sort_
               recursive=False):
     """
     Trivial wrapper for recursive_file_list, which was a poor function name choice at the time,
-    it doesn't really make sense to have a "recursive" option in a function called "recursive_file_list".
+    since it doesn't really make sense to have a "recursive" option in a function called
+    "recursive_file_list".
+    Args:
+        base_dir (str): folder to enumerate
+        convert_slashes (bool, optional): force forward slashes; if this is False, will use
+            the native path separator
+        return_relative_paths (bool, optional): return paths that are relative to [base_dir],
+            rather than absolute paths
+        sort_files (bool, optional): force files to be sorted, otherwise uses the sorting
+            provided by os.walk()
+        recursive (bool, optional): enumerate recursively
+    Returns:
+        list: list of filenames
     """
     return recursive_file_list(base_dir,convert_slashes,return_relative_paths,sort_files,
                                recursive=recursive)
-def split_path(path: str) -> List[str]:
-    r"""
-    Splits [path] into all its constituent tokens.
-    Non-recursive version of:
-    http://nicks-liquid-soapbox.blogspot.com/2011/03/splitting-path-to-list-in-python.html
-    Examples
-    >>> split_path(r'c:\dir\subdir\file.txt')
-    ['c:\\', 'dir', 'subdir', 'file.txt']
-    >>> split_path('/dir/subdir/file.jpg')
-    ['/', 'dir', 'subdir', 'file.jpg']
-    >>> split_path('c:\\')
-    ['c:\\']
-    >>> split_path('/')
-    ['/']
-    """
-    parts = []
-    while True:
-        # ntpath seems to do the right thing for both Windows and Unix paths
-        head, tail = ntpath.split(path)
-        if head == '' or head == path:
-            break
-        parts.append(tail)
-        path = head
-    parts.append(head or tail)
-    return parts[::-1]  # reverse
-def fileparts(path: str) -> Tuple[str, str, str]:
+def fileparts(path):
     r"""
     Breaks down a path into the directory path, filename, and extension.
     Note that the '.' lives with the extension, and separators are removed.
-    Examples
-    >>> fileparts('file')
-    ('', 'file', '')
-    >>> fileparts(r'c:\dir\file.jpg')
-    ('c:\\dir', 'file', '.jpg')
-    >>> fileparts('/dir/subdir/file.jpg')
-    ('/dir/subdir', 'file', '.jpg')
+    Examples:
+    .. code-block:: none
+        >>> fileparts('file')
+        ('', 'file', '')
+        >>> fileparts(r'c:/dir/file.jpg')
+        ('c:/dir', 'file', '.jpg')
+        >>> fileparts('/dir/subdir/file.jpg')
+        ('/dir/subdir', 'file', '.jpg')
+    Args:
+        path (str): path name to separate into parts
     Returns:
-        p: str, directory path
-        n: str, filename without extension
-        e: str, extension including the '.'
+        tuple: tuple containing (p,n,e):
+            - p: str, directory path
+            - n: str, filename without extension
+            - e: str, extension including the '.'
     """
     # ntpath seems to do the right thing for both Windows and Unix paths
@@ -135,79 +149,168 @@ def fileparts(path: str) -> Tuple[str, str, str]:
     return p, n, e
-def insert_before_extension(filename: str, s: str = '', separator='.') -> str:
+def insert_before_extension(filename, s=None, separator='.'):
     """
     Insert string [s] before the extension in [filename], separated with [separator].
     If [s] is empty, generates a date/timestamp. If [filename] has no extension,
     appends [s].
-    Examples
-    >>> insert_before_extension('/dir/subdir/file.ext', 'insert')
-    '/dir/subdir/file.insert.ext'
-    >>> insert_before_extension('/dir/subdir/file', 'insert')
-    '/dir/subdir/file.insert'
-    >>> insert_before_extension('/dir/subdir/file')
-    '/dir/subdir/file.2020.07.20.10.54.38'
+    Examples:
+    .. code-block:: none
+        >>> insert_before_extension('/dir/subdir/file.ext', 'insert')
+        '/dir/subdir/file.insert.ext'
+        >>> insert_before_extension('/dir/subdir/file', 'insert')
+        '/dir/subdir/file.insert'
+        >>> insert_before_extension('/dir/subdir/file')
+        '/dir/subdir/file.2020.07.20.10.54.38'
+    Args:
+        filename (str): filename to manipulate
+        s (str, optional): string to insert before the extension in [filename], or
+            None to insert a datestamp
+        separator (str, optional): separator to place between the filename base
+            and the inserted string
+    Returns:
+        str: modified string
     """
     assert len(filename) > 0
-    if len(s) == 0:
+    if s is None or len(s) == 0:
         s = datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
     name, ext = os.path.splitext(filename)
     return f'{name}{separator}{s}{ext}'
-def top_level_folder(p: str, windows: Optional[bool] = None) -> str:
+def split_path(path):
+    r"""
+    Splits [path] into all its constituent file/folder tokens.
+    Examples:
+    .. code-block:: none
+        >>> split_path(r'c:\dir\subdir\file.txt')
+        ['c:\\', 'dir', 'subdir', 'file.txt']
+        >>> split_path('/dir/subdir/file.jpg')
+        ['/', 'dir', 'subdir', 'file.jpg']
+        >>> split_path('c:\\')
+        ['c:\\']
+        >>> split_path('/')
+        ['/']
+    Args:
+        path (str): path to split into tokens
+    Returns:
+        list: list of path tokens
     """
-    Gets the top-level folder from path [p].
+    parts = []
+    while True:
+        # ntpath seems to do the right thing for both Windows and Unix paths
+        head, tail = ntpath.split(path)
+        if head == '' or head == path:
+            break
+        parts.append(tail)
+        path = head
+    parts.append(head or tail)
+    return parts[::-1] # reverse
-    This function behaves differently for Windows vs. Unix paths. Set
-    windows=True if [p] is a Windows path. Set windows=None (default) to treat
-    [p] as a native system path.
-    On Windows, will use the top-level folder that isn't the drive.
-    >>> top_level_folder(r'c:\blah\foo')
-    'c:\blah'
+def path_is_abs(p):
+    """
+    Determines whether [p] is an absolute path.  An absolute path is defined as
+    one that starts with slash, backslash, or a letter followed by a colon.
+    Args:
+        p (str): path to evaluate
+    Returns:
+        bool: True if [p] is an absolute path, else False
+    """
+    return (len(p) > 1) and (p[0] == '/' or p[1] == ':' or p[0] == '\\')
-    On Unix, does not include the leaf node.
-    >>> top_level_folder('/blah/foo')
-    '/blah'
+def top_level_folder(p):
+    r"""
+    Gets the top-level folder from the path *p*.
+    On UNIX, this is straightforward:
+    /blah/foo
+    ...returns '/blah'
+    On Windows, we define this as the top-level folder that isn't the drive, so:
+    c:\blah\foo
+    ...returns 'c:\blah'.
+    Args:
+        p (str): filename to evaluate
+    Returns:
+        str: the top-level folder in [p], see above for details on how this is defined
     """
     if p == '':
         return ''
-    default_lib = os.path  # save default os.path
-    if windows is not None:
-        os.path = ntpath if windows else posixpath
-    # Path('/blah').parts is ('/', 'blah')
+    # Path('/blah').parts is ('/','blah')
     parts = split_path(p)
+    if len(parts) == 1:
+        return parts[0]
+    # Handle paths like:
+    #
+    # /, \, /stuff, c:, c:\stuff
     drive = os.path.splitdrive(p)[0]
-    if len(parts) > 1 and (
-            parts[0] == drive
-            or parts[0] == drive + '/'
-            or parts[0] == drive + '\\'
-            or parts[0] in ['\\', '/']):
-        result = os.path.join(parts[0], parts[1])
+    if parts[0] == drive or parts[0] == drive + '/' or parts[0] == drive + '\\' or parts[0] in ['\\', '/']:
+        return os.path.join(parts[0], parts[1])
     else:
-        result = parts[0]
+        return parts[0]
+# ...top_level_folder()
-    os.path = default_lib  # restore default os.path
-    return result
+#%% Test driver for top_level_folder
+if False:
+    #%%
+    p = 'blah/foo/bar'; s = top_level_folder(p); print(s); assert s == 'blah'
+    p = '/blah/foo/bar'; s = top_level_folder(p); print(s); assert s == '/blah'
+    p = 'bar'; s = top_level_folder(p); print(s); assert s == 'bar'
+    p = ''; s = top_level_folder(p); print(s); assert s == ''
+    p = 'c:\\'; s = top_level_folder(p); print(s); assert s == 'c:\\'
+    p = r'c:\blah'; s = top_level_folder(p); print(s); assert s == 'c:\\blah'
+    p = r'c:\foo'; s = top_level_folder(p); print(s); assert s == 'c:\\foo'
+    p = r'c:/foo'; s = top_level_folder(p); print(s); assert s == 'c:/foo'
+    p = r'c:\foo/bar'; s = top_level_folder(p); print(s); assert s == 'c:\\foo'
+    #%%
 def safe_create_link(link_exists,link_new):
     """
-    Create a symlink at link_new pointing to link_exists.
+    Creates a symlink at [link_new] pointing to [link_exists].
-    If link_new already exists, make sure it's a link (not a file),
-    and if it has a different target than link_exists, remove and re-create
+    If [link_new] already exists, make sure it's a link (not a file),
+    and if it has a different target than [link_exists], removes and re-creates
     it.
-    Errors if link_new already exists but it's not a link.
+    Errors if [link_new] already exists but it's not a link.
+    Args:
+        link_exists (str): the source of the (possibly-new) symlink
+        link_new (str): the target of the (possibly-new) symlink
     """
     if os.path.exists(link_new) or os.path.islink(link_new):
@@ -219,58 +322,66 @@ def safe_create_link(link_exists,link_new):
         os.symlink(link_exists,link_new)
-def get_file_sizes(base_dir, convert_slashes=True):
-    """
-    Get sizes recursively for all files in base_dir, returning a dict mapping
-    relative filenames to size.
-    """
-    relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
-                                             return_relative_paths=True)
-    fn_to_size = {}
-    for fn_relative in tqdm(relative_filenames):
-        fn_abs = os.path.join(base_dir,fn_relative)
-        fn_to_size[fn_relative] = os.path.getsize(fn_abs)
-    return fn_to_size
 #%% Image-related path functions
-def is_image_file(s: str, img_extensions: Container[str] = IMG_EXTENSIONS
-                  ) -> bool:
+def is_image_file(s, img_extensions=IMG_EXTENSIONS):
     """
     Checks a file's extension against a hard-coded set of image file
-    extensions.
+    extensions.  Uses case-insensitive comparison.
     Does not check whether the file exists, only determines whether the filename
     implies it's an image file.
+    Args:
+        s (str): filename to evaluate for image-ness
+        img_extensions (list, optional): list of known image file extensions
+    Returns:
+        bool: True if [s] appears to be an image file, else False
     """
     ext = os.path.splitext(s)[1]
     return ext.lower() in img_extensions
-def find_image_strings(strings: Iterable[str]) -> List[str]:
+def find_image_strings(strings):
     """
     Given a list of strings that are potentially image file names, looks for
     strings that actually look like image file names (based on extension).
+    Args:
+        strings (list): list of filenames to check for image-ness
+    Returns:
+        list: the subset of [strings] that appear to be image filenames
     """
     return [s for s in strings if is_image_file(s)]
-def find_images(dirname: str, recursive: bool = False,
-                return_relative_paths: bool = False,
-                convert_slashes: bool = False) -> List[str]:
+def find_images(dirname,
+                recursive=False,
+                return_relative_paths=False,
+                convert_slashes=True):
     """
     Finds all files in a directory that look like image file names. Returns
     absolute paths unless return_relative_paths is set.  Uses the OS-native
-    path separator unless convert_slahes is set, in which case will always
+    path separator unless convert_slashes is set, in which case will always
     use '/'.
+    Args:
+        dirname (str): the folder to search for images
+        recursive (bool, optional): whether to search recursively
+        return_relative_paths (str, optional): return paths that are relative
+            to [dirname], rather than absolute paths
+        convert_slashes (bool, optional): force forward slashes in return values
+    Returns:
+        list: list of image filenames found in [dirname]
     """
+    assert os.path.isdir(dirname), '{} is not a folder'.format(dirname)
     if recursive:
         strings = glob.glob(os.path.join(dirname, '**', '*.*'), recursive=True)
     else:
@@ -291,16 +402,28 @@ def find_images(dirname: str, recursive: bool = False,
 #%% Filename cleaning functions
-def clean_filename(filename: str, allow_list: str = VALID_FILENAME_CHARS,
-                   char_limit: int = CHAR_LIMIT, force_lower: bool = False) -> str:
+def clean_filename(filename,
+                   allow_list=VALID_FILENAME_CHARS,
+                   char_limit=CHAR_LIMIT,
+                   force_lower= False):
     r"""
     Removes non-ASCII and other invalid filename characters (on any
-    reasonable OS) from a filename, then trims to a maximum length.
+    reasonable OS) from a filename, then optionally trims to a maximum length.
     Does not allow :\/ by default, use clean_path if you want to preserve those.
     Adapted from
     https://gist.github.com/wassname/1393c4a57cfcbf03641dbc31886123b8
+    Args:
+        filename (str): filename to clean
+        allow_list (str, optional): string containing all allowable filename characters
+        char_limit (int, optional): maximum allowable filename length, if None will skip this
+            step
+        force_lower (bool, optional): convert the resulting filename to lowercase
+    returns:
+        str: cleaned version of [filename]
     """
     # keep only valid ascii chars
@@ -316,37 +439,75 @@ def clean_filename(filename: str, allow_list: str = VALID_FILENAME_CHARS,
     return cleaned_filename
-def clean_path(pathname: str, allow_list: str = VALID_PATH_CHARS,
-               char_limit: int = CHAR_LIMIT, force_lower: bool = False) -> str:
+def clean_path(pathname,
+               allow_list=VALID_PATH_CHARS,
+               char_limit=CHAR_LIMIT,
+               force_lower=False):
     """
     Removes non-ASCII and other invalid path characters (on any reasonable
-    OS) from a path, then trims to a maximum length.
+    OS) from a path, then optionally trims to a maximum length.
+    Args:
+        pathname (str): path name to clean
+        allow_list (str, optional): string containing all allowable filename characters
+        char_limit (int, optional): maximum allowable filename length, if None will skip this
+            step
+        force_lower (bool, optional): convert the resulting filename to lowercase
+    returns:
+        str: cleaned version of [filename]
     """
     return clean_filename(pathname, allow_list=allow_list,
                           char_limit=char_limit, force_lower=force_lower)
-def flatten_path(pathname: str, separator_chars: str = SEPARATOR_CHARS) -> str:
-    """
+def flatten_path(pathname,separator_chars=SEPARATOR_CHARS,separator_char_replacement='~'):
+    r"""
     Removes non-ASCII and other invalid path characters (on any reasonable
     OS) from a path, then trims to a maximum length. Replaces all valid
-    separators with '~'.
+    separators with [separator_char_replacement.]
+    Args:
+        pathname (str): path name to flatten
+        separator_chars (str, optional): string containing all known path separators
+        separator_char_replacement (str, optional): string to insert in place of
+            path separators.
+    Returns:
+        str: flattened version of [pathname]
     """
     s = clean_path(pathname)
     for c in separator_chars:
-        s = s.replace(c, '~')
+        s = s.replace(c, separator_char_replacement)
     return s
-#%% Platform-independent way to open files in their associated application
+def is_executable(filename):
+    """
+    Checks whether [filename] is on the system path and marked as executable.
+    Args:
+        filename (str): filename to check for executable status
+    Returns:
+        bool: True if [filename] is on the system path and marked as executable, otherwise False
+    """
+    # https://stackoverflow.com/questions/11210104/check-if-a-program-exists-from-a-python-script
+    return which(filename) is not None
-import sys,subprocess,platform,re
+#%% Platform-independent way to open files in their associated application
 def environment_is_wsl():
     """
-    Returns True if we're running in WSL
+    Determines whether we're running in WSL.
+    Returns:
+        True if we're running in WSL.
     """
     if sys.platform not in ('linux','posix'):
@@ -356,7 +517,7 @@ def environment_is_wsl():
 def wsl_path_to_windows_path(filename):
-    """
+    r"""
     Converts a WSL path to a Windows path, or returns None if that's not possible.  E.g.
     converts:
@@ -365,6 +526,12 @@ def wsl_path_to_windows_path(filename):
     ...to:
     e:\a\b\c
+    Args:
+        filename (str): filename to convert
+    Returns:
+        str: Windows equivalent to the WSL path [filename]
     """
     result = subprocess.run(['wslpath', '-w', filename], text=True, capture_output=True)
@@ -373,13 +540,38 @@ def wsl_path_to_windows_path(filename):
         return None
     return result.stdout.strip()
-def open_file(filename,attempt_to_open_in_wsl_host=False):
+def open_file(filename, attempt_to_open_in_wsl_host=False, browser_name=None):
     """
-    Opens [filename] in the native OS file handler.  If attempt_to_open_in_wsl_host
-    is True, and we're in WSL, attempts to open [filename] in Windows.
+    Opens [filename] in the default OS file handler for this file type.
+    If browser_name is not None, uses the webbrowser module to open the filename
+    in the specified browser; see https://docs.python.org/3/library/webbrowser.html
+    for supported browsers.  Falls back to the default file handler if webbrowser.open()
+    fails.  In this case, attempt_to_open_in_wsl_host is ignored unless webbrowser.open() fails.
+    If browser_name is 'default', uses the system default.  This is different from the
+    parameter to webbrowser.get(), where None implies the system default.
+    Args:
+        filename (str): file to open
+        attempt_to_open_in_wsl_host: if this is True, and we're in WSL, attempts to open
+            [filename] in the Windows host environment
+        browser_name: see above
     """
+    if browser_name is not None:
+        if browser_name == 'chrome':
+            browser_name = 'google-chrome'
+        elif browser_name == 'default':
+            browser_name = None
+        try:
+            result = webbrowser.get(using=browser_name).open(filename)
+        except Exception:
+            result = False
+        if result:
+            return
     if sys.platform == 'win32':
         os.startfile(filename)
@@ -410,10 +602,14 @@ def open_file(filename,attempt_to_open_in_wsl_host=False):
 #%% File list functions
-def write_list_to_file(output_file: str, strings: Sequence[str]) -> None:
+def write_list_to_file(output_file,strings):
     """
     Writes a list of strings to either a JSON file or text file,
     depending on extension of the given file name.
+    Args:
+        output_file (str): file to write
+        strings (list): list of strings to write to [output_file]
     """
     with open(output_file, 'w') as f:
@@ -423,9 +619,15 @@ def write_list_to_file(output_file: str, strings: Sequence[str]) -> None:
             f.write('\n'.join(strings))
-def read_list_from_file(filename: str) -> List[str]:
+def read_list_from_file(filename):
     """
     Reads a json-formatted list of strings from a file.
+    Args:
+        filename (str): .json filename to read
+    Returns:
+        list: list of strings read from [filename]
     """
     assert filename.endswith('.json')
@@ -437,11 +639,155 @@ def read_list_from_file(filename: str) -> List[str]:
     return file_list
+def _copy_file(input_output_tuple,overwrite=True,verbose=False):
+    """
+    Internal function for copying files from within parallel_copy_files.
+    """
+    assert len(input_output_tuple) == 2
+    source_fn = input_output_tuple[0]
+    target_fn = input_output_tuple[1]
+    if (not overwrite) and (os.path.isfile(target_fn)):
+        if verbose:
+            print('Skipping existing file {}'.format(target_fn))
+        return
+    os.makedirs(os.path.dirname(target_fn),exist_ok=True)
+    shutil.copyfile(source_fn,target_fn)
+def parallel_copy_files(input_file_to_output_file, max_workers=16,
+                        use_threads=True, overwrite=False, verbose=False):
+    """
+    Copies files from source to target according to the dict input_file_to_output_file.
+    Args:
+        input_file_to_output_file (dict): dictionary mapping source files to the target files
+            to which they should be copied
+        max_workers (int, optional): number of concurrent workers; set to <=1 to disable parallelism
+        use_threads (bool, optional): whether to use threads (True) or processes (False) for
+            parallel copying; ignored if max_workers <= 1
+        overwrite (bool, optional): whether to overwrite existing destination files
+        verbose (bool, optional): enable additionald debug output
+    """
+    n_workers = min(max_workers,len(input_file_to_output_file))
+    # Package the dictionary as a set of 2-tuples
+    input_output_tuples = []
+    for input_fn in input_file_to_output_file:
+        input_output_tuples.append((input_fn,input_file_to_output_file[input_fn]))
+    if use_threads:
+        pool = ThreadPool(n_workers)
+    else:
+        pool = Pool(n_workers)
+    with tqdm(total=len(input_output_tuples)) as pbar:
+        for i,_ in enumerate(pool.imap_unordered(partial(_copy_file,overwrite=overwrite,verbose=verbose),
+                                                 input_output_tuples)):
+            pbar.update()
+# ...def parallel_copy_files(...)
+def get_file_sizes(base_dir, convert_slashes=True):
+    """
+    Gets sizes recursively for all files in base_dir, returning a dict mapping
+    relative filenames to size.
+    TODO: merge the functionality here with parallel_get_file_sizes, which uses slightly
+    different semantics.
+    Args:
+        base_dir (str): folder within which we want all file sizes
+        convert_slashes (bool, optional): force forward slashes in return strings,
+            otherwise uses the native path separator
+    Returns:
+        dict: dictionary mapping filenames to file sizes in bytes
+    """
+    relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
+                                             return_relative_paths=True)
+    fn_to_size = {}
+    for fn_relative in tqdm(relative_filenames):
+        fn_abs = os.path.join(base_dir,fn_relative)
+        fn_to_size[fn_relative] = os.path.getsize(fn_abs)
+    return fn_to_size
+def _get_file_size(filename,verbose=False):
+    """
+    Internal function for safely getting the size of a file.  Returns a (filename,size)
+    tuple, where size is None if there is an error.
+    """
+    try:
+        size = os.path.getsize(filename)
+    except Exception as e:
+        if verbose:
+            print('Error reading file size for {}: {}'.format(filename,str(e)))
+        size = None
+    return (filename,size)
+def parallel_get_file_sizes(filenames, max_workers=16,
+                        use_threads=True, verbose=False,
+                        recursive=True):
+    """
+    Returns a dictionary mapping every file in [filenames] to the corresponding file size,
+    or None for errors.  If [filenames] is a folder, will enumerate the folder (optionally recursively).
+    Args:
+        filenames (list or str): list of filenames for which we should read sizes, or a folder
+            within which we should read all file sizes recursively
+        max_workers (int, optional): number of concurrent workers; set to <=1 to disable parallelism
+        use_threads (bool, optional): whether to use threads (True) or processes (False) for
+            parallel copying; ignored if max_workers <= 1
+        verbose (bool, optional): enable additionald debug output
+    Returns:
+        dict: dictionary mapping filenames to file sizes in bytes
+    """
+    n_workers = min(max_workers,len(filenames))
+    if isinstance(filenames,str) and os.path.isdir(filenames):
+        filenames = recursive_file_list(filenames,recursive=recursive,return_relative_paths=False)
+    if use_threads:
+        pool = ThreadPool(n_workers)
+    else:
+        pool = Pool(n_workers)
+    resize_results = list(tqdm(pool.imap(
+        partial(_get_file_size,verbose=verbose),filenames), total=len(filenames)))
+    to_return = {}
+    for r in resize_results:
+        to_return[r[0]] = r[1]
+    return to_return
 #%% Zip functions
 def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compresslevel=9):
     """
-    Zip a single file, by default writing to a new file called [input_fn].zip
+    Zips a single file.
+    Args:
+        input_fn (str): file to zip
+        output_fn (str, optional): target zipfile; if this is None, we'll use
+            [input_fn].zip
+        overwrite (bool, optional): whether to overwrite an existing target file
+        verbose (bool, optional): enable existing debug console output
+        compresslevel (int, optional): compression level to use, between 0 and 9
+    Returns:
+        str: the output zipfile, whether we created it or determined that it already exists
     """
     basename = os.path.basename(input_fn)
@@ -451,10 +797,10 @@ def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compressl
     if (not overwrite) and (os.path.isfile(output_fn)):
         print('Skipping existing file {}'.format(output_fn))
-        return
+        return output_fn
     if verbose:
-        print('Zipping {} to {}'.format(input_fn,output_fn))
+        print('Zipping {} to {} with level {}'.format(input_fn,output_fn,compresslevel))
     with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
         zipf.write(input_fn,arcname=basename,compresslevel=compresslevel,
@@ -463,21 +809,113 @@ def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compressl
     return output_fn
+def add_files_to_single_tar_file(input_files, output_fn, arc_name_base,
+                                 overwrite=False, verbose=False, mode='x'):
+    """
+    Adds all the files in [input_files] to the tar file [output_fn].
+    Archive names are relative to arc_name_base.
+    Args:
+        input_files (list): list of absolute filenames to include in the .tar file
+        output_fn (str): .tar file to create
+        arc_name_base (str): absolute folder from which relative paths should be determined;
+            behavior is undefined if there are files in [input_files] that don't live within
+            [arc_name_base]
+        overwrite (bool, optional): whether to overwrite an existing .tar file
+        verbose (bool, optional): enable additional debug console output
+        mode (str, optional): compression type, can be 'x' (no compression), 'x:gz', or 'x:bz2'.
+    Returns:
+        str: the output tar file, whether we created it or determined that it already exists
+    """
+    if os.path.isfile(output_fn):
+        if not overwrite:
+            print('Tar file {} exists, skipping'.format(output_fn))
+            return output_fn
+        else:
+            print('Tar file {} exists, deleting and re-creating'.format(output_fn))
+            os.remove(output_fn)
+    if verbose:
+        print('Adding {} files to {} (mode {})'.format(
+            len(input_files),output_fn,mode))
+    with tarfile.open(output_fn,mode) as tarf:
+        for input_fn_abs in tqdm(input_files,disable=(not verbose)):
+            input_fn_relative = os.path.relpath(input_fn_abs,arc_name_base)
+            tarf.add(input_fn_abs,arcname=input_fn_relative)
+    return output_fn
+def zip_files_into_single_zipfile(input_files, output_fn, arc_name_base,
+                                  overwrite=False, verbose=False, compresslevel=9):
+    """
+    Zip all the files in [input_files] into [output_fn].  Archive names are relative to
+    arc_name_base.
+    Args:
+        input_files (list): list of absolute filenames to include in the .tar file
+        output_fn (str): .tar file to create
+        arc_name_base (str): absolute folder from which relative paths should be determined;
+            behavior is undefined if there are files in [input_files] that don't live within
+            [arc_name_base]
+        overwrite (bool, optional): whether to overwrite an existing .tar file
+        verbose (bool, optional): enable additional debug console output
+        compresslevel (int, optional): compression level to use, between 0 and 9
+    Returns:
+        str: the output zipfile, whether we created it or determined that it already exists
+    """
+    if not overwrite:
+        if os.path.isfile(output_fn):
+            print('Zip file {} exists, skipping'.format(output_fn))
+            return output_fn
+    if verbose:
+        print('Zipping {} files to {} (compression level {})'.format(
+            len(input_files),output_fn,compresslevel))
+    with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
+        for input_fn_abs in tqdm(input_files,disable=(not verbose)):
+            input_fn_relative = os.path.relpath(input_fn_abs,arc_name_base)
+            zipf.write(input_fn_abs,
+                       arcname=input_fn_relative,
+                       compresslevel=compresslevel,
+                       compress_type=zipfile.ZIP_DEFLATED)
+    return output_fn
 def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, compresslevel=9):
     """
-    Recursively zip everything in [input_folder], storing outputs as relative paths.
+    Recursively zip everything in [input_folder] into a single zipfile, storing outputs as relative
+    paths.
-    Defaults to writing to [input_folder].zip
+    Args:
+        input_folder (str): folder to zip
+        output_fn (str, optional): output filename; if this is None, we'll write to [input_folder].zip
+        overwrite (bool, optional): whether to overwrite an existing .tar file
+        verbose (bool, optional): enable additional debug console output
+        compresslevel (int, optional): compression level to use, between 0 and 9
+    Returns:
+        str: the output zipfile, whether we created it or determined that it already exists
     """
     if output_fn is None:
         output_fn = input_folder + '.zip'
     if not overwrite:
-        assert not os.path.isfile(output_fn), 'Zip file {} exists'.format(output_fn)
+        if os.path.isfile(output_fn):
+            print('Zip file {} exists, skipping'.format(output_fn))
+            return
     if verbose:
-        print('Zipping {} to {}'.format(input_folder,output_fn))
+        print('Zipping {} to {} (compression level {})'.format(
+            input_folder,output_fn,compresslevel))
     relative_filenames = recursive_file_list(input_folder,return_relative_paths=True)
@@ -492,10 +930,20 @@ def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, com
     return output_fn
-def parallel_zip_files(input_files, max_workers=16, use_threads=True):
+def parallel_zip_files(input_files, max_workers=16, use_threads=True, compresslevel=9,
+                       overwrite=False, verbose=False):
     """
-    Zip one or more files to separate output files in parallel, leaving the
+    Zips one or more files to separate output files in parallel, leaving the
     original files in place.  Each file is zipped to [filename].zip.
+    Args:
+        input_file (str): list of files to zip
+        max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
+        use_threads (bool, optional): whether to use threads (True) or processes (False); ignored if
+            max_workers <= 1
+        compresslevel (int, optional): zip compression level between 0 and 9
+        overwrite (bool, optional): whether to overwrite an existing .tar file
+        verbose (bool, optional): enable additional debug console output
     """
     n_workers = min(max_workers,len(input_files))
@@ -506,15 +954,26 @@ def parallel_zip_files(input_files, max_workers=16, use_threads=True):
         pool = Pool(n_workers)
     with tqdm(total=len(input_files)) as pbar:
-        for i,_ in enumerate(pool.imap_unordered(zip_file,input_files)):
+        for i,_ in enumerate(pool.imap_unordered(partial(zip_file,
+          output_fn=None,overwrite=overwrite,verbose=verbose,compresslevel=compresslevel),
+          input_files)):
             pbar.update()
 def parallel_zip_folders(input_folders, max_workers=16, use_threads=True,
-                         compresslevel=9, overwrite=False):
+                         compresslevel=9, overwrite=False, verbose=False):
     """
-    Zip one or more folders to separate output files in parallel, leaving the
+    Zips one or more folders to separate output files in parallel, leaving the
     original folders in place.  Each folder is zipped to [folder_name].zip.
+    Args:
+        input_folder (list): list of folders to zip
+        max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
+        use_threads (bool, optional): whether to use threads (True) or processes (False); ignored if
+            max_workers <= 1
+        compresslevel (int, optional): zip compression level between 0 and 9
+        overwrite (bool, optional): whether to overwrite an existing .tar file
+        verbose (bool, optional): enable additional debug console output
     """
     n_workers = min(max_workers,len(input_folders))
@@ -526,15 +985,56 @@ def parallel_zip_folders(input_folders, max_workers=16, use_threads=True,
     with tqdm(total=len(input_folders)) as pbar:
         for i,_ in enumerate(pool.imap_unordered(
-                partial(zip_folder,overwrite=overwrite,compresslevel=compresslevel),
+                partial(zip_folder,overwrite=overwrite,
+                        compresslevel=compresslevel,verbose=verbose),
                 input_folders)):
             pbar.update()
+def zip_each_file_in_folder(folder_name,recursive=False,max_workers=16,use_threads=True,
+                            compresslevel=9,overwrite=False,required_token=None,verbose=False,
+                            exclude_zip=True):
+    """
+    Zips each file in [folder_name] to its own zipfile (filename.zip), optionally recursing.  To
+    zip a whole folder into a single zipfile, use zip_folder().
+    Args:
+        folder_name (str): the folder within which we should zip files
+        recursive (bool, optional): whether to recurse within [folder_name]
+        max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
+        use_threads (bool, optional): whether to use threads (True) or processes (False); ignored if
+            max_workers <= 1
+        compresslevel (int, optional): zip compression level between 0 and 9
+        overwrite (bool, optional): whether to overwrite an existing .tar file
+        required_token (str, optional): only zip files whose names contain this string
+        verbose (bool, optional): enable additional debug console output
+        exclude_zip (bool, optional): skip files ending in .zip
+    """
+    assert os.path.isdir(folder_name), '{} is not a folder'.format(folder_name)
+    input_files = recursive_file_list(folder_name,recursive=recursive,return_relative_paths=False)
+    if required_token is not None:
+        input_files = [fn for fn in input_files if required_token in fn]
+    if exclude_zip:
+        input_files = [fn for fn in input_files if (not fn.endswith('.zip'))]
+    parallel_zip_files(input_files=input_files,max_workers=max_workers,
+                       use_threads=use_threads,compresslevel=compresslevel,
+                       overwrite=overwrite,verbose=verbose)
 def unzip_file(input_file, output_folder=None):
     """
-    Unzip a zipfile to the specified output folder, defaulting to the same location as
-    the input file
+    Unzips a zipfile to the specified output folder, defaulting to the same location as
+    the input file.
+    Args:
+        input_file (str): zipfile to unzip
+        output_folder (str, optional): folder to which we should unzip [input_file], defaults
+            to unzipping to the folder where [input_file] lives
     """
     if output_folder is None:

megadetector 5.0.7__py3-none-any.whl → 5.0.9__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.7py3-none-any.whl → 5.0.9py3-none-any.whl