PyPI - megadetector - Versions diffs - 5.0.7__py3-none-any.whl → 5.0.8__py3-none-any.whl - Mend

megadetector 5.0.7py3-none-any.whl → 5.0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (48) hide show

api/batch_processing/data_preparation/manage_local_batch.py +28 -14
api/batch_processing/postprocessing/combine_api_outputs.py +2 -2
api/batch_processing/postprocessing/compare_batch_results.py +1 -1
api/batch_processing/postprocessing/convert_output_format.py +24 -6
api/batch_processing/postprocessing/load_api_results.py +1 -3
api/batch_processing/postprocessing/md_to_labelme.py +118 -51
api/batch_processing/postprocessing/merge_detections.py +30 -5
api/batch_processing/postprocessing/postprocess_batch_results.py +24 -12
api/batch_processing/postprocessing/remap_detection_categories.py +163 -0
api/batch_processing/postprocessing/render_detection_confusion_matrix.py +15 -12
api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +2 -2
data_management/cct_json_utils.py +7 -2
data_management/coco_to_labelme.py +263 -0
data_management/coco_to_yolo.py +7 -4
data_management/databases/integrity_check_json_db.py +68 -59
data_management/databases/subset_json_db.py +1 -1
data_management/get_image_sizes.py +44 -26
data_management/importers/animl_results_to_md_results.py +1 -3
data_management/importers/noaa_seals_2019.py +1 -1
data_management/labelme_to_coco.py +252 -143
data_management/labelme_to_yolo.py +95 -52
data_management/lila/create_lila_blank_set.py +106 -23
data_management/lila/download_lila_subset.py +133 -65
data_management/lila/generate_lila_per_image_labels.py +1 -1
data_management/lila/lila_common.py +8 -38
data_management/read_exif.py +65 -16
data_management/remap_coco_categories.py +84 -0
data_management/resize_coco_dataset.py +3 -22
data_management/wi_download_csv_to_coco.py +239 -0
data_management/yolo_to_coco.py +283 -83
detection/run_detector_batch.py +12 -3
detection/run_inference_with_yolov5_val.py +10 -3
detection/run_tiled_inference.py +2 -2
detection/tf_detector.py +2 -1
detection/video_utils.py +1 -1
md_utils/ct_utils.py +22 -3
md_utils/md_tests.py +11 -2
md_utils/path_utils.py +206 -32
md_utils/url_utils.py +66 -1
md_utils/write_html_image_list.py +12 -3
md_visualization/visualization_utils.py +363 -72
md_visualization/visualize_db.py +33 -10
{megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/METADATA +10 -12
{megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/RECORD +47 -44
{megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/WHEEL +1 -1
md_visualization/visualize_megadb.py +0 -183
{megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/LICENSE +0 -0
{megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/top_level.txt +0 -0

detection/run_detector_batch.py CHANGED Viewed

@@ -245,7 +245,8 @@ def process_images(im_files, detector, confidence_threshold, use_image_queue=Fal
                    quiet=False, image_size=None, checkpoint_queue=None, include_image_size=False,
                    include_image_timestamp=False, include_exif_data=False):
     """
-    Runs MegaDetector over a list of image files.
+    Runs MegaDetector over a list of image files.  As of 3/2024, this entry point is used when the
+    image queue is enabled, but not in the standard inference path (which loops over process_image()).
     Args
     - im_files: list of str, paths to image files
@@ -269,7 +270,7 @@ def process_images(im_files, detector, confidence_threshold, use_image_queue=Fal
                                       include_image_size=include_image_size,
                                       include_image_timestamp=include_image_timestamp,
                                       include_exif_data=include_exif_data)
-    else:
+    else:
         results = []
         for im_file in im_files:
             result = process_image(im_file, detector, confidence_threshold,
@@ -662,7 +663,7 @@ def get_image_datetime(image):
 def write_results_to_file(results, output_file, relative_path_base=None,
                           detector_file=None, info=None, include_max_conf=False,
-                          custom_metadata=None):
+                          custom_metadata=None, force_forward_slashes=True):
     """
     Writes list of detection results to JSON output file. Format matches:
@@ -692,6 +693,14 @@ def write_results_to_file(results, output_file, relative_path_base=None,
             results_relative.append(r_relative)
         results = results_relative
+    if force_forward_slashes:
+        results_converted = []
+        for r in results:
+            r_converted = copy.copy(r)
+            r_converted['file'] = r_converted['file'].replace('\\','/')
+            results_converted.append(r_converted)
+        results = results_converted
     # The typical case: we need to build the 'info' struct
     if info is None:

detection/run_inference_with_yolov5_val.py CHANGED Viewed

@@ -105,6 +105,8 @@ class YoloInferenceOptions:
     treat_copy_failures_as_warnings = False
     save_yolo_debug_output = False
+    recursive = True
 #%% Main function
@@ -203,7 +205,7 @@ def run_inference_with_yolo_val(options):
     ##%% Enumerate images
     if os.path.isdir(options.input_folder):
-        image_files_absolute = path_utils.find_images(options.input_folder,recursive=True)
+        image_files_absolute = path_utils.find_images(options.input_folder,recursive=options.recursive)
     else:
         assert os.path.isfile(options.input_folder)
         with open(options.input_folder,'r') as f:
@@ -381,7 +383,7 @@ def run_inference_with_yolo_val(options):
     # YOLO console output contains lots of ANSI escape codes, remove them for easier parsing
     yolo_console_output = [string_utils.remove_ansi_codes(s) for s in yolo_console_output]
-    # Find errors that occrred during the initial corruption check; these will not be included in the
+    # Find errors that occurred during the initial corruption check; these will not be included in the
     # output.  Errors that occur during inference will be handled separately.
     yolo_read_failures = []
@@ -518,7 +520,7 @@ def main():
         help='inference batch size (default {})'.format(options.batch_size))
     parser.add_argument(
         '--half_precision_enabled', default=None, type=int,
-        help='use half-precision-inference (1 or 0) (default is the underlying model\'s default, probably half for YOLOv8 and full for YOLOv8')
+        help='use half-precision-inference (1 or 0) (default is the underlying model\'s default, probably full for YOLOv8 and half for YOLOv5')
     parser.add_argument(
         '--device_string', default=options.device_string, type=str,
         help='CUDA device specifier, typically "0" or "1" for CUDA devices, "mps" for M1/M2 devices, or "cpu" (default {})'.format(options.device_string))
@@ -553,6 +555,10 @@ def main():
         '--save_yolo_debug_output', action='store_true',
         help='write yolo console output to a text file in the results folder, along with additional debug files')
+    parser.add_argument(
+        '--nonrecursive', action='store_true',
+        help='Disable recursive folder processing')
     parser.add_argument(
         '--preview_yolo_command_only', action='store_true',
         help='don\'t run inference, just preview the YOLO inference command (still creates symlinks)')
@@ -592,6 +598,7 @@ def main():
     if args.yolo_dataset_file is not None:
         options.yolo_category_id_to_name = args.yolo_dataset_file
+    options.recursive = (not options.nonrecursive)
     options.remove_symlink_folder = (not options.no_remove_symlink_folder)
     options.remove_yolo_results_folder = (not options.no_remove_yolo_results_folder)
     options.use_symlinks = (not options.no_use_symlinks)

detection/run_tiled_inference.py CHANGED Viewed

@@ -823,12 +823,12 @@ def main():
         '--overwrite_handling',
         type=str,
         default='skip',
-        help=('behavior when the targt file exists (skip/overwrite/error) (default skip)'))
+        help=('Behavior when the target file exists (skip/overwrite/error) (default skip)'))
     parser.add_argument(
         '--image_list',
         type=str,
         default=None,
-        help=('a .json list of relative filenames (or absolute paths contained within image_folder) to include'))
+        help=('A .json list of relative filenames (or absolute paths contained within image_folder) to include'))
     if len(sys.argv[1:]) == 0:
         parser.print_help()

detection/tf_detector.py CHANGED Viewed

@@ -122,7 +122,8 @@ class TFDetector:
             detection_threshold: confidence above which to include the detection proposal
         Returns:
-        A dict with the following fields, see the 'images' key in https://github.com/agentmorris/MegaDetector/tree/master/api/batch_processing#batch-processing-api-output-format
+        A dict with the following fields, see the 'images' key in:
+        https://github.com/agentmorris/MegaDetector/tree/master/api/batch_processing#batch-processing-api-output-format
             - 'file' (always present)
             - 'max_detection_conf'
             - 'detections', which is a list of detection objects containing keys 'category', 'conf' and 'bbox'

detection/video_utils.py CHANGED Viewed

@@ -310,7 +310,7 @@ def video_folder_to_frames(input_folder:str, output_folder_base:str,
 class FrameToVideoOptions:
-    # zero-indexed
+    # One-indexed, i.e. "1" means "use the confidence value from the highest-confidence frame"
     nth_highest_confidence = 1
     # 'error' or 'skip_with_warning'

md_utils/ct_utils.py CHANGED Viewed

@@ -39,9 +39,13 @@ def truncate_float_array(xs, precision=3):
 def truncate_float(x, precision=3):
     """
-    Truncates a floating-point value to a specific number of significant digits.
+    Truncates the fractional portion of a floating-point value to a specific number of
+    floating-point digits.
-    For example: truncate_float(0.0003214884) --> 0.000321
+    For example:
+        truncate_float(0.0003214884) --> 0.000321
+        truncate_float(1.0003214884) --> 1.000321
     This function is primarily used to achieve a certain float representation
     before exporting to JSON.
@@ -58,13 +62,18 @@ def truncate_float(x, precision=3):
         return 0
+    elif (x > 1):
+        fractional_component = x - 1.0
+        return 1 + truncate_float(fractional_component)
     else:
         # Determine the factor, which shifts the decimal point of x
         # just behind the last significant digit.
         factor = math.pow(10, precision - 1 - math.floor(math.log10(abs(x))))
-        # Shift decimal point by multiplicatipon with factor, flooring, and
+        # Shift decimal point by multiplication with factor, flooring, and
         # division by factor.
         return math.floor(x * factor)/factor
@@ -174,6 +183,7 @@ def convert_xywh_to_xyxy(api_bbox):
     Converts an xywh bounding box to an xyxy bounding box.
     Note that this is also different from the TensorFlow Object Detection API coords format.
     Args:
         api_bbox: bbox output by the batch processing API [x_min, y_min, width_of_box, height_of_box]
@@ -352,6 +362,15 @@ def split_list_into_n_chunks(L, n, chunk_strategy='greedy'):
         raise ValueError('Invalid chunk strategy: {}'.format(chunk_strategy))
+def sort_dictionary_by_key(d,reverse=False):
+    """
+    Sorts the dictionary [d] by key.
+    """
+    d = dict(sorted(d.items(),reverse=reverse))
+    return d
 def sort_dictionary_by_value(d,sort_values=None,reverse=False):
     """
     Sorts the dictionary [d] by value.  If sort_values is None, uses d.values(),

md_utils/md_tests.py CHANGED Viewed

@@ -86,11 +86,14 @@ def get_expected_results_filename(gpu_is_available):
     return 'md-test-results-{}-{}.json'.format(hw_string,pt_string)
-def download_test_data(options):
+def download_test_data(options=None):
     """
     Download the test zipfile if necessary, unzip if necessary.
     """
+    if options is None:
+        options = MDTestOptions()
     if options.scratch_dir is None:
         tempdir_base = tempfile.gettempdir()
         scratch_dir = os.path.join(tempdir_base,'md-tests')
@@ -160,6 +163,8 @@ def download_test_data(options):
     options.test_videos = [fn for fn in test_files if os.path.splitext(fn.lower())[1] in ('.mp4','.avi')]
     options.test_videos = [fn for fn in options.test_videos if 'rendered' not in fn]
+    print('Finished unzipping and enumerating test data')
 # ...def download_test_data(...)
@@ -840,6 +845,10 @@ def main():
         type=str,
         default=None,
         help='Working directory for CLI tests')
+    # token used for linting
+    #
+    # no_arguments_required
     args = parser.parse_args()

md_utils/path_utils.py CHANGED Viewed

@@ -12,11 +12,17 @@
 import glob
 import ntpath
 import os
+import sys
+import platform
 import posixpath
 import string
 import json
+import shutil
 import unicodedata
 import zipfile
+import webbrowser
+import subprocess
+import re
 from zipfile import ZipFile
 from datetime import datetime
@@ -43,6 +49,8 @@ def recursive_file_list(base_dir, convert_slashes=True,
     \ to /
     """
+    assert os.path.isdir(base_dir), '{} is not a folder'.format(base_dir)
     all_files = []
     if recursive:
@@ -219,23 +227,6 @@ def safe_create_link(link_exists,link_new):
         os.symlink(link_exists,link_new)
-def get_file_sizes(base_dir, convert_slashes=True):
-    """
-    Get sizes recursively for all files in base_dir, returning a dict mapping
-    relative filenames to size.
-    """
-    relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
-                                             return_relative_paths=True)
-    fn_to_size = {}
-    for fn_relative in tqdm(relative_filenames):
-        fn_abs = os.path.join(base_dir,fn_relative)
-        fn_to_size[fn_relative] = os.path.getsize(fn_abs)
-    return fn_to_size
 #%% Image-related path functions
 def is_image_file(s: str, img_extensions: Container[str] = IMG_EXTENSIONS
@@ -267,10 +258,12 @@ def find_images(dirname: str, recursive: bool = False,
     """
     Finds all files in a directory that look like image file names. Returns
     absolute paths unless return_relative_paths is set.  Uses the OS-native
-    path separator unless convert_slahes is set, in which case will always
+    path separator unless convert_slashes is set, in which case will always
     use '/'.
     """
+    assert os.path.isdir(dirname), '{} is not a folder'.format(dirname)
     if recursive:
         strings = glob.glob(os.path.join(dirname, '**', '*.*'), recursive=True)
     else:
@@ -342,8 +335,6 @@ def flatten_path(pathname: str, separator_chars: str = SEPARATOR_CHARS) -> str:
 #%% Platform-independent way to open files in their associated application
-import sys,subprocess,platform,re
 def environment_is_wsl():
     """
     Returns True if we're running in WSL
@@ -373,13 +364,35 @@ def wsl_path_to_windows_path(filename):
         return None
     return result.stdout.strip()
-def open_file(filename,attempt_to_open_in_wsl_host=False):
+def open_file(filename, attempt_to_open_in_wsl_host=False, browser_name=None):
     """
-    Opens [filename] in the native OS file handler.  If attempt_to_open_in_wsl_host
-    is True, and we're in WSL, attempts to open [filename] in Windows.
+    Opens [filename] in the default OS file handler for this file type.
+    If attempt_to_open_in_wsl_host is True, and we're in WSL, attempts to open
+    [filename] in the Windows host environment.
+    If browser_name is not None, uses the webbrowser module to open the filename
+    in the specified browser; see https://docs.python.org/3/library/webbrowser.html
+    for supported browsers.  Falls back to the default file handler if webbrowser.open()
+    fails.  In this case, attempt_to_open_in_wsl_host is ignored unless webbrowser.open() fails.
+    If browser_name is 'default', use the system default.  This is different from the
+    parameter to webbrowser.get(), where None implies the system default.
     """
+    if browser_name is not None:
+        if browser_name == 'chrome':
+            browser_name = 'google-chrome'
+        elif browser_name == 'default':
+            browser_name = None
+        try:
+            result = webbrowser.get(using=browser_name).open(filename)
+        except Exception:
+            result = False
+        if result:
+            return
     if sys.platform == 'win32':
         os.startfile(filename)
@@ -437,6 +450,107 @@ def read_list_from_file(filename: str) -> List[str]:
     return file_list
+def _copy_file(input_output_tuple,overwrite=True,verbose=False):
+    assert len(input_output_tuple) == 2
+    source_fn = input_output_tuple[0]
+    target_fn = input_output_tuple[1]
+    if (not overwrite) and (os.path.isfile(target_fn)):
+        if verbose:
+            print('Skipping existing file {}'.format(target_fn))
+        return
+    os.makedirs(os.path.dirname(target_fn),exist_ok=True)
+    shutil.copyfile(source_fn,target_fn)
+def parallel_copy_files(input_file_to_output_file, max_workers=16,
+                        use_threads=True, overwrite=False, verbose=False):
+    """
+    Copy files from source to target according to the dict input_file_to_output_file.
+    """
+    n_workers = min(max_workers,len(input_file_to_output_file))
+    # Package the dictionary as a set of 2-tuples
+    input_output_tuples = []
+    for input_fn in input_file_to_output_file:
+        input_output_tuples.append((input_fn,input_file_to_output_file[input_fn]))
+    if use_threads:
+        pool = ThreadPool(n_workers)
+    else:
+        pool = Pool(n_workers)
+    with tqdm(total=len(input_output_tuples)) as pbar:
+        for i,_ in enumerate(pool.imap_unordered(partial(_copy_file,overwrite=overwrite,verbose=verbose),
+                                                 input_output_tuples)):
+            pbar.update()
+# ...def parallel_copy_files(...)
+def get_file_sizes(base_dir, convert_slashes=True):
+    """
+    Get sizes recursively for all files in base_dir, returning a dict mapping
+    relative filenames to size.
+    TODO: merge the functionality here with parallel_get_file_sizes, which uses slightly
+    different semantics.
+    """
+    relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
+                                             return_relative_paths=True)
+    fn_to_size = {}
+    for fn_relative in tqdm(relative_filenames):
+        fn_abs = os.path.join(base_dir,fn_relative)
+        fn_to_size[fn_relative] = os.path.getsize(fn_abs)
+    return fn_to_size
+def _get_file_size(filename,verbose=False):
+    """
+    Internal function for safely getting the size of a file.  Returns a (filename,size)
+    tuple, where size is None if there is an error.
+    """
+    try:
+        size = os.path.getsize(filename)
+    except Exception as e:
+        if verbose:
+            print('Error reading file size for {}: {}'.format(filename,str(e)))
+        size = None
+    return (filename,size)
+def parallel_get_file_sizes(filenames, max_workers=16,
+                        use_threads=True, verbose=False,
+                        recursive=True):
+    """
+    Return a dictionary mapping every file in [filenames] to the corresponding file size,
+    or None for errors.  If [filenames] is a folder, will enumerate the folder (optionally recursively).
+    """
+    n_workers = min(max_workers,len(filenames))
+    if isinstance(filenames,str) and os.path.isdir(filenames):
+        filenames = recursive_file_list(filenames,recursive=recursive,return_relative_paths=False)
+    if use_threads:
+        pool = ThreadPool(n_workers)
+    else:
+        pool = Pool(n_workers)
+    resize_results = list(tqdm(pool.imap(
+        partial(_get_file_size,verbose=verbose),filenames), total=len(filenames)))
+    to_return = {}
+    for r in resize_results:
+        to_return[r[0]] = r[1]
+    return to_return
 #%% Zip functions
 def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compresslevel=9):
@@ -454,7 +568,7 @@ def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compressl
         return
     if verbose:
-        print('Zipping {} to {}'.format(input_fn,output_fn))
+        print('Zipping {} to {} with level {}'.format(input_fn,output_fn,compresslevel))
     with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
         zipf.write(input_fn,arcname=basename,compresslevel=compresslevel,
@@ -463,9 +577,37 @@ def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compressl
     return output_fn
+def zip_files_into_single_zipfile(input_files, output_fn, arc_name_base,
+                                  overwrite=False, verbose=False, compresslevel=9):
+    """
+    Zip all the files in [input_files] into [output_fn].  Archive names are relative to
+    arc_name_base.
+    """
+    if not overwrite:
+        if os.path.isfile(output_fn):
+            print('Zip file {} exists, skipping'.format(output_fn))
+            return
+    if verbose:
+        print('Zipping {} files to {} (compression level {})'.format(
+            len(input_files),output_fn,compresslevel))
+    with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
+        for input_fn_abs in tqdm(input_files,disable=(not verbose)):
+            input_fn_relative = os.path.relpath(input_fn_abs,arc_name_base)
+            zipf.write(input_fn_abs,
+                       arcname=input_fn_relative,
+                       compresslevel=compresslevel,
+                       compress_type=zipfile.ZIP_DEFLATED)
+    return output_fn
 def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, compresslevel=9):
     """
-    Recursively zip everything in [input_folder], storing outputs as relative paths.
+    Recursively zip everything in [input_folder] into a single zipfile, storing outputs as relative
+    paths.
     Defaults to writing to [input_folder].zip
     """
@@ -474,10 +616,13 @@ def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, com
         output_fn = input_folder + '.zip'
     if not overwrite:
-        assert not os.path.isfile(output_fn), 'Zip file {} exists'.format(output_fn)
+        if os.path.isfile(output_fn):
+            print('Zip file {} exists, skipping'.format(output_fn))
+            return
     if verbose:
-        print('Zipping {} to {}'.format(input_folder,output_fn))
+        print('Zipping {} to {} (compression level {})'.format(
+            input_folder,output_fn,compresslevel))
     relative_filenames = recursive_file_list(input_folder,return_relative_paths=True)
@@ -492,7 +637,8 @@ def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, com
     return output_fn
-def parallel_zip_files(input_files, max_workers=16, use_threads=True):
+def parallel_zip_files(input_files, max_workers=16, use_threads=True, compresslevel=9,
+                       overwrite=False, verbose=False):
     """
     Zip one or more files to separate output files in parallel, leaving the
     original files in place.  Each file is zipped to [filename].zip.
@@ -506,12 +652,14 @@ def parallel_zip_files(input_files, max_workers=16, use_threads=True):
         pool = Pool(n_workers)
     with tqdm(total=len(input_files)) as pbar:
-        for i,_ in enumerate(pool.imap_unordered(zip_file,input_files)):
+        for i,_ in enumerate(pool.imap_unordered(partial(zip_file,
+          output_fn=None,overwrite=overwrite,verbose=verbose,compresslevel=compresslevel),
+          input_files)):
             pbar.update()
 def parallel_zip_folders(input_folders, max_workers=16, use_threads=True,
-                         compresslevel=9, overwrite=False):
+                         compresslevel=9, overwrite=False, verbose=False):
     """
     Zip one or more folders to separate output files in parallel, leaving the
     original folders in place.  Each folder is zipped to [folder_name].zip.
@@ -526,11 +674,37 @@ def parallel_zip_folders(input_folders, max_workers=16, use_threads=True,
     with tqdm(total=len(input_folders)) as pbar:
         for i,_ in enumerate(pool.imap_unordered(
-                partial(zip_folder,overwrite=overwrite,compresslevel=compresslevel),
+                partial(zip_folder,overwrite=overwrite,
+                        compresslevel=compresslevel,verbose=verbose),
                 input_folders)):
             pbar.update()
+def zip_each_file_in_folder(folder_name,recursive=False,max_workers=16,use_threads=True,
+                            compresslevel=9,overwrite=False,required_token=None,verbose=False,
+                            exclude_zip=True):
+    """
+    Zip each file in [folder_name] to its own zipfile (filename.zip), optionally recursing.  To zip a whole
+    folder into a single zipfile, use zip_folder().
+    If required_token is not None, include only files that contain that token.
+    """
+    assert os.path.isdir(folder_name), '{} is not a folder'.format(folder_name)
+    input_files = recursive_file_list(folder_name,recursive=recursive,return_relative_paths=False)
+    if required_token is not None:
+        input_files = [fn for fn in input_files if required_token in fn]
+    if exclude_zip:
+        input_files = [fn for fn in input_files if (not fn.endswith('.zip'))]
+    parallel_zip_files(input_files=input_files,max_workers=max_workers,
+                       use_threads=use_threads,compresslevel=compresslevel,
+                       overwrite=overwrite,verbose=verbose)
 def unzip_file(input_file, output_folder=None):
     """
     Unzip a zipfile to the specified output folder, defaulting to the same location as

md_utils/url_utils.py CHANGED Viewed

@@ -16,6 +16,7 @@ import requests
 from tqdm import tqdm
 from urllib.parse import urlparse
+from multiprocessing.pool import ThreadPool
 url_utils_temp_dir = None
 max_path_len = 255
@@ -109,7 +110,14 @@ def download_url(url, destination_filename=None, progress_updater=None,
 def download_relative_filename(url, output_base, verbose=False):
     """
-    Download a URL to output_base, preserving relative path
+    Download a URL to output_base, preserving relative path.  Path is relative to
+    the site, so:
+        https://abc.com/xyz/123.txt
+    ...will get downloaded to:
+        output_base/xyz/123.txt
     """
     p = urlparse(url)
@@ -119,6 +127,63 @@ def download_relative_filename(url, output_base, verbose=False):
     download_url(url, destination_filename, verbose=verbose)
+def parallel_download_urls(url_to_target_file,verbose=False,overwrite=False,
+                           n_workers=20):
+    """
+    Download a list of URLs to local files.  url_to_target_file should
+    be a dict mapping URLs to output files.  Catches exceptions and reports
+    them in the returned "results" array.
+    """
+    def _do_parallelized_download(download_info,overwrite=False):
+        url = download_info['url']
+        target_file = download_info['target_file']
+        result = {'status':'unknown','url':url,'target_file':target_file}
+        if ((os.path.isfile(target_file)) and (not overwrite)):
+            result['status'] = 'skipped'
+            return result
+        try:
+            download_url(url=url,
+                         destination_filename=target_file,
+                         verbose=verbose, force_download=overwrite)
+        except Exception as e:
+            print('Warning: error downloading URL {}: {}'.format(
+                url,str(e)))
+            result['status'] = 'error: {}'.format(str(e))
+            return result
+        result['status'] = 'success'
+        return result
+    all_download_info = []
+    for url in url_to_target_file:
+        download_info = {}
+        download_info['url'] = url
+        download_info['target_file'] = url_to_target_file[url]
+        all_download_info.append(download_info)
+    print('Downloading {} images on {} workers'.format(
+        len(all_download_info),n_workers))
+    if n_workers <= 1:
+        results = []
+        for download_info in tqdm(all_download_info):
+            result = _do_parallelized_download(download_info,overwrite=overwrite)
+            results.append(result)
+    else:
+        pool = ThreadPool(n_workers)
+        results = list(tqdm(pool.imap(lambda download_info: _do_parallelized_download(
+            download_info,overwrite=overwrite),all_download_info),
+            total=len(all_download_info)))
+    return results
 def test_urls(urls, error_on_failure=True):
     """
     Verify that a list of URLs is available (returns status 200).  By default,

megadetector 5.0.7__py3-none-any.whl → 5.0.8__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.7py3-none-any.whl → 5.0.8py3-none-any.whl