PyPI - megadetector - Versions diffs - 5.0.19__py3-none-any.whl → 5.0.21__py3-none-any.whl - Mend

megadetector 5.0.19py3-none-any.whl → 5.0.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (34) hide show

megadetector/utils/path_utils.py CHANGED Viewed

@@ -17,6 +17,7 @@ import platform
 import string
 import json
 import shutil
+import hashlib
 import unicodedata
 import zipfile
 import tarfile
@@ -236,6 +237,30 @@ def path_is_abs(p):
     return (len(p) > 1) and (p[0] == '/' or p[1] == ':' or p[0] == '\\')
+def safe_create_link(link_exists,link_new):
+    """
+    Creates a symlink at [link_new] pointing to [link_exists].
+    If [link_new] already exists, make sure it's a link (not a file),
+    and if it has a different target than [link_exists], removes and re-creates
+    it.
+    Errors if [link_new] already exists but it's not a link.
+    Args:
+        link_exists (str): the source of the (possibly-new) symlink
+        link_new (str): the target of the (possibly-new) symlink
+    """
+    if os.path.exists(link_new) or os.path.islink(link_new):
+        assert os.path.islink(link_new)
+        if not os.readlink(link_new) == link_exists:
+            os.remove(link_new)
+            os.symlink(link_exists,link_new)
+    else:
+        os.symlink(link_exists,link_new)
 def top_level_folder(p):
     r"""
     Gets the top-level folder from the path *p*.
@@ -296,31 +321,6 @@ if False:
     p = r'c:/foo'; s = top_level_folder(p); print(s); assert s == 'c:/foo'
     p = r'c:\foo/bar'; s = top_level_folder(p); print(s); assert s == 'c:\\foo'
-    #%%
-def safe_create_link(link_exists,link_new):
-    """
-    Creates a symlink at [link_new] pointing to [link_exists].
-    If [link_new] already exists, make sure it's a link (not a file),
-    and if it has a different target than [link_exists], removes and re-creates
-    it.
-    Errors if [link_new] already exists but it's not a link.
-    Args:
-        link_exists (str): the source of the (possibly-new) symlink
-        link_new (str): the target of the (possibly-new) symlink
-    """
-    if os.path.exists(link_new) or os.path.islink(link_new):
-        assert os.path.islink(link_new)
-        if not os.readlink(link_new) == link_exists:
-            os.remove(link_new)
-            os.symlink(link_exists,link_new)
-    else:
-        os.symlink(link_exists,link_new)
 #%% Image-related path functions
@@ -598,7 +598,9 @@ def open_file(filename, attempt_to_open_in_wsl_host=False, browser_name=None):
         opener = 'xdg-open'
         subprocess.call([opener, filename])
+# ...def open_file(...)
 #%% File list functions
@@ -649,8 +651,12 @@ def _copy_file(input_output_tuple,overwrite=True,verbose=False):
     target_fn = input_output_tuple[1]
     if (not overwrite) and (os.path.isfile(target_fn)):
         if verbose:
-            print('Skipping existing file {}'.format(target_fn))
-        return
+            print('Skipping existing target file {}'.format(target_fn))
+        return
+    if verbose:
+        print('Copying to target file {}'.format(target_fn))
     os.makedirs(os.path.dirname(target_fn),exist_ok=True)
     shutil.copyfile(source_fn,target_fn)
@@ -667,7 +673,7 @@ def parallel_copy_files(input_file_to_output_file, max_workers=16,
         use_threads (bool, optional): whether to use threads (True) or processes (False) for
             parallel copying; ignored if max_workers <= 1
         overwrite (bool, optional): whether to overwrite existing destination files
-        verbose (bool, optional): enable additionald debug output
+        verbose (bool, optional): enable additional debug output
     """
     n_workers = min(max_workers,len(input_file_to_output_file))
@@ -750,7 +756,7 @@ def parallel_get_file_sizes(filenames,
         max_workers (int, optional): number of concurrent workers; set to <=1 to disable parallelism
         use_threads (bool, optional): whether to use threads (True) or processes (False) for
             parallel copying; ignored if max_workers <= 1
-        verbose (bool, optional): enable additionald debug output
+        verbose (bool, optional): enable additional debug output
         recursive (bool, optional): enumerate recursively, only relevant if [filenames] is a folder.
         convert_slashes (bool, optional): convert backslashes to forward slashes
         return_relative_paths (bool, optional): return relative paths; only relevant if [filenames]
@@ -804,6 +810,8 @@ def parallel_get_file_sizes(filenames,
     return to_return
+# ...def parallel_get_file_sizes(...)
 #%% Zip functions
@@ -1075,3 +1083,104 @@ def unzip_file(input_file, output_folder=None):
     with zipfile.ZipFile(input_file, 'r') as zf:
         zf.extractall(output_folder)
+#%% File hashing functions
+def compute_file_hash(file_path, algorithm='sha256', allow_failures=True):
+    """
+    Compute the hash of a file.
+    Adapted from:
+    https://www.geeksforgeeks.org/python-program-to-find-hash-of-file/
+    Args:
+        file_path (str): the file to hash
+        algorithm (str, optional): the hashing algorithm to use (e.g. md5, sha256)
+    Returns:
+        str: the hash value for this file
+    """
+    try:
+        hash_func = hashlib.new(algorithm)
+        with open(file_path, 'rb') as file:
+            while chunk := file.read(8192):  # Read the file in chunks of 8192 bytes
+                hash_func.update(chunk)
+        return str(hash_func.hexdigest())
+    except Exception:
+        if allow_failures:
+            return None
+        else:
+            raise
+# ...def compute_file_hash(...)
+def parallel_compute_file_hashes(filenames,
+                               max_workers=16,
+                               use_threads=True,
+                               recursive=True,
+                               algorithm='sha256',
+                               verbose=False):
+    """
+    Compute file hashes for a list or folder of images.
+    Args:
+        filenames (list or str): a list of filenames or a folder
+        max_workers (int, optional): the number of parallel workers to use; set to <=1 to disable
+            parallelization
+        use_threads (bool, optional): whether to use threads (True) or processes (False) for
+            parallelization
+        algorithm (str, optional): the hashing algorithm to use (e.g. md5, sha256)
+        recursive (bool, optional): if [filenames] is a folder, whether to enumerate recursively.
+            Ignored if [filenames] is a list.
+        verbose (bool, optional): enable additional debug output
+    Returns:
+        dict: a dict mapping filenames to hash values; values will be None for files that fail
+        to load.
+    """
+    if isinstance(filenames,str) and os.path.isdir(filenames):
+        if verbose:
+            print('Enumerating files in {}'.format(filenames))
+        filenames = recursive_file_list(filenames,recursive=recursive,return_relative_paths=False)
+    n_workers = min(max_workers,len(filenames))
+    if verbose:
+        print('Computing hashes for {} files on {} workers'.format(len(filenames),n_workers))
+    if n_workers <= 1:
+        results = []
+        for filename in filenames:
+            results.append(compute_file_hash(filename,algorithm=algorithm,allow_failures=True))
+    else:
+        if use_threads:
+            pool = ThreadPool(n_workers)
+        else:
+            pool = Pool(n_workers)
+        results = list(tqdm(pool.imap(
+            partial(compute_file_hash,algorithm=algorithm,allow_failures=True),
+            filenames), total=len(filenames)))
+    assert len(filenames) == len(results), 'Internal error in parallel_compute_file_hashes'
+    to_return = {}
+    for i_file,filename in enumerate(filenames):
+        to_return[filename] = results[i_file]
+    return to_return
+# ...def parallel_compute_file_hashes(...)

megadetector/utils/write_html_image_list.py CHANGED Viewed

@@ -42,6 +42,7 @@ def write_html_image_list(filename=None,images=None,options=None):
         options (dict, optional): a dict with one or more of the following fields:
             - fHtml (file pointer to write to, used for splitting write operations over multiple calls)
+            - pageTitle (HTML page title)
             - headerHtml (html text to include before the image list)
             - trailerHtml (html text to include after the image list)
             - defaultImageStyle (default css style for images)
@@ -60,11 +61,14 @@ def write_html_image_list(filename=None,images=None,options=None):
     if 'fHtml' not in options:
         options['fHtml'] = -1
+    if 'pageTitle' not in options or options['pageTitle'] is None:
+        options['pageTitle'] = ''
     if 'headerHtml' not in options or options['headerHtml'] is None:
-        options['headerHtml'] = ''
+        options['headerHtml'] = ''
     if 'trailerHtml' not in options or options['trailerHtml'] is None:
-        options['trailerHtml'] = ''
+        options['trailerHtml'] = ''
     if 'defaultTextStyle' not in options or options['defaultTextStyle'] is None:
         options['defaultTextStyle'] = \
@@ -114,7 +118,7 @@ def write_html_image_list(filename=None,images=None,options=None):
         # You can't supply your own file handle in this case
         if options['fHtml'] != -1:
             raise ValueError(
-                    'You can''t supply your own file handle if we have to page the image set')
+                    "You can't supply your own file handle if we have to page the image set")
         figureFileStartingIndices = list(range(0,nImages,options['maxFiguresPerHtmlFile']))
@@ -124,7 +128,10 @@ def write_html_image_list(filename=None,images=None,options=None):
         fMeta = open(filename,'w')
         # Write header stuff
-        fMeta.write('<html><body>\n')
+        titleString = '<title>Index page</title>'
+        if len(options['pageTitle']) > 0:
+            titleString = '<title>Index page for: {}</title>'.format(options['pageTitle'])
+        fMeta.write('<html><head>{}</head><body>\n'.format(titleString))
         fMeta.write(options['headerHtml'])
         fMeta.write('<table border = 0 cellpadding = 2>\n')
@@ -170,7 +177,11 @@ def write_html_image_list(filename=None,images=None,options=None):
     else:
         fHtml = options['fHtml']
-    fHtml.write('<html><body>\n')
+    titleString = ''
+    if len(options['pageTitle']) > 0:
+        titleString = '<title>{}</title>'.format(options['pageTitle'])
+    fHtml.write('<html>{}<body>\n'.format(titleString))
     fHtml.write(options['headerHtml'])

megadetector/visualization/visualization_utils.py CHANGED Viewed

@@ -672,6 +672,36 @@ def draw_bounding_boxes_on_image(image,
 # ...draw_bounding_boxes_on_image(...)
+def get_text_size(font,s):
+    """
+    Get the expected width and height when rendering the string [s] in the font
+    [font].
+    Args:
+        font (PIL.ImageFont): the font whose size we should query
+        s (str): the string whose size we should query
+    Returns:
+        tuple: (w,h), both floats in pixel coordinatess
+    """
+    # This is what we did w/Pillow 9
+    # w,h = font.getsize(s)
+    # I would *think* this would be the equivalent for Pillow 10
+    # l,t,r,b = font.getbbox(s); w = r-l; h=b-t
+    # ...but this actually produces the most similar results to Pillow 9
+    # l,t,r,b = font.getbbox(s); w = r; h=b
+    try:
+        l,t,r,b = font.getbbox(s); w = r; h=b
+    except Exception:
+        w,h = font.getsize(s)
+    return w,h
 def draw_bounding_box_on_image(image,
                                ymin,
                                xmin,
@@ -773,24 +803,6 @@ def draw_bounding_box_on_image(image,
     except IOError:
         font = ImageFont.load_default()
-    def get_text_size(font,s):
-        # This is what we did w/Pillow 9
-        # w,h = font.getsize(s)
-        # I would *think* this would be the equivalent for Pillow 10
-        # l,t,r,b = font.getbbox(s); w = r-l; h=b-t
-        # ...but this actually produces the most similar results to Pillow 9
-        # l,t,r,b = font.getbbox(s); w = r; h=b
-        try:
-            l,t,r,b = font.getbbox(s); w = r; h=b
-        except Exception:
-            w,h = font.getsize(s)
-        return w,h
     # If the total height of the display strings added to the top of the bounding
     # box exceeds the top of the image, stack the strings below the bounding box
     # instead of above.
@@ -972,7 +984,7 @@ def draw_bounding_boxes_on_file(input_file,
             boxes are length-four arrays formatted as [x,y,w,h], normalized,
             upper-left origin (this is the standard MD detection format)
         detector_label_map (dict, optional): a dict mapping category IDs to strings.  If this
-            is None, no confidence values or identifiers are shown  If this is {}, just category
+            is None, no confidence values or identifiers are shown.  If this is {}, just category
             indices and confidence values are shown.
         thickness (int, optional): line width in pixels for box rendering
         expansion (int, optional): box expansion in pixels
@@ -1043,7 +1055,7 @@ def draw_db_boxes_on_file(input_file,
         classes = [0] * len(boxes)
     render_db_bounding_boxes(boxes, classes, image, original_size=None,
-                                 label_map=label_map, thickness=thickness, expansion=expansion)
+                             label_map=label_map, thickness=thickness, expansion=expansion)
     image.save(output_file)
@@ -1125,7 +1137,6 @@ def gray_scale_fraction(image,crop_size=(0.1,0.1)):
                 if r == g and r == b and g == b:
                     n_gray_pixels += 1
 # ...def gray_scale_fraction(...)
@@ -1376,6 +1387,98 @@ def resize_image_folder(input_folder,
 # ...def resize_image_folder(...)
+def get_image_size(im,verbose=False):
+    """
+    Retrieve the size of an image.  Returns None if the image fails to load.
+    Args:
+        im (str or PIL.Image): filename or PIL image
+    Returns:
+        tuple (w,h), or None if the image fails to load.
+    """
+    image_name = '[in memory]'
+    try:
+        if isinstance(im,str):
+            image_name = im
+            im = load_image(im)
+        w = im.width
+        h = im.height
+        if w <= 0 or h <= 0:
+            if verbose:
+                print('Error reading width from image {}: {},{}'.format(
+                    image_name,w,h))
+            return None
+        return (w,h)
+    except Exception as e:
+        if verbose:
+            print('Error reading width from image {}: {}'.format(
+                image_name,str(e)))
+        return None
+# ...def get_image_size(...)
+def parallel_get_image_sizes(filenames,
+                            max_workers=16,
+                            use_threads=True,
+                            recursive=True,
+                            verbose=False):
+    """
+    Retrieve image sizes for a list or folder of images
+    Args:
+        filenames (list or str): a list of image filenames or a folder
+        max_workers (int, optional): the number of parallel workers to use; set to <=1 to disable
+            parallelization
+        use_threads (bool, optional): whether to use threads (True) or processes (False) for
+            parallelization
+        recursive (bool, optional): if [filenames] is a folder, whether to search recursively for images.
+            Ignored if [filenames] is a list.
+        verbose (bool, optional): enable additional debug output
+    Returns:
+        dict: a dict mapping filenames to (w,h) tuples; values will be None for images that fail
+        to load.
+    """
+    if isinstance(filenames,str) and os.path.isdir(filenames):
+        if verbose:
+            print('Enumerating images in {}'.format(filenames))
+        filenames = find_images(filenames,recursive=recursive,return_relative_paths=False)
+    n_workers = min(max_workers,len(filenames))
+    if verbose:
+        print('Getting image sizes for {} images'.format(len(filenames)))
+    if n_workers <= 1:
+        results = []
+        for filename in filenames:
+            results.append(get_image_size(filename,verbose=verbose))
+    else:
+        if use_threads:
+            pool = ThreadPool(n_workers)
+        else:
+            pool = Pool(n_workers)
+        results = list(tqdm(pool.imap(
+            partial(get_image_size,verbose=verbose),filenames), total=len(filenames)))
+    assert len(filenames) == len(results), 'Internal error in parallel_get_image_sizes'
+    to_return = {}
+    for i_file,filename in enumerate(filenames):
+        to_return[filename] = results[i_file]
+    return to_return
 #%% Image integrity checking functions
 def check_image_integrity(filename,modes=None):
@@ -1494,13 +1597,13 @@ def parallel_check_image_integrity(filenames,
         with either 'success' or 'error').
     """
-    n_workers = min(max_workers,len(filenames))
     if isinstance(filenames,str) and os.path.isdir(filenames):
         if verbose:
             print('Enumerating images in {}'.format(filenames))
         filenames = find_images(filenames,recursive=recursive,return_relative_paths=False)
+    n_workers = min(max_workers,len(filenames))
     if verbose:
         print('Checking image integrity for {} filenames'.format(len(filenames)))

megadetector 5.0.19__py3-none-any.whl → 5.0.21__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.19py3-none-any.whl → 5.0.21py3-none-any.whl