PyPI - megadetector - Versions diffs - 5.0.20__py3-none-any.whl → 5.0.22__py3-none-any.whl - Mend

megadetector 5.0.20py3-none-any.whl → 5.0.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (41) hide show

megadetector/taxonomy_mapping/map_new_lila_datasets.py CHANGED Viewed

@@ -15,15 +15,17 @@ import json
 # Created by get_lila_category_list.py
 input_lila_category_list_file = os.path.expanduser('~/lila/lila_categories_list/lila_dataset_to_categories.json')
-output_file = os.path.expanduser('~/lila/lila_additions_2024.10.05.csv')
+output_file = os.path.expanduser('~/lila/lila_additions_2024.12.31.csv')
 datasets_to_map = [
-    'Ohio Small Animals'
+    'Seattle(ish) Camera Traps'
     ]
 #%% Initialize taxonomic lookup
+# Takes ~2 mins
 from megadetector.taxonomy_mapping.species_lookup import \
     initialize_taxonomy_lookup, get_preferred_taxonomic_match
@@ -39,27 +41,27 @@ lila_datasets = set()
 for dataset_name in input_lila_categories.keys():
     # The script that generates this dictionary creates a separate entry for bounding box
-    # metadata files, but those don't represent new dataset names
+    # metadata files, but those don't represent new dataset names, so we ignore them here.
     lila_datasets.add(dataset_name.replace('_bbox',''))
 for s in datasets_to_map:
     assert s in lila_datasets
 #%% Find all categories
 category_mappings = []
 # dataset_name = datasets_to_map[0]
 for dataset_name in datasets_to_map:
     ds_categories = input_lila_categories[dataset_name]
     for category in ds_categories:
         category_name = category['name']
         assert ':' not in category_name
         mapping_name = dataset_name + ':' + category_name
         category_mappings.append(mapping_name)
 print('Need to create {} mappings'.format(len(category_mappings)))
@@ -128,22 +130,23 @@ output_df.to_csv(output_file, index=None, header=True)
 if False:
-    #%%
+    #%% You probably want to open the .csv file first
     from megadetector.utils.path_utils import open_file
     open_file(output_file)
     #%%
     # q = 'white-throated monkey'
     # q = 'cingulata'
     # q = 'notamacropus'
-    q = 'thamnophis saurita saurita'
+    q = 'insects'
     taxonomy_preference = 'inat'
     m = get_preferred_taxonomic_match(q,taxonomy_preference)
     # print(m.scientific_name); import clipboard; clipboard.copy(m.scientific_name)
-    if m is None:
+    if (m is None) or (len(m.taxonomy_string) == 0):
         print('No match')
     else:
         if m.source != taxonomy_preference:

megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py CHANGED Viewed

@@ -89,7 +89,7 @@ if False:
                         'genus',
                         'species','subspecies','variety']
-    levels_to_exclude = ['stateofmatter','zoosection','parvorder','complex']
+    levels_to_exclude = ['stateofmatter','zoosection','parvorder','complex','epifamily']
     for s in levels_to_exclude:
         assert s not in levels_to_include

megadetector/taxonomy_mapping/preview_lila_taxonomy.py CHANGED Viewed

@@ -16,7 +16,7 @@ import os
 import pandas as pd
 # lila_taxonomy_file = r"c:\git\agentmorrisprivate\lila-taxonomy\lila-taxonomy-mapping.csv"
-lila_taxonomy_file = os.path.expanduser('~/lila/lila_additions_2024.10.05.csv')
+lila_taxonomy_file = os.path.expanduser('~/lila/lila_additions_2024.12.31.csv')
 preview_base = os.path.expanduser('~/lila/lila_taxonomy_preview')
 os.makedirs(preview_base,exist_ok=True)
@@ -399,6 +399,8 @@ images_per_query = 15
 min_valid_images_per_query = 3
 min_valid_image_size = 3000
+# TODO: parallelize this loop
+#
 # i_row = 0; row = df.iloc[i_row]
 for i_row,row in df.iterrows():

megadetector/utils/ct_utils.py CHANGED Viewed

@@ -12,6 +12,7 @@ import inspect
 import json
 import math
 import os
+import builtins
 import jsonpickle
 import numpy as np
@@ -613,6 +614,50 @@ def is_empty(v):
     return False
+def min_none(a,b):
+    """
+    Returns the minimum of a and b.  If both are None, returns None.  If one is None,
+    returns the other.
+    Args:
+        a (numeric): the first value to compare
+        b (numeric): the second value to compare
+    Returns:
+        numeric: the minimum of a and b, or None
+    """
+    if a is None and b is None:
+        return None
+    elif a is None:
+        return b
+    elif b is None:
+        return a
+    else:
+        return min(a,b)
+def max_none(a,b):
+    """
+    Returns the maximum of a and b.  If both are None, returns None.  If one is None,
+    returns the other.
+    Args:
+        a (numeric): the first value to compare
+        b (numeric): the second value to compare
+    Returns:
+        numeric: the maximum of a and b, or None
+    """
+    if a is None and b is None:
+        return None
+    elif a is None:
+        return b
+    elif b is None:
+        return a
+    else:
+        return max(a,b)
 def isnan(v):
     """
     Returns True if v is a nan-valued float, otherwise returns False.
@@ -645,7 +690,24 @@ def sets_overlap(set1, set2):
     return not set(set1).isdisjoint(set(set2))
+def is_function_name(s,calling_namespace):
+    """
+    Determines whether [s] is a callable function in the global or local scope, or a
+    built-in function.
+    Args:
+        s (str): the string to test for function-ness
+        calling_namespace (dict): typically pass the output of locals()
+    """
+    assert isinstance(s,str), 'Input is not a string'
+    return callable(globals().get(s)) or \
+        callable(locals().get(s)) or \
+        callable(calling_namespace.get(s)) or \
+        callable(getattr(builtins, s, None))
 #%% Test drivers
 if False:
@@ -678,4 +740,4 @@ if False:
     L = [{'a':5},{'a':0},{'a':10}]
     k = 'a'
     sort_list_of_dicts_by_key(L, k, reverse=True)

megadetector/utils/md_tests.py CHANGED Viewed

@@ -29,10 +29,6 @@ import subprocess
 import argparse
 import inspect
-#: IoU threshold used to determine whether boxes in two detection files likely correspond
-#: to the same box.
-iou_threshold_for_file_comparison = 0.9
 #%% Classes
@@ -106,6 +102,10 @@ class MDTestOptions:
         #: PYTHONPATH to set for CLI tests; if None, inherits from the parent process.  Only
         #: impacts the called functions, not the parent process.
         self.cli_test_pythonpath = None
+        #: IoU threshold used to determine whether boxes in two detection files likely correspond
+        #: to the same box.
+        self.iou_threshold_for_file_comparison = 0.85
 # ...class MDTestOptions()
@@ -410,7 +410,7 @@ def compare_detection_lists(detections_a,detections_b,options,bidirectional_comp
             iou = get_iou(det_a['bbox'],b_det['bbox'])
             # Is this likely the same detection as det_a?
-            if iou >= iou_threshold_for_file_comparison and iou > highest_iou:
+            if iou >= options.iou_threshold_for_file_comparison and iou > highest_iou:
                 matching_det_b = b_det
                 highest_iou = iou
@@ -529,12 +529,14 @@ def compare_results(inference_output_file,expected_results_file,options):
     if not options.warning_mode:
         assert max_conf_error <= options.max_conf_error, \
-            'Confidence error {} is greater than allowable ({}), on file:\n{}'.format(
-                max_conf_error,options.max_conf_error,max_conf_error_file)
+            'Confidence error {} is greater than allowable ({}), on file:\n{} ({},{})'.format(
+                max_conf_error,options.max_conf_error,max_conf_error_file,
+                inference_output_file,expected_results_file)
         assert max_coord_error <= options.max_coord_error, \
-            'Coord error {} is greater than allowable ({}), on file:\n{}'.format(
-                max_coord_error,options.max_coord_error,max_coord_error_file)
+            'Coord error {} is greater than allowable ({}), on file:\n{} ({},{})'.format(
+                max_coord_error,options.max_coord_error,max_coord_error_file,
+                inference_output_file,expected_results_file)
     print('Max conf error: {} (file {})'.format(
         max_conf_error,max_conf_error_file))
@@ -847,7 +849,7 @@ def run_python_tests(options):
         video_options.frame_rendering_folder = os.path.join(options.scratch_dir,'video_scratch/rendered_frame_folder')
         video_options.render_output_video = True
         # video_options.keep_rendered_frames = False
-        # video_options.keep_rendered_frames = False
+        # video_options.keep_extracted_frames = False
         video_options.force_extracted_frame_folder_deletion = True
         video_options.force_rendered_frame_folder_deletion = True
         # video_options.reuse_results_if_available = False
@@ -887,7 +889,7 @@ def run_python_tests(options):
         video_options.frame_rendering_folder = os.path.join(options.scratch_dir,'video_scratch/rendered_frame_folder')
         video_options.render_output_video = False
         video_options.keep_rendered_frames = False
-        video_options.keep_rendered_frames = False
+        video_options.keep_extracted_frames = False
         video_options.force_extracted_frame_folder_deletion = True
         video_options.force_rendered_frame_folder_deletion = True
         video_options.reuse_results_if_available = False
@@ -1208,7 +1210,7 @@ def run_cli_tests(options):
         cmd += ' --overwrite_handling overwrite'
         cmd_results = execute_and_print(cmd)
-        # Run again with checkpointing, make sure the output are identical
+        # Run again with checkpointing, make sure the outputs are identical
         cmd += ' --checkpoint_frequency 5'
         inference_output_file_yolo_val_checkpoint = \
             os.path.join(options.scratch_dir,'folder_inference_output_yolo_val_checkpoint.json')
@@ -1353,7 +1355,7 @@ if False:
     # options.cli_working_dir = r'c:\git\MegaDetector'
     # options.yolo_working_dir = r'c:\git\yolov5-md'
     options.cli_working_dir = os.path.expanduser('~')
-    options.yolo_working_dir = '/mnt/c/git/yolov5-md'
+    # options.yolo_working_dir = '/mnt/c/git/yolov5-md'
     options = download_test_data(options)
     #%%

megadetector/utils/path_utils.py CHANGED Viewed

@@ -17,6 +17,7 @@ import platform
 import string
 import json
 import shutil
+import hashlib
 import unicodedata
 import zipfile
 import tarfile
@@ -31,6 +32,8 @@ from functools import partial
 from shutil import which
 from tqdm import tqdm
+from megadetector.utils.ct_utils import is_iterable
 # Should all be lower-case
 IMG_EXTENSIONS = ('.jpg', '.jpeg', '.gif', '.png', '.tif', '.tiff', '.bmp')
@@ -236,6 +239,30 @@ def path_is_abs(p):
     return (len(p) > 1) and (p[0] == '/' or p[1] == ':' or p[0] == '\\')
+def safe_create_link(link_exists,link_new):
+    """
+    Creates a symlink at [link_new] pointing to [link_exists].
+    If [link_new] already exists, make sure it's a link (not a file),
+    and if it has a different target than [link_exists], removes and re-creates
+    it.
+    Errors if [link_new] already exists but it's not a link.
+    Args:
+        link_exists (str): the source of the (possibly-new) symlink
+        link_new (str): the target of the (possibly-new) symlink
+    """
+    if os.path.exists(link_new) or os.path.islink(link_new):
+        assert os.path.islink(link_new)
+        if not os.readlink(link_new) == link_exists:
+            os.remove(link_new)
+            os.symlink(link_exists,link_new)
+    else:
+        os.symlink(link_exists,link_new)
 def top_level_folder(p):
     r"""
     Gets the top-level folder from the path *p*.
@@ -296,31 +323,6 @@ if False:
     p = r'c:/foo'; s = top_level_folder(p); print(s); assert s == 'c:/foo'
     p = r'c:\foo/bar'; s = top_level_folder(p); print(s); assert s == 'c:\\foo'
-    #%%
-def safe_create_link(link_exists,link_new):
-    """
-    Creates a symlink at [link_new] pointing to [link_exists].
-    If [link_new] already exists, make sure it's a link (not a file),
-    and if it has a different target than [link_exists], removes and re-creates
-    it.
-    Errors if [link_new] already exists but it's not a link.
-    Args:
-        link_exists (str): the source of the (possibly-new) symlink
-        link_new (str): the target of the (possibly-new) symlink
-    """
-    if os.path.exists(link_new) or os.path.islink(link_new):
-        assert os.path.islink(link_new)
-        if not os.readlink(link_new) == link_exists:
-            os.remove(link_new)
-            os.symlink(link_exists,link_new)
-    else:
-        os.symlink(link_exists,link_new)
 #%% Image-related path functions
@@ -598,7 +600,9 @@ def open_file(filename, attempt_to_open_in_wsl_host=False, browser_name=None):
         opener = 'xdg-open'
         subprocess.call([opener, filename])
+# ...def open_file(...)
 #%% File list functions
@@ -649,8 +653,12 @@ def _copy_file(input_output_tuple,overwrite=True,verbose=False):
     target_fn = input_output_tuple[1]
     if (not overwrite) and (os.path.isfile(target_fn)):
         if verbose:
-            print('Skipping existing file {}'.format(target_fn))
-        return
+            print('Skipping existing target file {}'.format(target_fn))
+        return
+    if verbose:
+        print('Copying to target file {}'.format(target_fn))
     os.makedirs(os.path.dirname(target_fn),exist_ok=True)
     shutil.copyfile(source_fn,target_fn)
@@ -667,7 +675,7 @@ def parallel_copy_files(input_file_to_output_file, max_workers=16,
         use_threads (bool, optional): whether to use threads (True) or processes (False) for
             parallel copying; ignored if max_workers <= 1
         overwrite (bool, optional): whether to overwrite existing destination files
-        verbose (bool, optional): enable additionald debug output
+        verbose (bool, optional): enable additional debug output
     """
     n_workers = min(max_workers,len(input_file_to_output_file))
@@ -750,7 +758,7 @@ def parallel_get_file_sizes(filenames,
         max_workers (int, optional): number of concurrent workers; set to <=1 to disable parallelism
         use_threads (bool, optional): whether to use threads (True) or processes (False) for
             parallel copying; ignored if max_workers <= 1
-        verbose (bool, optional): enable additionald debug output
+        verbose (bool, optional): enable additional debug output
         recursive (bool, optional): enumerate recursively, only relevant if [filenames] is a folder.
         convert_slashes (bool, optional): convert backslashes to forward slashes
         return_relative_paths (bool, optional): return relative paths; only relevant if [filenames]
@@ -764,16 +772,21 @@ def parallel_get_file_sizes(filenames,
     folder_name = None
-    if verbose:
-        print('Enumerating files')
-    if isinstance(filenames,str) and os.path.isdir(filenames):
+    if isinstance(filenames,str):
         folder_name = filenames
+        assert os.path.isdir(filenames), 'Could not find folder {}'.format(folder_name)
+        if verbose:
+            print('Enumerating files in {}'.format(folder_name))
         # Enumerate absolute paths here, we'll convert to relative later if requested
-        filenames = recursive_file_list(filenames,recursive=recursive,return_relative_paths=False)
+        filenames = recursive_file_list(folder_name,recursive=recursive,return_relative_paths=False)
+    else:
+        assert is_iterable(filenames), '[filenames] argument is neither a folder nor an iterable'
     if verbose:
         print('Creating worker pool')
@@ -804,6 +817,8 @@ def parallel_get_file_sizes(filenames,
     return to_return
+# ...def parallel_get_file_sizes(...)
 #%% Zip functions
@@ -932,7 +947,7 @@ def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, com
         output_fn (str, optional): output filename; if this is None, we'll write to [input_folder].zip
         overwrite (bool, optional): whether to overwrite an existing .tar file
         verbose (bool, optional): enable additional debug console output
-        compresslevel (int, optional): compression level to use, between 0 and 9
+        compresslevel (int, optional): compression level to use, between 0 and 9
     Returns:
         str: the output zipfile, whether we created it or determined that it already exists
@@ -1075,3 +1090,104 @@ def unzip_file(input_file, output_folder=None):
     with zipfile.ZipFile(input_file, 'r') as zf:
         zf.extractall(output_folder)
+#%% File hashing functions
+def compute_file_hash(file_path, algorithm='sha256', allow_failures=True):
+    """
+    Compute the hash of a file.
+    Adapted from:
+    https://www.geeksforgeeks.org/python-program-to-find-hash-of-file/
+    Args:
+        file_path (str): the file to hash
+        algorithm (str, optional): the hashing algorithm to use (e.g. md5, sha256)
+    Returns:
+        str: the hash value for this file
+    """
+    try:
+        hash_func = hashlib.new(algorithm)
+        with open(file_path, 'rb') as file:
+            while chunk := file.read(8192):  # Read the file in chunks of 8192 bytes
+                hash_func.update(chunk)
+        return str(hash_func.hexdigest())
+    except Exception:
+        if allow_failures:
+            return None
+        else:
+            raise
+# ...def compute_file_hash(...)
+def parallel_compute_file_hashes(filenames,
+                               max_workers=16,
+                               use_threads=True,
+                               recursive=True,
+                               algorithm='sha256',
+                               verbose=False):
+    """
+    Compute file hashes for a list or folder of images.
+    Args:
+        filenames (list or str): a list of filenames or a folder
+        max_workers (int, optional): the number of parallel workers to use; set to <=1 to disable
+            parallelization
+        use_threads (bool, optional): whether to use threads (True) or processes (False) for
+            parallelization
+        algorithm (str, optional): the hashing algorithm to use (e.g. md5, sha256)
+        recursive (bool, optional): if [filenames] is a folder, whether to enumerate recursively.
+            Ignored if [filenames] is a list.
+        verbose (bool, optional): enable additional debug output
+    Returns:
+        dict: a dict mapping filenames to hash values; values will be None for files that fail
+        to load.
+    """
+    if isinstance(filenames,str) and os.path.isdir(filenames):
+        if verbose:
+            print('Enumerating files in {}'.format(filenames))
+        filenames = recursive_file_list(filenames,recursive=recursive,return_relative_paths=False)
+    n_workers = min(max_workers,len(filenames))
+    if verbose:
+        print('Computing hashes for {} files on {} workers'.format(len(filenames),n_workers))
+    if n_workers <= 1:
+        results = []
+        for filename in filenames:
+            results.append(compute_file_hash(filename,algorithm=algorithm,allow_failures=True))
+    else:
+        if use_threads:
+            pool = ThreadPool(n_workers)
+        else:
+            pool = Pool(n_workers)
+        results = list(tqdm(pool.imap(
+            partial(compute_file_hash,algorithm=algorithm,allow_failures=True),
+            filenames), total=len(filenames)))
+    assert len(filenames) == len(results), 'Internal error in parallel_compute_file_hashes'
+    to_return = {}
+    for i_file,filename in enumerate(filenames):
+        to_return[filename] = results[i_file]
+    return to_return
+# ...def parallel_compute_file_hashes(...)

megadetector/utils/process_utils.py CHANGED Viewed

@@ -59,8 +59,13 @@ def execute(cmd,encoding=None,errors=None,env=None,verbose=False):
     return return_code
-def execute_and_print(cmd,print_output=True,encoding=None,errors=None,
-                      env=None,verbose=False,catch_exceptions=True,
+def execute_and_print(cmd,
+                      print_output=True,
+                      encoding=None,
+                      errors=None,
+                      env=None,
+                      verbose=False,
+                      catch_exceptions=True,
                       echo_command=False):
     """
     Run [cmd] (a single string) in a shell, capturing and printing output.  Returns
@@ -73,7 +78,8 @@ def execute_and_print(cmd,print_output=True,encoding=None,errors=None,
     Args:
         cmd (str): command to run
-        print_output (bool, optional): whether to print output from [cmd]
+        print_output (bool, optional): whether to print output from [cmd] (stdout is
+            captured regardless of the value of print_output)
         encoding (str, optional): stdout encoding, see Popen() documentation
         errors (str, optional): error handling, see Popen() documentation
         env (dict, optional): environment variables, see Popen() documentation

megadetector/utils/write_html_image_list.py CHANGED Viewed

@@ -42,7 +42,9 @@ def write_html_image_list(filename=None,images=None,options=None):
         options (dict, optional): a dict with one or more of the following fields:
             - fHtml (file pointer to write to, used for splitting write operations over multiple calls)
+            - pageTitle (HTML page title)
             - headerHtml (html text to include before the image list)
+            - subPageHeaderHtml (html text to include before the images when images are broken into pages)
             - trailerHtml (html text to include after the image list)
             - defaultImageStyle (default css style for images)
             - defaultTextStyle (default css style for image titles)
@@ -60,11 +62,17 @@ def write_html_image_list(filename=None,images=None,options=None):
     if 'fHtml' not in options:
         options['fHtml'] = -1
+    if 'pageTitle' not in options or options['pageTitle'] is None:
+        options['pageTitle'] = ''
     if 'headerHtml' not in options or options['headerHtml'] is None:
-        options['headerHtml'] = ''
+        options['headerHtml'] = ''
+    if 'subPageHeaderHtml' not in options or options['subPageHeaderHtml'] is None:
+        options['subPageHeaderHtml'] = ''
     if 'trailerHtml' not in options or options['trailerHtml'] is None:
-        options['trailerHtml'] = ''
+        options['trailerHtml'] = ''
     if 'defaultTextStyle' not in options or options['defaultTextStyle'] is None:
         options['defaultTextStyle'] = \
@@ -114,7 +122,7 @@ def write_html_image_list(filename=None,images=None,options=None):
         # You can't supply your own file handle in this case
         if options['fHtml'] != -1:
             raise ValueError(
-                    'You can''t supply your own file handle if we have to page the image set')
+                    "You can't supply your own file handle if we have to page the image set")
         figureFileStartingIndices = list(range(0,nImages,options['maxFiguresPerHtmlFile']))
@@ -124,7 +132,10 @@ def write_html_image_list(filename=None,images=None,options=None):
         fMeta = open(filename,'w')
         # Write header stuff
-        fMeta.write('<html><body>\n')
+        titleString = '<title>Index page</title>'
+        if len(options['pageTitle']) > 0:
+            titleString = '<title>Index page for: {}</title>'.format(options['pageTitle'])
+        fMeta.write('<html><head>{}</head><body>\n'.format(titleString))
         fMeta.write(options['headerHtml'])
         fMeta.write('<table border = 0 cellpadding = 2>\n')
@@ -145,7 +156,7 @@ def write_html_image_list(filename=None,images=None,options=None):
             localImages = images[iStart:iEnd+1]
             localOptions = options.copy();
-            localOptions['headerHtml'] = '';
+            localOptions['headerHtml'] = options['subPageHeaderHtml'];
             localOptions['trailerHtml'] = '';
             # Make a recursive call for this image set
@@ -170,7 +181,11 @@ def write_html_image_list(filename=None,images=None,options=None):
     else:
         fHtml = options['fHtml']
-    fHtml.write('<html><body>\n')
+    titleString = ''
+    if len(options['pageTitle']) > 0:
+        titleString = '<title>{}</title>'.format(options['pageTitle'])
+    fHtml.write('<html>{}<body>\n'.format(titleString))
     fHtml.write(options['headerHtml'])

megadetector 5.0.20__py3-none-any.whl → 5.0.22__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.20py3-none-any.whl → 5.0.22py3-none-any.whl