PyPI - megadetector - Versions diffs - 5.0.20__py3-none-any.whl → 5.0.21__py3-none-any.whl - Mend

megadetector 5.0.20py3-none-any.whl → 5.0.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (19) hide show

megadetector/postprocessing/validate_batch_results.py CHANGED Viewed

@@ -42,11 +42,13 @@ class ValidateBatchResultsOptions:
         #:
         #: If None, assumes absolute paths.
         self.relative_path_base = None
+        #: Should we return the loaded data, or just the validation results?
+        self.return_data = False
 # ...class ValidateBatchResultsOptions
 #%% Main function
 def validate_batch_results(json_filename,options=None):
@@ -55,11 +57,17 @@ def validate_batch_results(json_filename,options=None):
     Args:
         json_filename (str): the filename to validate
-        options (ValidateBatchResultsOptions, optionsl): all the parameters used to control this
+        options (ValidateBatchResultsOptions, optional): all the parameters used to control this
             process, see ValidateBatchResultsOptions for details
     Returns:
-        bool: reserved; currently always errors or returns True.
+        dict: a dict with a field called "validation_results", which is itself a dict.  The reason
+        it's a dict inside a dict is that if return_data is True, the outer dict also contains all
+        the loaded data.  The "validation_results" dict contains fields called "errors", "warnings",
+        and "filename".  "errors" and "warnings" are lists of strings, although "errors" will never
+        be longer than N=1, since validation fails at the first error.
     """
     if options is None:
@@ -68,75 +76,127 @@ def validate_batch_results(json_filename,options=None):
     with open(json_filename,'r') as f:
         d = json.load(f)
-    ## Info validation
+    validation_results = {}
+    validation_results['filename'] = json_filename
+    validation_results['warnings'] = []
+    validation_results['errors'] = []
-    assert 'info' in d
-    info = d['info']
+    if not isinstance(d,dict):
+        validation_results['errors'].append('Input data is not a dict')
+        to_return = {}
+        to_return['validation_results'] = validation_results
+        return to_return
-    assert isinstance(info,dict)
-    assert 'format_version' in info
-    format_version = float(info['format_version'])
-    assert format_version >= 1.3, 'This validator can only be used with format version 1.3 or later'
+    try:
+        ## Info validation
+        if not 'info' in d:
+            raise ValueError('Input does not contain info field')
-    print('Validating a .json results file with format version {}'.format(format_version))
-    ## Category validation
-    assert 'detection_categories' in d
-    for k in d['detection_categories'].keys():
-        # Categories should be string-formatted ints
-        assert isinstance(k,str)
-        _ = int(k)
-        assert isinstance(d['detection_categories'][k],str)
+        info = d['info']
+        if not isinstance(info,dict):
+            raise ValueError('Input contains invalid info field')
+        if 'format_version' not in info :
+            raise ValueError('Input does not specify format version')
+        format_version = float(info['format_version'])
+        if format_version < 1.3:
+            raise ValueError('This validator can only be used with format version 1.3 or later')
-    if 'classification_categories' in d:
-        for k in d['classification_categories'].keys():
-            # Categories should be string-formatted ints
-            assert isinstance(k,str)
+        ## Category validation
+        if 'detection_categories' not in d:
+            raise ValueError('Input does not contain detection_categories field')
+        for k in d['detection_categories'].keys():
+            # Category ID should be string-formatted ints
+            if not isinstance(k,str):
+                raise ValueError('Invalid detection category ID: {}'.format(k))
             _ = int(k)
-            assert isinstance(d['classification_categories'][k],str)
-    ## Image validation
-    assert 'images' in d
-    assert isinstance(d['images'],list)
-    # im = d['images'][0]
-    for im in d['images']:
+            if not isinstance(d['detection_categories'][k],str):
+                raise ValueError('Invalid detection category name: {}'.format(
+                    d['detection_categories'][k]))
+        if 'classification_categories' in d:
+            for k in d['classification_categories'].keys():
+                # Categories should be string-formatted ints
+                if not isinstance(k,str):
+                    raise ValueError('Invalid classification category ID: {}'.format(k))
+                _ = int(k)
+                if not isinstance(d['classification_categories'][k],str):
+                    raise ValueError('Invalid classification category name: {}'.format(
+                        d['classification_categories'][k]))
-        assert isinstance(im,dict)
-        assert 'file' in im
-        file = im['file']
+        ## Image validation
-        if options.check_image_existence:
-            if options.relative_path_base is None:
-                file_abs = file
-            else:
-                file_abs = os.path.join(options.relative_path_base,file)
-            assert os.path.isfile(file_abs), 'Cannot find file {}'.format(file_abs)
+        if 'images' not in d:
+            raise ValueError('images field not present')
+        if not isinstance(d['images'],list):
+            raise ValueError('Invalid images field')
+        # im = d['images'][0]
+        for i_im,im in enumerate(d['images']):
-        if 'detections' not in im or im['detections'] is None:
-            assert 'failure' in im and isinstance(im['failure'],str)
-        else:
-            assert isinstance(im['detections'],list)
+            if not isinstance(im,dict):
+                raise ValueError('Invalid image at index {}'.format(i_im))
+            if 'file' not in im:
+                raise ValueError('Image without filename at index {}'.format(i_im))
-        if is_video_file(im['file']) and (format_version >= 1.4):
-            assert 'frame_rate' in im
-            if 'detections' in im and im['detections'] is not None:
-                for det in im['detections']:
-                    assert 'frame_number' in det
+            file = im['file']
-    # ...for each image
+            if options.check_image_existence:
+                if options.relative_path_base is None:
+                    file_abs = file
+                else:
+                    file_abs = os.path.join(options.relative_path_base,file)
+                if not os.path.isfile(file_abs):
+                    raise ValueError('Cannot find file {}'.format(file_abs))
+            if ('detections' not in im) or (im['detections'] is None):
+                if not ('failure' in im and isinstance(im['failure'],str)):
+                    raise ValueError('Image {} has no detections and no failure'.format(im['file']))
+            else:
+                if not isinstance(im['detections'],list):
+                    raise ValueError('Invalid detections list for image {}'.format(im['file']))
+            if is_video_file(im['file']) and (format_version >= 1.4):
+                if 'frame_rate' not in im:
+                    raise ValueError('Video without frame rate: {}'.format(im['file']))
+                if 'detections' in im and im['detections'] is not None:
+                    for det in im['detections']:
+                        if 'frame_number' not in det:
+                            raise ValueError('Frame without frame number in video {}'.format(
+                                im['file']))
+        # ...for each image
+        ## Checking on other keys
+        for k in d.keys():
+            if (k not in typical_keys) and (k not in required_keys):
+                validation_results['warnings'].append(
+                    'Warning: non-standard key {} present at file level'.format(k))
+    except Exception as e:
+        validation_results['errors'].append(str(e))
+    if options.return_data:
+        to_return = d
+    else:
+        to_return = {}
-    ## Checking on other keys
+    to_return['validation_results'] = validation_results
-    for k in d.keys():
-        if k not in typical_keys and k not in required_keys:
-            print('Warning: non-standard key {} present at file level'.format(k))
+    return to_return
 # ...def validate_batch_results(...)

megadetector/utils/md_tests.py CHANGED Viewed

@@ -29,10 +29,6 @@ import subprocess
 import argparse
 import inspect
-#: IoU threshold used to determine whether boxes in two detection files likely correspond
-#: to the same box.
-iou_threshold_for_file_comparison = 0.9
 #%% Classes
@@ -106,6 +102,10 @@ class MDTestOptions:
         #: PYTHONPATH to set for CLI tests; if None, inherits from the parent process.  Only
         #: impacts the called functions, not the parent process.
         self.cli_test_pythonpath = None
+        #: IoU threshold used to determine whether boxes in two detection files likely correspond
+        #: to the same box.
+        self.iou_threshold_for_file_comparison = 0.85
 # ...class MDTestOptions()
@@ -410,7 +410,7 @@ def compare_detection_lists(detections_a,detections_b,options,bidirectional_comp
             iou = get_iou(det_a['bbox'],b_det['bbox'])
             # Is this likely the same detection as det_a?
-            if iou >= iou_threshold_for_file_comparison and iou > highest_iou:
+            if iou >= options.iou_threshold_for_file_comparison and iou > highest_iou:
                 matching_det_b = b_det
                 highest_iou = iou
@@ -529,12 +529,14 @@ def compare_results(inference_output_file,expected_results_file,options):
     if not options.warning_mode:
         assert max_conf_error <= options.max_conf_error, \
-            'Confidence error {} is greater than allowable ({}), on file:\n{}'.format(
-                max_conf_error,options.max_conf_error,max_conf_error_file)
+            'Confidence error {} is greater than allowable ({}), on file:\n{} ({},{})'.format(
+                max_conf_error,options.max_conf_error,max_conf_error_file,
+                inference_output_file,expected_results_file)
         assert max_coord_error <= options.max_coord_error, \
-            'Coord error {} is greater than allowable ({}), on file:\n{}'.format(
-                max_coord_error,options.max_coord_error,max_coord_error_file)
+            'Coord error {} is greater than allowable ({}), on file:\n{} ({},{})'.format(
+                max_coord_error,options.max_coord_error,max_coord_error_file,
+                inference_output_file,expected_results_file)
     print('Max conf error: {} (file {})'.format(
         max_conf_error,max_conf_error_file))
@@ -847,7 +849,7 @@ def run_python_tests(options):
         video_options.frame_rendering_folder = os.path.join(options.scratch_dir,'video_scratch/rendered_frame_folder')
         video_options.render_output_video = True
         # video_options.keep_rendered_frames = False
-        # video_options.keep_rendered_frames = False
+        # video_options.keep_extracted_frames = False
         video_options.force_extracted_frame_folder_deletion = True
         video_options.force_rendered_frame_folder_deletion = True
         # video_options.reuse_results_if_available = False
@@ -887,7 +889,7 @@ def run_python_tests(options):
         video_options.frame_rendering_folder = os.path.join(options.scratch_dir,'video_scratch/rendered_frame_folder')
         video_options.render_output_video = False
         video_options.keep_rendered_frames = False
-        video_options.keep_rendered_frames = False
+        video_options.keep_extracted_frames = False
         video_options.force_extracted_frame_folder_deletion = True
         video_options.force_rendered_frame_folder_deletion = True
         video_options.reuse_results_if_available = False
@@ -1353,7 +1355,7 @@ if False:
     # options.cli_working_dir = r'c:\git\MegaDetector'
     # options.yolo_working_dir = r'c:\git\yolov5-md'
     options.cli_working_dir = os.path.expanduser('~')
-    options.yolo_working_dir = '/mnt/c/git/yolov5-md'
+    # options.yolo_working_dir = '/mnt/c/git/yolov5-md'
     options = download_test_data(options)
     #%%

megadetector/utils/path_utils.py CHANGED Viewed

@@ -17,6 +17,7 @@ import platform
 import string
 import json
 import shutil
+import hashlib
 import unicodedata
 import zipfile
 import tarfile
@@ -236,6 +237,30 @@ def path_is_abs(p):
     return (len(p) > 1) and (p[0] == '/' or p[1] == ':' or p[0] == '\\')
+def safe_create_link(link_exists,link_new):
+    """
+    Creates a symlink at [link_new] pointing to [link_exists].
+    If [link_new] already exists, make sure it's a link (not a file),
+    and if it has a different target than [link_exists], removes and re-creates
+    it.
+    Errors if [link_new] already exists but it's not a link.
+    Args:
+        link_exists (str): the source of the (possibly-new) symlink
+        link_new (str): the target of the (possibly-new) symlink
+    """
+    if os.path.exists(link_new) or os.path.islink(link_new):
+        assert os.path.islink(link_new)
+        if not os.readlink(link_new) == link_exists:
+            os.remove(link_new)
+            os.symlink(link_exists,link_new)
+    else:
+        os.symlink(link_exists,link_new)
 def top_level_folder(p):
     r"""
     Gets the top-level folder from the path *p*.
@@ -296,31 +321,6 @@ if False:
     p = r'c:/foo'; s = top_level_folder(p); print(s); assert s == 'c:/foo'
     p = r'c:\foo/bar'; s = top_level_folder(p); print(s); assert s == 'c:\\foo'
-    #%%
-def safe_create_link(link_exists,link_new):
-    """
-    Creates a symlink at [link_new] pointing to [link_exists].
-    If [link_new] already exists, make sure it's a link (not a file),
-    and if it has a different target than [link_exists], removes and re-creates
-    it.
-    Errors if [link_new] already exists but it's not a link.
-    Args:
-        link_exists (str): the source of the (possibly-new) symlink
-        link_new (str): the target of the (possibly-new) symlink
-    """
-    if os.path.exists(link_new) or os.path.islink(link_new):
-        assert os.path.islink(link_new)
-        if not os.readlink(link_new) == link_exists:
-            os.remove(link_new)
-            os.symlink(link_exists,link_new)
-    else:
-        os.symlink(link_exists,link_new)
 #%% Image-related path functions
@@ -598,7 +598,9 @@ def open_file(filename, attempt_to_open_in_wsl_host=False, browser_name=None):
         opener = 'xdg-open'
         subprocess.call([opener, filename])
+# ...def open_file(...)
 #%% File list functions
@@ -649,8 +651,12 @@ def _copy_file(input_output_tuple,overwrite=True,verbose=False):
     target_fn = input_output_tuple[1]
     if (not overwrite) and (os.path.isfile(target_fn)):
         if verbose:
-            print('Skipping existing file {}'.format(target_fn))
-        return
+            print('Skipping existing target file {}'.format(target_fn))
+        return
+    if verbose:
+        print('Copying to target file {}'.format(target_fn))
     os.makedirs(os.path.dirname(target_fn),exist_ok=True)
     shutil.copyfile(source_fn,target_fn)
@@ -667,7 +673,7 @@ def parallel_copy_files(input_file_to_output_file, max_workers=16,
         use_threads (bool, optional): whether to use threads (True) or processes (False) for
             parallel copying; ignored if max_workers <= 1
         overwrite (bool, optional): whether to overwrite existing destination files
-        verbose (bool, optional): enable additionald debug output
+        verbose (bool, optional): enable additional debug output
     """
     n_workers = min(max_workers,len(input_file_to_output_file))
@@ -750,7 +756,7 @@ def parallel_get_file_sizes(filenames,
         max_workers (int, optional): number of concurrent workers; set to <=1 to disable parallelism
         use_threads (bool, optional): whether to use threads (True) or processes (False) for
             parallel copying; ignored if max_workers <= 1
-        verbose (bool, optional): enable additionald debug output
+        verbose (bool, optional): enable additional debug output
         recursive (bool, optional): enumerate recursively, only relevant if [filenames] is a folder.
         convert_slashes (bool, optional): convert backslashes to forward slashes
         return_relative_paths (bool, optional): return relative paths; only relevant if [filenames]
@@ -804,6 +810,8 @@ def parallel_get_file_sizes(filenames,
     return to_return
+# ...def parallel_get_file_sizes(...)
 #%% Zip functions
@@ -1075,3 +1083,104 @@ def unzip_file(input_file, output_folder=None):
     with zipfile.ZipFile(input_file, 'r') as zf:
         zf.extractall(output_folder)
+#%% File hashing functions
+def compute_file_hash(file_path, algorithm='sha256', allow_failures=True):
+    """
+    Compute the hash of a file.
+    Adapted from:
+    https://www.geeksforgeeks.org/python-program-to-find-hash-of-file/
+    Args:
+        file_path (str): the file to hash
+        algorithm (str, optional): the hashing algorithm to use (e.g. md5, sha256)
+    Returns:
+        str: the hash value for this file
+    """
+    try:
+        hash_func = hashlib.new(algorithm)
+        with open(file_path, 'rb') as file:
+            while chunk := file.read(8192):  # Read the file in chunks of 8192 bytes
+                hash_func.update(chunk)
+        return str(hash_func.hexdigest())
+    except Exception:
+        if allow_failures:
+            return None
+        else:
+            raise
+# ...def compute_file_hash(...)
+def parallel_compute_file_hashes(filenames,
+                               max_workers=16,
+                               use_threads=True,
+                               recursive=True,
+                               algorithm='sha256',
+                               verbose=False):
+    """
+    Compute file hashes for a list or folder of images.
+    Args:
+        filenames (list or str): a list of filenames or a folder
+        max_workers (int, optional): the number of parallel workers to use; set to <=1 to disable
+            parallelization
+        use_threads (bool, optional): whether to use threads (True) or processes (False) for
+            parallelization
+        algorithm (str, optional): the hashing algorithm to use (e.g. md5, sha256)
+        recursive (bool, optional): if [filenames] is a folder, whether to enumerate recursively.
+            Ignored if [filenames] is a list.
+        verbose (bool, optional): enable additional debug output
+    Returns:
+        dict: a dict mapping filenames to hash values; values will be None for files that fail
+        to load.
+    """
+    if isinstance(filenames,str) and os.path.isdir(filenames):
+        if verbose:
+            print('Enumerating files in {}'.format(filenames))
+        filenames = recursive_file_list(filenames,recursive=recursive,return_relative_paths=False)
+    n_workers = min(max_workers,len(filenames))
+    if verbose:
+        print('Computing hashes for {} files on {} workers'.format(len(filenames),n_workers))
+    if n_workers <= 1:
+        results = []
+        for filename in filenames:
+            results.append(compute_file_hash(filename,algorithm=algorithm,allow_failures=True))
+    else:
+        if use_threads:
+            pool = ThreadPool(n_workers)
+        else:
+            pool = Pool(n_workers)
+        results = list(tqdm(pool.imap(
+            partial(compute_file_hash,algorithm=algorithm,allow_failures=True),
+            filenames), total=len(filenames)))
+    assert len(filenames) == len(results), 'Internal error in parallel_compute_file_hashes'
+    to_return = {}
+    for i_file,filename in enumerate(filenames):
+        to_return[filename] = results[i_file]
+    return to_return
+# ...def parallel_compute_file_hashes(...)

megadetector/utils/write_html_image_list.py CHANGED Viewed

@@ -42,6 +42,7 @@ def write_html_image_list(filename=None,images=None,options=None):
         options (dict, optional): a dict with one or more of the following fields:
             - fHtml (file pointer to write to, used for splitting write operations over multiple calls)
+            - pageTitle (HTML page title)
             - headerHtml (html text to include before the image list)
             - trailerHtml (html text to include after the image list)
             - defaultImageStyle (default css style for images)
@@ -60,11 +61,14 @@ def write_html_image_list(filename=None,images=None,options=None):
     if 'fHtml' not in options:
         options['fHtml'] = -1
+    if 'pageTitle' not in options or options['pageTitle'] is None:
+        options['pageTitle'] = ''
     if 'headerHtml' not in options or options['headerHtml'] is None:
-        options['headerHtml'] = ''
+        options['headerHtml'] = ''
     if 'trailerHtml' not in options or options['trailerHtml'] is None:
-        options['trailerHtml'] = ''
+        options['trailerHtml'] = ''
     if 'defaultTextStyle' not in options or options['defaultTextStyle'] is None:
         options['defaultTextStyle'] = \
@@ -114,7 +118,7 @@ def write_html_image_list(filename=None,images=None,options=None):
         # You can't supply your own file handle in this case
         if options['fHtml'] != -1:
             raise ValueError(
-                    'You can''t supply your own file handle if we have to page the image set')
+                    "You can't supply your own file handle if we have to page the image set")
         figureFileStartingIndices = list(range(0,nImages,options['maxFiguresPerHtmlFile']))
@@ -124,7 +128,10 @@ def write_html_image_list(filename=None,images=None,options=None):
         fMeta = open(filename,'w')
         # Write header stuff
-        fMeta.write('<html><body>\n')
+        titleString = '<title>Index page</title>'
+        if len(options['pageTitle']) > 0:
+            titleString = '<title>Index page for: {}</title>'.format(options['pageTitle'])
+        fMeta.write('<html><head>{}</head><body>\n'.format(titleString))
         fMeta.write(options['headerHtml'])
         fMeta.write('<table border = 0 cellpadding = 2>\n')
@@ -170,7 +177,11 @@ def write_html_image_list(filename=None,images=None,options=None):
     else:
         fHtml = options['fHtml']
-    fHtml.write('<html><body>\n')
+    titleString = ''
+    if len(options['pageTitle']) > 0:
+        titleString = '<title>{}</title>'.format(options['pageTitle'])
+    fHtml.write('<html>{}<body>\n'.format(titleString))
     fHtml.write(options['headerHtml'])

megadetector 5.0.20__py3-none-any.whl → 5.0.21__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.20py3-none-any.whl → 5.0.21py3-none-any.whl