PyPI - megadetector - Versions diffs - 10.0.6__py3-none-any.whl → 10.0.8__py3-none-any.whl - Mend

megadetector 10.0.6py3-none-any.whl → 10.0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (19) hide show

megadetector/postprocessing/convert_output_format.py CHANGED Viewed

@@ -2,12 +2,8 @@
 convert_output_format.py
-Converts between file formats output by our batch processing API.  Currently
-supports json <--> csv conversion, but this should be the landing place for any
-conversion - including between hypothetical alternative .json versions - that we support
-in the future.
-The .csv format is largely obsolete, don't use it unless you're super-duper sure you need it.
+Converts between file .json and .csv representations of MD output.  The .csv format is
+largely obsolete, don't use it unless you're super-duper sure you need it.
 """
@@ -15,13 +11,16 @@ The .csv format is largely obsolete, don't use it unless you're super-duper sure
 import argparse
 import json
-import csv
 import sys
 import os
 from tqdm import tqdm
+from collections import defaultdict
+import pandas as pd
 from megadetector.postprocessing.load_api_results import load_api_results_csv
+from megadetector.utils.wi_taxonomy_utils import load_md_or_speciesnet_file
 from megadetector.data_management.annotations import annotation_constants
 from megadetector.utils import ct_utils
@@ -35,16 +34,13 @@ def convert_json_to_csv(input_path,
                         min_confidence=None,
                         omit_bounding_boxes=False,
                         output_encoding=None,
-                        overwrite=True):
+                        overwrite=True,
+                        verbose=False):
     """
     Converts a MD results .json file to a totally non-standard .csv format.
     If [output_path] is None, will convert x.json to x.csv.
-    TODO: this function should obviously be using Pandas or some other sensible structured
-    representation of tabular data.  Even a list of dicts.  This implementation is quite
-    brittle and depends on adding fields to every row in exactly the right order.
     Args:
         input_path (str): the input .json file to convert
         output_path (str, optional): the output .csv file to generate; if this is None, uses
@@ -57,7 +53,7 @@ def convert_json_to_csv(input_path,
         output_encoding (str, optional): encoding to use for the .csv file
         overwrite (bool, optional): whether to overwrite an existing .csv file; if this is False and
             the output file exists, no-ops and returns
+        verbose (bool, optional): enable additional debug output
     """
     if output_path is None:
@@ -68,36 +64,28 @@ def convert_json_to_csv(input_path,
         return
     print('Loading json results from {}...'.format(input_path))
-    json_output = json.load(open(input_path))
-    rows = []
+    json_output = load_md_or_speciesnet_file(input_path,
+                                             verbose=verbose)
-    fixed_columns = ['image_path', 'max_confidence', 'detections']
+    def clean_category_name(s):
+        return s.replace(',','_').replace(' ','_').lower()
-    # We add an output column for each class other than 'empty',
-    # containing the maximum probability of  that class for each image
-    # n_non_empty_detection_categories = len(annotation_constants.annotation_bbox_categories) - 1
-    n_non_empty_detection_categories = annotation_constants.NUM_DETECTOR_CATEGORIES
-    detection_category_column_names = []
-    assert annotation_constants.detector_bbox_category_id_to_name[0] == 'empty'
-    for cat_id in range(1,n_non_empty_detection_categories+1):
-        cat_name = annotation_constants.detector_bbox_category_id_to_name[cat_id]
-        detection_category_column_names.append('max_conf_' + cat_name)
+    # Create column names for max detection confidences
+    detection_category_id_to_max_conf_column_name = {}
+    for category_id in json_output['detection_categories'].keys():
+        category_name = clean_category_name(json_output['detection_categories'][category_id])
+        detection_category_id_to_max_conf_column_name[category_id] = \
+            'max_conf_' + category_name
-    n_classification_categories = 0
+    classification_category_id_to_max_conf_column_name = {}
+    # Create column names for max classification confidences (if necessary)
     if 'classification_categories' in json_output.keys():
-        classification_category_id_to_name = json_output['classification_categories']
-        classification_category_ids = list(classification_category_id_to_name.keys())
-        classification_category_id_to_column_number = {}
-        classification_category_column_names = []
-        for i_category,category_id in enumerate(classification_category_ids):
-            category_name = classification_category_id_to_name[category_id].\
-                replace(' ','_').replace(',','')
-            classification_category_column_names.append('max_classification_conf_' + category_name)
-            classification_category_id_to_column_number[category_id] = i_category
-        n_classification_categories = len(classification_category_ids)
+        for category_id in json_output['classification_categories'].keys():
+            category_name = clean_category_name(json_output['classification_categories'][category_id])
+            classification_category_id_to_max_conf_column_name[category_id] = \
+                'max_classification_conf_' + category_name
     # There are several .json fields for which we add .csv columns; other random bespoke fields
     # will be ignored.
@@ -117,26 +105,43 @@ def convert_json_to_csv(input_path,
     if len(optional_fields_present) > 0:
         print('Found {} optional fields'.format(len(optional_fields_present)))
-    expected_row_length = len(fixed_columns) + len(detection_category_column_names) + \
-        n_classification_categories + len(optional_fields_present)
     print('Formatting results...')
+    output_records = []
     # i_image = 0; im = json_output['images'][i_image]
     for im in tqdm(json_output['images']):
-        image_id = im['file']
+        output_record = {}
+        output_records.append(output_record)
+        output_record['image_path'] = im['file']
+        output_record['max_confidence'] = ''
+        output_record['detections'] = ''
+        for field_name in optional_fields_present:
+            output_record[field_name] = ''
+            if field_name in im:
+                output_record[field_name] = im[field_name]
+        for detection_category_id in detection_category_id_to_max_conf_column_name:
+            column_name = detection_category_id_to_max_conf_column_name[detection_category_id]
+            output_record[column_name] = 0
+        for classification_category_id in classification_category_id_to_max_conf_column_name:
+            column_name = classification_category_id_to_max_conf_column_name[classification_category_id]
+            output_record[column_name] = 0
         if 'failure' in im and im['failure'] is not None:
-            row = [image_id, 'failure', im['failure']]
-            rows.append(row)
+            output_record['max_confidence'] = 'failure'
+            output_record['detections'] = im['failure']
             # print('Skipping failed image {} ({})'.format(im['file'],im['failure']))
             continue
         max_conf = ct_utils.get_max_conf(im)
+        detection_category_id_to_max_conf = defaultdict(float)
+        classification_category_id_to_max_conf = defaultdict(float)
         detections = []
-        max_detection_category_probabilities = [None] * n_non_empty_detection_categories
-        max_classification_category_probabilities = [0] * n_classification_categories
         # d = im['detections'][0]
         for d in im['detections']:
@@ -155,31 +160,24 @@ def convert_json_to_csv(input_path,
             xmax = input_bbox[0] + input_bbox[2]
             ymax = input_bbox[1] + input_bbox[3]
             output_detection = [ymin, xmin, ymax, xmax]
             output_detection.append(d['conf'])
-            # Category 0 is empty, for which we don't have a column, so the max
-            # confidence for category N goes in column N-1
-            detection_category_id = int(d['category'])
-            assert detection_category_id > 0 and detection_category_id <= \
-                n_non_empty_detection_categories
-            detection_category_column = detection_category_id - 1
-            detection_category_max = max_detection_category_probabilities[detection_category_column]
-            if detection_category_max is None or d['conf'] > detection_category_max:
-                max_detection_category_probabilities[detection_category_column] = d['conf']
-            output_detection.append(detection_category_id)
+            output_detection.append(int(d['category']))
             detections.append(output_detection)
+            detection_category_id = d['category']
+            detection_category_max = detection_category_id_to_max_conf[detection_category_id]
+            if d['conf'] > detection_category_max:
+                detection_category_id_to_max_conf[detection_category_id] = d['conf']
             if 'classifications' in d:
-                assert n_classification_categories > 0,\
-                    'Oops, I have classification results, but no classification metadata'
                 for c in d['classifications']:
-                    category_id = c[0]
-                    p = c[1]
-                    category_index = classification_category_id_to_column_number[category_id]
-                    if (max_classification_category_probabilities[category_index] < p):
-                        max_classification_category_probabilities[category_index] = p
+                    classification_category_id = c[0]
+                    classification_conf = c[1]
+                    classification_category_max = \
+                        classification_category_id_to_max_conf[classification_category_id]
+                    if classification_conf > classification_category_max:
+                        classification_category_id_to_max_conf[classification_category_id] = d['conf']
                 # ...for each classification
@@ -191,40 +189,36 @@ def convert_json_to_csv(input_path,
         if not omit_bounding_boxes:
             detection_string = json.dumps(detections)
-        row = [image_id, max_conf, detection_string]
-        row.extend(max_detection_category_probabilities)
-        row.extend(max_classification_category_probabilities)
+        output_record['detections'] = detection_string
+        output_record['max_confidence'] = max_conf
-        for field_name in optional_fields_present:
-            if field_name not in im:
-                row.append('')
-            else:
-                row.append(str(im[field_name]))
+        for detection_category_id in detection_category_id_to_max_conf_column_name:
+            column_name = detection_category_id_to_max_conf_column_name[detection_category_id]
+            output_record[column_name] = \
+                detection_category_id_to_max_conf[detection_category_id]
-        assert len(row) == expected_row_length
-        rows.append(row)
+        for classification_category_id in classification_category_id_to_max_conf_column_name:
+            column_name = classification_category_id_to_max_conf_column_name[classification_category_id]
+            output_record[column_name] = \
+                classification_category_id_to_max_conf[classification_category_id]
     # ...for each image
     print('Writing to csv...')
-    with open(output_path, 'w', newline='', encoding=output_encoding) as f:
-        writer = csv.writer(f, delimiter=',')
-        header = fixed_columns
-        header.extend(detection_category_column_names)
-        if n_classification_categories > 0:
-            header.extend(classification_category_column_names)
-        for field_name in optional_fields_present:
-            header.append(field_name)
-        writer.writerow(header)
-        writer.writerows(rows)
+    df = pd.DataFrame(output_records)
+    if omit_bounding_boxes:
+        df = df.drop('detections',axis=1)
+    df.to_csv(output_path,index=False,header=True)
 # ...def convert_json_to_csv(...)
 def convert_csv_to_json(input_path,output_path=None,overwrite=True):
     """
-    Convert .csv to .json.  If output_path is None, will convert x.csv to x.json.
+    Convert .csv to .json.  If output_path is None, will convert x.csv to x.json.  This
+    supports a largely obsolete .csv format, there's almost no reason you want to do this.
     Args:
         input_path (str): .csv filename to convert to .json

megadetector/postprocessing/subset_json_detector_output.py CHANGED Viewed

@@ -83,6 +83,9 @@ class SubsetJsonDetectorOutputOptions:
     def __init__(self):
         #: Only process files containing the token 'query'
+        #:
+        #: Does not support general regexes, but supports ^ as a special case
+        #: regex-like notation for "starts with"
         self.query = None
         #: Replace 'query' with 'replacement' if 'replacement' is not None.  If 'query' is None,

megadetector/utils/directory_listing.py CHANGED Viewed

@@ -21,7 +21,7 @@ from megadetector.utils.path_utils import is_image_file
 #%% Directory enumeration functions
-def create_plain_index(root, dirs, files, dirname=None):
+def _create_plain_index(root, dirs, files, dirname=None):
     """
     Creates the fairly plain HTML folder index including a preview of a single image file,
     if any is present.
@@ -40,6 +40,7 @@ def create_plain_index(root, dirs, files, dirname=None):
     if dirname is None:
         dirname = root or '/'
+    dirname = dirname.replace('\\','/')
     html = "<!DOCTYPE html>\n"
     html += "<html lang='en'><head>"
@@ -104,13 +105,14 @@ def create_plain_index(root, dirs, files, dirname=None):
     html += "</body></html>\n"
     return html
-# ...def create_plain_index(...)
+# ...def _create_plain_index(...)
-def traverse_and_create_index(dir,
-                              overwrite_files=False,
-                              template_fun=create_plain_index,
-                              basepath=None):
+def create_html_index(dir,
+                      overwrite=False,
+                      template_fun=_create_plain_index,
+                      basepath=None,
+                      recursive=True):
     """
     Recursively traverses the local directory [dir] and generates a index
     file for each folder using [template_fun] to generate the HTML output.
@@ -118,12 +120,13 @@ def traverse_and_create_index(dir,
     Args:
         dir (str): directory to process
-        overwrite_files (bool, optional): whether to over-write existing index file
+        overwrite (bool, optional): whether to over-write existing index file
         template_fun (func, optional): function taking three arguments (string,
             list of string, list of string) representing the current root, the list of folders,
             and the list of files.  Should return the HTML source of the index file.
         basepath (str, optional): if not None, the name used for each subfolder in [dir]
             in the output files will be relative to [basepath]
+        recursive (bool, optional): recurse into subfolders
     """
     print('Traversing {}'.format(dir))
@@ -141,7 +144,7 @@ def traverse_and_create_index(dir,
         # Output is written to file *root*/index.html
         output_file = os.path.join(root, "index.html")
-        if not overwrite_files and os.path.isfile(output_file):
+        if (not overwrite) and os.path.isfile(output_file):
             print('Skipping {}, file exists'.format(output_file))
             continue
@@ -157,7 +160,10 @@ def traverse_and_create_index(dir,
         with open(output_file, 'wt') as fi:
             fi.write(html)
-# ...def traverse_and_create_index(...)
+        if not recursive:
+            break
+# ...def create_html_index(...)
 #%% Command-line driver
@@ -171,7 +177,7 @@ def main(): # noqa
     parser.add_argument("--basepath", type=str,
                         help='Folder names will be printed relative to basepath, if specified',
                         default=None)
-    parser.add_argument("--enable_overwrite", action='store_true', default=False,
+    parser.add_argument("--overwrite", action='store_true', default=False,
                         help='If set, the script will overwrite existing index.html files.')
     if len(sys.argv[1:]) == 0:
@@ -182,9 +188,9 @@ def main(): # noqa
     assert os.path.isdir(args.directory), "{} is not a valid directory".format(args.directory)
-    traverse_and_create_index(args.directory,
-                              overwrite_files=args.enable_overwrite,
-                              basepath=args.basepath)
+    create_html_index(args.directory,
+                      overwrite=args.overwrite,
+                      basepath=args.basepath)
 if __name__ == '__main__':
     main()

megadetector/utils/path_utils.py CHANGED Viewed

@@ -528,7 +528,8 @@ def find_images(dirname,
 def clean_filename(filename,
                    allow_list=VALID_FILENAME_CHARS,
                    char_limit=CHAR_LIMIT,
-                   force_lower= False):
+                   force_lower=False,
+                   remove_trailing_leading_whitespace=True):
     r"""
     Removes non-ASCII and other invalid filename characters (on any
     reasonable OS) from a filename, then optionally trims to a maximum length.
@@ -544,11 +545,27 @@ def clean_filename(filename,
         char_limit (int, optional): maximum allowable filename length, if None will skip this
             step
         force_lower (bool, optional): convert the resulting filename to lowercase
+        remove_trailing_leading_whitespace (bool, optional): remove trailing and
+            leading whitespace from each component of a path, e.g. does not allow
+            a/b/c /d.jpg
     Returns:
         str: cleaned version of [filename]
     """
+    if remove_trailing_leading_whitespace:
+        # Best effort to preserve the original separator
+        separator = '/'
+        if '\\' in filename:
+            separator = '\\'
+        filename = filename.replace('\\','/')
+        components = filename.split('/')
+        clean_components = [c.strip() for c in components]
+        filename = separator.join(clean_components)
+        if separator == '\\':
+            filename = filename.replace('/','\\')
     # keep only valid ascii chars
     cleaned_filename = (unicodedata.normalize('NFKD', filename)
                         .encode('ASCII', 'ignore').decode())
@@ -565,7 +582,8 @@ def clean_filename(filename,
 def clean_path(pathname,
                allow_list=VALID_PATH_CHARS,
                char_limit=CHAR_LIMIT,
-               force_lower=False):
+               force_lower=False,
+               remove_trailing_leading_whitespace=True):
     """
     Removes non-ASCII and other invalid path characters (on any reasonable
     OS) from a path, then optionally trims to a maximum length.
@@ -576,13 +594,20 @@ def clean_path(pathname,
         char_limit (int, optional): maximum allowable filename length, if None will skip this
             step
         force_lower (bool, optional): convert the resulting filename to lowercase
+        remove_trailing_leading_whitespace (bool, optional): remove trailing and
+            leading whitespace from each component of a path, e.g. does not allow
+            a/b/c /d.jpg
     Returns:
         str: cleaned version of [filename]
     """
-    return clean_filename(pathname, allow_list=allow_list,
-                          char_limit=char_limit, force_lower=force_lower)
+    return clean_filename(pathname,
+                          allow_list=allow_list,
+                          char_limit=char_limit,
+                          force_lower=force_lower,
+                          remove_trailing_leading_whitespace=\
+                            remove_trailing_leading_whitespace)
 def flatten_path(pathname,separator_chars=SEPARATOR_CHARS,separator_char_replacement='~'):
@@ -1553,6 +1578,7 @@ class TestPathUtils:
         """
         self.test_dir = make_test_folder(subfolder='megadetector/path_utils_tests')
+        print('Using temporary folder {} for path utils testing'.format(self.test_dir))
         os.makedirs(self.test_dir, exist_ok=True)
@@ -1776,7 +1802,11 @@ class TestPathUtils:
         ])
         folders_non_recursive_abs = folder_list(folder_list_dir, recursive=False,
                                                 return_relative_paths=False)
-        assert sorted(folders_non_recursive_abs) == expected_folders_non_recursive_abs
+        assert sorted(folders_non_recursive_abs) == expected_folders_non_recursive_abs, \
+            'Non-recursive folder list failured, expected:\n\n{}\n\nFound:\n\n{}'.format(
+                str(expected_folders_non_recursive_abs),
+                str(folders_non_recursive_abs)
+            )
         # Test non-recursive, relative paths
         expected_folders_non_recursive_rel = sorted(['subdir1', 'subdir2'])
@@ -2114,7 +2144,17 @@ class TestPathUtils:
         assert clean_filename("test*file?.txt", char_limit=10) == "testfile.t"
         assert clean_filename("TestFile.TXT", force_lower=True) == "testfile.txt"
         assert clean_filename("file:with<illegal>chars.txt") == "filewithillegalchars.txt"
-        assert clean_filename(" accented_name_éà.txt") == " accented_name_ea.txt"
+        s = " accented_name_éà.txt"
+        assert clean_filename(s,
+                              remove_trailing_leading_whitespace=False) == " accented_name_ea.txt", \
+            'clean_filename with remove_trailing_leading_whitespace=False: {}'.format(
+                clean_filename(s, remove_trailing_leading_whitespace=False))
+        assert clean_filename(s, remove_trailing_leading_whitespace=True) == "accented_name_ea.txt", \
+            'clean_filename with remove_trailing_leading_whitespace=False: {}'.format(
+                clean_filename(s, remove_trailing_leading_whitespace=True))
         # Separators are not allowed by default in clean_filename
         assert clean_filename("path/to/file.txt") == "pathtofile.txt"
@@ -2444,7 +2484,13 @@ class TestPathUtils:
         un_tar_dir = os.path.join(self.test_dir, "un_tar_contents")
         os.makedirs(un_tar_dir, exist_ok=True)
         with tarfile.open(output_tar_path, 'r:gz') as tf:
-            tf.extractall(path=un_tar_dir)
+            # The "filter" option was added as of Python 3.12, and *not* specifying
+            # filter=None will change behavior as of Python 3.14.  We want the unmodified
+            # behavior, but we want to support Python <3.12, so we do a version check.
+            if sys.version_info >= (3, 12):
+                tf.extractall(path=un_tar_dir, filter=None)
+            else:
+                tf.extractall(path=un_tar_dir)
         expected_untarred_file1 = os.path.join(un_tar_dir, os.path.relpath(file1_path, self.test_dir))
         expected_untarred_file2 = os.path.join(un_tar_dir, os.path.relpath(file2_path, self.test_dir))
@@ -2618,7 +2664,9 @@ def test_path_utils():
     test_instance = TestPathUtils()
     test_instance.set_up()
     try:
         test_instance.test_is_image_file()
         test_instance.test_find_image_strings()
         test_instance.test_find_images()
@@ -2643,5 +2691,7 @@ def test_path_utils():
         test_instance.test_add_files_to_single_tar_file()
         test_instance.test_parallel_zip_individual_files_and_folders()
         test_instance.test_compute_file_hash()
     finally:
         test_instance.tear_down()

megadetector/utils/url_utils.py CHANGED Viewed

@@ -2,7 +2,7 @@
 url_utils.py
-Frequently-used functions for downloading or manipulating URLs
+Frequently-used functions for downloading, manipulating, or serving URLs
 """
@@ -16,6 +16,9 @@ import urllib.error
 import requests
 import shutil
 import pytest
+import socketserver
+import threading
+import http.server
 from functools import partial
 from tqdm import tqdm
@@ -453,6 +456,93 @@ def get_url_sizes(urls,n_workers=1,pool_type='thread',timeout=None,verbose=False
     return url_to_size
+#%%  Singleton HTTP server
+class QuietHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
+    """
+    SimpleHTTPRequestHandler sublcass that suppresses console printouts
+    """
+    def __init__(self, *args, directory=None, **kwargs):
+        super().__init__(*args, directory=directory, **kwargs)
+    def log_message(self, format, *args): # noqa
+        pass
+class SingletonHTTPServer:
+    """
+    HTTP server that runs on a local port, serving a particular local folder.  Runs as a
+    singleton, so starting a server in a new folder closes the previous server.  I use this
+    primarily to serve MD/SpeciesNet previews from manage_local_batch, which can exceed
+    the 260-character filename length limitation imposed by browser on Windows, so really the
+    point here is just to remove characters from the URL.
+    """
+    _server = None
+    _thread = None
+    @classmethod
+    def start_server(cls, directory, port=8000, host='localhost'):
+        """
+        Start or restart the HTTP server with a specific directory
+        Args:
+            directory (str): the root folder served by the server
+            port (int, optional): the port on which to create the server
+            host (str, optional): the host on which to listen, typically
+                either "localhost" (default) or "0.0.0.0"
+        Returns:
+            str: URL to the running host
+        """
+        # Stop the existing server instance if necessary
+        cls.stop_server()
+        # Create new server
+        handler = partial(QuietHTTPRequestHandler, directory=directory)
+        cls._server = socketserver.TCPServer((host, port), handler)
+        # Start server in daemon thread (dies when parent process dies)
+        cls._thread = threading.Thread(target=cls._server.serve_forever)
+        cls._thread.daemon = True
+        cls._thread.start()
+        print(f"Serving {directory} at http://{host}:{port}")
+        return f"http://{host}:{port}"
+    @classmethod
+    def stop_server(cls):
+        """
+        Stop the current server (if one is running)
+        """
+        if cls._server:
+            cls._server.shutdown()
+            cls._server.server_close()
+            cls._server = None
+        if cls._thread:
+            cls._thread.join(timeout=1)
+            cls._thread = None
+    @classmethod
+    def is_running(cls):
+        """
+        Check whether the server is currently running.
+        Returns:
+            bool: True if the server is running
+        """
+        return (cls._server is not None) and \
+            (cls._thread is not None) and \
+            (cls._thread.is_alive())
+# ...class SingletonHTTPServer
 #%% Tests
 # Constants for tests

megadetector 10.0.6__py3-none-any.whl → 10.0.8__py3-none-any.whl

Potentially problematic release.

megadetector 10.0.6py3-none-any.whl → 10.0.8py3-none-any.whl