PyPI - megadetector - Versions diffs - 5.0.24__py3-none-any.whl → 5.0.26__py3-none-any.whl - Mend

megadetector 5.0.24py3-none-any.whl → 5.0.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (41) hide show

megadetector/postprocessing/subset_json_detector_output.py CHANGED Viewed

@@ -61,9 +61,11 @@ import os
 import re
 from tqdm import tqdm
+from collections import defaultdict
 from megadetector.utils.ct_utils import args_to_object, get_max_conf, invert_dictionary
 from megadetector.utils.path_utils import top_level_folder
+from megadetector.utils.path_utils import recursive_file_list
 #%% Helper classes
@@ -136,7 +138,18 @@ class SubsetJsonDetectorOutputOptions:
         #: Set to >0 during testing to limit the number of images that get processed.
         self.debug_max_images = -1
+        #: Keep only files in this list, which can be a .json results file or a folder.
+        #
+        #: Assumes that the input .json file contains relative paths when comparing to a folder.
+        self.keep_files_in_list = None
+        #: Remove classification with <= N instances.  Does not re-map categories
+        #: to be contiguous.  Set to 1 to remove empty categories only.
+        self.remove_classification_categories_below_count = None
+# ...class SubsetJsonDetectorOutputOptions
 #%% Main function
@@ -156,11 +169,104 @@ def _write_detection_results(data, output_filename, options):
     else:
         os.makedirs(basedir, exist_ok=True)
-    print('Writing detection output to {}'.format(output_filename))
-    with open(output_filename, 'w') as f:
+    n_images = len(data['images'])
+    print('Writing detection output (with {} images) to {}'.format(n_images,output_filename))
+    with open(output_filename, 'w', newline='\n') as f:
         json.dump(data,f,indent=1)
-# ..._write_detection_results()
+# ...def _write_detection_results(...)
+def remove_classification_categories_below_count(data, options):
+    """
+    Removes all classification categories below a threshold count.  Does not re-map
+    classification category IDs.
+    Args:
+        data (dict): data loaded from a MD results file
+        options (SubsetJsonDetectorOutputOptions): parameters for subsetting
+    Returns:
+        dict: Possibly-modified version of [data] (also modifies in place)
+    """
+    if options.remove_classification_categories_below_count is None:
+        return data
+    if 'classification_categories' not in data:
+        return data
+    classification_category_id_to_count = {}
+    for classification_category_id in data['classification_categories']:
+        classification_category_id_to_count[classification_category_id] = 0
+    # Count the number of occurrences of each classification category
+    for im in data['images']:
+        if 'detections' not in im or im['detections'] is None:
+            continue
+        for det in im['detections']:
+            if 'classifications' not in det:
+                continue
+            for classification in det['classifications']:
+                classification_category_id_to_count[classification[0]] = \
+                    classification_category_id_to_count[classification[0]] + 1
+    # Which categories have above-threshold counts?
+    classification_category_ids_to_keep = set()
+    for classification_category_id in classification_category_id_to_count:
+        if classification_category_id_to_count[classification_category_id] > \
+            options.remove_classification_categories_below_count:
+                classification_category_ids_to_keep.add(classification_category_id)
+    n_categories_removed = \
+        len(classification_category_id_to_count) - \
+        len(classification_category_ids_to_keep)
+    print('Removing {} of {} classification categories'.format(
+        n_categories_removed,len(classification_category_id_to_count)))
+    if n_categories_removed == 0:
+        return data
+    # Filter the category list
+    output_classification_categories = {}
+    for category_id in data['classification_categories']:
+        if category_id in classification_category_ids_to_keep:
+            output_classification_categories[category_id] = \
+                data['classification_categories'][category_id]
+    data['classification_categories'] = output_classification_categories
+    assert len(data['classification_categories']) == len(classification_category_ids_to_keep)
+    # If necessary, filter the category descriptions
+    if 'classification_category_descriptions' in data:
+        output_classification_category_descriptions = {}
+        for category_id in data['classification_category_descriptions']:
+            if category_id in classification_category_ids_to_keep:
+                output_classification_category_descriptions[category_id] = \
+                    data['classification_category_descriptions'][category_id]
+        data['classification_category_descriptions'] = output_classification_category_descriptions
+    # Filter images
+    for im in data['images']:
+        if 'detections' not in im or im['detections'] is None:
+            continue
+        for det in im['detections']:
+            if 'classifications' not in det:
+                continue
+            classifications_to_keep = []
+            for classification in det['classifications']:
+                if classification[0] in classification_category_ids_to_keep:
+                    classifications_to_keep.append(classification)
+            det['classifications'] = classifications_to_keep
+    return data
+# ...def remove_classification_categories_below_count(...)
 def subset_json_detector_output_by_confidence(data, options):
@@ -172,7 +278,7 @@ def subset_json_detector_output_by_confidence(data, options):
         options (SubsetJsonDetectorOutputOptions): parameters for subsetting
     Returns:
-        dict: Possibly-modified version of data (also modifies in place)
+        dict: Possibly-modified version of [data] (also modifies in place)
     """
     if options.confidence_threshold is None:
@@ -234,9 +340,55 @@ def subset_json_detector_output_by_confidence(data, options):
     return data
-# ...subset_json_detector_output_by_confidence()
+# ...def subset_json_detector_output_by_confidence(...)
+def subset_json_detector_output_by_list(data, options):
+    """
+    Keeps only files in options.keep_files_in_list, which can be a .json results file or a folder.
+    Assumes that the input .json file contains relative paths when comparing to a folder.
+    Args:
+        data (dict): data loaded from a MD results file
+        options (SubsetJsonDetectorOutputOptions): parameters for subsetting
+    Returns:
+        dict: Possibly-modified version of [data] (also modifies in place)
+    """
+    if options.keep_files_in_list is None:
+        return
+    files_to_keep = None
+    if os.path.isfile(options.keep_files_in_list):
+        with open(options.keep_files_in_list,'r') as f:
+            d = json.load(f)
+        files_to_keep = [im['file'] for im in d['images']]
+    elif os.path.isdir(options.keep_files_in_list):
+        files_to_keep = \
+            recursive_file_list(options.keep_files_in_list,return_relative_paths=True)
+    else:
+        raise ValueError('Subsetting .json file by list: {} is neither a .json results file nor a folder'.format(
+            options.keep_files_in_list))
+    files_to_keep = [fn.replace('\\','/') for fn in files_to_keep]
+    files_to_keep_set = set(files_to_keep)
+    images_to_keep = []
+    for im in data['images']:
+        fn = im['file'].replace('\\','/')
+        if fn in files_to_keep_set:
+            images_to_keep.append(im)
+    data['images'] = images_to_keep
+    return data
+# ...def subset_json_detector_output_by_list(...)
 def subset_json_detector_output_by_categories(data, options):
     """
     Removes all detections without detections above a threshold for specific categories.
@@ -246,7 +398,7 @@ def subset_json_detector_output_by_categories(data, options):
         options (SubsetJsonDetectorOutputOptions): parameters for subsetting
     Returns:
-        dict: Possibly-modified version of data (also modifies in place)
+        dict: Possibly-modified version of [data] (also modifies in place)
     """
     # If categories_to_keep is supplied as a list, convert to a dict
@@ -342,7 +494,7 @@ def subset_json_detector_output_by_categories(data, options):
     return data
-# ...subset_json_detector_output_by_categories()
+# ...def subset_json_detector_output_by_categories(...)
 def remove_failed_images(data,options):
@@ -354,7 +506,7 @@ def remove_failed_images(data,options):
         options (SubsetJsonDetectorOutputOptions): parameters for subsetting
     Returns:
-        dict: Possibly-modified version of data (also modifies in place)
+        dict: Possibly-modified version of [data] (also modifies in place)
     """
     images_in = data['images']
@@ -381,7 +533,7 @@ def remove_failed_images(data,options):
     return data
-# ...remove_failed_images()
+# ...def remove_failed_images(...)
 def subset_json_detector_output_by_query(data, options):
@@ -394,7 +546,7 @@ def subset_json_detector_output_by_query(data, options):
         options (SubsetJsonDetectorOutputOptions): parameters for subsetting
     Returns:
-        dict: Possibly-modified version of data (also modifies in place)
+        dict: Possibly-modified version of [data] (also modifies in place)
     """
     images_in = data['images']
@@ -441,7 +593,7 @@ def subset_json_detector_output_by_query(data, options):
     return data
-# ...subset_json_detector_output_by_query()
+# ...def subset_json_detector_output_by_query(...)
 def subset_json_detector_output(input_filename, output_filename, options, data=None):
@@ -481,10 +633,10 @@ def subset_json_detector_output(input_filename, output_filename, options, data=N
             raise ValueError('When splitting by folders, output must be a valid directory name, you specified an existing file')
     if data is None:
-        print('Reading json...', end='')
+        print('Reading file {}'.format(input_filename))
         with open(input_filename) as f:
             data = json.load(f)
-        print(' ...done, read {} images'.format(len(data['images'])))
+        print('Read {} images'.format(len(data['images'])))
         if options.debug_max_images > 0:
             print('Trimming to {} images'.format(options.debug_max_images))
             data['images'] = data['images'][:options.debug_max_images]
@@ -500,7 +652,7 @@ def subset_json_detector_output(input_filename, output_filename, options, data=N
     if options.remove_failed_images:
         data = remove_failed_images(data, options)
     if options.confidence_threshold is not None:
         data = subset_json_detector_output_by_confidence(data, options)
@@ -508,6 +660,14 @@ def subset_json_detector_output(input_filename, output_filename, options, data=N
     if (options.categories_to_keep is not None) or (options.category_names_to_keep is not None):
         data = subset_json_detector_output_by_categories(data, options)
+    if options.remove_classification_categories_below_count is not None:
+        data = remove_classification_categories_below_count(data, options)
+    if options.keep_files_in_list is not None:
+        data = subset_json_detector_output_by_list(data, options)
     if not options.split_folders:
@@ -615,7 +775,7 @@ def subset_json_detector_output(input_filename, output_filename, options, data=N
     # ...if we're splitting folders
-# ...subset_json_detector_output()
+# ...def subset_json_detector_output(...)
 #%% Interactive driver
@@ -676,6 +836,9 @@ def main():
                         help='Replace [query] with this')
     parser.add_argument('--confidence_threshold', type=float, default=None,
                         help='Remove detections below this confidence level')
+    parser.add_argument('--keep_files_in_list', type=str, default=None,
+                        help='Keep only files in this list, which can be a .json results file or a folder.' + \
+                             ' Assumes that the input .json file contains relative paths when comparing to a folder.')
     parser.add_argument('--split_folders', action='store_true',
                         help='Split .json files by leaf-node folder')
     parser.add_argument('--split_folder_param', type=int,
@@ -690,6 +853,8 @@ def main():
                         help='When using split_folders and make_folder_relative, copy jsons to their corresponding folders (relative to output_file)')
     parser.add_argument('--create_folders', action='store_true',
                         help='When using copy_jsons_to_folders, create folders that don''t exist')
+    parser.add_argument('--remove_classification_categories_below_count', type=int, default=None,
+                        help='Remove classification categories with less than this many instances (no removal by default)')
     if len(sys.argv[1:]) == 0:
         parser.print_help()

megadetector/postprocessing/validate_batch_results.py CHANGED Viewed

@@ -20,11 +20,19 @@ from tqdm import tqdm
 from megadetector.detection.video_utils import is_video_file
 from megadetector.utils.ct_utils import args_to_object, is_list_sorted # noqa
-typical_info_fields = ['detector','detection_completion_time',
-                       'classifier','classification_completion_time',
-                       'detection_metadata','classifier_metadata']
-required_keys = ['info','images','detection_categories']
-typical_keys = ['classification_categories']
+typical_info_fields = ['detector',
+                       'detection_completion_time',
+                       'classifier',
+                       'classification_completion_time',
+                       'detection_metadata',
+                       'classifier_metadata']
+required_keys = ['info',
+                 'images',
+                 'detection_categories']
+typical_keys = ['classification_categories',
+                'classification_category_descriptions']
 #%% Classes

megadetector/taxonomy_mapping/map_new_lila_datasets.py CHANGED Viewed

@@ -15,10 +15,10 @@ import json
 # Created by get_lila_category_list.py
 input_lila_category_list_file = os.path.expanduser('~/lila/lila_categories_list/lila_dataset_to_categories.json')
-output_file = os.path.expanduser('~/lila/lila_additions_2024.12.31.csv')
+output_file = os.path.expanduser('~/lila/lila_additions_2025.03.24.csv')
 datasets_to_map = [
-    'Seattle(ish) Camera Traps'
+    'UNSW Predators'
     ]
@@ -125,6 +125,8 @@ output_df = pd.DataFrame(data=output_rows, columns=[
     'scientific_name', 'common_name', 'taxonomy_string'])
 output_df.to_csv(output_file, index=None, header=True)
+# from megadetector.utils.path_utils import open_file; open_file(output_file)
 #%% Manual lookup
@@ -138,10 +140,8 @@ if False:
     #%%
-    # q = 'white-throated monkey'
-    # q = 'cingulata'
-    # q = 'notamacropus'
-    q = 'insects'
+    q = 'dasyurus maculatus'
     taxonomy_preference = 'inat'
     m = get_preferred_taxonomic_match(q,taxonomy_preference)
     # print(m.scientific_name); import clipboard; clipboard.copy(m.scientific_name)

megadetector/taxonomy_mapping/preview_lila_taxonomy.py CHANGED Viewed

@@ -16,7 +16,7 @@ import os
 import pandas as pd
 # lila_taxonomy_file = r"c:\git\agentmorrisprivate\lila-taxonomy\lila-taxonomy-mapping.csv"
-lila_taxonomy_file = os.path.expanduser('~/lila/lila_additions_2024.12.31.csv')
+lila_taxonomy_file = os.path.expanduser('~/lila/lila_additions_2025.03.24.csv')
 preview_base = os.path.expanduser('~/lila/lila_taxonomy_preview')
 os.makedirs(preview_base,exist_ok=True)
@@ -72,65 +72,10 @@ from megadetector.taxonomy_mapping.species_lookup import \
 initialize_taxonomy_lookup()
-#%% Optionally remap all gbif-based mappings to inat (or vice-versa)
-if False:
-    #%%
-    source_mappings = ['gbif','manual']
-    target_mapping = 'inat'
-    valid_mappings = ['gbif','inat','manual']
-    assert target_mapping in valid_mappings
-    for source_mapping in source_mappings:
-        assert source_mapping != target_mapping and \
-            source_mapping in valid_mappings
-    n_remappings = 0
-    # i_row = 1; row = df.iloc[i_row]; row
-    for i_row,row in df.iterrows():
-        if row['source'] not in source_mappings:
-            continue
-        scientific_name = row['scientific_name']
-        old_common = taxonomy_string_to_common_name(row['taxonomy_string'])
-        m = get_preferred_taxonomic_match(scientific_name,target_mapping)
-        if m is None or m.source != target_mapping:
-            print('No mapping for {} ({}) ({})'.format(scientific_name,row['query'],old_common))
-            continue
-        assert m.scientific_name == row['scientific_name']
-        if m.taxonomic_level == 'variety' and row['taxonomy_level'] == 'subspecies':
-            pass
-        else:
-            assert m.taxonomic_level == row['taxonomy_level']
-        new_common = taxonomy_string_to_common_name(m.taxonomy_string)
-        if row['taxonomy_string'] != m.taxonomy_string:
-            print('Remapping {} ({} to {})'.format(scientific_name, old_common, new_common))
-            n_remappings += 1
-            df.loc[i_row,'taxonomy_string'] = m.taxonomy_string
-        if row['source'] != 'manual':
-            df.loc[i_row,'source'] = m.source
-    # This should be zero for the release .csv
-    print('Made {} remappings'.format(n_remappings))
-    #%%
-    df.to_csv(lila_taxonomy_file.replace('.csv','_remapped.csv'),header=True,index=False)
 #%% Check for mappings that disagree with the taxonomy string
+# For example, cases where the "level" column says "species", but the taxonomy string says it's a genus.
 df = pd.read_csv(lila_taxonomy_file)
 n_taxonomy_changes = 0

megadetector/taxonomy_mapping/species_lookup.py CHANGED Viewed

@@ -602,8 +602,17 @@ hyphenated_terms = ['crowned', 'backed', 'throated', 'tailed', 'headed', 'cheeke
 def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retry=True) -> TaxonomicMatch:
     """
-    Wrapper for species_lookup.py, but expressing a variety of heuristics and
-    preferences that are specific to our scenario.
+    Wrapper for _get_preferred_taxonomic_match, but expressing a variety of heuristics
+    and preferences that are specific to our scenario.
+    Args:
+        query (str): The common or scientific name we want to look up
+        taxonomy_preference (str, optional): 'inat' or 'gbif'
+        retry (bool, optional): if the initial lookup fails, should we try heuristic
+            substitutions, e.g. replacing "_" with " ", or "spp" with "species"?
+    Returns:
+        TaxonomicMatch: the best taxonomic match, or None
     """
     m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
@@ -616,6 +625,36 @@ def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retr
     return m
+def validate_and_convert(data):
+    """
+    Recursively validates that all elements in the nested structure are only
+    tuples, lists, ints, or np.int64, and converts np.int64 to int.
+    Args:
+        data: The nested structure to validate and convert
+    Returns:
+        The validated and converted structure
+    Raises:
+        TypeError: If an invalid type is encountered
+    """
+    if isinstance(data, np.int64):
+        return int(data)
+    elif isinstance(data, int) or isinstance(data, str):
+        return data
+    elif isinstance(data, (list, tuple)):
+        # Process lists and tuples recursively
+        container_type = type(data)
+        return container_type(validate_and_convert(item) for item in data)
+    else:
+        raise TypeError(f"Invalid type encountered: {type(data).__name__}. "
+                        f"Only int, np.int64, list, and tuple are allowed.")
+# ...def validate_and_convert(...)
 def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') -> TaxonomicMatch:
     query = query.lower().strip().replace('_', ' ')
@@ -760,6 +799,10 @@ def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') ->
     # ...if we needed to look in the GBIF taxonomy
+    # Convert np.int64's to ints
+    if match is not None:
+        match = validate_and_convert(match)
     taxonomy_string = str(match)
     return TaxonomicMatch(scientific_name, common_name, taxonomic_level, source,

megadetector/utils/ct_utils.py CHANGED Viewed

@@ -483,7 +483,9 @@ def sort_dictionary_by_key(d,reverse=False):
 def sort_dictionary_by_value(d,sort_values=None,reverse=False):
     """
     Sorts the dictionary [d] by value.  If sort_values is None, uses d.values(),
-    otherwise uses the dictionary sort_values as the sorting criterion.
+    otherwise uses the dictionary sort_values as the sorting criterion.  Always
+    returns a new standard dict, so if [d] is, for example, a defaultdict, the
+    returned value is not.
     Args:
         d (dict): dictionary to sort
@@ -492,7 +494,7 @@ def sort_dictionary_by_value(d,sort_values=None,reverse=False):
         reverse (bool, optional): whether to sort in reverse (descending) order
     Returns:
-        dict: sorted copy of [d]
+        dict: sorted copy of [d
     """
     if sort_values is None:
@@ -517,6 +519,52 @@ def invert_dictionary(d):
     return {v: k for k, v in d.items()}
+def round_floats_in_nested_dict(obj, decimal_places=5):
+    """
+    Recursively rounds all floating point values in a nested structure to the
+    specified number of decimal places. Handles dictionaries, lists, tuples,
+    sets, and other iterables. Modifies mutable objects in place.
+    Args:
+        obj: The object to process (can be a dict, list, set, tuple, or primitive value)
+        decimal_places: Number of decimal places to round to (default: 5)
+    Returns:
+        The processed object (useful for recursive calls)
+    """
+    if isinstance(obj, dict):
+        for key in obj:
+            obj[key] = round_floats_in_nested_dict(obj[key], decimal_places)
+        return obj
+    elif isinstance(obj, list):
+        for i in range(len(obj)):
+            obj[i] = round_floats_in_nested_dict(obj[i], decimal_places)
+        return obj
+    elif isinstance(obj, tuple):
+        # Tuples are immutable, so we create a new one
+        return tuple(round_floats_in_nested_dict(item, decimal_places) for item in obj)
+    elif isinstance(obj, set):
+        # Sets are mutable but we can't modify elements in-place
+        # Convert to list, process, and convert back to set
+        return set(round_floats_in_nested_dict(list(obj), decimal_places))
+    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
+        # Handle other iterable types - convert to list, process, and convert back
+        return type(obj)(round_floats_in_nested_dict(item, decimal_places) for item in obj)
+    elif isinstance(obj, float):
+        return round(obj, decimal_places)
+    else:
+        # For other types (int, str, bool, None, etc.), return as is
+        return obj
+# ...def round_floats_in_nested_dict(...)
 def image_file_to_camera_folder(image_fn):
     r"""
     Removes common overflow folders (e.g. RECNX101, RECNX102) from paths, i.e. turn:
@@ -780,7 +828,7 @@ def dict_to_kvp_list(d,
     if len(d) == 0:
         return ''
-    s = ''
+    s = None
     for k in d.keys():
         assert isinstance(k,str), 'Input {} is not a str <--> str dict'.format(str(d))
         v = d[k]
@@ -800,6 +848,9 @@ def dict_to_kvp_list(d,
             s += item_separator
         s += k + kv_separator + v
+    if s is None:
+        s = ''
     return s
@@ -856,3 +907,25 @@ def __module_test__():
     L = [{'a':5},{'a':0},{'a':10}]
     k = 'a'
     sort_list_of_dicts_by_key(L, k, reverse=True)
+    ##%% Test float rounding
+    # Example with mixed collection types
+    data = {
+        "name": "Project X",
+        "values": [1.23456789, 2.3456789],
+        "tuple_values": (3.45678901, 4.56789012),
+        "set_values": {5.67890123, 6.78901234},
+        "metrics": {
+            "score": 98.7654321,
+            "components": [5.6789012, 6.7890123]
+        }
+    }
+    result = round_floats_in_nested_dict(data)
+    assert result['values'][0] == 1.23457
+    assert result['tuple_values'][0] == 3.45679
+    assert min(list(result['set_values'])) == 5.6789

megadetector/utils/directory_listing.py CHANGED Viewed

@@ -17,9 +17,6 @@ import sys
 import argparse
 import re
-import azure.common
-from azure.storage.blob import BlobServiceClient, ContentSettings
 from megadetector.utils.path_utils import is_image_file
@@ -139,6 +136,8 @@ def traverse_and_create_index(dir, sas_url=None, overwrite_files=False,
     # If we want to set the content type in blob storage using a SAS URL
     if sas_url:
+        from azure.storage.blob import BlobServiceClient, ContentSettings
         # Example: sas_url = 'https://accname.blob.core.windows.net/bname/path/to/folder?st=...&se=...&sp=...&...'
         if '?' in sas_url:
             # 'https://accname.blob.core.windows.net/bname/path/to/folder' and 'st=...&se=...&sp=...&...'
@@ -196,6 +195,7 @@ def traverse_and_create_index(dir, sas_url=None, overwrite_files=False,
         # Set content type in blob storage
         if sas_url:
+            import azure.common
             if container_folder:
                 output_blob_path = container_folder + '/' + output_file[len(dir) + 1:]
             else:
@@ -237,7 +237,7 @@ def main():
     args = parser.parse_args()
     assert os.path.isdir(args.directory), "{} is not a valid directory".format(args.directory)
-    assert re.match('https?://[^\.]+\.blob\.core\.windows\.net/.+', args.sas_url), "--sas_url does not " + \
+    assert re.match(r'https?://[^\.]+\.blob\.core\.windows\.net/.+', args.sas_url), "--sas_url does not " + \
         "match the format https://accname.blob.core.windows.net/bname/path/to/folder?..."
     traverse_and_create_index(args.directory, overwrite_files=args.enable_overwrite, sas_url=args.sas_url, basepath=args.basepath)

megadetector 5.0.24__py3-none-any.whl → 5.0.26__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.24py3-none-any.whl → 5.0.26py3-none-any.whl