PyPI - megadetector - Versions diffs - 10.0.6__py3-none-any.whl → 10.0.8__py3-none-any.whl - Mend - Supply Chain Defender

megadetector 10.0.6py3-none-any.whl → 10.0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (19) hide show

megadetector/postprocessing/classification_postprocessing.py CHANGED Viewed

@@ -13,14 +13,15 @@ Functions for postprocessing species classification results, particularly:
 #%% Constants and imports
-import os
 import json
 import copy
+import pandas as pd
 from collections import defaultdict
 from tqdm import tqdm
 from megadetector.utils.ct_utils import is_list_sorted
+from megadetector.utils.ct_utils import is_empty
 from megadetector.utils.ct_utils import sort_dictionary_by_value
 from megadetector.utils.ct_utils import sort_dictionary_by_key
 from megadetector.utils.ct_utils import invert_dictionary
@@ -29,9 +30,9 @@ from megadetector.utils.wi_taxonomy_utils import clean_taxonomy_string
 from megadetector.utils.wi_taxonomy_utils import taxonomy_level_index
 from megadetector.utils.wi_taxonomy_utils import taxonomy_level_string_to_index
-from megadetector.utils.wi_taxonomy_utils import non_taxonomic_prediction_strings
 from megadetector.utils.wi_taxonomy_utils import human_prediction_string
 from megadetector.utils.wi_taxonomy_utils import animal_prediction_string
+from megadetector.utils.wi_taxonomy_utils import is_taxonomic_prediction_string
 from megadetector.utils.wi_taxonomy_utils import blank_prediction_string # noqa
@@ -129,7 +130,7 @@ class ClassificationSmoothingOptions:
         ## Populated internally
-        #: #: Only include these categories in the smoothing process (None to use all categories)
+        #: Only include these categories in the smoothing process (None to use all categories)
         self._detection_category_ids_to_smooth = None
@@ -1014,6 +1015,10 @@ def smooth_classification_results_sequence_level(input_file,
         detections_this_sequence = []
         for image_filename in image_filenames_this_sequence:
+            if image_filename not in image_fn_to_classification_results:
+                print('Warning: {} in sequence list but not in results'.format(
+                    image_filename))
+                continue
             im = image_fn_to_classification_results[image_filename]
             if 'detections' not in im or im['detections'] is None:
                 continue
@@ -1101,16 +1106,16 @@ def restrict_to_taxa_list(taxa_list,
                           output_file,
                           allow_walk_down=False,
                           add_pre_filtering_description=True,
-                          allow_redundant_latin_names=False):
+                          allow_redundant_latin_names=True,
+                          protected_common_names=None,
+                          use_original_common_names_if_available=True,
+                          verbose=True):
     """
     Given a prediction file in MD .json format, likely without having had
     a geofence applied, apply a custom taxa list.
     Args:
-        taxa_list (str or list): list of latin names, or a text file containing
-            a list of latin names.  Optionally may contain a second (comma-delimited)
-            column containing common names, used only for debugging.  Latin names
-            must exist in the SpeciesNet taxonomy.
+        taxa_list (str): .csv file with at least the columns "latin" and "common".
         speciesnet_taxonomy_file (str): taxonomy filename, in the same format used for
             model release (with 7-token taxonomy entries)
         input_file (str): .json file to read, in MD format.  This can be None, in which
@@ -1128,45 +1133,73 @@ def restrict_to_taxa_list(taxa_list,
             if the same latin name appears twice in the taxonomy list; if True, we'll
             just print a warning and ignore all entries other than the first for this
             latin name
+        protected_common_names (list, optional): these categories should be
+            unmodified, even if they aren't used, or have the same taxonomic
+            description as other categories
+        use_original_common_names_if_available (bool, optional): if an "original_common"
+            column is present in [taxa_list], use those common names instead of the ones
+            in the taxonomy file
+        verbose (bool, optional): enable additional debug output
     """
     ##%% Read target taxa list
-    if isinstance(taxa_list,str):
-        assert os.path.isfile(taxa_list), \
-            'Could not find taxa list file {}'.format(taxa_list)
-        with open(taxa_list,'r') as f:
-            taxa_list = f.readlines()
+    taxa_list_df = pd.read_csv(taxa_list)
+    required_columns = ('latin','common')
+    for s in required_columns:
+        assert s in taxa_list_df.columns, \
+            'Required column {} missing from taxonomy list file {}'.format(
+                s,taxa_list)
+    # Convert the "latin" and "common" columns in taxa_list_df to lowercase
+    taxa_list_df['latin'] = taxa_list_df['latin'].str.lower()
+    taxa_list_df['common'] = taxa_list_df['common'].str.lower()
-    taxa_list = [s.strip().lower() for s in taxa_list]
-    taxa_list = [s for s in taxa_list if len(s) > 0]
+    # Remove rows from taxa_list_df where the "latin" column is nan,
+    # printing a warning for each row (with a string representation of the whole row)
+    for i_row,row in taxa_list_df.iterrows():
+        if pd.isna(row['latin']):
+            if verbose:
+                print('Warning: Skipping row with empty "latin" column in {}:\n{}\n'.format(
+                    taxa_list,str(row.to_dict())))
+            taxa_list_df.drop(index=i_row, inplace=True)
+    # Convert all NaN values in the "common" column to empty strings
+    taxa_list_df['common'] = taxa_list_df['common'].fillna('')
+    # Create a dictionary mapping latin names to common names
     target_latin_to_common = {}
-    for s in taxa_list:
+    for i_row,row in taxa_list_df.iterrows():
-        if s.strip().startswith('#'):
-            continue
-        tokens = s.split(',')
-        # We allow additional columns now
-        # assert len(tokens) <= 2
-        binomial_name = tokens[0]
-        assert len(binomial_name.split(' ')) in (1,2,3), \
-            'Illegal binomial name in species list: {}'.format(binomial_name)
-        if len(tokens) > 0:
-            common_name = tokens[1].strip().lower()
-        else:
-            common_name = None
-        if binomial_name in target_latin_to_common:
-            error_string = 'scientific name {} appears multiple times in the taxonomy list'.format(
-                    binomial_name)
+        latin = row['latin']
+        common = row['common']
+        if use_original_common_names_if_available and \
+            ('original_common' in row) and \
+            (not is_empty(row['original_common'])):
+                common = row['original_common'].strip().lower()
+        # Valid latin names have either one token (e.g. "canidae"),
+        # two tokens (e.g. "bos taurus"), or three tokens (e.g. "canis lupus familiaris")
+        assert len(latin.split(' ')) in (1,2,3), \
+            'Illegal binomial name {} in taxaonomy list {}'.format(
+                latin,taxa_list)
+        if latin in target_latin_to_common:
+            error_string = \
+                'scientific name {} appears multiple times in the taxonomy list'.format(
+                latin)
             if allow_redundant_latin_names:
-                print('Warning: {}'.format(error_string))
+                if verbose:
+                    print('Warning: {}'.format(error_string))
             else:
                 raise ValueError(error_string)
-        target_latin_to_common[binomial_name] = common_name
-    # ...for each line in the taxonomy file
+        target_latin_to_common[latin] = common
+    # ...for each row in the custom taxonomy list
     ##%% Read taxonomy file
@@ -1185,7 +1218,7 @@ def restrict_to_taxa_list(taxa_list,
     def _insert_taxonomy_string(s):
         tokens = s.split(';')
-        assert len(tokens) == 7
+        assert len(tokens) == 7, 'Illegal taxonomy string {}'.format(s)
         guid = tokens[0] # noqa
         class_name = tokens[1]
@@ -1196,20 +1229,24 @@ def restrict_to_taxa_list(taxa_list,
         common_name = tokens[6]
         if len(class_name) == 0:
-            assert common_name in ('animal','vehicle','blank')
+            assert common_name in ('animal','vehicle','blank'), \
+                'Illegal common name {}'.format(common_name)
             return
         if len(species) > 0:
-            assert all([len(s) > 0 for s in [genus,family,order]])
+            assert all([len(s) > 0 for s in [genus,family,order]]), \
+                'Higher-level taxa missing for {}: {},{},{}'.format(s,genus,family,order)
             binomial_name = genus + ' ' + species
             if binomial_name not in speciesnet_latin_name_to_taxon_string:
                 speciesnet_latin_name_to_taxon_string[binomial_name] = s
         elif len(genus) > 0:
-            assert all([len(s) > 0 for s in [family,order]])
+            assert all([len(s) > 0 for s in [family,order]]), \
+                'Higher-level taxa missing for {}: {},{}'.format(s,family,order)
             if genus not in speciesnet_latin_name_to_taxon_string:
                 speciesnet_latin_name_to_taxon_string[genus] = s
         elif len(family) > 0:
-            assert len(order) > 0
+            assert len(order) > 0, \
+                'Higher-level taxa missing for {}: {}'.format(s,order)
             if family not in speciesnet_latin_name_to_taxon_string:
                 speciesnet_latin_name_to_taxon_string[family] = s
         elif len(order) > 0:
@@ -1232,12 +1269,19 @@ def restrict_to_taxa_list(taxa_list,
     # In theory any taxon that appears as the parent of another taxon should
     # also be in the taxonomy, but this isn't always true, so we fix it here.
     new_taxon_string_to_missing_tokens = defaultdict(list)
+    # While we're making this loop, also see whether we need to store any custom
+    # common name mappings based on the taxonomy list.
+    speciesnet_latin_name_to_output_common_name = {}
     # latin_name = next(iter(speciesnet_latin_name_to_taxon_string.keys()))
     for latin_name in speciesnet_latin_name_to_taxon_string.keys():
+        if latin_name in target_latin_to_common:
+            speciesnet_latin_name_to_output_common_name[latin_name] = \
+                target_latin_to_common[latin_name]
         if 'no cv result' in latin_name:
             continue
@@ -1260,7 +1304,8 @@ def restrict_to_taxa_list(taxa_list,
                 for i_copy_token in range(1,i_token+1):
                     new_tokens[i_copy_token] = tokens[i_copy_token]
                 new_tokens[-1] = test_token + ' species'
-                assert new_tokens[-2] == ''
+                assert new_tokens[-2] == '', \
+                    'Illegal taxonomy string {}'.format(taxon_string)
                 new_taxon_string = ';'.join(new_tokens)
                 # assert new_taxon_string not in new_taxon_strings
                 new_taxon_string_to_missing_tokens[new_taxon_string].append(test_token)
@@ -1269,14 +1314,19 @@ def restrict_to_taxa_list(taxa_list,
     # ...for each taxon
-    print('Found {} taxa that need to be inserted to make the taxonomy valid:\n'.format(
-        len(new_taxon_string_to_missing_tokens)))
     new_taxon_string_to_missing_tokens = \
         sort_dictionary_by_key(new_taxon_string_to_missing_tokens)
-    for taxon_string in new_taxon_string_to_missing_tokens:
-        missing_taxa = ','.join(new_taxon_string_to_missing_tokens[taxon_string])
-        print('{} ({})'.format(taxon_string,missing_taxa))
+    if verbose:
+        print(f'Found {len(new_taxon_string_to_missing_tokens)} taxa that need to be inserted to ' + \
+              'make the taxonomy valid, showing only mammals and birds here:\n')
+        for taxon_string in new_taxon_string_to_missing_tokens:
+            if 'mammalia' not in taxon_string and 'aves' not in taxon_string:
+                continue
+            missing_taxa = ','.join(new_taxon_string_to_missing_tokens[taxon_string])
+            print('{} ({})'.format(taxon_string,missing_taxa))
     for new_taxon_string in new_taxon_string_to_missing_tokens:
         _insert_taxonomy_string(new_taxon_string)
@@ -1298,7 +1348,7 @@ def restrict_to_taxa_list(taxa_list,
             n_failed_mappings += 1
     if n_failed_mappings > 0:
-        raise ValueError('Cannot continue with geofence generation')
+        raise ValueError('Cannot continue with taxonomic restriction')
     ##%% For the allow-list, map each parent taxon to a set of allowable child taxa
@@ -1312,7 +1362,8 @@ def restrict_to_taxa_list(taxa_list,
         taxon_string = speciesnet_latin_name_to_taxon_string[latin_name]
         tokens = taxon_string.split(';')
-        assert len(tokens) == 7
+        assert len(tokens) == 7, \
+            'Illegal taxonomy string {}'.format(taxon_string)
         # Remove GUID and common mame
         #
@@ -1324,25 +1375,85 @@ def restrict_to_taxa_list(taxa_list,
         # If this is a species
         if len(tokens[-1]) > 0:
             binomial_name = tokens[-2] + ' ' + tokens[-1]
-            assert binomial_name == latin_name
+            assert binomial_name == latin_name, \
+                'Binomial/latin mismatch: {} vs {}'.format(binomial_name,latin_name)
+            # If this already exists, it should only allow "None"
+            if binomial_name in allowed_parent_taxon_to_child_taxa:
+                assert len(allowed_parent_taxon_to_child_taxa[binomial_name]) == 1, \
+                    'Species-level entry {} has multiple children'.format(binomial_name)
+                assert None in allowed_parent_taxon_to_child_taxa[binomial_name], \
+                    'Species-level entry {} has non-None children'.format(binomial_name)
             allowed_parent_taxon_to_child_taxa[binomial_name].add(None)
             child_taxon = binomial_name
-        # The first candidate parent is the genus
+        # The first level that can ever be a parent taxon is the genus level
         parent_token_index = len(tokens) - 2
+        # Walk up from genus to family
         while(parent_token_index >= 0):
+            # "None" is our leaf node marker, we should never have ''
+            if child_taxon is not None:
+                assert len(child_taxon) > 0
             parent_taxon = tokens[parent_token_index]
-            allowed_parent_taxon_to_child_taxa[parent_taxon].add(child_taxon)
-            child_taxon = parent_taxon
+            # Don't create entries for blank taxa
+            if (len(parent_taxon) > 0):
+                create_child = True
+                # This is the lowest-level taxon in this entry
+                if (child_taxon is None):
+                    # ...but we don't want to remove existing children from any parents
+                    if (parent_taxon in allowed_parent_taxon_to_child_taxa) and \
+                       (len(allowed_parent_taxon_to_child_taxa[parent_taxon]) > 0):
+                        if verbose:
+                            existing_children_string = str(allowed_parent_taxon_to_child_taxa[parent_taxon])
+                            print('Not creating empty child for parent {} (already has children {})'.format(
+                                parent_taxon,existing_children_string))
+                        create_child = False
+                # If we're adding a new child entry, clear out any leaf node markers
+                else:
+                    if (parent_taxon in allowed_parent_taxon_to_child_taxa) and \
+                       (None in allowed_parent_taxon_to_child_taxa[parent_taxon]):
+                        assert len(allowed_parent_taxon_to_child_taxa[parent_taxon]) == 1, \
+                            'Illlegal parent/child configuration'
+                        if verbose:
+                            print('Un-marking parent {} as a leaf node because of child {}'.format(
+                                parent_taxon,child_taxon))
+                        allowed_parent_taxon_to_child_taxa[parent_taxon] = set()
+                if create_child:
+                    allowed_parent_taxon_to_child_taxa[parent_taxon].add(child_taxon)
+                # If we haven't hit a non-empty taxon yet, don't update "child_taxon"
+                assert len(parent_taxon) > 0
+                child_taxon = parent_taxon
+            # ...if we have a non-empty taxon
             parent_token_index -= 1
+        # ...for each taxonomic level
     # ...for each allowed latin name
     allowed_parent_taxon_to_child_taxa = \
         sort_dictionary_by_key(allowed_parent_taxon_to_child_taxa)
+    for parent_taxon in allowed_parent_taxon_to_child_taxa:
+        # "None" should only ever appear alone; this marks a leaf node with no children
+        if None in allowed_parent_taxon_to_child_taxa[parent_taxon]:
+            assert len(allowed_parent_taxon_to_child_taxa[parent_taxon]) == 1, \
+                '"None" should only appear alone in a child taxon list'
     ##%% If we were just validating the custom taxa file, we're done
@@ -1369,11 +1480,25 @@ def restrict_to_taxa_list(taxa_list,
         input_taxon_string = input_category_id_to_taxonomy_string[input_category_id]
         input_taxon_tokens = input_taxon_string.split(';')
-        assert len(input_taxon_tokens) == 7
+        assert len(input_taxon_tokens) == 7, \
+            'Illegal taxonomy string: {}'.format(input_taxon_string)
-        # Don't mess with blank/no-cv-result/animal/human
-        if (input_taxon_string in non_taxonomic_prediction_strings) or \
+        # Don't mess with blank/no-cv-result/human (or "animal", which is really "unknown")
+        if (not is_taxonomic_prediction_string(input_taxon_string)) or \
            (input_taxon_string == human_prediction_string):
+            if verbose:
+                print('Not messing with non-taxonomic category {}'.format(input_taxon_string))
+            input_category_id_to_output_taxon_string[input_category_id] = \
+                input_taxon_string
+            continue
+        # Don't mess with protected categories
+        common_name = input_taxon_tokens[-1]
+        if (protected_common_names is not None) and \
+            (common_name in protected_common_names):
+            if verbose:
+                print('Not messing with protected category {}'.format(common_name))
             input_category_id_to_output_taxon_string[input_category_id] = \
                 input_taxon_string
             continue
@@ -1403,19 +1528,23 @@ def restrict_to_taxa_list(taxa_list,
                 test_index -= 1
                 continue
-            assert test_taxon_name in speciesnet_latin_name_to_taxon_string
+            assert test_taxon_name in speciesnet_latin_name_to_taxon_string, \
+                '{} should be a substring of {}'.format(test_taxon_name,
+                                                        speciesnet_latin_name_to_taxon_string)
             # Is this taxon allowed according to the custom species list?
             if test_taxon_name in allowed_parent_taxon_to_child_taxa:
                 allowed_child_taxa = allowed_parent_taxon_to_child_taxa[test_taxon_name]
-                assert allowed_child_taxa is not None
+                assert allowed_child_taxa is not None, \
+                    'allowed_child_taxa should not be None: {}'.format(test_taxon_name)
                 # If this is the lowest-level allowable token or there is not a
                 # unique child, don't walk any further, even if walking down
                 # is enabled.
-                if (None in allowed_child_taxa):
-                    assert len(allowed_child_taxa) == 1
+                if None in allowed_child_taxa:
+                    assert len(allowed_child_taxa) == 1, \
+                        '"None" should not be listed as a child taxa with other child taxa'
                 if (None in allowed_child_taxa) or (len(allowed_child_taxa) > 1):
                     target_taxon = test_taxon_name
@@ -1427,8 +1556,12 @@ def restrict_to_taxa_list(taxa_list,
                     while ((next(iter(allowed_child_taxa)) is not None) and \
                           (len(allowed_child_taxa) == 1)):
                         candidate_taxon = next(iter(allowed_child_taxa))
-                        assert candidate_taxon in allowed_parent_taxon_to_child_taxa
-                        assert candidate_taxon in speciesnet_latin_name_to_taxon_string
+                        assert candidate_taxon in allowed_parent_taxon_to_child_taxa, \
+                            '{} should be a subset of {}'.format(
+                                candidate_taxon,allowed_parent_taxon_to_child_taxa)
+                        assert candidate_taxon in speciesnet_latin_name_to_taxon_string, \
+                            '{} should be a subset of {}'.format(
+                                candidate_taxon,speciesnet_latin_name_to_taxon_string)
                         allowed_child_taxa = \
                             allowed_parent_taxon_to_child_taxa[candidate_taxon]
                     target_taxon = candidate_taxon
@@ -1450,21 +1583,30 @@ def restrict_to_taxa_list(taxa_list,
     ##%% Build the new tables
+    speciesnet_taxon_string_to_latin_name = invert_dictionary(speciesnet_latin_name_to_taxon_string)
     input_category_id_to_output_category_id = {}
     output_taxon_string_to_category_id = {}
     output_category_id_to_common_name = {}
     for input_category_id in input_category_id_to_output_taxon_string:
-        original_common_name = \
-            input_category_id_to_common_name[input_category_id]
-        original_taxon_string = \
-            input_category_id_to_taxonomy_string[input_category_id]
         output_taxon_string = \
             input_category_id_to_output_taxon_string[input_category_id]
         output_common_name = output_taxon_string.split(';')[-1]
+        # Possibly substitute a custom common name
+        if output_taxon_string in speciesnet_taxon_string_to_latin_name:
+            speciesnet_latin_name = speciesnet_taxon_string_to_latin_name[output_taxon_string]
+            if speciesnet_latin_name in speciesnet_latin_name_to_output_common_name:
+                custom_common_name = speciesnet_latin_name_to_output_common_name[speciesnet_latin_name]
+                if custom_common_name != output_common_name:
+                    print('Substituting common name {} for {}'.format(custom_common_name,output_common_name))
+                    output_common_name = custom_common_name
         # Do we need to create a new output category?
         if output_taxon_string not in output_taxon_string_to_category_id:
             output_category_id = str(len(output_taxon_string_to_category_id))
@@ -1479,21 +1621,28 @@ def restrict_to_taxa_list(taxa_list,
         input_category_id_to_output_category_id[input_category_id] = \
             output_category_id
+        # Sometimes-useful debug printouts
         if False:
+            original_common_name = \
+              input_category_id_to_common_name[input_category_id]
+            original_taxon_string = \
+                input_category_id_to_taxonomy_string[input_category_id]
             print('Mapping {} ({}) to:\n{} ({})\n'.format(
                 original_common_name,original_taxon_string,
                 output_common_name,output_taxon_string))
-        if False:
             print('Mapping {} to {}'.format(
                 original_common_name,output_common_name,))
     # ...for each category
-    ##%% Remap all category labels
+    #%% Remap all category labels
     assert len(set(output_taxon_string_to_category_id.keys())) == \
-           len(set(output_taxon_string_to_category_id.values()))
+           len(set(output_taxon_string_to_category_id.values())), \
+           'Category ID/value non-uniqueness error'
     output_category_id_to_taxon_string = \
         invert_dictionary(output_taxon_string_to_category_id)