PyPI - megadetector - Versions diffs - 5.0.24__py3-none-any.whl → 5.0.25__py3-none-any.whl - Mend - Supply Chain Defender

megadetector 5.0.24py3-none-any.whl → 5.0.25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (20) hide show

megadetector/utils/wi_utils.py CHANGED Viewed

@@ -27,8 +27,12 @@ from tqdm import tqdm
 from megadetector.utils.path_utils import insert_before_extension
 from megadetector.utils.ct_utils import split_list_into_n_chunks
+from megadetector.utils.ct_utils import round_floats_in_nested_dict
+from megadetector.utils.ct_utils import is_list_sorted
 from megadetector.utils.ct_utils import invert_dictionary
 from megadetector.utils.ct_utils import sort_list_of_dicts_by_key
+from megadetector.utils.ct_utils import sort_dictionary_by_value
+from megadetector.utils.ct_utils import sort_dictionary_by_key
 from megadetector.utils.path_utils import find_images
 from megadetector.postprocessing.validate_batch_results import \
     validate_batch_results, ValidateBatchResultsOptions
@@ -58,10 +62,28 @@ def is_valid_prediction_string(s):
     Returns:
         bool: True if this looks more or less like a WI prediction string
     """
+    # Note to self... don't get tempted to remove spaces here; spaces are used
+    # to indicate subspecies.
     return isinstance(s,str) and (len(s.split(';')) == 7) and (s == s.lower())
+def is_valid_taxonomy_string(s):
+    """
+    Determine whether [s] is a valid 5-token WI taxonomy string.  Taxonmy strings look like:
+    'mammalia;rodentia;;;;rodent'
+    'mammalia;chordata;canidae;canis;lupus dingo'
+    Args:
+        s (str): the string to be tested for validity
+    Returns:
+        bool: True if this looks more or less like a WI taxonomy string
+    """
+    return isinstance(s,str) and (len(s.split(';')) == 5) and (s == s.lower())
 def wi_result_to_prediction_string(r):
     """
     Convert the dict [r] - typically loaded from a row in a downloaded .csv file - to
@@ -469,10 +491,14 @@ sample_update_payload = {
 blank_prediction_string = 'f1856211-cfb7-4a5b-9158-c0f72fd09ee6;;;;;;blank'
 no_cv_result_prediction_string = 'f2efdae9-efb8-48fb-8a91-eccf79ab4ffb;no cv result;no cv result;no cv result;no cv result;no cv result;no cv result'
-rodent_prediction_string = '90d950db-2106-4bd9-a4c1-777604c3eada;mammalia;rodentia;;;;rodent'
-mammal_prediction_string = 'f2d233e3-80e3-433d-9687-e29ecc7a467a;mammalia;;;;;mammal'
 animal_prediction_string = '1f689929-883d-4dae-958c-3d57ab5b6c16;;;;;;animal'
 human_prediction_string = '990ae9dd-7a59-4344-afcb-1b7b21368000;mammalia;primates;hominidae;homo;sapiens;human'
+vehicle_prediction_string = 'e2895ed5-780b-48f6-8a11-9e27cb594511;;;;;;vehicle'
+non_taxonomic_prediction_strings = [blank_prediction_string,
+                                    no_cv_result_prediction_string,
+                                    animal_prediction_string,
+                                    vehicle_prediction_string]
 process_cv_response_url = 'https://placeholder'
@@ -870,6 +896,7 @@ def get_kingdom(prediction_string):
         str: the kingdom field from the input string
     """
     tokens = prediction_string.split(';')
+    assert is_valid_prediction_string(prediction_string)
     return tokens[1]
@@ -912,7 +939,10 @@ def is_animal_classification(prediction_string):
     return True
-def generate_md_results_from_predictions_json(predictions_json_file,md_results_file,base_folder=None):
+def generate_md_results_from_predictions_json(predictions_json_file,
+                                              md_results_file,
+                                              base_folder=None,
+                                              max_decimals=5):
     """
     Generate an MD-formatted .json file from a predictions.json file.  Typically,
     MD results files use relative paths, and predictions.json files use absolute paths, so
@@ -921,21 +951,38 @@ def generate_md_results_from_predictions_json(predictions_json_file,md_results_f
     Currently just applies the top classification category to every detection.  If the top classification
     is "blank", writes an empty detection list.
-    wi_to_md.py is a command-line driver for this function.
+    speciesnet_to_md.py is a command-line driver for this function.
     Args:
-        predictions_json_file (str): path to a predictions.json file
+        predictions_json_file (str): path to a predictions.json file, or a dict
         md_results_file (str): path to which we should write an MD-formatted .json file
-        base_folder (str, optional): leading string to remove from each path in the predictions.json file
+        base_folder (str, optional): leading string to remove from each path in the
+            predictions.json file
+        max_decimals (int, optional): number of decimal places to which we should round
+            all values
     """
     # Read predictions file
-    with open(predictions_json_file,'r') as f:
-        predictions = json.load(f)
+    if isinstance(predictions_json_file,str):
+        with open(predictions_json_file,'r') as f:
+            predictions = json.load(f)
+    else:
+        assert isinstance(predictions_json_file,dict)
+        predictions = predictions_json_file
+    # Round floating-point values (confidence scores, coordinates) to a
+    # reasonable number of decimal places
+    if max_decimals is not None and max_decimals > 0:
+        round_floats_in_nested_dict(predictions)
     predictions = predictions['predictions']
     assert isinstance(predictions,list)
-    from megadetector.utils.ct_utils import is_list_sorted
+    # Convert backslashes to forward slashes in both filenames and the base folder string
+    for im in predictions:
+        im['filepath'] = im['filepath'].replace('\\','/')
+    if base_folder is not None:
+        base_folder = base_folder.replace('\\','/')
     detection_category_id_to_name = {}
     classification_category_name_to_id = {}
@@ -948,6 +995,8 @@ def generate_md_results_from_predictions_json(predictions_json_file,md_results_f
     # Create the output images list
     images_out = []
+    base_folder_replacements = 0
     # im_in = predictions[0]
     for im_in in predictions:
@@ -957,6 +1006,7 @@ def generate_md_results_from_predictions_json(predictions_json_file,md_results_f
         fn = im_in['filepath']
         if base_folder is not None:
             if fn.startswith(base_folder):
+                base_folder_replacements += 1
                 fn = fn.replace(base_folder,'',1)
         im_out['file'] = fn
@@ -1056,6 +1106,11 @@ def generate_md_results_from_predictions_json(predictions_json_file,md_results_f
     # ...for each image
+    if base_folder is not None:
+        if base_folder_replacements == 0:
+            print('Warning: you supplied {} as the base folder, but I made zero replacements'.format(
+                base_folder))
     # Fix the 'unknown' category
     if len(all_unknown_detections) > 0:
@@ -1075,7 +1130,8 @@ def generate_md_results_from_predictions_json(predictions_json_file,md_results_f
     # Prepare friendly classification names
-    classification_category_descriptions = invert_dictionary(classification_category_name_to_id)
+    classification_category_descriptions = \
+        invert_dictionary(classification_category_name_to_id)
     classification_categories_out = {}
     for category_id in classification_category_descriptions.keys():
         category_name = classification_category_descriptions[category_id].split(';')[-1]
@@ -1105,7 +1161,9 @@ def generate_md_results_from_predictions_json(predictions_json_file,md_results_f
 # ...def generate_md_results_from_predictions_json(...)
-def generate_predictions_json_from_md_results(md_results_file,predictions_json_file,base_folder=None):
+def generate_predictions_json_from_md_results(md_results_file,
+                                              predictions_json_file,
+                                              base_folder=None):
     """
     Generate a predictions.json file from the MD-formatted .json file [md_results_file].  Typically,
     MD results files use relative paths, and predictions.json files use absolute paths, so
@@ -1165,13 +1223,16 @@ def generate_predictions_json_from_md_results(md_results_file,predictions_json_f
 # ...def generate_predictions_json_from_md_results(...)
+default_tokens_to_ignore = ['$RECYCLE.BIN']
 def generate_instances_json_from_folder(folder,
                                         country=None,
+                                        admin1_region=None,
                                         lat=None,
                                         lon=None,
                                         output_file=None,
-                                        filename_replacements=None):
+                                        filename_replacements=None,
+                                        tokens_to_ignore=default_tokens_to_ignore):
     """
     Generate an instances.json record that contains all images in [folder], optionally
     including location information, in a format suitable for run_model.py.  Optionally writes
@@ -1186,6 +1247,8 @@ def generate_instances_json_from_folder(folder,
         filename_replacements (dict, optional): str --> str dict indicating filename substrings
             that should be replaced with other strings.  Replacement occurs *after* converting
             backslashes to forward slashes.
+        tokens_to_ignore (list, optional): ignore any images with these tokens in their
+            names, typically used to avoid $RECYCLE.BIN.  Can be None.
     Returns:
         dict: dict with at least the field "instances"
@@ -1195,6 +1258,13 @@ def generate_instances_json_from_folder(folder,
     image_files_abs = find_images(folder,recursive=True,return_relative_paths=False)
+    if tokens_to_ignore is not None:
+        n_images_before_ignore_tokens = len(image_files_abs)
+        for token in tokens_to_ignore:
+            image_files_abs = [fn for fn in image_files_abs if token not in fn]
+        print('After ignoring {} tokens, kept {} of {} images'.format(
+            len(tokens_to_ignore),len(image_files_abs),n_images_before_ignore_tokens))
     instances = []
     # image_fn_abs = image_files_abs[0]
@@ -1206,6 +1276,8 @@ def generate_instances_json_from_folder(folder,
                 instance['filepath'] = instance['filepath'].replace(s,filename_replacements[s])
         if country is not None:
             instance['country'] = country
+        if admin1_region is not None:
+            instance['admin1_region'] = admin1_region
         if lat is not None:
             assert lon is not None, 'Latitude provided without longitude'
             instance['latitude'] = lat
@@ -1226,14 +1298,243 @@ def generate_instances_json_from_folder(folder,
 # ...def generate_instances_json_from_folder(...)
-#%% Functions related to geofencing and taxonomy mapping
+def split_instances_into_n_batches(instances_json,n_batches,output_files=None):
+    """
+    Given an instances.json file, split it into batches of equal size.
+    Args:
+        instances_json (str): input .json file in
+        n_batches (int): number of new files to generate
+        output_files (list, optional): output .json files for each
+            batch.  If supplied, should have length [n_batches].  If not
+            supplied, filenames will be generated based on [instances_json].
+    Returns:
+        list: list of output files that were written; identical to [output_files]
+        if it was supplied as input.
+    """
+    with open(instances_json,'r') as f:
+        instances = json.load(f)
+    assert isinstance(instances,dict) and 'instances' in instances
+    instances = instances['instances']
+    if output_files is not None:
+        assert len(output_files) == n_batches, \
+            'Expected {} output files, received {}'.format(
+                n_batches,len(output_files))
+    else:
+        output_files = []
+        for i_batch in range(0,n_batches):
+            batch_string = 'batch_{}'.format(str(i_batch).zfill(3))
+            output_files.append(insert_before_extension(instances_json,batch_string))
+    batches = split_list_into_n_chunks(instances, n_batches)
+    for i_batch,batch in enumerate(batches):
+        batch_dict = {'instances':batch}
+        with open(output_files[i_batch],'w') as f:
+            json.dump(batch_dict,f,indent=1)
+    print('Wrote {} batches to file'.format(n_batches))
+    return output_files
+def merge_prediction_json_files(input_prediction_files,output_prediction_file):
+    """
+    Merge all predictions.json files in [files] into a single .json file.
+    Args:
+        files (list): list of predictions.json files to merge
+        output_file (str): output .json file
+    """
+    predictions = []
+    image_filenames_processed = set()
+    # input_json_fn = input_prediction_files[0]
+    for input_json_fn in tqdm(input_prediction_files):
+        assert os.path.isfile(input_json_fn), \
+            'Could not find prediction file {}'.format(input_json_fn)
+        with open(input_json_fn,'r') as f:
+            results_this_file = json.load(f)
+        assert isinstance(results_this_file,dict)
+        predictions_this_file = results_this_file['predictions']
+        for prediction in predictions_this_file:
+            image_fn = prediction['filepath']
+            assert image_fn not in image_filenames_processed
+        predictions.extend(predictions_this_file)
+    output_dict = {'predictions':predictions}
+    os.makedirs(os.path.dirname(output_prediction_file),exist_ok=True)
+    with open(output_prediction_file,'w') as f:
+        json.dump(output_dict,f,indent=1)
+# ...def merge_prediction_json_files(...)
+def validate_predictions_file(fn,instances=None,verbose=True):
+    """
+    Validate the predictions.json file [fn].
+    Args:
+        fn (str): a .json file in predictions.json (SpeciesNet) format
+        instances (str or list, optional): a folder, instances.json file,
+            or dict loaded from an instances.json file.  If supplied, this
+            function will verify that [fn] contains the same number of
+            images as [instances].
+        verbose (bool, optional): enable additional debug output
+    Returns:
+        dict: the contents of [fn]
+    """
+    with open(fn,'r') as f:
+        d = json.load(f)
+    predictions = d['predictions']
+    failures = []
+    for im in predictions:
+        if 'failures' in im:
+            failures.append(im)
+    if verbose:
+        print('Read detector results for {} images, with {} failure(s)'.format(
+            len(d['predictions']),len(failures)))
+    if instances is not None:
+        if isinstance(instances,str):
+            if os.path.isdir(instances):
+                instances = generate_instances_json_from_folder(folder=instances)
+            elif os.path.isfile(instances):
+                with open(instances,'r') as f:
+                    instances = json.load(f)
+            else:
+                raise ValueError('Could not find instances file/folder {}'.format(
+                    instances))
+        assert isinstance(instances,dict)
+        assert 'instances' in instances
+        instances = instances['instances']
+        if verbose:
+            print('Expected results for {} files'.format(len(instances)))
+        assert len(instances) == len(predictions), \
+            '{} instances expected, {} found'.format(
+                len(instances),len(predictions))
+        expected_files = set([instance['filepath'] for instance in instances])
+        found_files = set([prediction['filepath'] for prediction in predictions])
+        assert expected_files == found_files
+    # ...if a list of instances was supplied
+    return d
+# ...def validate_predictions_file(...)
+def find_geofence_adjustments(ensemble_json_file,use_latin_names=False):
+    """
+    Count the number of instances of each unique change made by the geofence.
+    Args:
+        ensemble_json_file (str): SpeciesNet-formatted .json file produced
+            by the full ensemble.
+        use_latin_names (bool, optional): return a mapping using binomial names
+            rather than common names.
+    Returns:
+        dict: maps strings that look like "puma,felidae family" to integers,
+            where that entry would indicate the number of times that "puma" was
+            predicted, but mapped to family level by the geofence.  Sorted in
+            descending order by count.
+    """
+    ensemble_results = validate_predictions_file(ensemble_json_file)
+    assert isinstance(ensemble_results,dict)
+    predictions = ensemble_results['predictions']
+    # Maps comma-separated pairs of common names (or binomial names) to
+    # the number of times that transition (first --> second) happened
+    rollup_pair_to_count = defaultdict(int)
+    # prediction = predictions[0]
+    for prediction in tqdm(predictions):
+        if 'failures' in prediction and \
+            prediction['failures'] is not None and \
+            len(prediction['failures']) > 0:
+                continue
+        assert 'prediction_source' in prediction, \
+            'Prediction present without [prediction_source] field, are you sure this ' + \
+            'is an ensemble output file?'
+        if 'geofence' in prediction['prediction_source']:
+            classification_taxonomy_string = \
+                prediction['classifications']['classes'][0]
+            prediction_taxonomy_string = prediction['prediction']
+            assert is_valid_prediction_string(classification_taxonomy_string)
+            assert is_valid_prediction_string(prediction_taxonomy_string)
+            # Typical examples:
+            # '86f5b978-4f30-40cc-bd08-be9e3fba27a0;mammalia;rodentia;sciuridae;sciurus;carolinensis;eastern gray squirrel'
+            # 'e4d1e892-0e4b-475a-a8ac-b5c3502e0d55;mammalia;rodentia;sciuridae;;;sciuridae family'
+            classification_common_name = classification_taxonomy_string.split(';')[-1]
+            prediction_common_name = prediction_taxonomy_string.split(';')[-1]
+            classification_binomial_name = classification_taxonomy_string.split(';')[-2]
+            prediction_binomial_name = prediction_taxonomy_string.split(';')[-2]
+            input_name = classification_binomial_name if use_latin_names else \
+                classification_common_name
+            output_name = prediction_binomial_name if use_latin_names else \
+                prediction_common_name
+            rollup_pair = input_name.strip() + ',' + output_name.strip()
+            rollup_pair_to_count[rollup_pair] += 1
+        # ...if we made a geofencing change
+    # ...for each prediction
+    rollup_pair_to_count = sort_dictionary_by_value(rollup_pair_to_count,reverse=True)
+    return rollup_pair_to_count
+# ...def find_geofence_adjustments(...)
+#%% Module-level globals related to taxonomy mapping and geofencing
 # This maps a taxonomy string (e.g. mammalia;cetartiodactyla;cervidae;odocoileus;virginianus) to
 # a dict with keys taxon_id, common_name, kingdom, phylum, class, order, family, genus, species
 taxonomy_string_to_taxonomy_info = None
+# Maps a binomial name (possibly three tokens, if it's a subspecies) to the same dict
+# described above.
 binomial_name_to_taxonomy_info = None
+# Maps a common name to the same dict described above
 common_name_to_taxonomy_info = None
+# Dict mapping 5-token semicolon-delimited taxonomy strings to geofencing rules
+taxonomy_string_to_geofencing_rules = None
+# Maps lower-case country names to upper-case country codes
+country_to_country_code = None
+# Maps upper-case country codes to lower-case country names
+country_code_to_country = None
+#%% Functions related to geofencing and taxonomy mapping
 def taxonomy_info_to_taxonomy_string(taxonomy_info):
     """
     Convert a taxonomy record in dict format to a semicolon-delimited string
@@ -1258,12 +1559,16 @@ def initialize_taxonomy_info(taxonomy_file,force_init=False,encoding='cp1252'):
     [common_name_to_taxonomy_info].
     Args:
-        taxonomy_file (str): .json file containing WI taxonomy information
+        taxonomy_file (str): .json file containing mappings from the short taxonomy strings
+            to the longer strings with GUID and common name, see example below.
         force_init (bool, optional): if the output dicts already exist, should we
             re-initialize anyway?
         encoding (str, optional): character encoding to use when opening the .json file
     """
+    if encoding is None:
+        encoding = 'cp1252'
     global taxonomy_string_to_taxonomy_info
     global binomial_name_to_taxonomy_info
     global common_name_to_taxonomy_info
@@ -1326,22 +1631,159 @@ def initialize_taxonomy_info(taxonomy_file,force_init=False,encoding='cp1252'):
             # print('Warning: no binomial name for {}'.format(taxonomy_string))
             pass
         else:
+            # strip(), but don't remove spaces from the species name;
+            # subspecies are separated with a space, e.g. canis;lupus dingo
             binomial_name = tokens[4].strip() + ' ' + tokens[5].strip()
             binomial_name_to_taxonomy_info[binomial_name] = taxon_info
+    print('Created {} records in taxonomy_string_to_taxonomy_info'.format(
+        len(taxonomy_string_to_taxonomy_info)))
 # ...def initialize_taxonomy_info(...)
-#%% Geofencing functions
-# Dict mapping semicolon-delimited taxonomy strings to geofencing rules
-taxonomy_string_to_geofencing_rules = None
+def _parse_code_list(codes):
+    """
+    Turn a list of country or state codes in string, delimited string, or list format
+    into a list.  Also does basic validity checking.
+    """
+    if not isinstance(codes,list):
+        assert isinstance(codes,str)
+        codes = codes.strip()
+        # This is just a single codes
+        if ',' not in codes:
+            codes = [codes]
+        else:
+            codes = codes.split(',')
+        codes = [c.strip() for c in codes]
+    assert isinstance(codes,list)
+    codes = [c.upper().strip() for c in codes]
+    for c in codes:
+        assert len(c) in (2,3)
+    return codes
+def _generate_csv_rows_to_block_all_countries_except(
+        species_string,
+        block_except_list):
+    """
+    Generate rows in the format expected by geofence_fixes.csv, representing a list of
+    allow and block rules to block all countries currently allowed for this species
+    except [allow_countries], and add allow rules these countries.
+    """
+    assert is_valid_taxonomy_string(species_string), \
+        '{} is not a valid taxonomy string'.format(species_string)
+    global taxonomy_string_to_taxonomy_info
+    global binomial_name_to_taxonomy_info
+    global common_name_to_taxonomy_info
+    assert taxonomy_string_to_geofencing_rules is not None, \
+        'Initialize geofencing prior to species lookup'
+    assert taxonomy_string_to_taxonomy_info is not None, \
+        'Initialize taxonomy lookup prior to species lookup'
+    geofencing_rules_this_species = \
+        taxonomy_string_to_geofencing_rules[species_string]
+    allowed_countries = []
+    if 'allow' in geofencing_rules_this_species:
+        allowed_countries.extend(geofencing_rules_this_species['allow'])
+    blocked_countries = []
+    if 'block' in geofencing_rules_this_species:
+        blocked_countries.extend(geofencing_rules_this_species['block'])
+    block_except_list = _parse_code_list(block_except_list)
+    countries_to_block = []
+    countries_to_allow = []
+    # country = allowed_countries[0]
+    for country in allowed_countries:
+        if country not in block_except_list and country not in blocked_countries:
+            countries_to_block.append(country)
+    for country in block_except_list:
+        if country in blocked_countries:
+            raise ValueError("I can't allow a country that has already been blocked")
+        if country not in allowed_countries:
+            countries_to_allow.append(country)
+    rows = generate_csv_rows_for_species(species_string,
+                                         allow_countries=countries_to_allow,
+                                         block_countries=countries_to_block)
+    return rows
+# ...def _generate_csv_rows_to_block_all_countries_except(...)
+def generate_csv_rows_for_species(species_string,
+                                  allow_countries=None,
+                                  block_countries=None,
+                                  allow_states=None,
+                                  block_states=None,
+                                  blockexcept_countries=None):
+    """
+    Generate rows in the format expected by geofence_fixes.csv, representing a list of
+    allow and/or block rules for the specified species and countries/states.  Does not check
+    that the rules make sense; e.g. nothing will stop you in this function from both allowing
+    and blocking a country.
+    Args:
+        species_string (str): string in semicolon-delimited WI taxonomy format
+        allow_countries (optional, list or str): three-letter country codes, list of
+            country codes, or comma-separated list of country codes to allow
+        block_countries (optional, list or str): three-letter country codes, list of
+            country codes, or comma-separated list of country codes to block
+        allow_states (optional, list or str): two-letter state codes, list of
+            state codes, or comma-separated list of state codes to allow
+        block_states (optional, list or str): two-letter state code, list of
+            state codes, or comma-separated list of state codes to block
+    Returns:
+        list of str: lines ready to be pasted into geofence_fixes.csv
+    """
+    assert is_valid_taxonomy_string(species_string), \
+        '{} is not a valid taxonomy string'.format(species_string)
+    lines = []
+    if allow_countries is not None:
+        allow_countries = _parse_code_list(allow_countries)
+        for country in allow_countries:
+            lines.append(species_string + ',allow,' + country + ',')
+    if block_countries is not None:
+        block_countries = _parse_code_list(block_countries)
+        for country in block_countries:
+            lines.append(species_string + ',block,' + country + ',')
+    if allow_states is not None:
+        allow_states = _parse_code_list(allow_states)
+        for state in allow_states:
+            lines.append(species_string + ',allow,USA,' + state)
+    if block_states is not None:
+        block_states = _parse_code_list(block_states)
+        for state in block_states:
+            lines.append(species_string + ',block,USA,' + state)
+    return lines
-# Maps lower-case country names to upper-case country codes
-country_to_country_code = None
+# ...def generate_csv_rows_for_species(...)
-# Maps upper-case country codes to lower-case country names
-country_code_to_country = None
 def initialize_geofencing(geofencing_file,country_code_file,force_init=False):
     """
@@ -1351,10 +1793,13 @@ def initialize_geofencing(geofencing_file,country_code_file,force_init=False):
     Args:
         geofencing_file (str): .json file with geofencing rules
-        country_code_file (str): .csv file with country code mappings
+        country_code_file (str): .csv file with country code mappings, in columns
+            called "name" and "alpha-3", e.g. from
+            https://github.com/lukes/ISO-3166-Countries-with-Regional-Codes/blob/master/all/all.csv
         force_init (bool, optional): if the output dicts already exist, should we
             re-initialize anyway?
     """
     global taxonomy_string_to_geofencing_rules
     global country_to_country_code
     global country_code_to_country
@@ -1427,35 +1872,21 @@ def initialize_geofencing(geofencing_file,country_code_file,force_init=False):
 # ...def initialize_geofencing(...)
-def species_allowed_in_country(species,country,state=None,return_status=False):
+def _species_string_to_canonical_species_string(species):
+    """
+    Convert a string that may be a 5-token species string, a binomial name,
+    or a common name into a 5-token species string.
     """
-    Determines whether [species] is allowed in [country], according to
-    already-initialized geofencing rules.
-    Args:
-        species (str): can be a common name, a binomial name, or a species string
-        country (str): country name or three-letter code
-        state (str, optional): two-letter US state code
-        return_status (bool, optional): by default, this function returns a bool;
-            if you want to know *why* [species] is allowed/not allowed, settings
-            return_status to True will return additional information.
+    global taxonomy_string_to_taxonomy_info
+    global binomial_name_to_taxonomy_info
+    global common_name_to_taxonomy_info
-    Returns:
-        bool or str: typically returns True if [species] is allowed in [country], else
-        False.  Returns a more detailed string if return_status is set.
-    """
     assert taxonomy_string_to_geofencing_rules is not None, \
         'Initialize geofencing prior to species lookup'
     assert taxonomy_string_to_taxonomy_info is not None, \
         'Initialize taxonomy lookup prior to species lookup'
-    # species = 'mammalia;cetartiodactyla;cervidae;odocoileus;virginianus'
-    # species = 'didelphis marsupialis'
-    # country = 'Guatemala'
-    # species = 'common opossum'
     species = species.lower()
     # Turn "species" into a taxonomy string
@@ -1463,8 +1894,8 @@ def species_allowed_in_country(species,country,state=None,return_status=False):
     # If this is already a taxonomy string...
     if len(species.split(';')) == 5:
         pass
-    # If this is a binomial name...
-    elif len(species.split(' ')) == 2 and (species in binomial_name_to_taxonomy_info):
+    # If this is a binomial name (which may include a subspecies)...
+    elif (len(species.split(' ')) in (2,3)) and (species in binomial_name_to_taxonomy_info):
         taxonomy_info = binomial_name_to_taxonomy_info[species]
         taxonomy_string = taxonomy_info_to_taxonomy_string(taxonomy_info)
     # If this is a common name...
@@ -1474,6 +1905,37 @@ def species_allowed_in_country(species,country,state=None,return_status=False):
     else:
         raise ValueError('Could not find taxonomic information for {}'.format(species))
+    return taxonomy_string
+def species_allowed_in_country(species,country,state=None,return_status=False):
+    """
+    Determines whether [species] is allowed in [country], according to
+    already-initialized geofencing rules.
+    Args:
+        species (str): can be a common name, a binomial name, or a species string
+        country (str): country name or three-letter code
+        state (str, optional): two-letter US state code
+        return_status (bool, optional): by default, this function returns a bool;
+            if you want to know *why* [species] is allowed/not allowed, settings
+            return_status to True will return additional information.
+    Returns:
+        bool or str: typically returns True if [species] is allowed in [country], else
+        False.  Returns a more detailed string if return_status is set.
+    """
+    global taxonomy_string_to_taxonomy_info
+    global binomial_name_to_taxonomy_info
+    global common_name_to_taxonomy_info
+    assert taxonomy_string_to_geofencing_rules is not None, \
+        'Initialize geofencing prior to species lookup'
+    assert taxonomy_string_to_taxonomy_info is not None, \
+        'Initialize taxonomy lookup prior to species lookup'
+    taxonomy_string = _species_string_to_canonical_species_string(species)
     # Normalize [state]
@@ -1515,6 +1977,8 @@ def species_allowed_in_country(species,country,state=None,return_status=False):
         blocked_countries = list(geofencing_rules_this_species['block'])
     status = None
+    # The convention is that block rules win over allow rules
     if country_code in blocked_countries:
         status = 'blocked'
     elif country_code in allowed_countries:
@@ -1565,12 +2029,459 @@ def species_allowed_in_country(species,country,state=None,return_status=False):
 # ...def species_allowed_in_country(...)
+def restrict_to_taxa_list(taxa_list,
+                          speciesnet_taxonomy_file,
+                          input_file,
+                          output_file,
+                          allow_walk_down=False):
+    """
+    Given a prediction file in MD .json format, likely without having had
+    a geofence applied, apply a custom taxa list.
+    Args:
+        taxa_list (str or list): list of latin names, or a text file containing
+            a list of latin names.  Optionally may contain a second (comma-delimited)
+            column containing common names, used only for debugging.  Latin names
+            must exist in the SpeciesNet taxonomy.
+        taxonomy_file (str): taxonomy filename, in the same format used for model
+            release (with 7-token taxonomy entries)
+        output_file (str): .json file to write, in MD format
+        allow_walk_down (bool, optional): should we walk down the taxonomy tree
+            when making mappings if a parent has only a single allowable child?
+            For example, if only a single felid species is allowed, should other
+            felid predictions be mapped to that species, as opposed to being mapped
+            to the family?
+    """
+    ##%% Read target taxa list
+    if isinstance(taxa_list,str):
+        assert os.path.isfile(taxa_list), \
+            'Could not find taxa list file {}'.format(taxa_list)
+        with open(taxa_list,'r') as f:
+            taxa_list = f.readlines()
+    taxa_list = [s.strip().lower() for s in taxa_list]
+    taxa_list = [s for s in taxa_list if len(s) > 0]
+    target_latin_to_common = {}
+    for s in taxa_list:
+        if s.strip().startswith('#'):
+            continue
+        tokens = s.split(',')
+        assert len(tokens) <= 2
+        binomial_name = tokens[0]
+        assert len(binomial_name.split(' ')) in (1,2,3), \
+            'Illegal binomial name in species list: {}'.format(binomial_name)
+        if len(tokens) > 0:
+            common_name = tokens[1].strip().lower()
+        else:
+            common_name = None
+        assert binomial_name not in target_latin_to_common
+        target_latin_to_common[binomial_name] = common_name
+    ##%% Read taxonomy file
+    with open(speciesnet_taxonomy_file,'r') as f:
+        speciesnet_taxonomy_list = f.readlines()
+    speciesnet_taxonomy_list = [s.strip() for s in \
+                                speciesnet_taxonomy_list if len(s.strip()) > 0]
+    # Maps the latin name of every taxon to the corresponding full taxon string
+    #
+    # For species, the key is a binomial name
+    speciesnet_latin_name_to_taxon_string = {}
+    speciesnet_common_name_to_taxon_string = {}
+    def _insert_taxonomy_string(s):
+        tokens = s.split(';')
+        assert len(tokens) == 7
+        guid = tokens[0] # noqa
+        class_name = tokens[1]
+        order = tokens[2]
+        family = tokens[3]
+        genus = tokens[4]
+        species = tokens[5]
+        common_name = tokens[6]
+        if len(class_name) == 0:
+            assert common_name in ('animal','vehicle','blank')
+            return
+        if len(species) > 0:
+            assert all([len(s) > 0 for s in [genus,family,order]])
+            binomial_name = genus + ' ' + species
+            if binomial_name not in speciesnet_latin_name_to_taxon_string:
+                speciesnet_latin_name_to_taxon_string[binomial_name] = s
+        elif len(genus) > 0:
+            assert all([len(s) > 0 for s in [family,order]])
+            if genus not in speciesnet_latin_name_to_taxon_string:
+                speciesnet_latin_name_to_taxon_string[genus] = s
+        elif len(family) > 0:
+            assert len(order) > 0
+            if family not in speciesnet_latin_name_to_taxon_string:
+                speciesnet_latin_name_to_taxon_string[family] = s
+        elif len(order) > 0:
+            if order not in speciesnet_latin_name_to_taxon_string:
+                speciesnet_latin_name_to_taxon_string[order] = s
+        else:
+            if class_name not in speciesnet_latin_name_to_taxon_string:
+                speciesnet_latin_name_to_taxon_string[class_name] = s
+        if len(common_name) > 0:
+            if common_name not in speciesnet_common_name_to_taxon_string:
+                speciesnet_common_name_to_taxon_string[common_name] = s
+    for s in speciesnet_taxonomy_list:
+        _insert_taxonomy_string(s)
+    ##%% Make sure all parent taxa are represented in the taxonomy
+    # In theory any taxon that appears as the parent of another taxon should
+    # also be in the taxonomy, but this isn't always true, so we fix it here.
+    new_taxon_string_to_missing_tokens = defaultdict(list)
+    # latin_name = next(iter(speciesnet_latin_name_to_taxon_string.keys()))
+    for latin_name in speciesnet_latin_name_to_taxon_string.keys():
+        if 'no cv result' in latin_name:
+            continue
+        taxon_string = speciesnet_latin_name_to_taxon_string[latin_name]
+        tokens = taxon_string.split(';')
+        # Don't process GUID, species, or common name
+        # i_token = 6
+        for i_token in range(1,len(tokens)-2):
+            test_token = tokens[i_token]
+            if len(test_token) == 0:
+                continue
+            # Do we need to make up a taxon for this token?
+            if test_token not in speciesnet_latin_name_to_taxon_string:
+                new_tokens = [''] * 7
+                new_tokens[0] = 'fake_guid'
+                for i_copy_token in range(1,i_token+1):
+                    new_tokens[i_copy_token] = tokens[i_copy_token]
+                new_tokens[-1] = test_token + ' species'
+                assert new_tokens[-2] == ''
+                new_taxon_string = ';'.join(new_tokens)
+                # assert new_taxon_string not in new_taxon_strings
+                new_taxon_string_to_missing_tokens[new_taxon_string].append(test_token)
+        # ...for each token
+    # ...for each taxon
+    print('Found {} taxa that need to be inserted to make the taxonomy valid:\n'.format(
+        len(new_taxon_string_to_missing_tokens)))
+    new_taxon_string_to_missing_tokens = \
+        sort_dictionary_by_key(new_taxon_string_to_missing_tokens)
+    for taxon_string in new_taxon_string_to_missing_tokens:
+        missing_taxa = ','.join(new_taxon_string_to_missing_tokens[taxon_string])
+        print('{} ({})'.format(taxon_string,missing_taxa))
+    for new_taxon_string in new_taxon_string_to_missing_tokens:
+        _insert_taxonomy_string(new_taxon_string)
+    ##%% Make sure all species on the allow-list are in the taxonomy
+    n_failed_mappings = 0
+    for target_taxon_latin_name in target_latin_to_common.keys():
+        if target_taxon_latin_name not in speciesnet_latin_name_to_taxon_string:
+            common_name = target_latin_to_common[target_taxon_latin_name]
+            s = '{} ({}) not in speciesnet taxonomy'.format(
+                target_taxon_latin_name,common_name)
+            if common_name in speciesnet_common_name_to_taxon_string:
+                s += ' (common name maps to {})'.format(
+                    speciesnet_common_name_to_taxon_string[common_name])
+            print(s)
+            n_failed_mappings += 1
+    if n_failed_mappings > 0:
+        raise ValueError('Cannot continue with geofence generation')
+    ##%% For the allow-list, map each parent taxon to a set of allowable child taxa
+    # Maps parent names to all allowed child names, or None if this is the
+    # lowest-level allowable taxon on this path
+    allowed_parent_taxon_to_child_taxa = defaultdict(set)
+    # latin_name = next(iter(target_latin_to_common.keys()))
+    for latin_name in target_latin_to_common:
+        taxon_string = speciesnet_latin_name_to_taxon_string[latin_name]
+        tokens = taxon_string.split(';')
+        assert len(tokens) == 7
+        # Remove GUID and common mame
+        #
+        # This is now always class/order/family/genus/species
+        tokens = tokens[1:-1]
+        child_taxon = None
+        # If this is a species
+        if len(tokens[-1]) > 0:
+            binomial_name = tokens[-2] + ' ' + tokens[-1]
+            assert binomial_name == latin_name
+            allowed_parent_taxon_to_child_taxa[binomial_name].add(None)
+            child_taxon = binomial_name
+        # The first candidate parent is the genus
+        parent_token_index = len(tokens) - 2
+        while(parent_token_index >= 0):
+            parent_taxon = tokens[parent_token_index]
+            allowed_parent_taxon_to_child_taxa[parent_taxon].add(child_taxon)
+            child_taxon = parent_taxon
+            parent_token_index -= 1
+    # ...for each allowed latin name
+    allowed_parent_taxon_to_child_taxa = \
+        sort_dictionary_by_key(allowed_parent_taxon_to_child_taxa)
+    ##%% Map all predictions that exist in this dataset...
+    # ...to the prediction we should generate.
+    with open(input_file,'r') as f:
+        input_data = json.load(f)
+    input_category_id_to_common_name = input_data['classification_categories'] #noqa
+    input_category_id_to_taxonomy_string = \
+        input_data['classification_category_descriptions']
+    input_category_id_to_output_taxon_string = {}
+    # input_category_id = next(iter(input_category_id_to_taxonomy_string.keys()))
+    for input_category_id in input_category_id_to_taxonomy_string.keys():
+        input_taxon_string = input_category_id_to_taxonomy_string[input_category_id]
+        input_taxon_tokens = input_taxon_string.split(';')
+        assert len(input_taxon_tokens) == 7
+        # Don't mess with blank/no-cv-result/animal/human
+        if (input_taxon_string in non_taxonomic_prediction_strings) or \
+           (input_taxon_string == human_prediction_string):
+            input_category_id_to_output_taxon_string[input_category_id] = \
+                input_taxon_string
+            continue
+        # Remove GUID and common mame
+        #
+        # This is now always class/order/family/genus/species
+        input_taxon_tokens = input_taxon_tokens[1:-1]
+        test_index = len(input_taxon_tokens) - 1
+        target_taxon = None
+        # Start at the species level, and see whether each taxon is allowed
+        while((test_index >= 0) and (target_taxon is None)):
+            # Species are represented as binomial names
+            if (test_index == (len(input_taxon_tokens) - 1)) and \
+                (len(input_taxon_tokens[-1]) > 0):
+                test_taxon_name = \
+                    input_taxon_tokens[-2] + ' ' + input_taxon_tokens[-1]
+            else:
+                test_taxon_name = input_taxon_tokens[test_index]
+            # If we haven't yet found the level at which this taxon is non-empty,
+            # keep going up
+            if len(test_taxon_name) == 0:
+                test_index -= 1
+                continue
+            assert test_taxon_name in speciesnet_latin_name_to_taxon_string
+            # Is this taxon allowed according to the custom species list?
+            if test_taxon_name in allowed_parent_taxon_to_child_taxa:
+                allowed_child_taxa = allowed_parent_taxon_to_child_taxa[test_taxon_name]
+                assert allowed_child_taxa is not None
+                # If this is the lowest-level allowable token or there is not a
+                # unique child, don't walk any further, even if walking down
+                # is enabled.
+                if (None in allowed_child_taxa):
+                    assert len(allowed_child_taxa) == 1
+                if (None in allowed_child_taxa) or (len(allowed_child_taxa) > 1):
+                    target_taxon = test_taxon_name
+                elif not allow_walk_down:
+                    target_taxon = test_taxon_name
+                else:
+                    # If there's a unique child, walk back *down* the allowable
+                    # taxa until we run out of unique children
+                    while ((next(iter(allowed_child_taxa)) is not None) and \
+                          (len(allowed_child_taxa) == 1)):
+                        candidate_taxon = next(iter(allowed_child_taxa))
+                        assert candidate_taxon in allowed_parent_taxon_to_child_taxa
+                        assert candidate_taxon in speciesnet_latin_name_to_taxon_string
+                        allowed_child_taxa = \
+                            allowed_parent_taxon_to_child_taxa[candidate_taxon]
+                    target_taxon = candidate_taxon
+            # ...if this is an allowed taxon
+            test_index -= 1
+        # ...for each token
+        if target_taxon is None:
+            output_taxon_string = animal_prediction_string
+        else:
+            output_taxon_string = speciesnet_latin_name_to_taxon_string[target_taxon]
+        input_category_id_to_output_taxon_string[input_category_id] = output_taxon_string
+    # ...for each category
+    ##%% Build the new tables
+    input_category_id_to_output_category_id = {}
+    output_taxon_string_to_category_id = {}
+    output_category_id_to_common_name = {}
+    for input_category_id in input_category_id_to_output_taxon_string:
+        original_common_name = \
+            input_category_id_to_common_name[input_category_id]
+        original_taxon_string = \
+            input_category_id_to_taxonomy_string[input_category_id]
+        output_taxon_string = \
+            input_category_id_to_output_taxon_string[input_category_id]
+        output_common_name = output_taxon_string.split(';')[-1]
+        # Do we need to create a new output category?
+        if output_taxon_string not in output_taxon_string_to_category_id:
+            output_category_id = str(len(output_taxon_string_to_category_id))
+            output_taxon_string_to_category_id[output_taxon_string] = \
+                output_category_id
+            output_category_id_to_common_name[output_category_id] = \
+                output_common_name
+        else:
+            output_category_id = \
+                output_taxon_string_to_category_id[output_taxon_string]
+        input_category_id_to_output_category_id[input_category_id] = \
+            output_category_id
+        if False:
+            print('Mapping {} ({}) to:\n{} ({})\n'.format(
+                original_common_name,original_taxon_string,
+                output_common_name,output_taxon_string))
+        if False:
+            print('Mapping {} to {}'.format(
+                original_common_name,output_common_name,))
+    # ...for each category
+    ##%% Remap all category labels
+    assert len(set(output_taxon_string_to_category_id.keys())) == \
+           len(set(output_taxon_string_to_category_id.values()))
+    output_category_id_to_taxon_string = \
+        invert_dictionary(output_taxon_string_to_category_id)
+    with open(input_file,'r') as f:
+        output_data = json.load(f)
+    for im in tqdm(output_data['images']):
+        if 'detections' in im and im['detections'] is not None:
+            for det in im['detections']:
+                if 'classifications' in det:
+                    for classification in det['classifications']:
+                        classification[0] = \
+                            input_category_id_to_output_category_id[classification[0]]
+    output_data['classification_categories'] = output_category_id_to_common_name
+    output_data['classification_category_descriptions'] = \
+        output_category_id_to_taxon_string
+    ##%% Write output
+    with open(output_file,'w') as f:
+        json.dump(output_data,f,indent=1)
 #%% Interactive driver(s)
 if False:
     pass
+    #%% Shared cell to initialize geofencing and taxonomy information
+    from megadetector.utils.wi_utils import species_allowed_in_country # noqa
+    from megadetector.utils.wi_utils import initialize_geofencing, initialize_taxonomy_info # noqa
+    from megadetector.utils.wi_utils import _species_string_to_canonical_species_string # noqa
+    from megadetector.utils.wi_utils import generate_csv_rows_for_species # noqa
+    from megadetector.utils.wi_utils import _generate_csv_rows_to_block_all_countries_except # noqa
+    from megadetector.utils.wi_utils import taxonomy_string_to_geofencing_rules # noqa
+    from megadetector.utils.wi_utils import taxonomy_string_to_taxonomy_info # noqa
+    geofencing_file = r'c:\git\cameratrapai\data\geofence_base.json'
+    country_code_file = r'g:\temp\country-codes.csv'
+    # encoding = 'cp1252'; taxonomy_file = r'g:\temp\taxonomy_mapping-' + encoding + '.json'
+    encoding = None; taxonomy_file = r'g:\temp\taxonomy_mapping.json'
+    initialize_geofencing(geofencing_file, country_code_file, force_init=True)
+    initialize_taxonomy_info(taxonomy_file, force_init=True, encoding=encoding)
+    #%% Test driver for geofence_fixes.csv function
+    block_except_list = 'AUS, PNG, THA, IDN, MYS'
+    species = 'dingo'
+    species_string = _species_string_to_canonical_species_string(species)
+    rows = _generate_csv_rows_to_block_all_countries_except(species_string,block_except_list)
+    import clipboard; clipboard.copy('\n'.join(rows))
+    #%%
+    generate_csv_rows_for_species(species_string=species_string,
+                                  allow_countries=None,
+                                  block_countries=None,
+                                  allow_states=None,
+                                  block_states=None,
+                                  blockexcept_countries=None)
+    _generate_csv_rows_to_block_all_countries_except(species_string,'AUS')
+    #%% Test the effects of geofence changes
+    species = 'canis lupus dingo'
+    country = 'guatemala'
+    species_allowed_in_country(species,country,state=None,return_status=False)
     #%% instances.json generation test
     from megadetector.utils.wi_utils import generate_instances_json_from_folder # noqa
@@ -1604,13 +2515,6 @@ if False:
     #%% Geofencing tests
-    geofencing_file = r'g:\temp\geofence_mapping.json'
-    country_code_file = r'G:/temp/country-codes.csv'
-    encoding = 'cp1252'; taxonomy_file = r'g:\temp\taxonomy_mapping-' + encoding + '.json'
-    initialize_taxonomy_info(taxonomy_file, force_init=True, encoding=encoding)
-    initialize_geofencing(geofencing_file, country_code_file, force_init=True)
     species = 'didelphis marsupialis'
     print(binomial_name_to_taxonomy_info[species])
     country = 'Guatemala'
@@ -1624,13 +2528,6 @@ if False:
     #%% Test several species
-    geofencing_file = r'g:\temp\geofence_mapping.json'
-    country_code_file = r'G:/temp/country-codes.csv'
-    encoding = 'cp1252'; taxonomy_file = r'g:\temp\taxonomy_mapping-' + encoding + '.json'
-    initialize_taxonomy_info(taxonomy_file, force_init=True, encoding=encoding)
-    initialize_geofencing(geofencing_file, country_code_file, force_init=True)
     if True:
         # Make sure some Guatemalan species are allowed in Guatemala