PyPI - megadetector - Versions diffs - 10.0.15__py3-none-any.whl - Mend

megadetector 10.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (147) hide show

megadetector/__init__.py +0 -0
megadetector/api/__init__.py +0 -0
megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
megadetector/classification/__init__.py +0 -0
megadetector/classification/aggregate_classifier_probs.py +108 -0
megadetector/classification/analyze_failed_images.py +227 -0
megadetector/classification/cache_batchapi_outputs.py +198 -0
megadetector/classification/create_classification_dataset.py +626 -0
megadetector/classification/crop_detections.py +516 -0
megadetector/classification/csv_to_json.py +226 -0
megadetector/classification/detect_and_crop.py +853 -0
megadetector/classification/efficientnet/__init__.py +9 -0
megadetector/classification/efficientnet/model.py +415 -0
megadetector/classification/efficientnet/utils.py +608 -0
megadetector/classification/evaluate_model.py +520 -0
megadetector/classification/identify_mislabeled_candidates.py +152 -0
megadetector/classification/json_to_azcopy_list.py +63 -0
megadetector/classification/json_validator.py +696 -0
megadetector/classification/map_classification_categories.py +276 -0
megadetector/classification/merge_classification_detection_output.py +509 -0
megadetector/classification/prepare_classification_script.py +194 -0
megadetector/classification/prepare_classification_script_mc.py +228 -0
megadetector/classification/run_classifier.py +287 -0
megadetector/classification/save_mislabeled.py +110 -0
megadetector/classification/train_classifier.py +827 -0
megadetector/classification/train_classifier_tf.py +725 -0
megadetector/classification/train_utils.py +323 -0
megadetector/data_management/__init__.py +0 -0
megadetector/data_management/animl_to_md.py +161 -0
megadetector/data_management/annotations/__init__.py +0 -0
megadetector/data_management/annotations/annotation_constants.py +33 -0
megadetector/data_management/camtrap_dp_to_coco.py +270 -0
megadetector/data_management/cct_json_utils.py +566 -0
megadetector/data_management/cct_to_md.py +184 -0
megadetector/data_management/cct_to_wi.py +293 -0
megadetector/data_management/coco_to_labelme.py +284 -0
megadetector/data_management/coco_to_yolo.py +701 -0
megadetector/data_management/databases/__init__.py +0 -0
megadetector/data_management/databases/add_width_and_height_to_db.py +107 -0
megadetector/data_management/databases/combine_coco_camera_traps_files.py +210 -0
megadetector/data_management/databases/integrity_check_json_db.py +563 -0
megadetector/data_management/databases/subset_json_db.py +195 -0
megadetector/data_management/generate_crops_from_cct.py +200 -0
megadetector/data_management/get_image_sizes.py +164 -0
megadetector/data_management/labelme_to_coco.py +559 -0
megadetector/data_management/labelme_to_yolo.py +349 -0
megadetector/data_management/lila/__init__.py +0 -0
megadetector/data_management/lila/create_lila_blank_set.py +556 -0
megadetector/data_management/lila/create_lila_test_set.py +192 -0
megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
megadetector/data_management/lila/download_lila_subset.py +182 -0
megadetector/data_management/lila/generate_lila_per_image_labels.py +777 -0
megadetector/data_management/lila/get_lila_annotation_counts.py +174 -0
megadetector/data_management/lila/get_lila_image_counts.py +112 -0
megadetector/data_management/lila/lila_common.py +319 -0
megadetector/data_management/lila/test_lila_metadata_urls.py +164 -0
megadetector/data_management/mewc_to_md.py +344 -0
megadetector/data_management/ocr_tools.py +873 -0
megadetector/data_management/read_exif.py +964 -0
megadetector/data_management/remap_coco_categories.py +195 -0
megadetector/data_management/remove_exif.py +156 -0
megadetector/data_management/rename_images.py +194 -0
megadetector/data_management/resize_coco_dataset.py +665 -0
megadetector/data_management/speciesnet_to_md.py +41 -0
megadetector/data_management/wi_download_csv_to_coco.py +247 -0
megadetector/data_management/yolo_output_to_md_output.py +594 -0
megadetector/data_management/yolo_to_coco.py +984 -0
megadetector/data_management/zamba_to_md.py +188 -0
megadetector/detection/__init__.py +0 -0
megadetector/detection/change_detection.py +840 -0
megadetector/detection/process_video.py +479 -0
megadetector/detection/pytorch_detector.py +1451 -0
megadetector/detection/run_detector.py +1267 -0
megadetector/detection/run_detector_batch.py +2172 -0
megadetector/detection/run_inference_with_yolov5_val.py +1314 -0
megadetector/detection/run_md_and_speciesnet.py +1604 -0
megadetector/detection/run_tiled_inference.py +1044 -0
megadetector/detection/tf_detector.py +209 -0
megadetector/detection/video_utils.py +1379 -0
megadetector/postprocessing/__init__.py +0 -0
megadetector/postprocessing/add_max_conf.py +72 -0
megadetector/postprocessing/categorize_detections_by_size.py +166 -0
megadetector/postprocessing/classification_postprocessing.py +1943 -0
megadetector/postprocessing/combine_batch_outputs.py +249 -0
megadetector/postprocessing/compare_batch_results.py +2110 -0
megadetector/postprocessing/convert_output_format.py +403 -0
megadetector/postprocessing/create_crop_folder.py +629 -0
megadetector/postprocessing/detector_calibration.py +570 -0
megadetector/postprocessing/generate_csv_report.py +522 -0
megadetector/postprocessing/load_api_results.py +223 -0
megadetector/postprocessing/md_to_coco.py +428 -0
megadetector/postprocessing/md_to_labelme.py +351 -0
megadetector/postprocessing/md_to_wi.py +41 -0
megadetector/postprocessing/merge_detections.py +392 -0
megadetector/postprocessing/postprocess_batch_results.py +2140 -0
megadetector/postprocessing/remap_detection_categories.py +226 -0
megadetector/postprocessing/render_detection_confusion_matrix.py +677 -0
megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +206 -0
megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +82 -0
megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1665 -0
megadetector/postprocessing/separate_detections_into_folders.py +795 -0
megadetector/postprocessing/subset_json_detector_output.py +964 -0
megadetector/postprocessing/top_folders_to_bottom.py +238 -0
megadetector/postprocessing/validate_batch_results.py +332 -0
megadetector/taxonomy_mapping/__init__.py +0 -0
megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
megadetector/taxonomy_mapping/map_new_lila_datasets.py +211 -0
megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +165 -0
megadetector/taxonomy_mapping/preview_lila_taxonomy.py +543 -0
megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
megadetector/taxonomy_mapping/simple_image_download.py +231 -0
megadetector/taxonomy_mapping/species_lookup.py +1008 -0
megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
megadetector/tests/__init__.py +0 -0
megadetector/tests/test_nms_synthetic.py +335 -0
megadetector/utils/__init__.py +0 -0
megadetector/utils/ct_utils.py +1857 -0
megadetector/utils/directory_listing.py +199 -0
megadetector/utils/extract_frames_from_video.py +307 -0
megadetector/utils/gpu_test.py +125 -0
megadetector/utils/md_tests.py +2072 -0
megadetector/utils/path_utils.py +2872 -0
megadetector/utils/process_utils.py +172 -0
megadetector/utils/split_locations_into_train_val.py +237 -0
megadetector/utils/string_utils.py +234 -0
megadetector/utils/url_utils.py +825 -0
megadetector/utils/wi_platform_utils.py +968 -0
megadetector/utils/wi_taxonomy_utils.py +1766 -0
megadetector/utils/write_html_image_list.py +239 -0
megadetector/visualization/__init__.py +0 -0
megadetector/visualization/plot_utils.py +309 -0
megadetector/visualization/render_images_with_thumbnails.py +243 -0
megadetector/visualization/visualization_utils.py +1973 -0
megadetector/visualization/visualize_db.py +630 -0
megadetector/visualization/visualize_detector_output.py +498 -0
megadetector/visualization/visualize_video_output.py +705 -0
megadetector-10.0.15.dist-info/METADATA +115 -0
megadetector-10.0.15.dist-info/RECORD +147 -0
megadetector-10.0.15.dist-info/WHEEL +5 -0
megadetector-10.0.15.dist-info/licenses/LICENSE +19 -0
megadetector-10.0.15.dist-info/top_level.txt +1 -0

megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py ADDED Viewed

@@ -0,0 +1,491 @@
+"""
+map_lila_taxonomy_to_wi_taxonomy.py
+Loads the LILA category mapping (in which taxonomy information comes from an
+iNat taxonomy snapshot) and tries to map each class to the Wildlife Insights taxonomy.
+"""
+#%% Constants and imports
+import numpy as np
+import json
+import os
+from tqdm import tqdm
+from megadetector.data_management.lila.lila_common import \
+    read_lila_taxonomy_mapping, read_wildlife_insights_taxonomy_mapping
+#%% Prevent execution during infrastructural imports
+if False:
+    #%%
+    lila_local_base = os.path.expanduser('~/lila')
+    metadata_dir = os.path.join(lila_local_base, 'metadata')
+    os.makedirs(metadata_dir, exist_ok=True)
+    # Created by get_lila_category_list.py... contains counts for each category
+    category_list_dir = os.path.join(lila_local_base, 'lila_categories_list')
+    lila_dataset_to_categories_file = os.path.join(
+        category_list_dir, 'lila_dataset_to_categories.json')
+    # This is a manually-curated file used to store mappings that had to be made manually
+    lila_to_wi_supplementary_mapping_file = os.path.expanduser(
+        '~/git/MegaDetector/taxonomy_mapping/lila_to_wi_supplementary_mapping_file.csv')
+    assert os.path.isfile(lila_dataset_to_categories_file)
+    # This is the main output file from this whole process
+    wi_mapping_table_file = os.path.join(lila_local_base,'lila_wi_mapping_table.csv')
+    id_column = 'uniqueIdentifier' # 'id'
+    #%% Load category and taxonomy files
+    with open(lila_dataset_to_categories_file, 'r') as f:
+        lila_dataset_to_categories = json.load(f)
+    lila_taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
+    wi_taxonomy_df = read_wildlife_insights_taxonomy_mapping(metadata_dir)
+    #%% Pull everything out of pandas
+    lila_taxonomy = lila_taxonomy_df.to_dict('records')
+    wi_taxonomy = wi_taxonomy_df.to_dict('records')
+    #%% Cache WI taxonomy lookups
+    def _is_empty_wi_item(v):
+        if isinstance(v, str):
+            return len(v) == 0
+        elif v is None:
+            return True
+        else:
+            assert isinstance(v, float) and np.isnan(v), 'Invalid item: {}'.format(str(v))
+            return True
+    def _taxonomy_items_equal(a, b):
+        if isinstance(a, str) and (not isinstance(b, str)):
+            return False
+        if isinstance(b, str) and (not isinstance(a, str)):
+            return False
+        if (not isinstance(a, str)) or (not isinstance(b, str)):
+            assert (a is None and b is None) or (isinstance(a, float) and isinstance(b, float))
+            return True
+        return a == b
+    for taxon in wi_taxonomy:
+        taxon['taxon_name'] = None
+    from collections import defaultdict
+    wi_taxon_name_to_taxa = defaultdict(list)
+    # This is just a handy lookup table that we'll use to debug mismatches
+    wi_common_name_to_taxon = {}
+    blank_taxon_name = 'blank'
+    blank_taxon = None
+    animal_taxon_name = 'animal'
+    animal_taxon = None
+    unknown_taxon_name = 'unknown'
+    unknown_taxon = None
+    ignore_taxa = set(['No CV Result', 'CV Needed', 'CV Failed'])
+    known_problematic_taxon_ids = ['f94e6d97-59cf-4d38-a05a-a75efdd2863b']
+    human_taxa = []
+    # taxon = wi_taxonomy[21653]; print(taxon)
+    for taxon in tqdm(wi_taxonomy):
+        taxon_name = None
+        assert taxon['taxonomyType'] == 'object' or taxon['taxonomyType'] == 'biological'
+        for k in taxon.keys():
+            v = taxon[k]
+            if isinstance(v,str):
+                taxon[k] = v.strip()
+        if taxon['commonNameEnglish'] in ignore_taxa:
+            continue
+        if isinstance(taxon['commonNameEnglish'], str):
+            wi_common_name_to_taxon[taxon['commonNameEnglish'].strip(
+            ).lower()] = taxon
+            special_taxon = False
+            # Look for keywords that don't refer to specific taxa: blank/animal/unknown
+            if taxon['commonNameEnglish'].strip().lower() == blank_taxon_name:
+                blank_taxon = taxon
+                special_taxon = True
+            elif taxon['commonNameEnglish'].strip().lower() == animal_taxon_name:
+                animal_taxon = taxon
+                special_taxon = True
+            elif taxon['commonNameEnglish'].strip().lower() == unknown_taxon_name:
+                unknown_taxon = taxon
+                special_taxon = True
+            if special_taxon:
+                taxon_name = taxon['commonNameEnglish'].strip().lower()
+                taxon['taxon_name'] = taxon_name
+                wi_taxon_name_to_taxa[taxon_name].append(taxon)
+                continue
+        # Do we have a species name?
+        if not _is_empty_wi_item(taxon['species']):
+            # If 'species' is populated, 'genus' should always be populated; one item currently breaks
+            # this rule.
+            assert not _is_empty_wi_item(taxon['genus'])
+            taxon_name = (taxon['genus'].strip() + ' ' +
+                        taxon['species'].strip()).strip().lower()
+            assert not _is_empty_wi_item(taxon['class']) and \
+                not _is_empty_wi_item(taxon['order']) and \
+                not _is_empty_wi_item(taxon['family'])
+        elif not _is_empty_wi_item(taxon['genus']):
+            assert not _is_empty_wi_item(taxon['class']) and \
+                not _is_empty_wi_item(taxon['order']) and \
+                not _is_empty_wi_item(taxon['family'])
+            taxon_name = taxon['genus'].strip().lower()
+        elif not _is_empty_wi_item(taxon['family']):
+            assert not _is_empty_wi_item(taxon['class']) and \
+                not _is_empty_wi_item(taxon['order'])
+            taxon_name = taxon['family'].strip().lower()
+        elif not _is_empty_wi_item(taxon['order']):
+            assert not _is_empty_wi_item(taxon['class'])
+            taxon_name = taxon['order'].strip().lower()
+        elif not _is_empty_wi_item(taxon['class']):
+            taxon_name = taxon['class'].strip().lower()
+        if taxon_name is not None:
+            assert taxon['taxonomyType'] == 'biological'
+        else:
+            assert taxon['taxonomyType'] == 'object'
+            taxon_name = taxon['commonNameEnglish'].strip().lower()
+        if taxon_name in wi_taxon_name_to_taxa:
+            if taxon[id_column] in known_problematic_taxon_ids:
+                print('Skipping problematic taxon ID {}'.format(taxon[id_column]))
+            else:
+                previous_taxa = wi_taxon_name_to_taxa[taxon_name]
+                for previous_taxon in previous_taxa:
+                    for level in ['class', 'order', 'family', 'genus', 'species']:
+                        error_string = 'Error: taxon {} appeared previously in {} {} (as {}), now in {} {}'.format(
+                            taxon_name,
+                            level,previous_taxon[level],
+                            previous_taxon['taxon_name'],
+                            level,taxon[level])
+                        assert _taxonomy_items_equal(previous_taxon[level], taxon[level]), error_string
+        taxon['taxon_name'] = taxon_name
+        if taxon_name == 'homo sapiens':
+            human_taxa.append(taxon)
+        wi_taxon_name_to_taxa[taxon_name].append(taxon)
+    # ...for each taxon
+    assert unknown_taxon is not None
+    assert animal_taxon is not None
+    assert blank_taxon is not None
+    #%% Find redundant taxa
+    taxon_names_with_multiple_entries = []
+    for wi_taxon_name in wi_taxon_name_to_taxa:
+        if len(wi_taxon_name_to_taxa[wi_taxon_name]) > 1:
+            taxon_names_with_multiple_entries.append(wi_taxon_name)
+    print('{} names have multiple entries\n:'.format(len(taxon_names_with_multiple_entries)))
+    for s in taxon_names_with_multiple_entries:
+        print(s)
+    if False:
+        pass
+        #%% Manual review of redundant taxa
+        s = taxon_names_with_multiple_entries[15]
+        taxa = wi_taxon_name_to_taxa[s]
+        for t in taxa:
+            for k in t.keys():
+                print('{}: {}'.format(k,t[k]))
+            print()
+            # print(t,end='\n\n')
+    #%% Clean up redundant taxa
+    taxon_name_to_preferred_taxon_id = {}
+    # "helmeted guineafowl" vs "domestic guineafowl"
+    taxon_name_to_preferred_taxon_id['numida meleagris'] = '83133617-8358-4910-82ee-4c23e40ba3dc' # 2005826
+    # "domestic turkey" vs. "wild turkey"
+    taxon_name_to_preferred_taxon_id['meleagris gallopavo'] = 'c10547c3-1748-48bf-a451-8066c820f22f' # 2021598
+    # multiple sensible human entries
+    taxon_name_to_preferred_taxon_id['homo sapiens'] = '990ae9dd-7a59-4344-afcb-1b7b21368000' # 2002045
+    # "domestic dog" and "dog-on-leash"
+    taxon_name_to_preferred_taxon_id['canis familiaris'] = '3d80f1d6-b1df-4966-9ff4-94053c7a902a' # 2021548
+    # "small mammal" vs. "mammal"
+    taxon_name_to_preferred_taxon_id['mammalia'] = 'f2d233e3-80e3-433d-9687-e29ecc7a467a' # 2021108
+    # "Hispaniolan Mango" vs. NaN
+    taxon_name_to_preferred_taxon_id['anthracothorax dominicus'] = 'f94e6d97-59cf-4d38-a05a-a75efdd2863b'
+    # "millipedes" vs. "Millipede"
+    taxon_name_to_preferred_taxon_id['diplopoda'] =  '065884eb-4e64-4233-84dc-de25bd06ffd2' # 2021760
+    # Different suborders: Squamata vs. Lacertilia
+    taxon_name_to_preferred_taxon_id['squamata'] = '710c4066-bd5d-4313-bcf4-0217c4c84da7' # 2021703
+    # Redundancy (both "beautiful firetail")
+    taxon_name_to_preferred_taxon_id['stagonopleura bella'] = '7fec8e7e-fd3b-4d7f-99fd-3ade6f3bbaa5' # 2021939
+    # "yellow wagtail" vs. "yellow crowned-wagtail"
+    taxon_name_to_preferred_taxon_id['motacilla flava'] = 'ac6669bc-9f9e-4473-b609-b9082f9bf50c' # 2016194
+    # "dremomys species" vs. "dremomys genus"
+    taxon_name_to_preferred_taxon_id['dremomys'] = '1507d153-af11-46f1-bfb8-77918d035ab3' # 2019370
+    # "elk" vs. "domestic elk"
+    taxon_name_to_preferred_taxon_id['cervus canadensis'] = 'c5ce946f-8f0d-4379-992b-cc0982381f5e'
+    # "American bison" vs. "domestic bison"
+    taxon_name_to_preferred_taxon_id['bison bison'] = '539ebd55-081b-429a-9ae6-5a6a0f6999d4' # 2021593
+    # "woodrat or rat or mouse species" vs. "mouse species"
+    taxon_name_to_preferred_taxon_id['muridae'] = 'e7503287-468c-45af-a1bd-a17821bb62f2' # 2021642
+    # both "southern sand frog"
+    taxon_name_to_preferred_taxon_id['tomopterna adiastola'] = 'a5dc63cb-41be-4090-84a7-b944b16dcee4' # 2021834
+    # sericornis species vs. scrubwren species
+    taxon_name_to_preferred_taxon_id['sericornis'] = 'ad82c0ac-df48-4028-bf71-d2b2f4bc4129' # 2021776
+    # taxon_name = list(taxon_name_to_preferred_taxon_id.keys())[0]
+    for taxon_name in taxon_name_to_preferred_taxon_id.keys():
+        candidate_taxa = wi_taxon_name_to_taxa[taxon_name]
+        # If we've gotten this far, we should be choosing from multiple taxa.
+        #
+        # This will become untrue if any of these are resolved later, at which point we should
+        # remove them from taxon_name_to_preferred_id
+        assert len(candidate_taxa) > 1, 'Only one taxon available for {}'.format(taxon_name)
+        # Choose the preferred taxa
+        selected_taxa = [t for t in candidate_taxa if t[id_column] == \
+                        taxon_name_to_preferred_taxon_id[taxon_name]]
+        assert len(selected_taxa) == 1
+        wi_taxon_name_to_taxa[taxon_name] = selected_taxa
+    wi_taxon_name_to_taxon = {}
+    for taxon_name in wi_taxon_name_to_taxa.keys():
+        taxa = wi_taxon_name_to_taxa[taxon_name]
+        assert len(taxa) == 1
+        wi_taxon_name_to_taxon[taxon_name] = taxa[0]
+    #%% Read supplementary mappings
+    with open(lila_to_wi_supplementary_mapping_file, 'r') as f:
+        lines = f.readlines()
+    supplementary_lila_query_to_wi_query = {}
+    for line in lines:
+        # Each line is [lila query],[WI taxon name],[notes]
+        tokens = line.strip().split(',')
+        assert len(tokens) == 3
+        lila_query = tokens[0].strip().lower()
+        wi_taxon_name = tokens[1].strip().lower()
+        assert wi_taxon_name in wi_taxon_name_to_taxa
+        supplementary_lila_query_to_wi_query[lila_query] = wi_taxon_name
+    #%% Map LILA categories to WI categories
+    mismatches = set()
+    mismatches_with_common_mappings = set()
+    supplementary_mappings = set()
+    all_searches = set()
+    # Must be ordered from kingdom --> species
+    lila_taxonomy_levels = ['kingdom', 'phylum', 'subphylum', 'superclass', 'class', 'subclass',
+                            'infraclass', 'superorder', 'order', 'suborder', 'infraorder',
+                            'superfamily', 'family', 'subfamily', 'tribe', 'genus', 'species']
+    unknown_queries = set(
+        ['unidentifiable', 'other', 'unidentified', 'unknown', 'unclassifiable'])
+    blank_queries = set(['empty'])
+    animal_queries = set(['animalia'])
+    lila_dataset_category_to_wi_taxon = {}
+    # i_taxon = 0; taxon = lila_taxonomy[i_taxon]; print(taxon)
+    for i_taxon, lila_taxon in enumerate(lila_taxonomy):
+        query = None
+        lila_dataset_category = lila_taxon['dataset_name'] + ':' + lila_taxon['query']
+        # Go from kingdom --> species, choosing the lowest-level description as the query
+        for level in lila_taxonomy_levels:
+            if isinstance(lila_taxon[level], str):
+                query = lila_taxon[level]
+                all_searches.add(query)
+        if query is None:
+            # E.g., 'car'
+            query = lila_taxon['query']
+        wi_taxon = None
+        if query in unknown_queries:
+            wi_taxon = unknown_taxon
+        elif query in blank_queries:
+            wi_taxon = blank_taxon
+        elif query in animal_queries:
+            wi_taxon = animal_taxon
+        elif query in wi_taxon_name_to_taxon:
+            wi_taxon = wi_taxon_name_to_taxon[query]
+        elif query in supplementary_lila_query_to_wi_query:
+            wi_taxon = wi_taxon_name_to_taxon[supplementary_lila_query_to_wi_query[query]]
+            supplementary_mappings.add(query)
+            # print('Made a supplementary mapping from {} to {}'.format(query,wi_taxon['taxon_name']))
+        else:
+            # print('No match for {}'.format(query))
+            lila_common_name = lila_taxon['common_name']
+            if lila_common_name in wi_common_name_to_taxon:
+                wi_taxon = wi_common_name_to_taxon[lila_common_name]
+                wi_common_name = wi_taxon['commonNameEnglish']
+                wi_taxon_name = wi_taxon['taxon_name']
+                if False:
+                    print('LILA common name {} maps to WI taxon {} ({})'.format(lila_common_name,
+                                                                                wi_taxon_name,
+                                                                                wi_common_name))
+                mismatches_with_common_mappings.add(query)
+            else:
+                mismatches.add(query)
+        lila_dataset_category_to_wi_taxon[lila_dataset_category] = wi_taxon
+    # ...for each LILA taxon
+    print('Of {} entities, there are {} mismatches ({} mapped by common name) ({} mapped by supplementary mapping file)'.format(
+        len(all_searches), len(mismatches), len(mismatches_with_common_mappings), len(supplementary_mappings)))
+    assert len(mismatches) == 0
+    #%% Manual mapping
+    if not os.path.isfile(lila_to_wi_supplementary_mapping_file):
+        print('Creating mapping file {}'.format(
+            lila_to_wi_supplementary_mapping_file))
+        with open(lila_to_wi_supplementary_mapping_file, 'w') as f:
+            for query in mismatches:
+                f.write(query + ',' + '\n')
+    else:
+        print('{} exists, not re-writing'.format(lila_to_wi_supplementary_mapping_file))
+    #%% Build a dictionary from LILA dataset names and categories to LILA taxa
+    lila_dataset_category_to_lila_taxon = {}
+    # i_d = 0; d = lila_taxonomy[i_d]
+    for i_d,d in enumerate(lila_taxonomy):
+        lila_dataset_category = d['dataset_name'] + ':' + d['query']
+        assert lila_dataset_category not in lila_dataset_category_to_lila_taxon
+        lila_dataset_category_to_lila_taxon[lila_dataset_category] = d
+    #%% Map LILA datasets to WI taxa, and count the number of each taxon available in each dataset
+    with open(wi_mapping_table_file,'w') as f:
+        f.write('lila_dataset_name,lila_category_name,wi_guid,wi_taxon_name,wi_common,count\n')
+        # dataset_name = list(lila_dataset_to_categories.keys())[0]
+        for dataset_name in lila_dataset_to_categories.keys():
+            if '_bbox' in dataset_name:
+                continue
+            dataset_categories = lila_dataset_to_categories[dataset_name]
+            # dataset_category = dataset_categories[0]
+            for category in dataset_categories:
+                lila_dataset_category = dataset_name + ':' + category['name'].strip().lower()
+                if '#' in lila_dataset_category:
+                    continue
+                assert lila_dataset_category in lila_dataset_category_to_lila_taxon
+                assert lila_dataset_category in lila_dataset_category_to_wi_taxon
+                assert 'count' in category
+                wi_taxon = lila_dataset_category_to_wi_taxon[lila_dataset_category]
+                # Write out the dataset name, category name, WI GUID, WI scientific name, WI common name,
+                # and count
+                s = f"{dataset_name},{category['name']},{wi_taxon['uniqueIdentifier']},"+\
+                    f"{wi_taxon['taxon_name']},{wi_taxon['commonNameEnglish']},{category['count']}\n"
+                f.write(s)
+            # ...for each category in this dataset
+        # ...for each dataset
+    # ...with open()

megadetector/taxonomy_mapping/map_new_lila_datasets.py ADDED Viewed

@@ -0,0 +1,211 @@
+"""
+map_new_lila_datasets.py
+Given a subset of LILA datasets, find all the categories, and start the taxonomy
+mapping process.
+"""
+#%% Constants and imports
+import os
+import json
+# Created by get_lila_category_list.py
+input_lila_category_list_file = os.path.expanduser('~/lila/lila_categories_list/lila_dataset_to_categories.json')
+output_file = os.path.expanduser('~/lila/lila_additions_2025.11.17.csv')
+datasets_to_map = [
+    'Biome Health Project Maasai Mara 2018'
+    ]
+#%% Initialize taxonomic lookup
+# Takes ~2 mins
+from megadetector.taxonomy_mapping.species_lookup import \
+    initialize_taxonomy_lookup, get_preferred_taxonomic_match
+initialize_taxonomy_lookup(force_init=False)
+#%% Read the list of datasets
+with open(input_lila_category_list_file,'r') as f:
+    input_lila_categories = json.load(f)
+lila_datasets = set()
+for dataset_name in input_lila_categories.keys():
+    # The script that generates this dictionary creates a separate entry for bounding box
+    # metadata files, but those don't represent new dataset names, so we ignore them here.
+    lila_datasets.add(dataset_name.replace('_bbox',''))
+for s in datasets_to_map:
+    assert s in lila_datasets
+#%% Find all categories
+category_mappings = []
+# dataset_name = datasets_to_map[0]
+for dataset_name in datasets_to_map:
+    ds_categories = input_lila_categories[dataset_name]
+    for category in ds_categories:
+        category_name = category['name']
+        assert ':' not in category_name
+        mapping_name = dataset_name + ':' + category_name
+        category_mappings.append(mapping_name)
+print('Need to create {} mappings'.format(len(category_mappings)))
+#%% Match every query against our taxonomies
+output_rows = []
+taxonomy_preference = 'inat'
+allow_non_preferred_matches = True
+# mapping_string = category_mappings[1]; print(mapping_string)
+for mapping_string in category_mappings:
+    tokens = mapping_string.split(':')
+    assert len(tokens) == 2
+    dataset_name = tokens[0]
+    query = tokens[1]
+    taxonomic_match = get_preferred_taxonomic_match(query,taxonomy_preference=taxonomy_preference)
+    if (taxonomic_match.source == taxonomy_preference) or allow_non_preferred_matches:
+        output_row = {
+            'dataset_name': dataset_name,
+            'query': query,
+            'source': taxonomic_match.source,
+            'taxonomy_level': taxonomic_match.taxonomic_level,
+            'scientific_name': taxonomic_match.scientific_name,
+            'common_name': taxonomic_match.common_name,
+            'taxonomy_string': taxonomic_match.taxonomy_string
+        }
+    else:
+        output_row = {
+            'dataset_name': dataset_name,
+            'query': query,
+            'source': '',
+            'taxonomy_level': '',
+            'scientific_name': '',
+            'common_name': '',
+            'taxonomy_string': ''
+        }
+    output_rows.append(output_row)
+# ...for each mapping
+#%% Write output rows
+import os
+import pandas as pd
+assert not os.path.isfile(output_file), 'Delete the output file before re-generating'
+output_df = pd.DataFrame(data=output_rows, columns=[
+    'dataset_name', 'query', 'source', 'taxonomy_level',
+    'scientific_name', 'common_name', 'taxonomy_string'])
+output_df.to_csv(output_file, index=None, header=True)
+# from megadetector.utils.path_utils import open_file; open_file(output_file)
+#%% Remap missing entries in the .csv file
+# ...typically because I made a change to the mapping code.
+from megadetector.utils.path_utils import insert_before_extension
+from megadetector.utils.ct_utils import is_empty
+remapped_file = insert_before_extension(output_file,'remapped')
+df = pd.read_csv(output_file)
+for i_row,row in df.iterrows():
+    # Do we need to map this row?
+    if is_empty(row['source']):
+        query = row['query']
+        print('Mapping {}'.format(query))
+        taxonomic_match = get_preferred_taxonomic_match(query,taxonomy_preference=taxonomy_preference)
+        if (taxonomic_match.source == taxonomy_preference):
+            source = taxonomic_match.source
+            taxonomy_level = taxonomic_match.taxonomic_level
+            scientific_name = taxonomic_match.scientific_name
+            common_name  = taxonomic_match.common_name
+            taxonomy_string = taxonomic_match.taxonomy_string
+            # Write source, taxonomy_level, scientific_name, common_name, and taxonomy_string
+            # to the corresponding columns in the current row in df
+            df.loc[i_row, 'source'] = source
+            df.loc[i_row, 'taxonomy_level'] = taxonomy_level
+            df.loc[i_row, 'scientific_name'] = scientific_name
+            df.loc[i_row, 'common_name'] = common_name
+            df.loc[i_row, 'taxonomy_string'] = taxonomy_string
+        # ...if we found a match
+    # ...do we need to map this row?
+# ...for each row
+df.to_csv(remapped_file, index=None, header=True)
+#%% Manual lookup
+if False:
+    #%% You probably want to open the .csv file first
+    from megadetector.utils.path_utils import open_file
+    open_file(output_file)
+    #%%
+    from megadetector.taxonomy_mapping.species_lookup import pop_levels
+    # Use this when an iNat match includes an empty subgenus with the same name as the genus
+    n_levels_to_pop = 0
+    q = 'hirundinidae'
+    taxonomy_preference = 'inat'
+    m = get_preferred_taxonomic_match(q,taxonomy_preference)
+    if n_levels_to_pop > 0:
+        m = pop_levels(m,n_levels_to_pop)
+    # print(m.scientific_name); import clipboard; clipboard.copy(m.scientific_name)
+    # common_name = eval(m.__dict__['taxonomy_string'])[0][-1][0]; print(common_name); clipboard.copy(common_name)
+    if (m is None) or (len(m.taxonomy_string) == 0):
+        print('No match')
+    else:
+        if m.source != taxonomy_preference:
+            print('\n*** non-preferred match ***\n')
+            # raise ValueError('')
+        print(m.source)
+        print(m.taxonomy_string)
+        import clipboard; clipboard.copy(m.taxonomy_string)