PyPI - megadetector - Versions diffs - 5.0.28__py3-none-any.whl → 10.0.0__py3-none-any.whl - Mend

megadetector 5.0.28py3-none-any.whl → 10.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (197) hide show

megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
megadetector/classification/aggregate_classifier_probs.py +3 -3
megadetector/classification/analyze_failed_images.py +5 -5
megadetector/classification/cache_batchapi_outputs.py +5 -5
megadetector/classification/create_classification_dataset.py +11 -12
megadetector/classification/crop_detections.py +10 -10
megadetector/classification/csv_to_json.py +8 -8
megadetector/classification/detect_and_crop.py +13 -15
megadetector/classification/efficientnet/model.py +8 -8
megadetector/classification/efficientnet/utils.py +6 -5
megadetector/classification/evaluate_model.py +7 -7
megadetector/classification/identify_mislabeled_candidates.py +6 -6
megadetector/classification/json_to_azcopy_list.py +1 -1
megadetector/classification/json_validator.py +29 -32
megadetector/classification/map_classification_categories.py +9 -9
megadetector/classification/merge_classification_detection_output.py +12 -9
megadetector/classification/prepare_classification_script.py +19 -19
megadetector/classification/prepare_classification_script_mc.py +26 -26
megadetector/classification/run_classifier.py +4 -4
megadetector/classification/save_mislabeled.py +6 -6
megadetector/classification/train_classifier.py +1 -1
megadetector/classification/train_classifier_tf.py +9 -9
megadetector/classification/train_utils.py +10 -10
megadetector/data_management/annotations/annotation_constants.py +1 -2
megadetector/data_management/camtrap_dp_to_coco.py +79 -46
megadetector/data_management/cct_json_utils.py +103 -103
megadetector/data_management/cct_to_md.py +49 -49
megadetector/data_management/cct_to_wi.py +33 -33
megadetector/data_management/coco_to_labelme.py +75 -75
megadetector/data_management/coco_to_yolo.py +210 -193
megadetector/data_management/databases/add_width_and_height_to_db.py +86 -12
megadetector/data_management/databases/combine_coco_camera_traps_files.py +40 -40
megadetector/data_management/databases/integrity_check_json_db.py +228 -200
megadetector/data_management/databases/subset_json_db.py +33 -33
megadetector/data_management/generate_crops_from_cct.py +88 -39
megadetector/data_management/get_image_sizes.py +54 -49
megadetector/data_management/labelme_to_coco.py +133 -125
megadetector/data_management/labelme_to_yolo.py +159 -73
megadetector/data_management/lila/create_lila_blank_set.py +81 -83
megadetector/data_management/lila/create_lila_test_set.py +32 -31
megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
megadetector/data_management/lila/download_lila_subset.py +21 -24
megadetector/data_management/lila/generate_lila_per_image_labels.py +365 -107
megadetector/data_management/lila/get_lila_annotation_counts.py +35 -33
megadetector/data_management/lila/get_lila_image_counts.py +22 -22
megadetector/data_management/lila/lila_common.py +73 -70
megadetector/data_management/lila/test_lila_metadata_urls.py +28 -19
megadetector/data_management/mewc_to_md.py +344 -340
megadetector/data_management/ocr_tools.py +262 -255
megadetector/data_management/read_exif.py +249 -227
megadetector/data_management/remap_coco_categories.py +90 -28
megadetector/data_management/remove_exif.py +81 -21
megadetector/data_management/rename_images.py +187 -187
megadetector/data_management/resize_coco_dataset.py +588 -120
megadetector/data_management/speciesnet_to_md.py +41 -41
megadetector/data_management/wi_download_csv_to_coco.py +55 -55
megadetector/data_management/yolo_output_to_md_output.py +248 -122
megadetector/data_management/yolo_to_coco.py +333 -191
megadetector/detection/change_detection.py +832 -0
megadetector/detection/process_video.py +340 -337
megadetector/detection/pytorch_detector.py +358 -278
megadetector/detection/run_detector.py +399 -186
megadetector/detection/run_detector_batch.py +404 -377
megadetector/detection/run_inference_with_yolov5_val.py +340 -327
megadetector/detection/run_tiled_inference.py +257 -249
megadetector/detection/tf_detector.py +24 -24
megadetector/detection/video_utils.py +332 -295
megadetector/postprocessing/add_max_conf.py +19 -11
megadetector/postprocessing/categorize_detections_by_size.py +45 -45
megadetector/postprocessing/classification_postprocessing.py +468 -433
megadetector/postprocessing/combine_batch_outputs.py +23 -23
megadetector/postprocessing/compare_batch_results.py +590 -525
megadetector/postprocessing/convert_output_format.py +106 -102
megadetector/postprocessing/create_crop_folder.py +347 -147
megadetector/postprocessing/detector_calibration.py +173 -168
megadetector/postprocessing/generate_csv_report.py +508 -499
megadetector/postprocessing/load_api_results.py +48 -27
megadetector/postprocessing/md_to_coco.py +133 -102
megadetector/postprocessing/md_to_labelme.py +107 -90
megadetector/postprocessing/md_to_wi.py +40 -40
megadetector/postprocessing/merge_detections.py +92 -114
megadetector/postprocessing/postprocess_batch_results.py +319 -301
megadetector/postprocessing/remap_detection_categories.py +91 -38
megadetector/postprocessing/render_detection_confusion_matrix.py +214 -205
megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +704 -679
megadetector/postprocessing/separate_detections_into_folders.py +226 -211
megadetector/postprocessing/subset_json_detector_output.py +265 -262
megadetector/postprocessing/top_folders_to_bottom.py +45 -45
megadetector/postprocessing/validate_batch_results.py +70 -70
megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
megadetector/taxonomy_mapping/map_new_lila_datasets.py +18 -19
megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +54 -33
megadetector/taxonomy_mapping/preview_lila_taxonomy.py +67 -67
megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
megadetector/taxonomy_mapping/simple_image_download.py +8 -8
megadetector/taxonomy_mapping/species_lookup.py +156 -74
megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
megadetector/taxonomy_mapping/taxonomy_graph.py +10 -10
megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
megadetector/utils/ct_utils.py +1049 -211
megadetector/utils/directory_listing.py +21 -77
megadetector/utils/gpu_test.py +22 -22
megadetector/utils/md_tests.py +632 -529
megadetector/utils/path_utils.py +1520 -431
megadetector/utils/process_utils.py +41 -41
megadetector/utils/split_locations_into_train_val.py +62 -62
megadetector/utils/string_utils.py +148 -27
megadetector/utils/url_utils.py +489 -176
megadetector/utils/wi_utils.py +2658 -2526
megadetector/utils/write_html_image_list.py +137 -137
megadetector/visualization/plot_utils.py +34 -30
megadetector/visualization/render_images_with_thumbnails.py +39 -74
megadetector/visualization/visualization_utils.py +487 -435
megadetector/visualization/visualize_db.py +232 -198
megadetector/visualization/visualize_detector_output.py +82 -76
{megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/METADATA +5 -2
megadetector-10.0.0.dist-info/RECORD +139 -0
{megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/WHEEL +1 -1
megadetector/api/batch_processing/api_core/__init__.py +0 -0
megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
megadetector/api/batch_processing/api_core/batch_service/score.py +0 -439
megadetector/api/batch_processing/api_core/server.py +0 -294
megadetector/api/batch_processing/api_core/server_api_config.py +0 -97
megadetector/api/batch_processing/api_core/server_app_config.py +0 -55
megadetector/api/batch_processing/api_core/server_batch_job_manager.py +0 -220
megadetector/api/batch_processing/api_core/server_job_status_table.py +0 -149
megadetector/api/batch_processing/api_core/server_orchestration.py +0 -360
megadetector/api/batch_processing/api_core/server_utils.py +0 -88
megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
megadetector/api/batch_processing/api_support/__init__.py +0 -0
megadetector/api/batch_processing/api_support/summarize_daily_activity.py +0 -152
megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
megadetector/api/synchronous/__init__.py +0 -0
megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +0 -151
megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -263
megadetector/api/synchronous/api_core/animal_detection_api/config.py +0 -35
megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
megadetector/api/synchronous/api_core/tests/load_test.py +0 -110
megadetector/data_management/importers/add_nacti_sizes.py +0 -52
megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
megadetector/data_management/importers/awc_to_json.py +0 -191
megadetector/data_management/importers/bellevue_to_json.py +0 -272
megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
megadetector/data_management/importers/cct_field_adjustments.py +0 -58
megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
megadetector/data_management/importers/ena24_to_json.py +0 -276
megadetector/data_management/importers/filenames_to_json.py +0 -386
megadetector/data_management/importers/helena_to_cct.py +0 -283
megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
megadetector/data_management/importers/jb_csv_to_json.py +0 -150
megadetector/data_management/importers/mcgill_to_json.py +0 -250
megadetector/data_management/importers/missouri_to_json.py +0 -490
megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
megadetector/data_management/importers/noaa_seals_2019.py +0 -181
megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
megadetector/data_management/importers/pc_to_json.py +0 -365
megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
megadetector/data_management/importers/rspb_to_json.py +0 -356
megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
megadetector/data_management/importers/sulross_get_exif.py +0 -65
megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
megadetector/data_management/importers/ubc_to_json.py +0 -399
megadetector/data_management/importers/umn_to_json.py +0 -507
megadetector/data_management/importers/wellington_to_json.py +0 -263
megadetector/data_management/importers/wi_to_json.py +0 -442
megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
megadetector/utils/azure_utils.py +0 -178
megadetector/utils/sas_blob_utils.py +0 -509
megadetector-5.0.28.dist-info/RECORD +0 -209
/megadetector/{api/batch_processing/__init__.py → __init__.py} +0 -0
{megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/licenses/LICENSE +0 -0
{megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/top_level.txt +0 -0

megadetector/taxonomy_mapping/species_lookup.py CHANGED Viewed

@@ -32,27 +32,18 @@ taxonomy_download_dir = os.path.expanduser('~/taxonomy')
 taxonomy_urls = {
     'GBIF': 'https://hosted-datasets.gbif.org/datasets/backbone/current/backbone.zip',
-    'iNaturalist': 'https://www.inaturalist.org/observations/inaturalist-dwca-with-taxa.zip'  # pylint: disable=line-too-long
+    'iNaturalist': 'https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip'
 }
 files_to_unzip = {
-    # GBIF used to put everything in a "backbone" folder within the zipfile, but as of
-    # 12.2023, this is no longer the case.
-    # 'GBIF': ['backbone/Taxon.tsv', 'backbone/VernacularName.tsv'],
     'GBIF': ['Taxon.tsv', 'VernacularName.tsv'],
-    'iNaturalist': ['taxa.csv']
+    'iNaturalist': ['taxa.csv','VernacularNames-english.csv']
 }
-# As of 2020.05.12:
+# As of 2025.06.24:
 #
-# GBIF: ~777MB zipped, ~1.6GB taxonomy
-# iNat: ~2.2GB zipped, ~51MB taxonomy (most of the zipfile is observations)
-# As of 2023.12.29:
-#
-# GBIF: ~948MB zipped, ~2.2GB taxonomy
-# iNat: ~6.7GB zipped, ~62MB taxonomy (most of the zipfile is observations)
+# GBIF: 950MB zipped, 2.3GB of relevant content unzipped
+# iNat: 71MB zipped, 415MB of relevant content unzipped
 os.makedirs(taxonomy_download_dir, exist_ok=True)
 for taxonomy_name in taxonomy_urls:
@@ -83,7 +74,7 @@ gbif_scientific_to_taxon_id = None # : Dict[str, np.int64]
 # Initialization function
-def initialize_taxonomy_lookup(force_init=False) -> None:
+def initialize_taxonomy_lookup(force_init=False):
     """
     Initialize this module by doing the following:
@@ -92,8 +83,14 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
     * Builds a bunch of dictionaries and tables to facilitate lookup
     * Serializes those tables via pickle
     * Skips all of the above if the serialized pickle file already exists
+    Args:
+        force_init (bool, optional): force re-download and parsing of the source .zip files,
+            even if the cached .p file already exists
     """
+    #%%
     global inat_taxonomy,\
         gbif_taxonomy,\
         gbif_common_mapping,\
@@ -109,12 +106,12 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
         gbif_scientific_to_taxon_id
-    ## Load serialized taxonomy info if we've already saved it
+    #%% Load serialized taxonomy info if we've already saved it
     if (not force_init) and (inat_taxonomy is not None):
         print('Skipping taxonomy re-init')
         return
     if (not force_init) and (os.path.isfile(serialized_structures_file)):
         print(f'De-serializing taxonomy data from {serialized_structures_file}')
@@ -135,18 +132,17 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
         gbif_vernacular_to_taxon_id,\
         gbif_taxon_id_to_scientific,\
         gbif_scientific_to_taxon_id = structures_to_serialize
         return
-    ## If we don't have serialized taxonomy info, create it from scratch.
+    #%% Download and unzip taxonomy files
-    # Download and unzip taxonomy files
     # taxonomy_name = list(taxonomy_urls.items())[0][0]; zip_url = list(taxonomy_urls.items())[0][1]
     for taxonomy_name, zip_url in taxonomy_urls.items():
         need_to_download = False
         if force_init:
             need_to_download = True
@@ -189,21 +185,44 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
             # ...for each file that we need from this zipfile
-        # Remove the zipfile
-        # os.remove(zipfile_path)
     # ...for each taxonomy
-    # Create dataframes from each of the taxonomy files, and the GBIF common
-    # name file
+    #%% Create dataframes from each of the taxonomy/vernacular files
     # Load iNat taxonomy
     inat_taxonomy_file = os.path.join(taxonomy_download_dir, 'iNaturalist', 'taxa.csv')
     print('Loading iNat taxonomy from {}'.format(inat_taxonomy_file))
     inat_taxonomy = pd.read_csv(inat_taxonomy_file)
     inat_taxonomy['scientificName'] = inat_taxonomy['scientificName'].fillna('').str.strip()
-    inat_taxonomy['vernacularName'] = inat_taxonomy['vernacularName'].fillna('').str.strip()
+    # Delete columns we won't use.  The "taxonID" column is a non-int version of "ID"
+    inat_taxonomy = inat_taxonomy.drop(['identifier', 'taxonID', 'modified', 'references'], axis=1)
+    # The "parentNameUsageID" column in inat_taxonomy is a URL, like:
+    #
+    # https://www.inaturalist.org/taxa/71262
+    #
+    # Convert this column to be integer-valued, using only the last token of the URL
+    inat_taxonomy['parentNameUsageID'] = \
+        inat_taxonomy['parentNameUsageID'].str.split('/').str[-1].fillna(0).astype(int)
+    # Rename the "id" column to "taxonID"
+    inat_taxonomy = inat_taxonomy.rename(columns={'id': 'taxonID'})
+    assert 'id' not in inat_taxonomy.columns
+    assert 'taxonID' in inat_taxonomy.columns
+    # Load iNat common name mapping
+    inat_common_mapping_file = os.path.join(taxonomy_download_dir, 'iNaturalist', 'VernacularNames-english.csv')
+    inat_common_mapping = pd.read_csv(inat_common_mapping_file)
+    inat_common_mapping['vernacularName'] = inat_common_mapping['vernacularName'].fillna('').str.strip()
+    inat_common_mapping = inat_common_mapping.drop(['language','locality','countryCode',
+                                                    'source','lexicon','contributor','created'], axis=1)
+    assert 'id' in inat_common_mapping.columns
+    assert 'taxonID' not in inat_common_mapping.columns
+    assert 'vernacularName' in inat_common_mapping.columns
     # Load GBIF taxonomy
     gbif_taxonomy_file = os.path.join(taxonomy_download_dir, 'GBIF', 'Taxon.tsv')
@@ -211,12 +230,20 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
     gbif_taxonomy = pd.read_csv(gbif_taxonomy_file, sep='\t', encoding='utf-8',on_bad_lines='warn')
     gbif_taxonomy['scientificName'] = gbif_taxonomy['scientificName'].fillna('').str.strip()
     gbif_taxonomy['canonicalName'] = gbif_taxonomy['canonicalName'].fillna('').str.strip()
+    gbif_taxonomy['parentNameUsageID'] = gbif_taxonomy['parentNameUsageID'].fillna(-1).astype(int)
     # Remove questionable rows from the GBIF taxonomy
     gbif_taxonomy = gbif_taxonomy[~gbif_taxonomy['taxonomicStatus'].isin(['doubtful', 'misapplied'])]
     gbif_taxonomy = gbif_taxonomy.reset_index()
-    # Load GBIF vernacular name mapping
+    gbif_taxonomy = gbif_taxonomy.drop(['datasetID','acceptedNameUsageID','originalNameUsageID',
+                                        'scientificNameAuthorship','nameAccordingTo','namePublishedIn',
+                                        'taxonomicStatus','nomenclaturalStatus','taxonRemarks'], axis=1)
+    assert 'taxonID' in gbif_taxonomy.columns
+    assert 'scientificName' in gbif_taxonomy.columns
+    # Load GBIF common name mapping
     gbif_common_mapping = pd.read_csv(os.path.join(
         taxonomy_download_dir, 'GBIF', 'VernacularName.tsv'), sep='\t')
     gbif_common_mapping['vernacularName'] = gbif_common_mapping['vernacularName'].fillna('').str.strip()
@@ -225,6 +252,12 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
     gbif_common_mapping = gbif_common_mapping.loc[gbif_common_mapping['language'] == 'en']
     gbif_common_mapping = gbif_common_mapping.reset_index()
+    gbif_common_mapping = gbif_common_mapping.drop(['language','country','countryCode','sex',
+                                                    'lifeStage','source'],axis=1)
+    assert 'taxonID' in gbif_common_mapping.columns
+    assert 'vernacularName' in gbif_common_mapping.columns
     # Convert everything to lowercase
@@ -235,23 +268,28 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
     inat_taxonomy = convert_df_to_lowercase(inat_taxonomy)
     gbif_taxonomy = convert_df_to_lowercase(gbif_taxonomy)
     gbif_common_mapping = convert_df_to_lowercase(gbif_common_mapping)
+    inat_common_mapping = convert_df_to_lowercase(inat_common_mapping)
-    # For each taxonomy table, create a mapping from taxon IDs to rows
+    ##%% For each taxonomy table, create a mapping from taxon IDs to rows
     inat_taxon_id_to_row = {}
     gbif_taxon_id_to_row = {}
     print('Building iNat taxonID --> row table')
     for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
-        inat_taxon_id_to_row[row['taxonID']] = i_row
+        taxon_id = row['taxonID']
+        assert isinstance(taxon_id, int)
+        inat_taxon_id_to_row[taxon_id] = i_row
     print('Building GBIF taxonID --> row table')
     for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
-        gbif_taxon_id_to_row[row['taxonID']] = i_row
+        taxon_id = row['taxonID']
+        assert isinstance(taxon_id, int)
+        gbif_taxon_id_to_row[taxon_id] = i_row
-    # Create name mapping dictionaries
+    ##%% Create name mapping dictionaries
     inat_taxon_id_to_vernacular = defaultdict(set)
     inat_vernacular_to_taxon_id = defaultdict(set)
@@ -267,32 +305,61 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
     # Build iNat dictionaries
     print('Building lookup dictionaries for iNat taxonomy')
+    # iNat Scientific name mapping
     for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
         taxon_id = row['taxonID']
-        vernacular_name = row['vernacularName']
-        scientific_name = row['scientificName']
-        if len(vernacular_name) > 0:
-            inat_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
-            inat_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
+        assert isinstance(taxon_id,int)
+        scientific_name = row['scientificName']
         assert len(scientific_name) > 0
         inat_taxon_id_to_scientific[taxon_id].add(scientific_name)
         inat_scientific_to_taxon_id[scientific_name].add(taxon_id)
+    # iNat common name mapping
+    inat_taxon_ids_in_vernacular_file_but_not_in_taxa_file = set()
+    for i_row, row in tqdm(inat_common_mapping.iterrows(), total=len(inat_common_mapping)):
+        taxon_id = row['id']
+        assert isinstance(taxon_id,int)
+        # This should never happen; we will assert() this at the end of the loop
+        if taxon_id not in inat_taxon_id_to_scientific:
+            inat_taxon_ids_in_vernacular_file_but_not_in_taxa_file.add(taxon_id)
+            continue
+        vernacular_name = row['vernacularName']
+        assert len(vernacular_name) > 0
+        inat_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
+        inat_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
+    assert len(inat_taxon_ids_in_vernacular_file_but_not_in_taxa_file) == 0
-    # Build GBIF dictionaries
+    ##%% Build GBIF dictionaries
     print('Building lookup dictionaries for GBIF taxonomy')
+    # GBIF scientific name mapping
     for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
         taxon_id = row['taxonID']
+        assert isinstance(taxon_id,int)
-        # The canonical name is the Latin name; the "scientific name"
-        # include the taxonomy name.
+        # The "canonical name" is the Latin name; the "scientific name"
+        # column includes other information.  For example:
+        #
+        # "scientificName": Schizophoria impressa (Hall, 1843)
+        # "canonicalName": Schizophoria impressa
+        #
+        # Also see:
         #
         # http://globalnames.org/docs/glossary/
@@ -307,12 +374,18 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
         gbif_taxon_id_to_scientific[taxon_id].add(scientific_name)
         gbif_scientific_to_taxon_id[scientific_name].add(taxon_id)
+    # GBIF common name mapping
+    gbif_taxon_ids_in_vernacular_file_but_not_in_taxa_file = set()
     for i_row, row in tqdm(gbif_common_mapping.iterrows(), total=len(gbif_common_mapping)):
         taxon_id = row['taxonID']
+        assert isinstance(taxon_id,int)
         # Don't include taxon IDs that were removed from the master table
         if taxon_id not in gbif_taxon_id_to_scientific:
+            gbif_taxon_ids_in_vernacular_file_but_not_in_taxa_file.add(taxon_id)
             continue
         vernacular_name = row['vernacularName']
@@ -321,8 +394,13 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
         gbif_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
         gbif_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
+    print('Finished GBIF common --> scientific mapping, failed to map {} of {} taxon IDs'.format(
+        len(gbif_taxon_ids_in_vernacular_file_but_not_in_taxa_file),
+        len(gbif_common_mapping)
+    ))
-    # Save everything to file
+    ##%% Save everything to file
     structures_to_serialize = [
         inat_taxonomy,
@@ -344,7 +422,10 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
     if not os.path.isfile(serialized_structures_file):
         with open(serialized_structures_file, 'wb') as p:
             pickle.dump(structures_to_serialize, p)
-    print(' done')
+    print('...done')
+    #%%
 # ...def initialize_taxonomy_lookup(...)
@@ -412,7 +493,8 @@ def traverse_taxonomy(matching_rownums: Sequence[int],
         while True:
             taxon_id = current_row['taxonID']
-            vernacular_names = sorted(taxon_id_to_vernacular[taxon_id])  # sort for determinism, pylint: disable=line-too-long
+            # sort for determinism
+            vernacular_names = sorted(taxon_id_to_vernacular[taxon_id])
             match_details.append((taxon_id, current_row['taxonRank'],
                                   get_scientific_name_from_row(current_row),
                                   vernacular_names))
@@ -596,21 +678,21 @@ class TaxonomicMatch:
 hyphenated_terms = ['crowned', 'backed', 'throated', 'tailed', 'headed', 'cheeked',
-                    'ruffed', 'browed', 'eating', 'striped', 'shanked',
+                    'ruffed', 'browed', 'eating', 'striped', 'shanked',
                     'fronted', 'bellied', 'spotted', 'eared', 'collared', 'breasted',
                     'necked']
 def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retry=True) -> TaxonomicMatch:
     """
-    Wrapper for _get_preferred_taxonomic_match, but expressing a variety of heuristics
+    Wrapper for _get_preferred_taxonomic_match, but expressing a variety of heuristics
     and preferences that are specific to our scenario.
     Args:
         query (str): The common or scientific name we want to look up
         taxonomy_preference (str, optional): 'inat' or 'gbif'
-        retry (bool, optional): if the initial lookup fails, should we try heuristic
+        retry (bool, optional): if the initial lookup fails, should we try heuristic
             substitutions, e.g. replacing "_" with " ", or "spp" with "species"?
     Returns:
         TaxonomicMatch: the best taxonomic match, or None
     """
@@ -618,31 +700,31 @@ def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retr
     m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
     if (len(m.scientific_name) > 0) or (not retry):
         return m
     for s in hyphenated_terms:
         query = query.replace(' ' + s,'-' + s)
     m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
     return m
 def validate_and_convert(data):
     """
     Recursively validates that all elements in the nested structure are only
     tuples, lists, ints, or np.int64, and converts np.int64 to int.
     Args:
         data: The nested structure to validate and convert
     Returns:
         The validated and converted structure
     Raises:
         TypeError: If an invalid type is encountered
     """
-    if isinstance(data, np.int64):
+    if isinstance(data, np.int64):
         return int(data)
-    elif isinstance(data, int) or isinstance(data, str):
+    elif isinstance(data, int) or isinstance(data, str):
         return data
     elif isinstance(data, (list, tuple)):
         # Process lists and tuples recursively
@@ -654,17 +736,17 @@ def validate_and_convert(data):
 # ...def validate_and_convert(...)
 def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') -> TaxonomicMatch:
     query = query.lower().strip().replace('_', ' ')
     query = query.replace('unidentified','')
     query = query.replace('unknown','')
     if query.endswith(' sp'):
         query = query.replace(' sp','')
     if query.endswith(' group'):
-        query = query.replace(' group','')
+        query = query.replace(' group','')
     query = query.strip()
     # query = 'person'
@@ -686,17 +768,17 @@ def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') ->
     n_inat_matches = len(inat_matches)
     n_gbif_matches = len(gbif_matches)
     selected_matches = None
     assert taxonomy_preference in ['gbif','inat'],\
         'Unrecognized taxonomy preference: {}'.format(taxonomy_preference)
     if n_inat_matches > 0 and taxonomy_preference == 'inat':
         selected_matches = 'inat'
     elif n_gbif_matches > 0:
         selected_matches = 'gbif'
     if selected_matches == 'inat':
         i_match = 0
@@ -802,7 +884,7 @@ def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') ->
     # Convert np.int64's to ints
     if match is not None:
         match = validate_and_convert(match)
     taxonomy_string = str(match)
     return TaxonomicMatch(scientific_name, common_name, taxonomic_level, source,
@@ -828,15 +910,15 @@ if False:
     # print(matches)
     print_taxonomy_matches(matches,verbose=True)
     print('\n\n')
     # Print the taxonomy in the taxonomy spreadsheet format
     assert matches[1]['source'] == 'inat'
     t = str(matches[1]['taxonomy'])
     print(t)
     import clipboard; clipboard.copy(t)
     #%% Directly access the taxonomy tables
@@ -848,12 +930,12 @@ if False:
 #%% Command-line driver
-def main():
+def main(): # noqa
     # Read command line inputs (absolute path)
     parser = argparse.ArgumentParser()
     parser.add_argument('input_file')
     if len(sys.argv[1:]) == 0:
         parser.print_help()
         parser.exit()

megadetector/taxonomy_mapping/taxonomy_csv_checker.py CHANGED Viewed

@@ -36,7 +36,7 @@ def check_taxonomy_csv(csv_path: str) -> None:
     """
     See module docstring.
     """
     taxonomy_df = pd.read_csv(csv_path)
     graph = nx.DiGraph()
@@ -46,12 +46,12 @@ def check_taxonomy_csv(csv_path: str) -> None:
     num_scientific_name_errors = 0
     for i_row, row in taxonomy_df.iterrows():
         ds = row['dataset_name']
         ds_label = row['query']
         scientific_name = row['scientific_name']
         level = row['taxonomy_level']
         # This used to represent the source of the mapping: iNat, gbif, or manual.  We've
         # stopped tracking this, so this is now vestigial.
         id_source = 0 # row['source']
@@ -95,8 +95,8 @@ def check_taxonomy_csv(csv_path: str) -> None:
                     num_scientific_name_errors += 1
             taxon_child = node
-    # ...for each row in the taxonomy file
+    # ...for each row in the taxonomy file
     assert nx.is_directed_acyclic_graph(graph)
@@ -124,36 +124,36 @@ def check_taxonomy_csv(csv_path: str) -> None:
         print(f'At least one node has unresolved ambiguous parents: {e}')
     print('Processed {} rows from {}'.format(len(taxonomy_df),csv_path))
     print('num taxon level errors:', num_taxon_level_errors)
     print('num scientific name errors:', num_scientific_name_errors)
 #%% Command-line driver
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument(
         'taxonomy_csv_path',
         help='path to taxonomy CSV file')
     if len(sys.argv[1:]) == 0:
         parser.print_help()
         parser.exit()
     args = parser.parse_args()
     check_taxonomy_csv(args.taxonomy_csv_path)
 #%% Interactive driver
 if False:
     #%%
     import os
     csv_path = os.path.expanduser('~/lila/lila-taxonomy-mapping_release.csv')
     check_taxonomy_csv(csv_path)

megadetector/taxonomy_mapping/taxonomy_graph.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""
+r"""
 taxonomy_graph.py
@@ -69,7 +69,7 @@ class TaxonNode:
     By default, we support multiple parents for each TaxonNode. See discussion
     in module docstring above.
     """
     # class variables
     single_parent_only: ClassVar[bool] = False
@@ -82,7 +82,7 @@ class TaxonNode:
     def __init__(self, level: str, name: str,
                  graph: Optional[nx.DiGraph] = None):
         self.level = level
         self.name = name
         self.graph = graph
@@ -131,7 +131,7 @@ class TaxonNode:
         Args:
             parent: TaxonNode, must be higher in the taxonomical hierarchy
         """
         assert self.graph is not None
         parents = self.parents
         if TaxonNode.single_parent_only and len(parents) > 0:
@@ -150,7 +150,7 @@ class TaxonNode:
         Args:
             child: TaxonNode, must be lower in the taxonomical hierarchy
         """
         assert self.graph is not None
         self.graph.add_edge(self, child)
@@ -160,7 +160,7 @@ class TaxonNode:
             ds: str, name of dataset
             ds_label: str, name of label used by that dataset
         """
         self.dataset_labels.add((ds, ds_label))
     def get_dataset_labels(self,
@@ -176,7 +176,7 @@ class TaxonNode:
         Returns: set of (ds, ds_label) tuples
         """
         result = self.dataset_labels
         if include_datasets is not None:
             result = set(tup for tup in result if tup[0] in include_datasets)
@@ -199,7 +199,7 @@ class TaxonNode:
         Returns: TaxonNode, the LCA if it exists, or None if no LCA exists
         """
         paths = []
         for node in nodes:
             # get path to root
@@ -242,7 +242,7 @@ def build_taxonomy_graph(taxonomy_df: pd.DataFrame
             TaxonNode node in the tree that contains the label,
             keys are all lowercase
     """
     graph = nx.DiGraph()
     taxon_to_node = {}  # maps (taxon_level, taxon_name) to a TaxonNode
     label_to_node = {}  # maps (dataset_name, dataset_label) to a TaxonNode
@@ -308,7 +308,7 @@ def dag_to_tree(graph: nx.DiGraph,
     Returns: nx.DiGraph, a tree-structured graph
     """
     tree = nx.DiGraph()
     for node in graph.nodes:
         tree.add_node(node)

megadetector 5.0.28__py3-none-any.whl → 10.0.0__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.28py3-none-any.whl → 10.0.0py3-none-any.whl