PyPI - megadetector - Versions diffs - 5.0.10__py3-none-any.whl → 5.0.11__py3-none-any.whl - Mend

megadetector 5.0.10py3-none-any.whl → 5.0.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (226) hide show

{megadetector-5.0.10.dist-info → megadetector-5.0.11.dist-info}/LICENSE +0 -0
{megadetector-5.0.10.dist-info → megadetector-5.0.11.dist-info}/METADATA +12 -11
megadetector-5.0.11.dist-info/RECORD +5 -0
megadetector-5.0.11.dist-info/top_level.txt +1 -0
api/__init__.py +0 -0
api/batch_processing/__init__.py +0 -0
api/batch_processing/api_core/__init__.py +0 -0
api/batch_processing/api_core/batch_service/__init__.py +0 -0
api/batch_processing/api_core/batch_service/score.py +0 -439
api/batch_processing/api_core/server.py +0 -294
api/batch_processing/api_core/server_api_config.py +0 -98
api/batch_processing/api_core/server_app_config.py +0 -55
api/batch_processing/api_core/server_batch_job_manager.py +0 -220
api/batch_processing/api_core/server_job_status_table.py +0 -152
api/batch_processing/api_core/server_orchestration.py +0 -360
api/batch_processing/api_core/server_utils.py +0 -92
api/batch_processing/api_core_support/__init__.py +0 -0
api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
api/batch_processing/api_support/__init__.py +0 -0
api/batch_processing/api_support/summarize_daily_activity.py +0 -152
api/batch_processing/data_preparation/__init__.py +0 -0
api/batch_processing/data_preparation/manage_local_batch.py +0 -2391
api/batch_processing/data_preparation/manage_video_batch.py +0 -327
api/batch_processing/integration/digiKam/setup.py +0 -6
api/batch_processing/integration/digiKam/xmp_integration.py +0 -465
api/batch_processing/integration/eMammal/test_scripts/config_template.py +0 -5
api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -126
api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +0 -55
api/batch_processing/postprocessing/__init__.py +0 -0
api/batch_processing/postprocessing/add_max_conf.py +0 -64
api/batch_processing/postprocessing/categorize_detections_by_size.py +0 -163
api/batch_processing/postprocessing/combine_api_outputs.py +0 -249
api/batch_processing/postprocessing/compare_batch_results.py +0 -958
api/batch_processing/postprocessing/convert_output_format.py +0 -397
api/batch_processing/postprocessing/load_api_results.py +0 -195
api/batch_processing/postprocessing/md_to_coco.py +0 -310
api/batch_processing/postprocessing/md_to_labelme.py +0 -330
api/batch_processing/postprocessing/merge_detections.py +0 -401
api/batch_processing/postprocessing/postprocess_batch_results.py +0 -1904
api/batch_processing/postprocessing/remap_detection_categories.py +0 -170
api/batch_processing/postprocessing/render_detection_confusion_matrix.py +0 -661
api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +0 -211
api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +0 -82
api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +0 -1631
api/batch_processing/postprocessing/separate_detections_into_folders.py +0 -731
api/batch_processing/postprocessing/subset_json_detector_output.py +0 -696
api/batch_processing/postprocessing/top_folders_to_bottom.py +0 -223
api/synchronous/__init__.py +0 -0
api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
api/synchronous/api_core/animal_detection_api/api_backend.py +0 -152
api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -266
api/synchronous/api_core/animal_detection_api/config.py +0 -35
api/synchronous/api_core/animal_detection_api/data_management/annotations/annotation_constants.py +0 -47
api/synchronous/api_core/animal_detection_api/detection/detector_training/copy_checkpoints.py +0 -43
api/synchronous/api_core/animal_detection_api/detection/detector_training/model_main_tf2.py +0 -114
api/synchronous/api_core/animal_detection_api/detection/process_video.py +0 -543
api/synchronous/api_core/animal_detection_api/detection/pytorch_detector.py +0 -304
api/synchronous/api_core/animal_detection_api/detection/run_detector.py +0 -627
api/synchronous/api_core/animal_detection_api/detection/run_detector_batch.py +0 -1029
api/synchronous/api_core/animal_detection_api/detection/run_inference_with_yolov5_val.py +0 -581
api/synchronous/api_core/animal_detection_api/detection/run_tiled_inference.py +0 -754
api/synchronous/api_core/animal_detection_api/detection/tf_detector.py +0 -165
api/synchronous/api_core/animal_detection_api/detection/video_utils.py +0 -495
api/synchronous/api_core/animal_detection_api/md_utils/azure_utils.py +0 -174
api/synchronous/api_core/animal_detection_api/md_utils/ct_utils.py +0 -262
api/synchronous/api_core/animal_detection_api/md_utils/directory_listing.py +0 -251
api/synchronous/api_core/animal_detection_api/md_utils/matlab_porting_tools.py +0 -97
api/synchronous/api_core/animal_detection_api/md_utils/path_utils.py +0 -416
api/synchronous/api_core/animal_detection_api/md_utils/process_utils.py +0 -110
api/synchronous/api_core/animal_detection_api/md_utils/sas_blob_utils.py +0 -509
api/synchronous/api_core/animal_detection_api/md_utils/string_utils.py +0 -59
api/synchronous/api_core/animal_detection_api/md_utils/url_utils.py +0 -144
api/synchronous/api_core/animal_detection_api/md_utils/write_html_image_list.py +0 -226
api/synchronous/api_core/animal_detection_api/md_visualization/visualization_utils.py +0 -841
api/synchronous/api_core/tests/__init__.py +0 -0
api/synchronous/api_core/tests/load_test.py +0 -110
classification/__init__.py +0 -0
classification/aggregate_classifier_probs.py +0 -108
classification/analyze_failed_images.py +0 -227
classification/cache_batchapi_outputs.py +0 -198
classification/create_classification_dataset.py +0 -627
classification/crop_detections.py +0 -516
classification/csv_to_json.py +0 -226
classification/detect_and_crop.py +0 -855
classification/efficientnet/__init__.py +0 -9
classification/efficientnet/model.py +0 -415
classification/efficientnet/utils.py +0 -610
classification/evaluate_model.py +0 -520
classification/identify_mislabeled_candidates.py +0 -152
classification/json_to_azcopy_list.py +0 -63
classification/json_validator.py +0 -695
classification/map_classification_categories.py +0 -276
classification/merge_classification_detection_output.py +0 -506
classification/prepare_classification_script.py +0 -194
classification/prepare_classification_script_mc.py +0 -228
classification/run_classifier.py +0 -286
classification/save_mislabeled.py +0 -110
classification/train_classifier.py +0 -825
classification/train_classifier_tf.py +0 -724
classification/train_utils.py +0 -322
data_management/__init__.py +0 -0
data_management/annotations/__init__.py +0 -0
data_management/annotations/annotation_constants.py +0 -34
data_management/camtrap_dp_to_coco.py +0 -238
data_management/cct_json_utils.py +0 -395
data_management/cct_to_md.py +0 -176
data_management/cct_to_wi.py +0 -289
data_management/coco_to_labelme.py +0 -272
data_management/coco_to_yolo.py +0 -662
data_management/databases/__init__.py +0 -0
data_management/databases/add_width_and_height_to_db.py +0 -33
data_management/databases/combine_coco_camera_traps_files.py +0 -206
data_management/databases/integrity_check_json_db.py +0 -477
data_management/databases/subset_json_db.py +0 -115
data_management/generate_crops_from_cct.py +0 -149
data_management/get_image_sizes.py +0 -188
data_management/importers/add_nacti_sizes.py +0 -52
data_management/importers/add_timestamps_to_icct.py +0 -79
data_management/importers/animl_results_to_md_results.py +0 -158
data_management/importers/auckland_doc_test_to_json.py +0 -372
data_management/importers/auckland_doc_to_json.py +0 -200
data_management/importers/awc_to_json.py +0 -189
data_management/importers/bellevue_to_json.py +0 -273
data_management/importers/cacophony-thermal-importer.py +0 -796
data_management/importers/carrizo_shrubfree_2018.py +0 -268
data_management/importers/carrizo_trail_cam_2017.py +0 -287
data_management/importers/cct_field_adjustments.py +0 -57
data_management/importers/channel_islands_to_cct.py +0 -913
data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
data_management/importers/eMammal/eMammal_helpers.py +0 -249
data_management/importers/eMammal/make_eMammal_json.py +0 -223
data_management/importers/ena24_to_json.py +0 -275
data_management/importers/filenames_to_json.py +0 -385
data_management/importers/helena_to_cct.py +0 -282
data_management/importers/idaho-camera-traps.py +0 -1407
data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
data_management/importers/jb_csv_to_json.py +0 -150
data_management/importers/mcgill_to_json.py +0 -250
data_management/importers/missouri_to_json.py +0 -489
data_management/importers/nacti_fieldname_adjustments.py +0 -79
data_management/importers/noaa_seals_2019.py +0 -181
data_management/importers/pc_to_json.py +0 -365
data_management/importers/plot_wni_giraffes.py +0 -123
data_management/importers/prepare-noaa-fish-data-for-lila.py +0 -359
data_management/importers/prepare_zsl_imerit.py +0 -131
data_management/importers/rspb_to_json.py +0 -356
data_management/importers/save_the_elephants_survey_A.py +0 -320
data_management/importers/save_the_elephants_survey_B.py +0 -332
data_management/importers/snapshot_safari_importer.py +0 -758
data_management/importers/snapshot_safari_importer_reprise.py +0 -665
data_management/importers/snapshot_serengeti_lila.py +0 -1067
data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
data_management/importers/sulross_get_exif.py +0 -65
data_management/importers/timelapse_csv_set_to_json.py +0 -490
data_management/importers/ubc_to_json.py +0 -399
data_management/importers/umn_to_json.py +0 -507
data_management/importers/wellington_to_json.py +0 -263
data_management/importers/wi_to_json.py +0 -441
data_management/importers/zamba_results_to_md_results.py +0 -181
data_management/labelme_to_coco.py +0 -548
data_management/labelme_to_yolo.py +0 -272
data_management/lila/__init__.py +0 -0
data_management/lila/add_locations_to_island_camera_traps.py +0 -97
data_management/lila/add_locations_to_nacti.py +0 -147
data_management/lila/create_lila_blank_set.py +0 -557
data_management/lila/create_lila_test_set.py +0 -151
data_management/lila/create_links_to_md_results_files.py +0 -106
data_management/lila/download_lila_subset.py +0 -177
data_management/lila/generate_lila_per_image_labels.py +0 -515
data_management/lila/get_lila_annotation_counts.py +0 -170
data_management/lila/get_lila_image_counts.py +0 -111
data_management/lila/lila_common.py +0 -300
data_management/lila/test_lila_metadata_urls.py +0 -132
data_management/ocr_tools.py +0 -874
data_management/read_exif.py +0 -681
data_management/remap_coco_categories.py +0 -84
data_management/remove_exif.py +0 -66
data_management/resize_coco_dataset.py +0 -189
data_management/wi_download_csv_to_coco.py +0 -246
data_management/yolo_output_to_md_output.py +0 -441
data_management/yolo_to_coco.py +0 -676
detection/__init__.py +0 -0
detection/detector_training/__init__.py +0 -0
detection/detector_training/model_main_tf2.py +0 -114
detection/process_video.py +0 -703
detection/pytorch_detector.py +0 -337
detection/run_detector.py +0 -779
detection/run_detector_batch.py +0 -1219
detection/run_inference_with_yolov5_val.py +0 -917
detection/run_tiled_inference.py +0 -935
detection/tf_detector.py +0 -188
detection/video_utils.py +0 -606
docs/source/conf.py +0 -43
md_utils/__init__.py +0 -0
md_utils/azure_utils.py +0 -174
md_utils/ct_utils.py +0 -612
md_utils/directory_listing.py +0 -246
md_utils/md_tests.py +0 -968
md_utils/path_utils.py +0 -1044
md_utils/process_utils.py +0 -157
md_utils/sas_blob_utils.py +0 -509
md_utils/split_locations_into_train_val.py +0 -228
md_utils/string_utils.py +0 -92
md_utils/url_utils.py +0 -323
md_utils/write_html_image_list.py +0 -225
md_visualization/__init__.py +0 -0
md_visualization/plot_utils.py +0 -293
md_visualization/render_images_with_thumbnails.py +0 -275
md_visualization/visualization_utils.py +0 -1537
md_visualization/visualize_db.py +0 -551
md_visualization/visualize_detector_output.py +0 -406
megadetector-5.0.10.dist-info/RECORD +0 -224
megadetector-5.0.10.dist-info/top_level.txt +0 -8
taxonomy_mapping/__init__.py +0 -0
taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +0 -491
taxonomy_mapping/map_new_lila_datasets.py +0 -154
taxonomy_mapping/prepare_lila_taxonomy_release.py +0 -142
taxonomy_mapping/preview_lila_taxonomy.py +0 -591
taxonomy_mapping/retrieve_sample_image.py +0 -71
taxonomy_mapping/simple_image_download.py +0 -218
taxonomy_mapping/species_lookup.py +0 -834
taxonomy_mapping/taxonomy_csv_checker.py +0 -159
taxonomy_mapping/taxonomy_graph.py +0 -346
taxonomy_mapping/validate_lila_category_mappings.py +0 -83
{megadetector-5.0.10.dist-info → megadetector-5.0.11.dist-info}/WHEEL +0 -0

taxonomy_mapping/species_lookup.py DELETED Viewed

@@ -1,834 +0,0 @@
-"""
-species_lookup.py
-Look up species names (common or scientific) in the GBIF and iNaturalist
-taxonomies.
-Run initialize_taxonomy_lookup() before calling any other function.
-"""
-#%% Constants and imports
-import argparse
-import pickle
-import shutil
-import zipfile
-import sys
-import os
-from collections import defaultdict
-from itertools import compress
-from tqdm import tqdm
-from typing import Any, Dict, List, Mapping, Sequence, Set
-import pandas as pd
-import numpy as np
-from md_utils import url_utils
-taxonomy_download_dir = os.path.expanduser('~/taxonomy')
-taxonomy_urls = {
-    'GBIF': 'https://hosted-datasets.gbif.org/datasets/backbone/current/backbone.zip',
-    'iNaturalist': 'https://www.inaturalist.org/observations/inaturalist-dwca-with-taxa.zip'  # pylint: disable=line-too-long
-}
-files_to_unzip = {
-    # GBIF used to put everything in a "backbone" folder within the zipfile, but as of
-    # 12.2023, this is no longer the case.
-    # 'GBIF': ['backbone/Taxon.tsv', 'backbone/VernacularName.tsv'],
-    'GBIF': ['Taxon.tsv', 'VernacularName.tsv'],
-    'iNaturalist': ['taxa.csv']
-}
-# As of 2020.05.12:
-#
-# GBIF: ~777MB zipped, ~1.6GB taxonomy
-# iNat: ~2.2GB zipped, ~51MB taxonomy (most of the zipfile is observations)
-# As of 2023.12.29:
-#
-# GBIF: ~948MB zipped, ~2.2GB taxonomy
-# iNat: ~6.7GB zipped, ~62MB taxonomy (most of the zipfile is observations)
-os.makedirs(taxonomy_download_dir, exist_ok=True)
-for taxonomy_name in taxonomy_urls:
-    taxonomy_dir = os.path.join(taxonomy_download_dir, taxonomy_name)
-    os.makedirs(taxonomy_dir, exist_ok=True)
-serialized_structures_file = os.path.join(taxonomy_download_dir,
-                                          'serialized_taxonomies.p')
-# These are un-initialized globals that must be initialized by
-# the initialize_taxonomy_lookup() function below.
-inat_taxonomy = None # : pd.DataFrame
-gbif_taxonomy = None # : pd.DataFrame
-gbif_common_mapping = None # : pd.DataFrame
-inat_taxon_id_to_row = None # : Dict[np.int64, int]
-gbif_taxon_id_to_row = None # : Dict[np.int64, int]
-inat_taxon_id_to_vernacular = None # : Dict[np.int64, Set[str]]
-inat_vernacular_to_taxon_id = None # : Dict[str, np.int64]
-inat_taxon_id_to_scientific = None # : Dict[np.int64, Set[str]]
-inat_scientific_to_taxon_id = None # : Dict[str, np.int64]
-gbif_taxon_id_to_vernacular = None # : Dict[np.int64, Set[str]]
-gbif_vernacular_to_taxon_id = None # : Dict[str, np.int64]
-gbif_taxon_id_to_scientific = None # : Dict[np.int64, Set[str]]
-gbif_scientific_to_taxon_id = None # : Dict[str, np.int64]
-#%% Functions
-# Initialization function
-def initialize_taxonomy_lookup(force_init=False) -> None:
-    """
-    Initialize this module by doing the following:
-    * Downloads and unzips the current GBIF and iNat taxonomies if necessary
-      (only unzips what's necessary, but does not delete the original zipfiles)
-    * Builds a bunch of dictionaries and tables to facilitate lookup
-    * Serializes those tables via pickle
-    * Skips all of the above if the serialized pickle file already exists
-    """
-    global inat_taxonomy,\
-        gbif_taxonomy,\
-        gbif_common_mapping,\
-        inat_taxon_id_to_row,\
-        gbif_taxon_id_to_row,\
-        inat_taxon_id_to_vernacular,\
-        inat_vernacular_to_taxon_id,\
-        inat_taxon_id_to_scientific,\
-        inat_scientific_to_taxon_id,\
-        gbif_taxon_id_to_vernacular,\
-        gbif_vernacular_to_taxon_id,\
-        gbif_taxon_id_to_scientific,\
-        gbif_scientific_to_taxon_id
-    ## Load serialized taxonomy info if we've already saved it
-    if (not force_init) and (inat_taxonomy is not None):
-        print('Skipping taxonomy re-init')
-        return
-    if (not force_init) and (os.path.isfile(serialized_structures_file)):
-        print(f'De-serializing taxonomy data from {serialized_structures_file}')
-        with open(serialized_structures_file, 'rb') as f:
-            structures_to_serialize = pickle.load(f)
-        inat_taxonomy,\
-        gbif_taxonomy,\
-        gbif_common_mapping,\
-        inat_taxon_id_to_row,\
-        gbif_taxon_id_to_row,\
-        inat_taxon_id_to_vernacular,\
-        inat_vernacular_to_taxon_id,\
-        inat_taxon_id_to_scientific,\
-        inat_scientific_to_taxon_id,\
-        gbif_taxon_id_to_vernacular,\
-        gbif_vernacular_to_taxon_id,\
-        gbif_taxon_id_to_scientific,\
-        gbif_scientific_to_taxon_id = structures_to_serialize
-        return
-    ## If we don't have serialized taxonomy info, create it from scratch.
-    # Download and unzip taxonomy files
-    # taxonomy_name = list(taxonomy_urls.items())[0][0]; zip_url = list(taxonomy_urls.items())[0][1]
-    for taxonomy_name, zip_url in taxonomy_urls.items():
-        need_to_download = False
-        if force_init:
-            need_to_download = True
-        # Don't download the zipfile if we've already unzipped what we need
-        for fn in files_to_unzip[taxonomy_name]:
-            target_file = os.path.join(
-                taxonomy_download_dir, taxonomy_name, fn)
-            if not os.path.isfile(target_file):
-                need_to_download = True
-                break
-        if not need_to_download:
-            print(f'Bypassing download of {taxonomy_name}, all files available')
-            continue
-        zipfile_path = os.path.join(
-            taxonomy_download_dir, zip_url.split('/')[-1])
-        # Bypasses download if the file exists already (unless force_init is set)
-        url_utils.download_url(
-            zip_url, os.path.join(zipfile_path),
-            progress_updater=url_utils.DownloadProgressBar(),
-            verbose=True,force_download=force_init)
-        # Unzip the files we need
-        files_we_need = files_to_unzip[taxonomy_name]
-        with zipfile.ZipFile(zipfile_path, 'r') as zipH:
-            for fn in files_we_need:
-                print('Unzipping {}'.format(fn))
-                target_file = os.path.join(
-                    taxonomy_download_dir, taxonomy_name, os.path.basename(fn))
-                if (not force_init) and (os.path.isfile(target_file)):
-                    print(f'Bypassing unzip of {target_file}, file exists')
-                else:
-                    os.makedirs(os.path.basename(target_file),exist_ok=True)
-                    with zipH.open(fn) as zf, open(target_file, 'wb') as f:
-                        shutil.copyfileobj(zf, f)
-            # ...for each file that we need from this zipfile
-        # Remove the zipfile
-        # os.remove(zipfile_path)
-    # ...for each taxonomy
-    # Create dataframes from each of the taxonomy files, and the GBIF common
-    # name file
-    # Load iNat taxonomy
-    inat_taxonomy_file = os.path.join(taxonomy_download_dir, 'iNaturalist', 'taxa.csv')
-    print('Loading iNat taxonomy from {}'.format(inat_taxonomy_file))
-    inat_taxonomy = pd.read_csv(inat_taxonomy_file)
-    inat_taxonomy['scientificName'] = inat_taxonomy['scientificName'].fillna('').str.strip()
-    inat_taxonomy['vernacularName'] = inat_taxonomy['vernacularName'].fillna('').str.strip()
-    # Load GBIF taxonomy
-    gbif_taxonomy_file = os.path.join(taxonomy_download_dir, 'GBIF', 'Taxon.tsv')
-    print('Loading GBIF taxonomy from {}'.format(gbif_taxonomy_file))
-    gbif_taxonomy = pd.read_csv(gbif_taxonomy_file, sep='\t')
-    gbif_taxonomy['scientificName'] = gbif_taxonomy['scientificName'].fillna('').str.strip()
-    gbif_taxonomy['canonicalName'] = gbif_taxonomy['canonicalName'].fillna('').str.strip()
-    # Remove questionable rows from the GBIF taxonomy
-    gbif_taxonomy = gbif_taxonomy[~gbif_taxonomy['taxonomicStatus'].isin(['doubtful', 'misapplied'])]
-    gbif_taxonomy = gbif_taxonomy.reset_index()
-    # Load GBIF vernacular name mapping
-    gbif_common_mapping = pd.read_csv(os.path.join(
-        taxonomy_download_dir, 'GBIF', 'VernacularName.tsv'), sep='\t')
-    gbif_common_mapping['vernacularName'] = gbif_common_mapping['vernacularName'].fillna('').str.strip()
-    # Only keep English mappings
-    gbif_common_mapping = gbif_common_mapping.loc[gbif_common_mapping['language'] == 'en']
-    gbif_common_mapping = gbif_common_mapping.reset_index()
-    # Convert everything to lowercase
-    def convert_df_to_lowercase(df):
-        df = df.applymap(lambda s: s.lower() if isinstance(s, str) else s)
-        return df
-    inat_taxonomy = convert_df_to_lowercase(inat_taxonomy)
-    gbif_taxonomy = convert_df_to_lowercase(gbif_taxonomy)
-    gbif_common_mapping = convert_df_to_lowercase(gbif_common_mapping)
-    # For each taxonomy table, create a mapping from taxon IDs to rows
-    inat_taxon_id_to_row = {}
-    gbif_taxon_id_to_row = {}
-    print('Building iNat taxonID --> row table')
-    for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
-        inat_taxon_id_to_row[row['taxonID']] = i_row
-    print('Building GBIF taxonID --> row table')
-    for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
-        gbif_taxon_id_to_row[row['taxonID']] = i_row
-    # Create name mapping dictionaries
-    inat_taxon_id_to_vernacular = defaultdict(set)
-    inat_vernacular_to_taxon_id = defaultdict(set)
-    inat_taxon_id_to_scientific = defaultdict(set)
-    inat_scientific_to_taxon_id = defaultdict(set)
-    gbif_taxon_id_to_vernacular = defaultdict(set)
-    gbif_vernacular_to_taxon_id = defaultdict(set)
-    gbif_taxon_id_to_scientific = defaultdict(set)
-    gbif_scientific_to_taxon_id = defaultdict(set)
-    # Build iNat dictionaries
-    print('Building lookup dictionaries for iNat taxonomy')
-    for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
-        taxon_id = row['taxonID']
-        vernacular_name = row['vernacularName']
-        scientific_name = row['scientificName']
-        if len(vernacular_name) > 0:
-            inat_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
-            inat_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
-        assert len(scientific_name) > 0
-        inat_taxon_id_to_scientific[taxon_id].add(scientific_name)
-        inat_scientific_to_taxon_id[scientific_name].add(taxon_id)
-    # Build GBIF dictionaries
-    print('Building lookup dictionaries for GBIF taxonomy')
-    for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
-        taxon_id = row['taxonID']
-        # The canonical name is the Latin name; the "scientific name"
-        # include the taxonomy name.
-        #
-        # http://globalnames.org/docs/glossary/
-        scientific_name = row['canonicalName']
-        # This only seems to happen for really esoteric species that aren't
-        # likely to apply to our problems, but doing this for completeness.
-        if len(scientific_name) == 0:
-            scientific_name = row['scientificName']
-        assert len(scientific_name) > 0
-        gbif_taxon_id_to_scientific[taxon_id].add(scientific_name)
-        gbif_scientific_to_taxon_id[scientific_name].add(taxon_id)
-    for i_row, row in tqdm(gbif_common_mapping.iterrows(), total=len(gbif_common_mapping)):
-        taxon_id = row['taxonID']
-        # Don't include taxon IDs that were removed from the master table
-        if taxon_id not in gbif_taxon_id_to_scientific:
-            continue
-        vernacular_name = row['vernacularName']
-        assert len(vernacular_name) > 0
-        gbif_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
-        gbif_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
-    # Save everything to file
-    structures_to_serialize = [
-        inat_taxonomy,
-        gbif_taxonomy,
-        gbif_common_mapping,
-        inat_taxon_id_to_row,
-        gbif_taxon_id_to_row,
-        inat_taxon_id_to_vernacular,
-        inat_vernacular_to_taxon_id,
-        inat_taxon_id_to_scientific,
-        inat_scientific_to_taxon_id,
-        gbif_taxon_id_to_vernacular,
-        gbif_vernacular_to_taxon_id,
-        gbif_taxon_id_to_scientific,
-        gbif_scientific_to_taxon_id
-    ]
-    print('Serializing to {}...'.format(serialized_structures_file), end='')
-    if not os.path.isfile(serialized_structures_file):
-        with open(serialized_structures_file, 'wb') as p:
-            pickle.dump(structures_to_serialize, p)
-    print(' done')
-# ...def initialize_taxonomy_lookup(...)
-def get_scientific_name_from_row(r):
-    """
-    r: a dataframe that's really a row in one of our taxonomy tables
-    """
-    if 'canonicalName' in r and len(r['canonicalName']) > 0:
-        scientific_name = r['canonicalName']
-    else:
-        scientific_name = r['scientificName']
-    return scientific_name
-def taxonomy_row_to_string(r):
-    """
-    r: a dataframe that's really a row in one of our taxonomy tables
-    """
-    if 'vernacularName' in r:
-        common_string = ' (' + r['vernacularName'] + ')'
-    else:
-        common_string = ''
-    scientific_name = get_scientific_name_from_row(r)
-    return r['taxonRank'] + ' ' + scientific_name + common_string
-def traverse_taxonomy(matching_rownums: Sequence[int],
-                      taxon_id_to_row: Mapping[str, int],
-                      taxon_id_to_vernacular: Mapping[str, Set[str]],
-                      taxonomy: pd.DataFrame,
-                      source_name: str,
-                      query: str) -> List[Dict[str, Any]]:
-    """
-    Given a data frame that's a set of rows from one of our taxonomy tables,
-    walks the taxonomy hierarchy from each row to put together a full taxonomy
-    tree, then prunes redundant trees (e.g. if we had separate hits for a
-    species and the genus that contains that species.)
-    Returns a list of dicts:
-    [
-      {
-        'source': 'inat' or 'gbif',
-        'taxonomy': [(taxon_id, taxon_rank, scientific_name, [common names])]
-      },
-      ...
-    ]
-    """
-    # list of dicts: {'source': source_name, 'taxonomy': match_details}
-    matching_trees: List[Dict[str, Any]] = []
-    # i_match = 0
-    for i_match in matching_rownums:
-        # list of (taxon_id, taxonRank, scientific name, [vernacular names])
-        # corresponding to an exact match and its parents
-        match_details = []
-        current_row = taxonomy.iloc[i_match]
-        # Walk taxonomy hierarchy
-        while True:
-            taxon_id = current_row['taxonID']
-            vernacular_names = sorted(taxon_id_to_vernacular[taxon_id])  # sort for determinism, pylint: disable=line-too-long
-            match_details.append((taxon_id, current_row['taxonRank'],
-                                  get_scientific_name_from_row(current_row),
-                                  vernacular_names))
-            if np.isnan(current_row['parentNameUsageID']):
-                break
-            parent_taxon_id = current_row['parentNameUsageID'].astype('int64')
-            if parent_taxon_id not in taxon_id_to_row:
-                # This can happen because we remove questionable rows from the
-                # GBIF taxonomy
-                # print(f'Warning: no row exists for parent_taxon_id {parent_taxon_id},' + \
-                #      f'child taxon_id: {taxon_id}, query: {query}')
-                break
-            i_parent_row = taxon_id_to_row[parent_taxon_id]
-            current_row = taxonomy.iloc[i_parent_row]
-            # The GBIF taxonomy contains unranked entries
-            if current_row['taxonRank'] == 'unranked':
-                break
-        # ...while there is taxonomy left to walk
-        matching_trees.append({'source': source_name,
-                               'taxonomy': match_details})
-    # ...for each match
-    # Remove redundant matches
-    b_valid_tree = [True] * len(matching_rownums)
-    # i_tree_a = 0; tree_a = matching_trees[i_tree_a]
-    for i_tree_a, tree_a in enumerate(matching_trees):
-        tree_a_primary_taxon_id = tree_a['taxonomy'][0][0]
-        # i_tree_b = 1; tree_b = matching_trees[i_tree_b]
-        for i_tree_b, tree_b in enumerate(matching_trees):
-            if i_tree_a == i_tree_b:
-                continue
-            # If tree a's primary taxon ID is inside tree b, discard tree a
-            #
-            # taxonomy_level_b = tree_b['taxonomy'][0]
-            for taxonomy_level_b in tree_b['taxonomy']:
-                if tree_a_primary_taxon_id == taxonomy_level_b[0]:
-                    b_valid_tree[i_tree_a] = False
-                    break
-            # ...for each level in taxonomy B
-        # ...for each tree (inner)
-    # ...for each tree (outer)
-    matching_trees = list(compress(matching_trees, b_valid_tree))
-    return matching_trees
-# ...def traverse_taxonomy()
-def get_taxonomic_info(query: str) -> List[Dict[str, Any]]:
-    """
-    Main entry point: get taxonomic matches from both taxonomies for [query],
-    which may be a scientific or common name.
-    """
-    query = query.strip().lower()
-    # print("Finding taxonomy information for: {0}".format(query))
-    inat_taxon_ids = set()
-    if query in inat_scientific_to_taxon_id:
-        inat_taxon_ids |= inat_scientific_to_taxon_id[query]
-    if query in inat_vernacular_to_taxon_id:
-        inat_taxon_ids |= inat_vernacular_to_taxon_id[query]
-    # In GBIF, some queries hit for both common and scientific, make sure we end
-    # up with unique inputs
-    gbif_taxon_ids = set()
-    if query in gbif_scientific_to_taxon_id:
-        gbif_taxon_ids |= gbif_scientific_to_taxon_id[query]
-    if query in gbif_vernacular_to_taxon_id:
-        gbif_taxon_ids |= gbif_vernacular_to_taxon_id[query]
-    # If the species is not found in either taxonomy, return None
-    if (len(inat_taxon_ids) == 0) and (len(gbif_taxon_ids) == 0):
-        return []
-    # Both GBIF and iNat have a 1-to-1 mapping between taxon_id and row number
-    inat_row_indices = [inat_taxon_id_to_row[i] for i in inat_taxon_ids]
-    gbif_row_indices = [gbif_taxon_id_to_row[i] for i in gbif_taxon_ids]
-    # Walk both taxonomies
-    inat_matching_trees = traverse_taxonomy(
-        inat_row_indices, inat_taxon_id_to_row, inat_taxon_id_to_vernacular,
-        inat_taxonomy, 'inat', query)
-    gbif_matching_trees = traverse_taxonomy(
-        gbif_row_indices, gbif_taxon_id_to_row, gbif_taxon_id_to_vernacular,
-        gbif_taxonomy, 'gbif', query)
-    return gbif_matching_trees + inat_matching_trees
-# ...def get_taxonomic_info()
-def print_taxonomy_matches(matches, verbose=False):
-    """
-    Console-friendly printing function to make nicely-indentend trees
-    """
-    # m = matches[0]
-    for m in matches:
-        source = m['source']
-        # For example: [(9761484, 'species', 'anas platyrhynchos')]
-        for i_taxonomy_level in range(0, len(m['taxonomy'])):
-            taxonomy_level_info = m['taxonomy'][i_taxonomy_level]
-            taxonomy_level = taxonomy_level_info[1]
-            name = taxonomy_level_info[2]
-            common = taxonomy_level_info[3]
-            if i_taxonomy_level > 0:
-                print('\t',end='')
-            print('{} {} ({})'.format(taxonomy_level, name, common), end='')
-            if i_taxonomy_level == 0:
-                print(' ({})'.format(source))
-            else:
-                print('')
-            if not verbose:
-                break
-        # ...for each taxonomy level
-    # ...for each match
-# ...def print_taxonomy_matches()
-#%% Taxonomy functions that make subjective judgements
-import unicodedata
-import re
-def slugify(value: Any, allow_unicode: bool = False) -> str:
-    """
-    From:
-    https://github.com/django/django/blob/master/django/utils/text.py
-    Convert to ASCII if 'allow_unicode' is False. Convert spaces to hyphens.
-    Remove characters that aren't alphanumerics, underscores, or hyphens.
-    Convert to lowercase. Also strip leading and trailing whitespace.
-    """
-    value = str(value)
-    value = unicodedata.normalize('NFKC', value)
-    if not allow_unicode:
-        value = value.encode('ascii', 'ignore').decode('ascii')
-    value = re.sub(r'[^\w\s-]', '', value.lower()).strip()
-    return re.sub(r'[-\s]+', '-', value)
-class TaxonomicMatch:
-    def __init__(self, scientific_name, common_name, taxonomic_level, source,
-                 taxonomy_string, match):
-        self.scientific_name = scientific_name
-        self.common_name = common_name
-        self.taxonomic_level = taxonomic_level
-        self.source = source
-        self.taxonomy_string = taxonomy_string
-        self.match = match
-    def __repr__(self):
-        return ('TaxonomicMatch('
-            f'scientific_name={self.scientific_name}, '
-            f'common_name={self.common_name}, '
-            f'taxonomic_level={self.taxonomic_level}, '
-            f'source={self.source}')
-hyphenated_terms = ['crowned', 'backed', 'throated', 'tailed', 'headed', 'cheeked',
-                    'ruffed', 'browed', 'eating', 'striped', 'shanked',
-                    'fronted', 'bellied', 'spotted', 'eared', 'collared', 'breasted',
-                    'necked']
-def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retry=True) -> TaxonomicMatch:
-    """
-    Wrapper for species_lookup.py, but expressing a variety of heuristics and
-    preferences that are specific to our scenario.
-    """
-    m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
-    if (len(m.scientific_name) > 0) or (not retry):
-        return m
-    for s in hyphenated_terms:
-        query = query.replace(' ' + s,'-' + s)
-    m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
-    return m
-def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') -> TaxonomicMatch:
-    query = query.lower().strip().replace('_', ' ')
-    query = query.replace('unidentified','')
-    query = query.replace('unknown','')
-    if query.endswith(' sp'):
-        query = query.replace(' sp','')
-    if query.endswith(' group'):
-        query = query.replace(' group','')
-    query = query.strip()
-    # query = 'person'
-    matches = get_taxonomic_info(query)
-    # Do we have an iNat match?
-    inat_matches = [m for m in matches if m['source'] == 'inat']
-    gbif_matches = [m for m in matches if m['source'] == 'gbif']
-    # print_taxonomy_matches(inat_matches, verbose=True)
-    # print_taxonomy_matches(gbif_matches, verbose=True)
-    scientific_name = ''
-    common_name = ''
-    taxonomic_level = ''
-    match = ''
-    source = ''
-    taxonomy_string = ''
-    n_inat_matches = len(inat_matches)
-    n_gbif_matches = len(gbif_matches)
-    selected_matches = None
-    assert taxonomy_preference in ['gbif','inat'],\
-        'Unrecognized taxonomy preference: {}'.format(taxonomy_preference)
-    if n_inat_matches > 0 and taxonomy_preference == 'inat':
-        selected_matches = 'inat'
-    elif n_gbif_matches > 0:
-        selected_matches = 'gbif'
-    if selected_matches == 'inat':
-        i_match = 0
-        if len(inat_matches) > 1:
-            # print('Warning: multiple iNat matches for {}'.format(query))
-            # Prefer chordates... most of the names that aren't what we want
-            # are esoteric insects, like a moth called "cheetah"
-            #
-            # If we can't find a chordate, just take the first match.
-            #
-            # i_test_match = 0
-            for i_test_match, match in enumerate(inat_matches):
-                found_vertebrate = False
-                taxonomy = match['taxonomy']
-                for taxonomy_level in taxonomy:
-                    taxon_rank = taxonomy_level[1]
-                    scientific_name = taxonomy_level[2]
-                    if taxon_rank == 'phylum' and scientific_name == 'chordata':
-                        i_match = i_test_match
-                        found_vertebrate = True
-                        break
-                if found_vertebrate:
-                    break
-        match = inat_matches[i_match]['taxonomy']
-        # This is (taxonID, taxonLevel, scientific, [list of common])
-        lowest_level = match[0]
-        taxonomic_level = lowest_level[1]
-        scientific_name = lowest_level[2]
-        assert len(scientific_name) > 0
-        common_names = lowest_level[3]
-        if len(common_names) > 1:
-            # print(f'Warning: multiple iNat common names for {query}')
-            # Default to returning the query
-            if query in common_names:
-                common_name = query
-            else:
-                common_name = common_names[0]
-        elif len(common_names) > 0:
-            common_name = common_names[0]
-        # print(f'Matched iNat {query} to {scientific_name},{common_name}')
-        source = 'inat'
-    # ...if we had iNat matches
-    # If we either prefer GBIF or didn't have iNat matches
-    #
-    # Code is deliberately redundant here; I'm expecting some subtleties in how
-    # handle GBIF and iNat.
-    elif selected_matches == 'gbif':
-        i_match = 0
-        if len(gbif_matches) > 1:
-            # print('Warning: multiple GBIF matches for {}'.format(query))
-            # Prefer chordates... most of the names that aren't what we want
-            # are esoteric insects, like a moth called "cheetah"
-            #
-            # If we can't find a chordate, just take the first match.
-            #
-            # i_test_match = 0
-            for i_test_match, match in enumerate(gbif_matches):
-                found_vertebrate = False
-                taxonomy = match['taxonomy']
-                for taxonomy_level in taxonomy:
-                    taxon_rank = taxonomy_level[1]
-                    scientific_name = taxonomy_level[2]
-                    if taxon_rank == 'phylum' and scientific_name == 'chordata':
-                        i_match = i_test_match
-                        found_vertebrate = True
-                        break
-                if found_vertebrate:
-                    break
-        match = gbif_matches[i_match]['taxonomy']
-        # This is (taxonID, taxonLevel, scientific, [list of common])
-        lowest_level = match[0]
-        taxonomic_level = lowest_level[1]
-        scientific_name = lowest_level[2]
-        assert len(scientific_name) > 0
-        common_names = lowest_level[3]
-        if len(common_names) > 1:
-            # print(f'Warning: multiple GBIF common names for {query}')
-            # Default to returning the query
-            if query in common_names:
-                common_name = query
-            else:
-                common_name = common_names[0]
-        elif len(common_names) > 0:
-            common_name = common_names[0]
-        source = 'gbif'
-    # ...if we needed to look in the GBIF taxonomy
-    taxonomy_string = str(match)
-    return TaxonomicMatch(scientific_name, common_name, taxonomic_level, source,
-                          taxonomy_string, match),query
-# ...def _get_preferred_taxonomic_match()
-#%% Interactive drivers and debug
-if False:
-    #%% Initialization
-    initialize_taxonomy_lookup()
-    #%% Taxonomic lookup
-    # query = 'lion'
-    query = 'xenoperdix'
-    matches = get_taxonomic_info(query)
-    # print(matches)
-    print_taxonomy_matches(matches,verbose=True)
-    print('\n\n')
-    # Print the taxonomy in the taxonomy spreadsheet format
-    assert matches[1]['source'] == 'inat'
-    t = str(matches[1]['taxonomy'])
-    print(t)
-    import clipboard; clipboard.copy(t)
-    #%% Directly access the taxonomy tables
-    taxon_ids = gbif_vernacular_to_taxon_id['lion']
-    for taxon_id in taxon_ids:
-        i_row = gbif_taxon_id_to_row[taxon_id]
-        print(taxonomy_row_to_string(gbif_taxonomy.iloc[i_row]))
-#%% Command-line driver
-def main():
-    # Read command line inputs (absolute path)
-    parser = argparse.ArgumentParser()
-    parser.add_argument('input_file')
-    if len(sys.argv[1:]) == 0:
-        parser.print_help()
-        parser.exit()
-    args = parser.parse_args()
-    input_file = args.input_file
-    initialize_taxonomy_lookup()
-    # Read the tokens from the input text file
-    with open(input_file, 'r') as f:
-        tokens = f.readlines()
-    # Loop through each token and get scientific name
-    for token in tokens:
-        token = token.strip().lower()
-        matches = get_taxonomic_info(token)
-        print_taxonomy_matches(matches)
-if __name__ == '__main__':
-    main()

megadetector 5.0.10__py3-none-any.whl → 5.0.11__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.10py3-none-any.whl → 5.0.11py3-none-any.whl