megadetector 10.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- megadetector/__init__.py +0 -0
- megadetector/api/__init__.py +0 -0
- megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
- megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
- megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
- megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
- megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
- megadetector/classification/__init__.py +0 -0
- megadetector/classification/aggregate_classifier_probs.py +108 -0
- megadetector/classification/analyze_failed_images.py +227 -0
- megadetector/classification/cache_batchapi_outputs.py +198 -0
- megadetector/classification/create_classification_dataset.py +626 -0
- megadetector/classification/crop_detections.py +516 -0
- megadetector/classification/csv_to_json.py +226 -0
- megadetector/classification/detect_and_crop.py +853 -0
- megadetector/classification/efficientnet/__init__.py +9 -0
- megadetector/classification/efficientnet/model.py +415 -0
- megadetector/classification/efficientnet/utils.py +608 -0
- megadetector/classification/evaluate_model.py +520 -0
- megadetector/classification/identify_mislabeled_candidates.py +152 -0
- megadetector/classification/json_to_azcopy_list.py +63 -0
- megadetector/classification/json_validator.py +696 -0
- megadetector/classification/map_classification_categories.py +276 -0
- megadetector/classification/merge_classification_detection_output.py +509 -0
- megadetector/classification/prepare_classification_script.py +194 -0
- megadetector/classification/prepare_classification_script_mc.py +228 -0
- megadetector/classification/run_classifier.py +287 -0
- megadetector/classification/save_mislabeled.py +110 -0
- megadetector/classification/train_classifier.py +827 -0
- megadetector/classification/train_classifier_tf.py +725 -0
- megadetector/classification/train_utils.py +323 -0
- megadetector/data_management/__init__.py +0 -0
- megadetector/data_management/animl_to_md.py +161 -0
- megadetector/data_management/annotations/__init__.py +0 -0
- megadetector/data_management/annotations/annotation_constants.py +33 -0
- megadetector/data_management/camtrap_dp_to_coco.py +270 -0
- megadetector/data_management/cct_json_utils.py +566 -0
- megadetector/data_management/cct_to_md.py +184 -0
- megadetector/data_management/cct_to_wi.py +293 -0
- megadetector/data_management/coco_to_labelme.py +284 -0
- megadetector/data_management/coco_to_yolo.py +701 -0
- megadetector/data_management/databases/__init__.py +0 -0
- megadetector/data_management/databases/add_width_and_height_to_db.py +107 -0
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +210 -0
- megadetector/data_management/databases/integrity_check_json_db.py +563 -0
- megadetector/data_management/databases/subset_json_db.py +195 -0
- megadetector/data_management/generate_crops_from_cct.py +200 -0
- megadetector/data_management/get_image_sizes.py +164 -0
- megadetector/data_management/labelme_to_coco.py +559 -0
- megadetector/data_management/labelme_to_yolo.py +349 -0
- megadetector/data_management/lila/__init__.py +0 -0
- megadetector/data_management/lila/create_lila_blank_set.py +556 -0
- megadetector/data_management/lila/create_lila_test_set.py +192 -0
- megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
- megadetector/data_management/lila/download_lila_subset.py +182 -0
- megadetector/data_management/lila/generate_lila_per_image_labels.py +777 -0
- megadetector/data_management/lila/get_lila_annotation_counts.py +174 -0
- megadetector/data_management/lila/get_lila_image_counts.py +112 -0
- megadetector/data_management/lila/lila_common.py +319 -0
- megadetector/data_management/lila/test_lila_metadata_urls.py +164 -0
- megadetector/data_management/mewc_to_md.py +344 -0
- megadetector/data_management/ocr_tools.py +873 -0
- megadetector/data_management/read_exif.py +964 -0
- megadetector/data_management/remap_coco_categories.py +195 -0
- megadetector/data_management/remove_exif.py +156 -0
- megadetector/data_management/rename_images.py +194 -0
- megadetector/data_management/resize_coco_dataset.py +665 -0
- megadetector/data_management/speciesnet_to_md.py +41 -0
- megadetector/data_management/wi_download_csv_to_coco.py +247 -0
- megadetector/data_management/yolo_output_to_md_output.py +594 -0
- megadetector/data_management/yolo_to_coco.py +984 -0
- megadetector/data_management/zamba_to_md.py +188 -0
- megadetector/detection/__init__.py +0 -0
- megadetector/detection/change_detection.py +840 -0
- megadetector/detection/process_video.py +479 -0
- megadetector/detection/pytorch_detector.py +1451 -0
- megadetector/detection/run_detector.py +1267 -0
- megadetector/detection/run_detector_batch.py +2172 -0
- megadetector/detection/run_inference_with_yolov5_val.py +1314 -0
- megadetector/detection/run_md_and_speciesnet.py +1604 -0
- megadetector/detection/run_tiled_inference.py +1044 -0
- megadetector/detection/tf_detector.py +209 -0
- megadetector/detection/video_utils.py +1379 -0
- megadetector/postprocessing/__init__.py +0 -0
- megadetector/postprocessing/add_max_conf.py +72 -0
- megadetector/postprocessing/categorize_detections_by_size.py +166 -0
- megadetector/postprocessing/classification_postprocessing.py +1943 -0
- megadetector/postprocessing/combine_batch_outputs.py +249 -0
- megadetector/postprocessing/compare_batch_results.py +2110 -0
- megadetector/postprocessing/convert_output_format.py +403 -0
- megadetector/postprocessing/create_crop_folder.py +629 -0
- megadetector/postprocessing/detector_calibration.py +570 -0
- megadetector/postprocessing/generate_csv_report.py +522 -0
- megadetector/postprocessing/load_api_results.py +223 -0
- megadetector/postprocessing/md_to_coco.py +428 -0
- megadetector/postprocessing/md_to_labelme.py +351 -0
- megadetector/postprocessing/md_to_wi.py +41 -0
- megadetector/postprocessing/merge_detections.py +392 -0
- megadetector/postprocessing/postprocess_batch_results.py +2140 -0
- megadetector/postprocessing/remap_detection_categories.py +226 -0
- megadetector/postprocessing/render_detection_confusion_matrix.py +677 -0
- megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +206 -0
- megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +82 -0
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1665 -0
- megadetector/postprocessing/separate_detections_into_folders.py +795 -0
- megadetector/postprocessing/subset_json_detector_output.py +964 -0
- megadetector/postprocessing/top_folders_to_bottom.py +238 -0
- megadetector/postprocessing/validate_batch_results.py +332 -0
- megadetector/taxonomy_mapping/__init__.py +0 -0
- megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +211 -0
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +165 -0
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +543 -0
- megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
- megadetector/taxonomy_mapping/simple_image_download.py +231 -0
- megadetector/taxonomy_mapping/species_lookup.py +1008 -0
- megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
- megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
- megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
- megadetector/tests/__init__.py +0 -0
- megadetector/tests/test_nms_synthetic.py +335 -0
- megadetector/utils/__init__.py +0 -0
- megadetector/utils/ct_utils.py +1857 -0
- megadetector/utils/directory_listing.py +199 -0
- megadetector/utils/extract_frames_from_video.py +307 -0
- megadetector/utils/gpu_test.py +125 -0
- megadetector/utils/md_tests.py +2072 -0
- megadetector/utils/path_utils.py +2872 -0
- megadetector/utils/process_utils.py +172 -0
- megadetector/utils/split_locations_into_train_val.py +237 -0
- megadetector/utils/string_utils.py +234 -0
- megadetector/utils/url_utils.py +825 -0
- megadetector/utils/wi_platform_utils.py +968 -0
- megadetector/utils/wi_taxonomy_utils.py +1766 -0
- megadetector/utils/write_html_image_list.py +239 -0
- megadetector/visualization/__init__.py +0 -0
- megadetector/visualization/plot_utils.py +309 -0
- megadetector/visualization/render_images_with_thumbnails.py +243 -0
- megadetector/visualization/visualization_utils.py +1973 -0
- megadetector/visualization/visualize_db.py +630 -0
- megadetector/visualization/visualize_detector_output.py +498 -0
- megadetector/visualization/visualize_video_output.py +705 -0
- megadetector-10.0.15.dist-info/METADATA +115 -0
- megadetector-10.0.15.dist-info/RECORD +147 -0
- megadetector-10.0.15.dist-info/WHEEL +5 -0
- megadetector-10.0.15.dist-info/licenses/LICENSE +19 -0
- megadetector-10.0.15.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1008 @@
|
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
species_lookup.py
|
|
4
|
+
|
|
5
|
+
Look up species names (common or scientific) in the GBIF and iNaturalist
|
|
6
|
+
taxonomies.
|
|
7
|
+
|
|
8
|
+
Run initialize_taxonomy_lookup() before calling any other function.
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
#%% Constants and imports
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import pickle
|
|
16
|
+
import shutil
|
|
17
|
+
import zipfile
|
|
18
|
+
import sys
|
|
19
|
+
import os
|
|
20
|
+
|
|
21
|
+
from collections import defaultdict
|
|
22
|
+
from itertools import compress
|
|
23
|
+
from tqdm import tqdm
|
|
24
|
+
from typing import Any, Dict, List, Mapping, Sequence, Set
|
|
25
|
+
|
|
26
|
+
import pandas as pd
|
|
27
|
+
import numpy as np
|
|
28
|
+
|
|
29
|
+
from megadetector.utils import url_utils
|
|
30
|
+
|
|
31
|
+
taxonomy_download_dir = os.path.expanduser('~/taxonomy')
|
|
32
|
+
|
|
33
|
+
taxonomy_urls = {
|
|
34
|
+
'GBIF': 'https://hosted-datasets.gbif.org/datasets/backbone/current/backbone.zip',
|
|
35
|
+
'iNaturalist': 'https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip'
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
files_to_unzip = {
|
|
39
|
+
'GBIF': ['Taxon.tsv', 'VernacularName.tsv'],
|
|
40
|
+
'iNaturalist': ['taxa.csv','VernacularNames-english.csv']
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
# As of 2025.06.24:
|
|
44
|
+
#
|
|
45
|
+
# GBIF: 950MB zipped, 2.3GB of relevant content unzipped
|
|
46
|
+
# iNat: 71MB zipped, 415MB of relevant content unzipped
|
|
47
|
+
|
|
48
|
+
os.makedirs(taxonomy_download_dir, exist_ok=True)
|
|
49
|
+
for taxonomy_name in taxonomy_urls:
|
|
50
|
+
taxonomy_dir = os.path.join(taxonomy_download_dir, taxonomy_name)
|
|
51
|
+
os.makedirs(taxonomy_dir, exist_ok=True)
|
|
52
|
+
|
|
53
|
+
serialized_structures_file = os.path.join(taxonomy_download_dir,
|
|
54
|
+
'serialized_taxonomies.p')
|
|
55
|
+
|
|
56
|
+
# These are un-initialized globals that must be initialized by
|
|
57
|
+
# the initialize_taxonomy_lookup() function below.
|
|
58
|
+
inat_taxonomy = None # : pd.DataFrame
|
|
59
|
+
gbif_taxonomy = None # : pd.DataFrame
|
|
60
|
+
gbif_common_mapping = None # : pd.DataFrame
|
|
61
|
+
inat_taxon_id_to_row = None # : Dict[np.int64, int]
|
|
62
|
+
gbif_taxon_id_to_row = None # : Dict[np.int64, int]
|
|
63
|
+
inat_taxon_id_to_vernacular = None # : Dict[np.int64, Set[str]]
|
|
64
|
+
inat_vernacular_to_taxon_id = None # : Dict[str, np.int64]
|
|
65
|
+
inat_taxon_id_to_scientific = None # : Dict[np.int64, Set[str]]
|
|
66
|
+
inat_scientific_to_taxon_id = None # : Dict[str, np.int64]
|
|
67
|
+
gbif_taxon_id_to_vernacular = None # : Dict[np.int64, Set[str]]
|
|
68
|
+
gbif_vernacular_to_taxon_id = None # : Dict[str, np.int64]
|
|
69
|
+
gbif_taxon_id_to_scientific = None # : Dict[np.int64, Set[str]]
|
|
70
|
+
gbif_scientific_to_taxon_id = None # : Dict[str, np.int64]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
#%% Functions
|
|
74
|
+
|
|
75
|
+
# Initialization function
|
|
76
|
+
|
|
77
|
+
def initialize_taxonomy_lookup(force_init=False):
|
|
78
|
+
"""
|
|
79
|
+
Initialize this module by doing the following:
|
|
80
|
+
|
|
81
|
+
* Downloads and unzips the current GBIF and iNat taxonomies if necessary
|
|
82
|
+
(only unzips what's necessary, but does not delete the original zipfiles)
|
|
83
|
+
* Builds a bunch of dictionaries and tables to facilitate lookup
|
|
84
|
+
* Serializes those tables via pickle
|
|
85
|
+
* Skips all of the above if the serialized pickle file already exists
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
force_init (bool, optional): force re-download and parsing of the source .zip files,
|
|
89
|
+
even if the cached .p file already exists
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
#%%
|
|
93
|
+
|
|
94
|
+
global inat_taxonomy,\
|
|
95
|
+
gbif_taxonomy,\
|
|
96
|
+
gbif_common_mapping,\
|
|
97
|
+
inat_taxon_id_to_row,\
|
|
98
|
+
gbif_taxon_id_to_row,\
|
|
99
|
+
inat_taxon_id_to_vernacular,\
|
|
100
|
+
inat_vernacular_to_taxon_id,\
|
|
101
|
+
inat_taxon_id_to_scientific,\
|
|
102
|
+
inat_scientific_to_taxon_id,\
|
|
103
|
+
gbif_taxon_id_to_vernacular,\
|
|
104
|
+
gbif_vernacular_to_taxon_id,\
|
|
105
|
+
gbif_taxon_id_to_scientific,\
|
|
106
|
+
gbif_scientific_to_taxon_id
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
#%% Load serialized taxonomy info if we've already saved it
|
|
110
|
+
|
|
111
|
+
if (not force_init) and (inat_taxonomy is not None):
|
|
112
|
+
print('Skipping taxonomy re-init')
|
|
113
|
+
return
|
|
114
|
+
|
|
115
|
+
if (not force_init) and (os.path.isfile(serialized_structures_file)):
|
|
116
|
+
|
|
117
|
+
print(f'De-serializing taxonomy data from {serialized_structures_file}')
|
|
118
|
+
|
|
119
|
+
with open(serialized_structures_file, 'rb') as f:
|
|
120
|
+
structures_to_serialize = pickle.load(f)
|
|
121
|
+
|
|
122
|
+
inat_taxonomy,\
|
|
123
|
+
gbif_taxonomy,\
|
|
124
|
+
gbif_common_mapping,\
|
|
125
|
+
inat_taxon_id_to_row,\
|
|
126
|
+
gbif_taxon_id_to_row,\
|
|
127
|
+
inat_taxon_id_to_vernacular,\
|
|
128
|
+
inat_vernacular_to_taxon_id,\
|
|
129
|
+
inat_taxon_id_to_scientific,\
|
|
130
|
+
inat_scientific_to_taxon_id,\
|
|
131
|
+
gbif_taxon_id_to_vernacular,\
|
|
132
|
+
gbif_vernacular_to_taxon_id,\
|
|
133
|
+
gbif_taxon_id_to_scientific,\
|
|
134
|
+
gbif_scientific_to_taxon_id = structures_to_serialize
|
|
135
|
+
|
|
136
|
+
return
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
#%% Download and unzip taxonomy files
|
|
140
|
+
|
|
141
|
+
# taxonomy_name = list(taxonomy_urls.items())[0][0]; zip_url = list(taxonomy_urls.items())[0][1]
|
|
142
|
+
for taxonomy_name, zip_url in taxonomy_urls.items():
|
|
143
|
+
|
|
144
|
+
need_to_download = False
|
|
145
|
+
|
|
146
|
+
if force_init:
|
|
147
|
+
need_to_download = True
|
|
148
|
+
|
|
149
|
+
# Don't download the zipfile if we've already unzipped what we need
|
|
150
|
+
for fn in files_to_unzip[taxonomy_name]:
|
|
151
|
+
target_file = os.path.join(
|
|
152
|
+
taxonomy_download_dir, taxonomy_name, fn)
|
|
153
|
+
if not os.path.isfile(target_file):
|
|
154
|
+
need_to_download = True
|
|
155
|
+
break
|
|
156
|
+
if not need_to_download:
|
|
157
|
+
print(f'Bypassing download of {taxonomy_name}, all files available')
|
|
158
|
+
continue
|
|
159
|
+
|
|
160
|
+
zipfile_path = os.path.join(
|
|
161
|
+
taxonomy_download_dir, zip_url.split('/')[-1])
|
|
162
|
+
|
|
163
|
+
# Bypasses download if the file exists already (unless force_init is set)
|
|
164
|
+
url_utils.download_url(
|
|
165
|
+
zip_url, os.path.join(zipfile_path),
|
|
166
|
+
progress_updater=url_utils.DownloadProgressBar(),
|
|
167
|
+
verbose=True,force_download=force_init)
|
|
168
|
+
|
|
169
|
+
# Unzip the files we need
|
|
170
|
+
files_we_need = files_to_unzip[taxonomy_name]
|
|
171
|
+
|
|
172
|
+
with zipfile.ZipFile(zipfile_path, 'r') as zipH:
|
|
173
|
+
|
|
174
|
+
for fn in files_we_need:
|
|
175
|
+
print('Unzipping {}'.format(fn))
|
|
176
|
+
target_file = os.path.join(
|
|
177
|
+
taxonomy_download_dir, taxonomy_name, os.path.basename(fn))
|
|
178
|
+
|
|
179
|
+
if (not force_init) and (os.path.isfile(target_file)):
|
|
180
|
+
print(f'Bypassing unzip of {target_file}, file exists')
|
|
181
|
+
else:
|
|
182
|
+
os.makedirs(os.path.basename(target_file),exist_ok=True)
|
|
183
|
+
with zipH.open(fn) as zf, open(target_file, 'wb') as f:
|
|
184
|
+
shutil.copyfileobj(zf, f)
|
|
185
|
+
|
|
186
|
+
# ...for each file that we need from this zipfile
|
|
187
|
+
|
|
188
|
+
# ...for each taxonomy
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
#%% Create dataframes from each of the taxonomy/vernacular files
|
|
192
|
+
|
|
193
|
+
# Load iNat taxonomy
|
|
194
|
+
inat_taxonomy_file = os.path.join(taxonomy_download_dir, 'iNaturalist', 'taxa.csv')
|
|
195
|
+
print('Loading iNat taxonomy from {}'.format(inat_taxonomy_file))
|
|
196
|
+
inat_taxonomy = pd.read_csv(inat_taxonomy_file)
|
|
197
|
+
inat_taxonomy['scientificName'] = inat_taxonomy['scientificName'].fillna('').str.strip()
|
|
198
|
+
|
|
199
|
+
# Delete columns we won't use. The "taxonID" column is a non-int version of "ID"
|
|
200
|
+
inat_taxonomy = inat_taxonomy.drop(['identifier', 'taxonID', 'modified', 'references'], axis=1)
|
|
201
|
+
|
|
202
|
+
# The "parentNameUsageID" column in inat_taxonomy is a URL, like:
|
|
203
|
+
#
|
|
204
|
+
# https://www.inaturalist.org/taxa/71262
|
|
205
|
+
#
|
|
206
|
+
# Convert this column to be integer-valued, using only the last token of the URL
|
|
207
|
+
inat_taxonomy['parentNameUsageID'] = \
|
|
208
|
+
inat_taxonomy['parentNameUsageID'].str.split('/').str[-1].fillna(0).astype(int)
|
|
209
|
+
|
|
210
|
+
# Rename the "id" column to "taxonID"
|
|
211
|
+
inat_taxonomy = inat_taxonomy.rename(columns={'id': 'taxonID'})
|
|
212
|
+
|
|
213
|
+
assert 'id' not in inat_taxonomy.columns
|
|
214
|
+
assert 'taxonID' in inat_taxonomy.columns
|
|
215
|
+
|
|
216
|
+
# Load iNat common name mapping
|
|
217
|
+
inat_common_mapping_file = os.path.join(taxonomy_download_dir, 'iNaturalist', 'VernacularNames-english.csv')
|
|
218
|
+
inat_common_mapping = pd.read_csv(inat_common_mapping_file)
|
|
219
|
+
inat_common_mapping['vernacularName'] = inat_common_mapping['vernacularName'].fillna('').str.strip()
|
|
220
|
+
|
|
221
|
+
inat_common_mapping = inat_common_mapping.drop(['language','locality','countryCode',
|
|
222
|
+
'source','lexicon','contributor','created'], axis=1)
|
|
223
|
+
assert 'id' in inat_common_mapping.columns
|
|
224
|
+
assert 'taxonID' not in inat_common_mapping.columns
|
|
225
|
+
assert 'vernacularName' in inat_common_mapping.columns
|
|
226
|
+
|
|
227
|
+
# Load GBIF taxonomy
|
|
228
|
+
gbif_taxonomy_file = os.path.join(taxonomy_download_dir, 'GBIF', 'Taxon.tsv')
|
|
229
|
+
print('Loading GBIF taxonomy from {}'.format(gbif_taxonomy_file))
|
|
230
|
+
gbif_taxonomy = pd.read_csv(gbif_taxonomy_file, sep='\t', encoding='utf-8',on_bad_lines='warn')
|
|
231
|
+
gbif_taxonomy['scientificName'] = gbif_taxonomy['scientificName'].fillna('').str.strip()
|
|
232
|
+
gbif_taxonomy['canonicalName'] = gbif_taxonomy['canonicalName'].fillna('').str.strip()
|
|
233
|
+
gbif_taxonomy['parentNameUsageID'] = gbif_taxonomy['parentNameUsageID'].fillna(-1).astype(int)
|
|
234
|
+
|
|
235
|
+
# Remove questionable rows from the GBIF taxonomy
|
|
236
|
+
gbif_taxonomy = gbif_taxonomy[~gbif_taxonomy['taxonomicStatus'].isin(['doubtful', 'misapplied'])]
|
|
237
|
+
gbif_taxonomy = gbif_taxonomy.reset_index()
|
|
238
|
+
|
|
239
|
+
gbif_taxonomy = gbif_taxonomy.drop(['datasetID','acceptedNameUsageID','originalNameUsageID',
|
|
240
|
+
'scientificNameAuthorship','nameAccordingTo','namePublishedIn',
|
|
241
|
+
'taxonomicStatus','nomenclaturalStatus','taxonRemarks'], axis=1)
|
|
242
|
+
|
|
243
|
+
assert 'taxonID' in gbif_taxonomy.columns
|
|
244
|
+
assert 'scientificName' in gbif_taxonomy.columns
|
|
245
|
+
|
|
246
|
+
# Load GBIF common name mapping
|
|
247
|
+
gbif_common_mapping = pd.read_csv(os.path.join(
|
|
248
|
+
taxonomy_download_dir, 'GBIF', 'VernacularName.tsv'), sep='\t')
|
|
249
|
+
gbif_common_mapping['vernacularName'] = gbif_common_mapping['vernacularName'].fillna('').str.strip()
|
|
250
|
+
|
|
251
|
+
# Only keep English mappings
|
|
252
|
+
gbif_common_mapping = gbif_common_mapping.loc[gbif_common_mapping['language'] == 'en']
|
|
253
|
+
gbif_common_mapping = gbif_common_mapping.reset_index()
|
|
254
|
+
|
|
255
|
+
gbif_common_mapping = gbif_common_mapping.drop(['language','country','countryCode','sex',
|
|
256
|
+
'lifeStage','source'],axis=1)
|
|
257
|
+
|
|
258
|
+
assert 'taxonID' in gbif_common_mapping.columns
|
|
259
|
+
assert 'vernacularName' in gbif_common_mapping.columns
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
# Convert everything to lowercase
|
|
263
|
+
|
|
264
|
+
def convert_df_to_lowercase(df):
|
|
265
|
+
df = df.applymap(lambda s: s.lower() if isinstance(s, str) else s)
|
|
266
|
+
return df
|
|
267
|
+
|
|
268
|
+
inat_taxonomy = convert_df_to_lowercase(inat_taxonomy)
|
|
269
|
+
gbif_taxonomy = convert_df_to_lowercase(gbif_taxonomy)
|
|
270
|
+
gbif_common_mapping = convert_df_to_lowercase(gbif_common_mapping)
|
|
271
|
+
inat_common_mapping = convert_df_to_lowercase(inat_common_mapping)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
##%% For each taxonomy table, create a mapping from taxon IDs to rows
|
|
275
|
+
|
|
276
|
+
inat_taxon_id_to_row = {}
|
|
277
|
+
gbif_taxon_id_to_row = {}
|
|
278
|
+
|
|
279
|
+
print('Building iNat taxonID --> row table')
|
|
280
|
+
for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
|
|
281
|
+
taxon_id = row['taxonID']
|
|
282
|
+
assert isinstance(taxon_id, int)
|
|
283
|
+
inat_taxon_id_to_row[taxon_id] = i_row
|
|
284
|
+
|
|
285
|
+
print('Building GBIF taxonID --> row table')
|
|
286
|
+
for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
|
|
287
|
+
taxon_id = row['taxonID']
|
|
288
|
+
assert isinstance(taxon_id, int)
|
|
289
|
+
gbif_taxon_id_to_row[taxon_id] = i_row
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
##%% Create name mapping dictionaries
|
|
293
|
+
|
|
294
|
+
inat_taxon_id_to_vernacular = defaultdict(set)
|
|
295
|
+
inat_vernacular_to_taxon_id = defaultdict(set)
|
|
296
|
+
inat_taxon_id_to_scientific = defaultdict(set)
|
|
297
|
+
inat_scientific_to_taxon_id = defaultdict(set)
|
|
298
|
+
|
|
299
|
+
gbif_taxon_id_to_vernacular = defaultdict(set)
|
|
300
|
+
gbif_vernacular_to_taxon_id = defaultdict(set)
|
|
301
|
+
gbif_taxon_id_to_scientific = defaultdict(set)
|
|
302
|
+
gbif_scientific_to_taxon_id = defaultdict(set)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
# Build iNat dictionaries
|
|
306
|
+
|
|
307
|
+
print('Building lookup dictionaries for iNat taxonomy')
|
|
308
|
+
|
|
309
|
+
# iNat Scientific name mapping
|
|
310
|
+
|
|
311
|
+
for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
|
|
312
|
+
|
|
313
|
+
taxon_id = row['taxonID']
|
|
314
|
+
assert isinstance(taxon_id,int)
|
|
315
|
+
|
|
316
|
+
scientific_name = row['scientificName']
|
|
317
|
+
assert len(scientific_name) > 0
|
|
318
|
+
|
|
319
|
+
inat_taxon_id_to_scientific[taxon_id].add(scientific_name)
|
|
320
|
+
inat_scientific_to_taxon_id[scientific_name].add(taxon_id)
|
|
321
|
+
|
|
322
|
+
# iNat common name mapping
|
|
323
|
+
|
|
324
|
+
inat_taxon_ids_in_vernacular_file_but_not_in_taxa_file = set()
|
|
325
|
+
|
|
326
|
+
for i_row, row in tqdm(inat_common_mapping.iterrows(), total=len(inat_common_mapping)):
|
|
327
|
+
|
|
328
|
+
taxon_id = row['id']
|
|
329
|
+
assert isinstance(taxon_id,int)
|
|
330
|
+
|
|
331
|
+
# This should never happen; we will assert() this at the end of the loop
|
|
332
|
+
if taxon_id not in inat_taxon_id_to_scientific:
|
|
333
|
+
inat_taxon_ids_in_vernacular_file_but_not_in_taxa_file.add(taxon_id)
|
|
334
|
+
continue
|
|
335
|
+
|
|
336
|
+
vernacular_name = row['vernacularName']
|
|
337
|
+
|
|
338
|
+
assert len(vernacular_name) > 0
|
|
339
|
+
inat_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
|
|
340
|
+
inat_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
|
|
341
|
+
|
|
342
|
+
assert len(inat_taxon_ids_in_vernacular_file_but_not_in_taxa_file) == 0
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
##%% Build GBIF dictionaries
|
|
346
|
+
|
|
347
|
+
print('Building lookup dictionaries for GBIF taxonomy')
|
|
348
|
+
|
|
349
|
+
# GBIF scientific name mapping
|
|
350
|
+
|
|
351
|
+
for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
|
|
352
|
+
|
|
353
|
+
taxon_id = row['taxonID']
|
|
354
|
+
assert isinstance(taxon_id,int)
|
|
355
|
+
|
|
356
|
+
# The "canonical name" is the Latin name; the "scientific name"
|
|
357
|
+
# column includes other information. For example:
|
|
358
|
+
#
|
|
359
|
+
# "scientificName": Schizophoria impressa (Hall, 1843)
|
|
360
|
+
# "canonicalName": Schizophoria impressa
|
|
361
|
+
#
|
|
362
|
+
# Also see:
|
|
363
|
+
#
|
|
364
|
+
# http://globalnames.org/docs/glossary/
|
|
365
|
+
|
|
366
|
+
scientific_name = row['canonicalName']
|
|
367
|
+
|
|
368
|
+
# This only seems to happen for really esoteric species that aren't
|
|
369
|
+
# likely to apply to our problems, but doing this for completeness.
|
|
370
|
+
if len(scientific_name) == 0:
|
|
371
|
+
scientific_name = row['scientificName']
|
|
372
|
+
|
|
373
|
+
assert len(scientific_name) > 0
|
|
374
|
+
gbif_taxon_id_to_scientific[taxon_id].add(scientific_name)
|
|
375
|
+
gbif_scientific_to_taxon_id[scientific_name].add(taxon_id)
|
|
376
|
+
|
|
377
|
+
# GBIF common name mapping
|
|
378
|
+
|
|
379
|
+
gbif_taxon_ids_in_vernacular_file_but_not_in_taxa_file = set()
|
|
380
|
+
|
|
381
|
+
for i_row, row in tqdm(gbif_common_mapping.iterrows(), total=len(gbif_common_mapping)):
|
|
382
|
+
|
|
383
|
+
taxon_id = row['taxonID']
|
|
384
|
+
assert isinstance(taxon_id,int)
|
|
385
|
+
|
|
386
|
+
# Don't include taxon IDs that were removed from the master table
|
|
387
|
+
if taxon_id not in gbif_taxon_id_to_scientific:
|
|
388
|
+
gbif_taxon_ids_in_vernacular_file_but_not_in_taxa_file.add(taxon_id)
|
|
389
|
+
continue
|
|
390
|
+
|
|
391
|
+
vernacular_name = row['vernacularName']
|
|
392
|
+
|
|
393
|
+
assert len(vernacular_name) > 0
|
|
394
|
+
gbif_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
|
|
395
|
+
gbif_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
|
|
396
|
+
|
|
397
|
+
print('Finished GBIF common --> scientific mapping, failed to map {} of {} taxon IDs'.format(
|
|
398
|
+
len(gbif_taxon_ids_in_vernacular_file_but_not_in_taxa_file),
|
|
399
|
+
len(gbif_common_mapping)
|
|
400
|
+
))
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
##%% Save everything to file
|
|
404
|
+
|
|
405
|
+
structures_to_serialize = [
|
|
406
|
+
inat_taxonomy,
|
|
407
|
+
gbif_taxonomy,
|
|
408
|
+
gbif_common_mapping,
|
|
409
|
+
inat_taxon_id_to_row,
|
|
410
|
+
gbif_taxon_id_to_row,
|
|
411
|
+
inat_taxon_id_to_vernacular,
|
|
412
|
+
inat_vernacular_to_taxon_id,
|
|
413
|
+
inat_taxon_id_to_scientific,
|
|
414
|
+
inat_scientific_to_taxon_id,
|
|
415
|
+
gbif_taxon_id_to_vernacular,
|
|
416
|
+
gbif_vernacular_to_taxon_id,
|
|
417
|
+
gbif_taxon_id_to_scientific,
|
|
418
|
+
gbif_scientific_to_taxon_id
|
|
419
|
+
]
|
|
420
|
+
|
|
421
|
+
print('Serializing to {}...'.format(serialized_structures_file), end='')
|
|
422
|
+
if not os.path.isfile(serialized_structures_file):
|
|
423
|
+
with open(serialized_structures_file, 'wb') as p:
|
|
424
|
+
pickle.dump(structures_to_serialize, p)
|
|
425
|
+
print('...done')
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
#%%
|
|
429
|
+
|
|
430
|
+
# ...def initialize_taxonomy_lookup(...)
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def get_scientific_name_from_row(r):
|
|
434
|
+
"""
|
|
435
|
+
r: a dataframe that's really a row in one of our taxonomy tables
|
|
436
|
+
"""
|
|
437
|
+
|
|
438
|
+
if 'canonicalName' in r and len(r['canonicalName']) > 0:
|
|
439
|
+
scientific_name = r['canonicalName']
|
|
440
|
+
else:
|
|
441
|
+
scientific_name = r['scientificName']
|
|
442
|
+
return scientific_name
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def taxonomy_row_to_string(r):
|
|
446
|
+
"""
|
|
447
|
+
r: a dataframe that's really a row in one of our taxonomy tables
|
|
448
|
+
"""
|
|
449
|
+
|
|
450
|
+
if 'vernacularName' in r:
|
|
451
|
+
common_string = ' (' + r['vernacularName'] + ')'
|
|
452
|
+
else:
|
|
453
|
+
common_string = ''
|
|
454
|
+
scientific_name = get_scientific_name_from_row(r)
|
|
455
|
+
|
|
456
|
+
return r['taxonRank'] + ' ' + scientific_name + common_string
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def traverse_taxonomy(matching_rownums: Sequence[int],
|
|
460
|
+
taxon_id_to_row: Mapping[str, int],
|
|
461
|
+
taxon_id_to_vernacular: Mapping[str, Set[str]],
|
|
462
|
+
taxonomy: pd.DataFrame,
|
|
463
|
+
source_name: str,
|
|
464
|
+
query: str) -> List[Dict[str, Any]]:
|
|
465
|
+
"""
|
|
466
|
+
Given a data frame that's a set of rows from one of our taxonomy tables,
|
|
467
|
+
walks the taxonomy hierarchy from each row to put together a full taxonomy
|
|
468
|
+
tree, then prunes redundant trees (e.g. if we had separate hits for a
|
|
469
|
+
species and the genus that contains that species.)
|
|
470
|
+
|
|
471
|
+
Returns a list of dicts:
|
|
472
|
+
[
|
|
473
|
+
{
|
|
474
|
+
'source': 'inat' or 'gbif',
|
|
475
|
+
'taxonomy': [(taxon_id, taxon_rank, scientific_name, [common names])]
|
|
476
|
+
},
|
|
477
|
+
...
|
|
478
|
+
]
|
|
479
|
+
"""
|
|
480
|
+
|
|
481
|
+
# list of dicts: {'source': source_name, 'taxonomy': match_details}
|
|
482
|
+
matching_trees: List[Dict[str, Any]] = []
|
|
483
|
+
|
|
484
|
+
# i_match = 0
|
|
485
|
+
for i_match in matching_rownums:
|
|
486
|
+
|
|
487
|
+
# list of (taxon_id, taxonRank, scientific name, [vernacular names])
|
|
488
|
+
# corresponding to an exact match and its parents
|
|
489
|
+
match_details = []
|
|
490
|
+
current_row = taxonomy.iloc[i_match]
|
|
491
|
+
|
|
492
|
+
# Walk taxonomy hierarchy
|
|
493
|
+
while True:
|
|
494
|
+
|
|
495
|
+
taxon_id = current_row['taxonID']
|
|
496
|
+
# sort for determinism
|
|
497
|
+
vernacular_names = sorted(taxon_id_to_vernacular[taxon_id])
|
|
498
|
+
match_details.append((taxon_id, current_row['taxonRank'],
|
|
499
|
+
get_scientific_name_from_row(current_row),
|
|
500
|
+
vernacular_names))
|
|
501
|
+
|
|
502
|
+
if np.isnan(current_row['parentNameUsageID']):
|
|
503
|
+
break
|
|
504
|
+
parent_taxon_id = current_row['parentNameUsageID'].astype('int64')
|
|
505
|
+
if parent_taxon_id not in taxon_id_to_row:
|
|
506
|
+
# This can happen because we remove questionable rows from the
|
|
507
|
+
# GBIF taxonomy
|
|
508
|
+
# print(f'Warning: no row exists for parent_taxon_id {parent_taxon_id},' + \
|
|
509
|
+
# f'child taxon_id: {taxon_id}, query: {query}')
|
|
510
|
+
break
|
|
511
|
+
i_parent_row = taxon_id_to_row[parent_taxon_id]
|
|
512
|
+
current_row = taxonomy.iloc[i_parent_row]
|
|
513
|
+
|
|
514
|
+
# The GBIF taxonomy contains unranked entries
|
|
515
|
+
if current_row['taxonRank'] == 'unranked':
|
|
516
|
+
break
|
|
517
|
+
|
|
518
|
+
# ...while there is taxonomy left to walk
|
|
519
|
+
|
|
520
|
+
matching_trees.append({'source': source_name,
|
|
521
|
+
'taxonomy': match_details})
|
|
522
|
+
|
|
523
|
+
# ...for each match
|
|
524
|
+
|
|
525
|
+
# Remove redundant matches
|
|
526
|
+
b_valid_tree = [True] * len(matching_rownums)
|
|
527
|
+
# i_tree_a = 0; tree_a = matching_trees[i_tree_a]
|
|
528
|
+
for i_tree_a, tree_a in enumerate(matching_trees):
|
|
529
|
+
|
|
530
|
+
tree_a_primary_taxon_id = tree_a['taxonomy'][0][0]
|
|
531
|
+
|
|
532
|
+
# i_tree_b = 1; tree_b = matching_trees[i_tree_b]
|
|
533
|
+
for i_tree_b, tree_b in enumerate(matching_trees):
|
|
534
|
+
|
|
535
|
+
if i_tree_a == i_tree_b:
|
|
536
|
+
continue
|
|
537
|
+
|
|
538
|
+
# If tree a's primary taxon ID is inside tree b, discard tree a
|
|
539
|
+
#
|
|
540
|
+
# taxonomy_level_b = tree_b['taxonomy'][0]
|
|
541
|
+
for taxonomy_level_b in tree_b['taxonomy']:
|
|
542
|
+
if tree_a_primary_taxon_id == taxonomy_level_b[0]:
|
|
543
|
+
b_valid_tree[i_tree_a] = False
|
|
544
|
+
break
|
|
545
|
+
|
|
546
|
+
# ...for each level in taxonomy B
|
|
547
|
+
|
|
548
|
+
# ...for each tree (inner)
|
|
549
|
+
|
|
550
|
+
# ...for each tree (outer)
|
|
551
|
+
|
|
552
|
+
matching_trees = list(compress(matching_trees, b_valid_tree))
|
|
553
|
+
return matching_trees
|
|
554
|
+
|
|
555
|
+
# ...def traverse_taxonomy()
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
def get_taxonomic_info(query: str) -> List[Dict[str, Any]]:
|
|
559
|
+
"""
|
|
560
|
+
Main entry point: get taxonomic matches from both taxonomies for [query],
|
|
561
|
+
which may be a scientific or common name.
|
|
562
|
+
"""
|
|
563
|
+
|
|
564
|
+
query = query.strip().lower()
|
|
565
|
+
# print("Finding taxonomy information for: {0}".format(query))
|
|
566
|
+
|
|
567
|
+
inat_taxon_ids = set()
|
|
568
|
+
if query in inat_scientific_to_taxon_id:
|
|
569
|
+
inat_taxon_ids |= inat_scientific_to_taxon_id[query]
|
|
570
|
+
if query in inat_vernacular_to_taxon_id:
|
|
571
|
+
inat_taxon_ids |= inat_vernacular_to_taxon_id[query]
|
|
572
|
+
|
|
573
|
+
# In GBIF, some queries hit for both common and scientific, make sure we end
|
|
574
|
+
# up with unique inputs
|
|
575
|
+
gbif_taxon_ids = set()
|
|
576
|
+
if query in gbif_scientific_to_taxon_id:
|
|
577
|
+
gbif_taxon_ids |= gbif_scientific_to_taxon_id[query]
|
|
578
|
+
if query in gbif_vernacular_to_taxon_id:
|
|
579
|
+
gbif_taxon_ids |= gbif_vernacular_to_taxon_id[query]
|
|
580
|
+
|
|
581
|
+
# If the species is not found in either taxonomy, return None
|
|
582
|
+
if (len(inat_taxon_ids) == 0) and (len(gbif_taxon_ids) == 0):
|
|
583
|
+
return []
|
|
584
|
+
|
|
585
|
+
# Both GBIF and iNat have a 1-to-1 mapping between taxon_id and row number
|
|
586
|
+
inat_row_indices = [inat_taxon_id_to_row[i] for i in inat_taxon_ids]
|
|
587
|
+
gbif_row_indices = [gbif_taxon_id_to_row[i] for i in gbif_taxon_ids]
|
|
588
|
+
|
|
589
|
+
# Walk both taxonomies
|
|
590
|
+
inat_matching_trees = traverse_taxonomy(
|
|
591
|
+
inat_row_indices, inat_taxon_id_to_row, inat_taxon_id_to_vernacular,
|
|
592
|
+
inat_taxonomy, 'inat', query)
|
|
593
|
+
gbif_matching_trees = traverse_taxonomy(
|
|
594
|
+
gbif_row_indices, gbif_taxon_id_to_row, gbif_taxon_id_to_vernacular,
|
|
595
|
+
gbif_taxonomy, 'gbif', query)
|
|
596
|
+
|
|
597
|
+
return gbif_matching_trees + inat_matching_trees
|
|
598
|
+
|
|
599
|
+
# ...def get_taxonomic_info()
|
|
600
|
+
|
|
601
|
+
|
|
602
|
+
def print_taxonomy_matches(matches, verbose=False):
|
|
603
|
+
"""
|
|
604
|
+
Console-friendly printing function to make nicely-indentend trees
|
|
605
|
+
"""
|
|
606
|
+
|
|
607
|
+
# m = matches[0]
|
|
608
|
+
for m in matches:
|
|
609
|
+
|
|
610
|
+
source = m['source']
|
|
611
|
+
|
|
612
|
+
# For example: [(9761484, 'species', 'anas platyrhynchos')]
|
|
613
|
+
for i_taxonomy_level in range(0, len(m['taxonomy'])):
|
|
614
|
+
taxonomy_level_info = m['taxonomy'][i_taxonomy_level]
|
|
615
|
+
taxonomy_level = taxonomy_level_info[1]
|
|
616
|
+
name = taxonomy_level_info[2]
|
|
617
|
+
common = taxonomy_level_info[3]
|
|
618
|
+
|
|
619
|
+
if i_taxonomy_level > 0:
|
|
620
|
+
print('\t',end='')
|
|
621
|
+
|
|
622
|
+
print('{} {} ({})'.format(taxonomy_level, name, common), end='')
|
|
623
|
+
|
|
624
|
+
if i_taxonomy_level == 0:
|
|
625
|
+
print(' ({})'.format(source))
|
|
626
|
+
else:
|
|
627
|
+
print('')
|
|
628
|
+
|
|
629
|
+
if not verbose:
|
|
630
|
+
break
|
|
631
|
+
|
|
632
|
+
# ...for each taxonomy level
|
|
633
|
+
|
|
634
|
+
# ...for each match
|
|
635
|
+
|
|
636
|
+
# ...def print_taxonomy_matches()
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
#%% Taxonomy functions that make subjective judgements
|
|
640
|
+
|
|
641
|
+
import unicodedata
|
|
642
|
+
import re
|
|
643
|
+
|
|
644
|
+
def slugify(value: Any, allow_unicode: bool = False) -> str:
|
|
645
|
+
"""
|
|
646
|
+
From:
|
|
647
|
+
https://github.com/django/django/blob/master/django/utils/text.py
|
|
648
|
+
|
|
649
|
+
Convert to ASCII if 'allow_unicode' is False. Convert spaces to hyphens.
|
|
650
|
+
Remove characters that aren't alphanumerics, underscores, or hyphens.
|
|
651
|
+
Convert to lowercase. Also strip leading and trailing whitespace.
|
|
652
|
+
"""
|
|
653
|
+
|
|
654
|
+
value = str(value)
|
|
655
|
+
value = unicodedata.normalize('NFKC', value)
|
|
656
|
+
if not allow_unicode:
|
|
657
|
+
value = value.encode('ascii', 'ignore').decode('ascii')
|
|
658
|
+
value = re.sub(r'[^\w\s-]', '', value.lower()).strip()
|
|
659
|
+
return re.sub(r'[-\s]+', '-', value)
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
class TaxonomicMatch:
|
|
663
|
+
|
|
664
|
+
def __init__(self, scientific_name, common_name, taxonomic_level, source,
|
|
665
|
+
taxonomy_string, match):
|
|
666
|
+
self.scientific_name = scientific_name
|
|
667
|
+
self.common_name = common_name
|
|
668
|
+
self.taxonomic_level = taxonomic_level
|
|
669
|
+
self.source = source
|
|
670
|
+
self.taxonomy_string = taxonomy_string
|
|
671
|
+
self.match = match
|
|
672
|
+
|
|
673
|
+
def __repr__(self):
|
|
674
|
+
return ('TaxonomicMatch('
|
|
675
|
+
f'scientific_name={self.scientific_name}, '
|
|
676
|
+
f'common_name={self.common_name}, '
|
|
677
|
+
f'taxonomic_level={self.taxonomic_level}, '
|
|
678
|
+
f'source={self.source}')
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
hyphenated_terms = ['crowned', 'backed', 'throated', 'tailed', 'headed', 'cheeked',
|
|
682
|
+
'ruffed', 'browed', 'eating', 'striped', 'shanked',
|
|
683
|
+
'fronted', 'bellied', 'spotted', 'eared', 'collared', 'breasted',
|
|
684
|
+
'necked']
|
|
685
|
+
|
|
686
|
+
def pop_levels(m, n_levels=1):
|
|
687
|
+
"""
|
|
688
|
+
Remove [n_levels] levels from the bottom of the TaxonomicMatch object m, typically used to remove
|
|
689
|
+
silly subgenera.
|
|
690
|
+
"""
|
|
691
|
+
|
|
692
|
+
v = eval(m.taxonomy_string)
|
|
693
|
+
assert v[0][1] == m.taxonomic_level
|
|
694
|
+
assert v[0][2] == m.scientific_name
|
|
695
|
+
popped_v = v[n_levels:]
|
|
696
|
+
taxonomic_level = popped_v[0][1]
|
|
697
|
+
scientific_name = popped_v[0][2]
|
|
698
|
+
common_name = popped_v[0][3]
|
|
699
|
+
if len(common_name) == 0:
|
|
700
|
+
common_name = ''
|
|
701
|
+
else:
|
|
702
|
+
common_name = common_name[0]
|
|
703
|
+
taxonomy_string = str(popped_v)
|
|
704
|
+
source = m.source
|
|
705
|
+
return TaxonomicMatch(scientific_name=scientific_name,
|
|
706
|
+
common_name=common_name,
|
|
707
|
+
taxonomic_level=taxonomic_level,
|
|
708
|
+
source=source,
|
|
709
|
+
taxonomy_string=taxonomy_string,
|
|
710
|
+
match=None)
|
|
711
|
+
|
|
712
|
+
# ...def pop_levels(...)
|
|
713
|
+
|
|
714
|
+
|
|
715
|
+
def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retry=True) -> TaxonomicMatch:
|
|
716
|
+
"""
|
|
717
|
+
Wrapper for _get_preferred_taxonomic_match, but expressing a variety of heuristics
|
|
718
|
+
and preferences that are specific to our scenario.
|
|
719
|
+
|
|
720
|
+
Args:
|
|
721
|
+
query (str): The common or scientific name we want to look up
|
|
722
|
+
taxonomy_preference (str, optional): 'inat' or 'gbif'
|
|
723
|
+
retry (bool, optional): if the initial lookup fails, should we try heuristic
|
|
724
|
+
substitutions, e.g. replacing "_" with " ", or "spp" with "species"?
|
|
725
|
+
|
|
726
|
+
Returns:
|
|
727
|
+
TaxonomicMatch: the best taxonomic match, or None
|
|
728
|
+
"""
|
|
729
|
+
|
|
730
|
+
m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
|
|
731
|
+
if (len(m.scientific_name) > 0) or (not retry):
|
|
732
|
+
return m
|
|
733
|
+
|
|
734
|
+
for s in hyphenated_terms:
|
|
735
|
+
query = query.replace(' ' + s,'-' + s)
|
|
736
|
+
m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
|
|
737
|
+
|
|
738
|
+
if (len(m.scientific_name) > 0) or (not retry):
|
|
739
|
+
return m
|
|
740
|
+
|
|
741
|
+
query = query.replace(' species','')
|
|
742
|
+
query = query.replace(' order','')
|
|
743
|
+
query = query.replace(' genus','')
|
|
744
|
+
query = query.replace(' family','')
|
|
745
|
+
query = query.replace(' subfamily','')
|
|
746
|
+
m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
|
|
747
|
+
|
|
748
|
+
return m
|
|
749
|
+
|
|
750
|
+
|
|
751
|
+
def validate_and_convert(data):
|
|
752
|
+
"""
|
|
753
|
+
Recursively validates that all elements in the nested structure are only
|
|
754
|
+
tuples, lists, ints, or np.int64, and converts np.int64 to int.
|
|
755
|
+
|
|
756
|
+
Args:
|
|
757
|
+
data: The nested structure to validate and convert
|
|
758
|
+
|
|
759
|
+
Returns:
|
|
760
|
+
The validated and converted structure
|
|
761
|
+
|
|
762
|
+
Raises:
|
|
763
|
+
TypeError: If an invalid type is encountered
|
|
764
|
+
"""
|
|
765
|
+
|
|
766
|
+
if isinstance(data, np.int64):
|
|
767
|
+
return int(data)
|
|
768
|
+
elif isinstance(data, int) or isinstance(data, str):
|
|
769
|
+
return data
|
|
770
|
+
elif isinstance(data, (list, tuple)):
|
|
771
|
+
# Process lists and tuples recursively
|
|
772
|
+
container_type = type(data)
|
|
773
|
+
return container_type(validate_and_convert(item) for item in data)
|
|
774
|
+
else:
|
|
775
|
+
raise TypeError(f"Invalid type encountered: {type(data).__name__}. "
|
|
776
|
+
f"Only int, np.int64, list, and tuple are allowed.")
|
|
777
|
+
|
|
778
|
+
# ...def validate_and_convert(...)
|
|
779
|
+
|
|
780
|
+
|
|
781
|
+
def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') -> TaxonomicMatch:
|
|
782
|
+
|
|
783
|
+
query = query.lower().strip().replace('_', ' ')
|
|
784
|
+
query = query.replace('unidentified','')
|
|
785
|
+
query = query.replace('unknown','')
|
|
786
|
+
if query.endswith(' sp'):
|
|
787
|
+
query = query.replace(' sp','')
|
|
788
|
+
if query.endswith(' group'):
|
|
789
|
+
query = query.replace(' group','')
|
|
790
|
+
|
|
791
|
+
query = query.strip()
|
|
792
|
+
|
|
793
|
+
# query = 'person'
|
|
794
|
+
matches = get_taxonomic_info(query)
|
|
795
|
+
|
|
796
|
+
# Do we have an iNat match?
|
|
797
|
+
inat_matches = [m for m in matches if m['source'] == 'inat']
|
|
798
|
+
gbif_matches = [m for m in matches if m['source'] == 'gbif']
|
|
799
|
+
|
|
800
|
+
# print_taxonomy_matches(inat_matches, verbose=True)
|
|
801
|
+
# print_taxonomy_matches(gbif_matches, verbose=True)
|
|
802
|
+
|
|
803
|
+
scientific_name = ''
|
|
804
|
+
common_name = ''
|
|
805
|
+
taxonomic_level = ''
|
|
806
|
+
match = ''
|
|
807
|
+
source = ''
|
|
808
|
+
taxonomy_string = ''
|
|
809
|
+
|
|
810
|
+
n_inat_matches = len(inat_matches)
|
|
811
|
+
n_gbif_matches = len(gbif_matches)
|
|
812
|
+
|
|
813
|
+
selected_matches = None
|
|
814
|
+
|
|
815
|
+
assert taxonomy_preference in ['gbif','inat'],\
|
|
816
|
+
'Unrecognized taxonomy preference: {}'.format(taxonomy_preference)
|
|
817
|
+
|
|
818
|
+
if n_inat_matches > 0 and taxonomy_preference == 'inat':
|
|
819
|
+
selected_matches = 'inat'
|
|
820
|
+
elif n_gbif_matches > 0:
|
|
821
|
+
selected_matches = 'gbif'
|
|
822
|
+
|
|
823
|
+
if selected_matches == 'inat':
|
|
824
|
+
|
|
825
|
+
i_match = 0
|
|
826
|
+
|
|
827
|
+
if len(inat_matches) > 1:
|
|
828
|
+
# print('Warning: multiple iNat matches for {}'.format(query))
|
|
829
|
+
|
|
830
|
+
# Prefer chordates... most of the names that aren't what we want
|
|
831
|
+
# are esoteric insects, like a moth called "cheetah"
|
|
832
|
+
#
|
|
833
|
+
# If we can't find a chordate, just take the first match.
|
|
834
|
+
#
|
|
835
|
+
# i_test_match = 0
|
|
836
|
+
for i_test_match, match in enumerate(inat_matches):
|
|
837
|
+
found_vertebrate = False
|
|
838
|
+
taxonomy = match['taxonomy']
|
|
839
|
+
for taxonomy_level in taxonomy:
|
|
840
|
+
taxon_rank = taxonomy_level[1]
|
|
841
|
+
scientific_name = taxonomy_level[2]
|
|
842
|
+
if taxon_rank == 'phylum' and scientific_name == 'chordata':
|
|
843
|
+
i_match = i_test_match
|
|
844
|
+
found_vertebrate = True
|
|
845
|
+
break
|
|
846
|
+
if found_vertebrate:
|
|
847
|
+
break
|
|
848
|
+
|
|
849
|
+
match = inat_matches[i_match]['taxonomy']
|
|
850
|
+
|
|
851
|
+
# This is (taxonID, taxonLevel, scientific, [list of common])
|
|
852
|
+
lowest_level = match[0]
|
|
853
|
+
taxonomic_level = lowest_level[1]
|
|
854
|
+
scientific_name = lowest_level[2]
|
|
855
|
+
assert len(scientific_name) > 0
|
|
856
|
+
common_names = lowest_level[3]
|
|
857
|
+
if len(common_names) > 1:
|
|
858
|
+
# print(f'Warning: multiple iNat common names for {query}')
|
|
859
|
+
# Default to returning the query
|
|
860
|
+
if query in common_names:
|
|
861
|
+
common_name = query
|
|
862
|
+
else:
|
|
863
|
+
common_name = common_names[0]
|
|
864
|
+
elif len(common_names) > 0:
|
|
865
|
+
common_name = common_names[0]
|
|
866
|
+
|
|
867
|
+
# print(f'Matched iNat {query} to {scientific_name},{common_name}')
|
|
868
|
+
source = 'inat'
|
|
869
|
+
|
|
870
|
+
# ...if we had iNat matches
|
|
871
|
+
|
|
872
|
+
# If we either prefer GBIF or didn't have iNat matches
|
|
873
|
+
#
|
|
874
|
+
# Code is deliberately redundant here; I'm expecting some subtleties in how
|
|
875
|
+
# handle GBIF and iNat.
|
|
876
|
+
elif selected_matches == 'gbif':
|
|
877
|
+
|
|
878
|
+
i_match = 0
|
|
879
|
+
|
|
880
|
+
if len(gbif_matches) > 1:
|
|
881
|
+
# print('Warning: multiple GBIF matches for {}'.format(query))
|
|
882
|
+
|
|
883
|
+
# Prefer chordates... most of the names that aren't what we want
|
|
884
|
+
# are esoteric insects, like a moth called "cheetah"
|
|
885
|
+
#
|
|
886
|
+
# If we can't find a chordate, just take the first match.
|
|
887
|
+
#
|
|
888
|
+
# i_test_match = 0
|
|
889
|
+
for i_test_match, match in enumerate(gbif_matches):
|
|
890
|
+
found_vertebrate = False
|
|
891
|
+
taxonomy = match['taxonomy']
|
|
892
|
+
for taxonomy_level in taxonomy:
|
|
893
|
+
taxon_rank = taxonomy_level[1]
|
|
894
|
+
scientific_name = taxonomy_level[2]
|
|
895
|
+
if taxon_rank == 'phylum' and scientific_name == 'chordata':
|
|
896
|
+
i_match = i_test_match
|
|
897
|
+
found_vertebrate = True
|
|
898
|
+
break
|
|
899
|
+
if found_vertebrate:
|
|
900
|
+
break
|
|
901
|
+
|
|
902
|
+
match = gbif_matches[i_match]['taxonomy']
|
|
903
|
+
|
|
904
|
+
# This is (taxonID, taxonLevel, scientific, [list of common])
|
|
905
|
+
lowest_level = match[0]
|
|
906
|
+
taxonomic_level = lowest_level[1]
|
|
907
|
+
scientific_name = lowest_level[2]
|
|
908
|
+
assert len(scientific_name) > 0
|
|
909
|
+
|
|
910
|
+
common_names = lowest_level[3]
|
|
911
|
+
if len(common_names) > 1:
|
|
912
|
+
# print(f'Warning: multiple GBIF common names for {query}')
|
|
913
|
+
# Default to returning the query
|
|
914
|
+
if query in common_names:
|
|
915
|
+
common_name = query
|
|
916
|
+
else:
|
|
917
|
+
common_name = common_names[0]
|
|
918
|
+
elif len(common_names) > 0:
|
|
919
|
+
common_name = common_names[0]
|
|
920
|
+
|
|
921
|
+
source = 'gbif'
|
|
922
|
+
|
|
923
|
+
# ...if we needed to look in the GBIF taxonomy
|
|
924
|
+
|
|
925
|
+
# Convert np.int64's to ints
|
|
926
|
+
if match is not None:
|
|
927
|
+
match = validate_and_convert(match)
|
|
928
|
+
|
|
929
|
+
taxonomy_string = str(match)
|
|
930
|
+
|
|
931
|
+
m = TaxonomicMatch(scientific_name, common_name, taxonomic_level, source,
|
|
932
|
+
taxonomy_string, match)
|
|
933
|
+
|
|
934
|
+
if (m.taxonomic_level == 'subgenus' and \
|
|
935
|
+
match[1][1] == 'genus' and \
|
|
936
|
+
match[1][2] == m.scientific_name):
|
|
937
|
+
print('Removing redundant subgenus {}'.format(scientific_name))
|
|
938
|
+
m = pop_levels(m,1)
|
|
939
|
+
|
|
940
|
+
return m,query
|
|
941
|
+
|
|
942
|
+
# ...def _get_preferred_taxonomic_match()
|
|
943
|
+
|
|
944
|
+
|
|
945
|
+
#%% Interactive drivers and debug
|
|
946
|
+
|
|
947
|
+
if False:
|
|
948
|
+
|
|
949
|
+
#%% Initialization
|
|
950
|
+
|
|
951
|
+
initialize_taxonomy_lookup()
|
|
952
|
+
|
|
953
|
+
|
|
954
|
+
#%% Taxonomic lookup
|
|
955
|
+
|
|
956
|
+
# query = 'lion'
|
|
957
|
+
query = 'xenoperdix'
|
|
958
|
+
matches = get_taxonomic_info(query)
|
|
959
|
+
# print(matches)
|
|
960
|
+
|
|
961
|
+
print_taxonomy_matches(matches,verbose=True)
|
|
962
|
+
|
|
963
|
+
print('\n\n')
|
|
964
|
+
|
|
965
|
+
# Print the taxonomy in the taxonomy spreadsheet format
|
|
966
|
+
assert matches[1]['source'] == 'inat'
|
|
967
|
+
t = str(matches[1]['taxonomy'])
|
|
968
|
+
print(t)
|
|
969
|
+
import clipboard; clipboard.copy(t)
|
|
970
|
+
|
|
971
|
+
|
|
972
|
+
#%% Directly access the taxonomy tables
|
|
973
|
+
|
|
974
|
+
taxon_ids = gbif_vernacular_to_taxon_id['lion']
|
|
975
|
+
for taxon_id in taxon_ids:
|
|
976
|
+
i_row = gbif_taxon_id_to_row[taxon_id]
|
|
977
|
+
print(taxonomy_row_to_string(gbif_taxonomy.iloc[i_row]))
|
|
978
|
+
|
|
979
|
+
|
|
980
|
+
#%% Command-line driver
|
|
981
|
+
|
|
982
|
+
def main(): # noqa
|
|
983
|
+
|
|
984
|
+
# Read command line inputs (absolute path)
|
|
985
|
+
parser = argparse.ArgumentParser()
|
|
986
|
+
parser.add_argument('input_file')
|
|
987
|
+
|
|
988
|
+
if len(sys.argv[1:]) == 0:
|
|
989
|
+
parser.print_help()
|
|
990
|
+
parser.exit()
|
|
991
|
+
|
|
992
|
+
args = parser.parse_args()
|
|
993
|
+
input_file = args.input_file
|
|
994
|
+
|
|
995
|
+
initialize_taxonomy_lookup()
|
|
996
|
+
|
|
997
|
+
# Read the tokens from the input text file
|
|
998
|
+
with open(input_file, 'r') as f:
|
|
999
|
+
tokens = f.readlines()
|
|
1000
|
+
|
|
1001
|
+
# Loop through each token and get scientific name
|
|
1002
|
+
for token in tokens:
|
|
1003
|
+
token = token.strip().lower()
|
|
1004
|
+
matches = get_taxonomic_info(token)
|
|
1005
|
+
print_taxonomy_matches(matches)
|
|
1006
|
+
|
|
1007
|
+
if __name__ == '__main__':
|
|
1008
|
+
main()
|