megadetector 10.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (147) hide show
  1. megadetector/__init__.py +0 -0
  2. megadetector/api/__init__.py +0 -0
  3. megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
  4. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
  5. megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
  6. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
  7. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
  8. megadetector/classification/__init__.py +0 -0
  9. megadetector/classification/aggregate_classifier_probs.py +108 -0
  10. megadetector/classification/analyze_failed_images.py +227 -0
  11. megadetector/classification/cache_batchapi_outputs.py +198 -0
  12. megadetector/classification/create_classification_dataset.py +626 -0
  13. megadetector/classification/crop_detections.py +516 -0
  14. megadetector/classification/csv_to_json.py +226 -0
  15. megadetector/classification/detect_and_crop.py +853 -0
  16. megadetector/classification/efficientnet/__init__.py +9 -0
  17. megadetector/classification/efficientnet/model.py +415 -0
  18. megadetector/classification/efficientnet/utils.py +608 -0
  19. megadetector/classification/evaluate_model.py +520 -0
  20. megadetector/classification/identify_mislabeled_candidates.py +152 -0
  21. megadetector/classification/json_to_azcopy_list.py +63 -0
  22. megadetector/classification/json_validator.py +696 -0
  23. megadetector/classification/map_classification_categories.py +276 -0
  24. megadetector/classification/merge_classification_detection_output.py +509 -0
  25. megadetector/classification/prepare_classification_script.py +194 -0
  26. megadetector/classification/prepare_classification_script_mc.py +228 -0
  27. megadetector/classification/run_classifier.py +287 -0
  28. megadetector/classification/save_mislabeled.py +110 -0
  29. megadetector/classification/train_classifier.py +827 -0
  30. megadetector/classification/train_classifier_tf.py +725 -0
  31. megadetector/classification/train_utils.py +323 -0
  32. megadetector/data_management/__init__.py +0 -0
  33. megadetector/data_management/animl_to_md.py +161 -0
  34. megadetector/data_management/annotations/__init__.py +0 -0
  35. megadetector/data_management/annotations/annotation_constants.py +33 -0
  36. megadetector/data_management/camtrap_dp_to_coco.py +270 -0
  37. megadetector/data_management/cct_json_utils.py +566 -0
  38. megadetector/data_management/cct_to_md.py +184 -0
  39. megadetector/data_management/cct_to_wi.py +293 -0
  40. megadetector/data_management/coco_to_labelme.py +284 -0
  41. megadetector/data_management/coco_to_yolo.py +702 -0
  42. megadetector/data_management/databases/__init__.py +0 -0
  43. megadetector/data_management/databases/add_width_and_height_to_db.py +107 -0
  44. megadetector/data_management/databases/combine_coco_camera_traps_files.py +210 -0
  45. megadetector/data_management/databases/integrity_check_json_db.py +528 -0
  46. megadetector/data_management/databases/subset_json_db.py +195 -0
  47. megadetector/data_management/generate_crops_from_cct.py +200 -0
  48. megadetector/data_management/get_image_sizes.py +164 -0
  49. megadetector/data_management/labelme_to_coco.py +559 -0
  50. megadetector/data_management/labelme_to_yolo.py +349 -0
  51. megadetector/data_management/lila/__init__.py +0 -0
  52. megadetector/data_management/lila/create_lila_blank_set.py +556 -0
  53. megadetector/data_management/lila/create_lila_test_set.py +187 -0
  54. megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
  55. megadetector/data_management/lila/download_lila_subset.py +182 -0
  56. megadetector/data_management/lila/generate_lila_per_image_labels.py +777 -0
  57. megadetector/data_management/lila/get_lila_annotation_counts.py +174 -0
  58. megadetector/data_management/lila/get_lila_image_counts.py +112 -0
  59. megadetector/data_management/lila/lila_common.py +319 -0
  60. megadetector/data_management/lila/test_lila_metadata_urls.py +164 -0
  61. megadetector/data_management/mewc_to_md.py +344 -0
  62. megadetector/data_management/ocr_tools.py +873 -0
  63. megadetector/data_management/read_exif.py +964 -0
  64. megadetector/data_management/remap_coco_categories.py +195 -0
  65. megadetector/data_management/remove_exif.py +156 -0
  66. megadetector/data_management/rename_images.py +194 -0
  67. megadetector/data_management/resize_coco_dataset.py +663 -0
  68. megadetector/data_management/speciesnet_to_md.py +41 -0
  69. megadetector/data_management/wi_download_csv_to_coco.py +247 -0
  70. megadetector/data_management/yolo_output_to_md_output.py +594 -0
  71. megadetector/data_management/yolo_to_coco.py +876 -0
  72. megadetector/data_management/zamba_to_md.py +188 -0
  73. megadetector/detection/__init__.py +0 -0
  74. megadetector/detection/change_detection.py +840 -0
  75. megadetector/detection/process_video.py +479 -0
  76. megadetector/detection/pytorch_detector.py +1451 -0
  77. megadetector/detection/run_detector.py +1267 -0
  78. megadetector/detection/run_detector_batch.py +2159 -0
  79. megadetector/detection/run_inference_with_yolov5_val.py +1314 -0
  80. megadetector/detection/run_md_and_speciesnet.py +1494 -0
  81. megadetector/detection/run_tiled_inference.py +1038 -0
  82. megadetector/detection/tf_detector.py +209 -0
  83. megadetector/detection/video_utils.py +1379 -0
  84. megadetector/postprocessing/__init__.py +0 -0
  85. megadetector/postprocessing/add_max_conf.py +72 -0
  86. megadetector/postprocessing/categorize_detections_by_size.py +166 -0
  87. megadetector/postprocessing/classification_postprocessing.py +1752 -0
  88. megadetector/postprocessing/combine_batch_outputs.py +249 -0
  89. megadetector/postprocessing/compare_batch_results.py +2110 -0
  90. megadetector/postprocessing/convert_output_format.py +403 -0
  91. megadetector/postprocessing/create_crop_folder.py +629 -0
  92. megadetector/postprocessing/detector_calibration.py +570 -0
  93. megadetector/postprocessing/generate_csv_report.py +522 -0
  94. megadetector/postprocessing/load_api_results.py +223 -0
  95. megadetector/postprocessing/md_to_coco.py +428 -0
  96. megadetector/postprocessing/md_to_labelme.py +351 -0
  97. megadetector/postprocessing/md_to_wi.py +41 -0
  98. megadetector/postprocessing/merge_detections.py +392 -0
  99. megadetector/postprocessing/postprocess_batch_results.py +2077 -0
  100. megadetector/postprocessing/remap_detection_categories.py +226 -0
  101. megadetector/postprocessing/render_detection_confusion_matrix.py +677 -0
  102. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +206 -0
  103. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +82 -0
  104. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1665 -0
  105. megadetector/postprocessing/separate_detections_into_folders.py +795 -0
  106. megadetector/postprocessing/subset_json_detector_output.py +964 -0
  107. megadetector/postprocessing/top_folders_to_bottom.py +238 -0
  108. megadetector/postprocessing/validate_batch_results.py +332 -0
  109. megadetector/taxonomy_mapping/__init__.py +0 -0
  110. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
  111. megadetector/taxonomy_mapping/map_new_lila_datasets.py +213 -0
  112. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +165 -0
  113. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +543 -0
  114. megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
  115. megadetector/taxonomy_mapping/simple_image_download.py +224 -0
  116. megadetector/taxonomy_mapping/species_lookup.py +1008 -0
  117. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
  118. megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
  119. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
  120. megadetector/tests/__init__.py +0 -0
  121. megadetector/tests/test_nms_synthetic.py +335 -0
  122. megadetector/utils/__init__.py +0 -0
  123. megadetector/utils/ct_utils.py +1857 -0
  124. megadetector/utils/directory_listing.py +199 -0
  125. megadetector/utils/extract_frames_from_video.py +307 -0
  126. megadetector/utils/gpu_test.py +125 -0
  127. megadetector/utils/md_tests.py +2072 -0
  128. megadetector/utils/path_utils.py +2832 -0
  129. megadetector/utils/process_utils.py +172 -0
  130. megadetector/utils/split_locations_into_train_val.py +237 -0
  131. megadetector/utils/string_utils.py +234 -0
  132. megadetector/utils/url_utils.py +825 -0
  133. megadetector/utils/wi_platform_utils.py +968 -0
  134. megadetector/utils/wi_taxonomy_utils.py +1759 -0
  135. megadetector/utils/write_html_image_list.py +239 -0
  136. megadetector/visualization/__init__.py +0 -0
  137. megadetector/visualization/plot_utils.py +309 -0
  138. megadetector/visualization/render_images_with_thumbnails.py +243 -0
  139. megadetector/visualization/visualization_utils.py +1940 -0
  140. megadetector/visualization/visualize_db.py +630 -0
  141. megadetector/visualization/visualize_detector_output.py +479 -0
  142. megadetector/visualization/visualize_video_output.py +705 -0
  143. megadetector-10.0.13.dist-info/METADATA +134 -0
  144. megadetector-10.0.13.dist-info/RECORD +147 -0
  145. megadetector-10.0.13.dist-info/WHEEL +5 -0
  146. megadetector-10.0.13.dist-info/licenses/LICENSE +19 -0
  147. megadetector-10.0.13.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1008 @@
1
+ """
2
+
3
+ species_lookup.py
4
+
5
+ Look up species names (common or scientific) in the GBIF and iNaturalist
6
+ taxonomies.
7
+
8
+ Run initialize_taxonomy_lookup() before calling any other function.
9
+
10
+ """
11
+
12
+ #%% Constants and imports
13
+
14
+ import argparse
15
+ import pickle
16
+ import shutil
17
+ import zipfile
18
+ import sys
19
+ import os
20
+
21
+ from collections import defaultdict
22
+ from itertools import compress
23
+ from tqdm import tqdm
24
+ from typing import Any, Dict, List, Mapping, Sequence, Set
25
+
26
+ import pandas as pd
27
+ import numpy as np
28
+
29
+ from megadetector.utils import url_utils
30
+
31
+ taxonomy_download_dir = os.path.expanduser('~/taxonomy')
32
+
33
+ taxonomy_urls = {
34
+ 'GBIF': 'https://hosted-datasets.gbif.org/datasets/backbone/current/backbone.zip',
35
+ 'iNaturalist': 'https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip'
36
+ }
37
+
38
+ files_to_unzip = {
39
+ 'GBIF': ['Taxon.tsv', 'VernacularName.tsv'],
40
+ 'iNaturalist': ['taxa.csv','VernacularNames-english.csv']
41
+ }
42
+
43
+ # As of 2025.06.24:
44
+ #
45
+ # GBIF: 950MB zipped, 2.3GB of relevant content unzipped
46
+ # iNat: 71MB zipped, 415MB of relevant content unzipped
47
+
48
+ os.makedirs(taxonomy_download_dir, exist_ok=True)
49
+ for taxonomy_name in taxonomy_urls:
50
+ taxonomy_dir = os.path.join(taxonomy_download_dir, taxonomy_name)
51
+ os.makedirs(taxonomy_dir, exist_ok=True)
52
+
53
+ serialized_structures_file = os.path.join(taxonomy_download_dir,
54
+ 'serialized_taxonomies.p')
55
+
56
+ # These are un-initialized globals that must be initialized by
57
+ # the initialize_taxonomy_lookup() function below.
58
+ inat_taxonomy = None # : pd.DataFrame
59
+ gbif_taxonomy = None # : pd.DataFrame
60
+ gbif_common_mapping = None # : pd.DataFrame
61
+ inat_taxon_id_to_row = None # : Dict[np.int64, int]
62
+ gbif_taxon_id_to_row = None # : Dict[np.int64, int]
63
+ inat_taxon_id_to_vernacular = None # : Dict[np.int64, Set[str]]
64
+ inat_vernacular_to_taxon_id = None # : Dict[str, np.int64]
65
+ inat_taxon_id_to_scientific = None # : Dict[np.int64, Set[str]]
66
+ inat_scientific_to_taxon_id = None # : Dict[str, np.int64]
67
+ gbif_taxon_id_to_vernacular = None # : Dict[np.int64, Set[str]]
68
+ gbif_vernacular_to_taxon_id = None # : Dict[str, np.int64]
69
+ gbif_taxon_id_to_scientific = None # : Dict[np.int64, Set[str]]
70
+ gbif_scientific_to_taxon_id = None # : Dict[str, np.int64]
71
+
72
+
73
+ #%% Functions
74
+
75
+ # Initialization function
76
+
77
+ def initialize_taxonomy_lookup(force_init=False):
78
+ """
79
+ Initialize this module by doing the following:
80
+
81
+ * Downloads and unzips the current GBIF and iNat taxonomies if necessary
82
+ (only unzips what's necessary, but does not delete the original zipfiles)
83
+ * Builds a bunch of dictionaries and tables to facilitate lookup
84
+ * Serializes those tables via pickle
85
+ * Skips all of the above if the serialized pickle file already exists
86
+
87
+ Args:
88
+ force_init (bool, optional): force re-download and parsing of the source .zip files,
89
+ even if the cached .p file already exists
90
+ """
91
+
92
+ #%%
93
+
94
+ global inat_taxonomy,\
95
+ gbif_taxonomy,\
96
+ gbif_common_mapping,\
97
+ inat_taxon_id_to_row,\
98
+ gbif_taxon_id_to_row,\
99
+ inat_taxon_id_to_vernacular,\
100
+ inat_vernacular_to_taxon_id,\
101
+ inat_taxon_id_to_scientific,\
102
+ inat_scientific_to_taxon_id,\
103
+ gbif_taxon_id_to_vernacular,\
104
+ gbif_vernacular_to_taxon_id,\
105
+ gbif_taxon_id_to_scientific,\
106
+ gbif_scientific_to_taxon_id
107
+
108
+
109
+ #%% Load serialized taxonomy info if we've already saved it
110
+
111
+ if (not force_init) and (inat_taxonomy is not None):
112
+ print('Skipping taxonomy re-init')
113
+ return
114
+
115
+ if (not force_init) and (os.path.isfile(serialized_structures_file)):
116
+
117
+ print(f'De-serializing taxonomy data from {serialized_structures_file}')
118
+
119
+ with open(serialized_structures_file, 'rb') as f:
120
+ structures_to_serialize = pickle.load(f)
121
+
122
+ inat_taxonomy,\
123
+ gbif_taxonomy,\
124
+ gbif_common_mapping,\
125
+ inat_taxon_id_to_row,\
126
+ gbif_taxon_id_to_row,\
127
+ inat_taxon_id_to_vernacular,\
128
+ inat_vernacular_to_taxon_id,\
129
+ inat_taxon_id_to_scientific,\
130
+ inat_scientific_to_taxon_id,\
131
+ gbif_taxon_id_to_vernacular,\
132
+ gbif_vernacular_to_taxon_id,\
133
+ gbif_taxon_id_to_scientific,\
134
+ gbif_scientific_to_taxon_id = structures_to_serialize
135
+
136
+ return
137
+
138
+
139
+ #%% Download and unzip taxonomy files
140
+
141
+ # taxonomy_name = list(taxonomy_urls.items())[0][0]; zip_url = list(taxonomy_urls.items())[0][1]
142
+ for taxonomy_name, zip_url in taxonomy_urls.items():
143
+
144
+ need_to_download = False
145
+
146
+ if force_init:
147
+ need_to_download = True
148
+
149
+ # Don't download the zipfile if we've already unzipped what we need
150
+ for fn in files_to_unzip[taxonomy_name]:
151
+ target_file = os.path.join(
152
+ taxonomy_download_dir, taxonomy_name, fn)
153
+ if not os.path.isfile(target_file):
154
+ need_to_download = True
155
+ break
156
+ if not need_to_download:
157
+ print(f'Bypassing download of {taxonomy_name}, all files available')
158
+ continue
159
+
160
+ zipfile_path = os.path.join(
161
+ taxonomy_download_dir, zip_url.split('/')[-1])
162
+
163
+ # Bypasses download if the file exists already (unless force_init is set)
164
+ url_utils.download_url(
165
+ zip_url, os.path.join(zipfile_path),
166
+ progress_updater=url_utils.DownloadProgressBar(),
167
+ verbose=True,force_download=force_init)
168
+
169
+ # Unzip the files we need
170
+ files_we_need = files_to_unzip[taxonomy_name]
171
+
172
+ with zipfile.ZipFile(zipfile_path, 'r') as zipH:
173
+
174
+ for fn in files_we_need:
175
+ print('Unzipping {}'.format(fn))
176
+ target_file = os.path.join(
177
+ taxonomy_download_dir, taxonomy_name, os.path.basename(fn))
178
+
179
+ if (not force_init) and (os.path.isfile(target_file)):
180
+ print(f'Bypassing unzip of {target_file}, file exists')
181
+ else:
182
+ os.makedirs(os.path.basename(target_file),exist_ok=True)
183
+ with zipH.open(fn) as zf, open(target_file, 'wb') as f:
184
+ shutil.copyfileobj(zf, f)
185
+
186
+ # ...for each file that we need from this zipfile
187
+
188
+ # ...for each taxonomy
189
+
190
+
191
+ #%% Create dataframes from each of the taxonomy/vernacular files
192
+
193
+ # Load iNat taxonomy
194
+ inat_taxonomy_file = os.path.join(taxonomy_download_dir, 'iNaturalist', 'taxa.csv')
195
+ print('Loading iNat taxonomy from {}'.format(inat_taxonomy_file))
196
+ inat_taxonomy = pd.read_csv(inat_taxonomy_file)
197
+ inat_taxonomy['scientificName'] = inat_taxonomy['scientificName'].fillna('').str.strip()
198
+
199
+ # Delete columns we won't use. The "taxonID" column is a non-int version of "ID"
200
+ inat_taxonomy = inat_taxonomy.drop(['identifier', 'taxonID', 'modified', 'references'], axis=1)
201
+
202
+ # The "parentNameUsageID" column in inat_taxonomy is a URL, like:
203
+ #
204
+ # https://www.inaturalist.org/taxa/71262
205
+ #
206
+ # Convert this column to be integer-valued, using only the last token of the URL
207
+ inat_taxonomy['parentNameUsageID'] = \
208
+ inat_taxonomy['parentNameUsageID'].str.split('/').str[-1].fillna(0).astype(int)
209
+
210
+ # Rename the "id" column to "taxonID"
211
+ inat_taxonomy = inat_taxonomy.rename(columns={'id': 'taxonID'})
212
+
213
+ assert 'id' not in inat_taxonomy.columns
214
+ assert 'taxonID' in inat_taxonomy.columns
215
+
216
+ # Load iNat common name mapping
217
+ inat_common_mapping_file = os.path.join(taxonomy_download_dir, 'iNaturalist', 'VernacularNames-english.csv')
218
+ inat_common_mapping = pd.read_csv(inat_common_mapping_file)
219
+ inat_common_mapping['vernacularName'] = inat_common_mapping['vernacularName'].fillna('').str.strip()
220
+
221
+ inat_common_mapping = inat_common_mapping.drop(['language','locality','countryCode',
222
+ 'source','lexicon','contributor','created'], axis=1)
223
+ assert 'id' in inat_common_mapping.columns
224
+ assert 'taxonID' not in inat_common_mapping.columns
225
+ assert 'vernacularName' in inat_common_mapping.columns
226
+
227
+ # Load GBIF taxonomy
228
+ gbif_taxonomy_file = os.path.join(taxonomy_download_dir, 'GBIF', 'Taxon.tsv')
229
+ print('Loading GBIF taxonomy from {}'.format(gbif_taxonomy_file))
230
+ gbif_taxonomy = pd.read_csv(gbif_taxonomy_file, sep='\t', encoding='utf-8',on_bad_lines='warn')
231
+ gbif_taxonomy['scientificName'] = gbif_taxonomy['scientificName'].fillna('').str.strip()
232
+ gbif_taxonomy['canonicalName'] = gbif_taxonomy['canonicalName'].fillna('').str.strip()
233
+ gbif_taxonomy['parentNameUsageID'] = gbif_taxonomy['parentNameUsageID'].fillna(-1).astype(int)
234
+
235
+ # Remove questionable rows from the GBIF taxonomy
236
+ gbif_taxonomy = gbif_taxonomy[~gbif_taxonomy['taxonomicStatus'].isin(['doubtful', 'misapplied'])]
237
+ gbif_taxonomy = gbif_taxonomy.reset_index()
238
+
239
+ gbif_taxonomy = gbif_taxonomy.drop(['datasetID','acceptedNameUsageID','originalNameUsageID',
240
+ 'scientificNameAuthorship','nameAccordingTo','namePublishedIn',
241
+ 'taxonomicStatus','nomenclaturalStatus','taxonRemarks'], axis=1)
242
+
243
+ assert 'taxonID' in gbif_taxonomy.columns
244
+ assert 'scientificName' in gbif_taxonomy.columns
245
+
246
+ # Load GBIF common name mapping
247
+ gbif_common_mapping = pd.read_csv(os.path.join(
248
+ taxonomy_download_dir, 'GBIF', 'VernacularName.tsv'), sep='\t')
249
+ gbif_common_mapping['vernacularName'] = gbif_common_mapping['vernacularName'].fillna('').str.strip()
250
+
251
+ # Only keep English mappings
252
+ gbif_common_mapping = gbif_common_mapping.loc[gbif_common_mapping['language'] == 'en']
253
+ gbif_common_mapping = gbif_common_mapping.reset_index()
254
+
255
+ gbif_common_mapping = gbif_common_mapping.drop(['language','country','countryCode','sex',
256
+ 'lifeStage','source'],axis=1)
257
+
258
+ assert 'taxonID' in gbif_common_mapping.columns
259
+ assert 'vernacularName' in gbif_common_mapping.columns
260
+
261
+
262
+ # Convert everything to lowercase
263
+
264
+ def convert_df_to_lowercase(df):
265
+ df = df.applymap(lambda s: s.lower() if isinstance(s, str) else s)
266
+ return df
267
+
268
+ inat_taxonomy = convert_df_to_lowercase(inat_taxonomy)
269
+ gbif_taxonomy = convert_df_to_lowercase(gbif_taxonomy)
270
+ gbif_common_mapping = convert_df_to_lowercase(gbif_common_mapping)
271
+ inat_common_mapping = convert_df_to_lowercase(inat_common_mapping)
272
+
273
+
274
+ ##%% For each taxonomy table, create a mapping from taxon IDs to rows
275
+
276
+ inat_taxon_id_to_row = {}
277
+ gbif_taxon_id_to_row = {}
278
+
279
+ print('Building iNat taxonID --> row table')
280
+ for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
281
+ taxon_id = row['taxonID']
282
+ assert isinstance(taxon_id, int)
283
+ inat_taxon_id_to_row[taxon_id] = i_row
284
+
285
+ print('Building GBIF taxonID --> row table')
286
+ for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
287
+ taxon_id = row['taxonID']
288
+ assert isinstance(taxon_id, int)
289
+ gbif_taxon_id_to_row[taxon_id] = i_row
290
+
291
+
292
+ ##%% Create name mapping dictionaries
293
+
294
+ inat_taxon_id_to_vernacular = defaultdict(set)
295
+ inat_vernacular_to_taxon_id = defaultdict(set)
296
+ inat_taxon_id_to_scientific = defaultdict(set)
297
+ inat_scientific_to_taxon_id = defaultdict(set)
298
+
299
+ gbif_taxon_id_to_vernacular = defaultdict(set)
300
+ gbif_vernacular_to_taxon_id = defaultdict(set)
301
+ gbif_taxon_id_to_scientific = defaultdict(set)
302
+ gbif_scientific_to_taxon_id = defaultdict(set)
303
+
304
+
305
+ # Build iNat dictionaries
306
+
307
+ print('Building lookup dictionaries for iNat taxonomy')
308
+
309
+ # iNat Scientific name mapping
310
+
311
+ for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
312
+
313
+ taxon_id = row['taxonID']
314
+ assert isinstance(taxon_id,int)
315
+
316
+ scientific_name = row['scientificName']
317
+ assert len(scientific_name) > 0
318
+
319
+ inat_taxon_id_to_scientific[taxon_id].add(scientific_name)
320
+ inat_scientific_to_taxon_id[scientific_name].add(taxon_id)
321
+
322
+ # iNat common name mapping
323
+
324
+ inat_taxon_ids_in_vernacular_file_but_not_in_taxa_file = set()
325
+
326
+ for i_row, row in tqdm(inat_common_mapping.iterrows(), total=len(inat_common_mapping)):
327
+
328
+ taxon_id = row['id']
329
+ assert isinstance(taxon_id,int)
330
+
331
+ # This should never happen; we will assert() this at the end of the loop
332
+ if taxon_id not in inat_taxon_id_to_scientific:
333
+ inat_taxon_ids_in_vernacular_file_but_not_in_taxa_file.add(taxon_id)
334
+ continue
335
+
336
+ vernacular_name = row['vernacularName']
337
+
338
+ assert len(vernacular_name) > 0
339
+ inat_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
340
+ inat_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
341
+
342
+ assert len(inat_taxon_ids_in_vernacular_file_but_not_in_taxa_file) == 0
343
+
344
+
345
+ ##%% Build GBIF dictionaries
346
+
347
+ print('Building lookup dictionaries for GBIF taxonomy')
348
+
349
+ # GBIF scientific name mapping
350
+
351
+ for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
352
+
353
+ taxon_id = row['taxonID']
354
+ assert isinstance(taxon_id,int)
355
+
356
+ # The "canonical name" is the Latin name; the "scientific name"
357
+ # column includes other information. For example:
358
+ #
359
+ # "scientificName": Schizophoria impressa (Hall, 1843)
360
+ # "canonicalName": Schizophoria impressa
361
+ #
362
+ # Also see:
363
+ #
364
+ # http://globalnames.org/docs/glossary/
365
+
366
+ scientific_name = row['canonicalName']
367
+
368
+ # This only seems to happen for really esoteric species that aren't
369
+ # likely to apply to our problems, but doing this for completeness.
370
+ if len(scientific_name) == 0:
371
+ scientific_name = row['scientificName']
372
+
373
+ assert len(scientific_name) > 0
374
+ gbif_taxon_id_to_scientific[taxon_id].add(scientific_name)
375
+ gbif_scientific_to_taxon_id[scientific_name].add(taxon_id)
376
+
377
+ # GBIF common name mapping
378
+
379
+ gbif_taxon_ids_in_vernacular_file_but_not_in_taxa_file = set()
380
+
381
+ for i_row, row in tqdm(gbif_common_mapping.iterrows(), total=len(gbif_common_mapping)):
382
+
383
+ taxon_id = row['taxonID']
384
+ assert isinstance(taxon_id,int)
385
+
386
+ # Don't include taxon IDs that were removed from the master table
387
+ if taxon_id not in gbif_taxon_id_to_scientific:
388
+ gbif_taxon_ids_in_vernacular_file_but_not_in_taxa_file.add(taxon_id)
389
+ continue
390
+
391
+ vernacular_name = row['vernacularName']
392
+
393
+ assert len(vernacular_name) > 0
394
+ gbif_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
395
+ gbif_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
396
+
397
+ print('Finished GBIF common --> scientific mapping, failed to map {} of {} taxon IDs'.format(
398
+ len(gbif_taxon_ids_in_vernacular_file_but_not_in_taxa_file),
399
+ len(gbif_common_mapping)
400
+ ))
401
+
402
+
403
+ ##%% Save everything to file
404
+
405
+ structures_to_serialize = [
406
+ inat_taxonomy,
407
+ gbif_taxonomy,
408
+ gbif_common_mapping,
409
+ inat_taxon_id_to_row,
410
+ gbif_taxon_id_to_row,
411
+ inat_taxon_id_to_vernacular,
412
+ inat_vernacular_to_taxon_id,
413
+ inat_taxon_id_to_scientific,
414
+ inat_scientific_to_taxon_id,
415
+ gbif_taxon_id_to_vernacular,
416
+ gbif_vernacular_to_taxon_id,
417
+ gbif_taxon_id_to_scientific,
418
+ gbif_scientific_to_taxon_id
419
+ ]
420
+
421
+ print('Serializing to {}...'.format(serialized_structures_file), end='')
422
+ if not os.path.isfile(serialized_structures_file):
423
+ with open(serialized_structures_file, 'wb') as p:
424
+ pickle.dump(structures_to_serialize, p)
425
+ print('...done')
426
+
427
+
428
+ #%%
429
+
430
+ # ...def initialize_taxonomy_lookup(...)
431
+
432
+
433
+ def get_scientific_name_from_row(r):
434
+ """
435
+ r: a dataframe that's really a row in one of our taxonomy tables
436
+ """
437
+
438
+ if 'canonicalName' in r and len(r['canonicalName']) > 0:
439
+ scientific_name = r['canonicalName']
440
+ else:
441
+ scientific_name = r['scientificName']
442
+ return scientific_name
443
+
444
+
445
+ def taxonomy_row_to_string(r):
446
+ """
447
+ r: a dataframe that's really a row in one of our taxonomy tables
448
+ """
449
+
450
+ if 'vernacularName' in r:
451
+ common_string = ' (' + r['vernacularName'] + ')'
452
+ else:
453
+ common_string = ''
454
+ scientific_name = get_scientific_name_from_row(r)
455
+
456
+ return r['taxonRank'] + ' ' + scientific_name + common_string
457
+
458
+
459
+ def traverse_taxonomy(matching_rownums: Sequence[int],
460
+ taxon_id_to_row: Mapping[str, int],
461
+ taxon_id_to_vernacular: Mapping[str, Set[str]],
462
+ taxonomy: pd.DataFrame,
463
+ source_name: str,
464
+ query: str) -> List[Dict[str, Any]]:
465
+ """
466
+ Given a data frame that's a set of rows from one of our taxonomy tables,
467
+ walks the taxonomy hierarchy from each row to put together a full taxonomy
468
+ tree, then prunes redundant trees (e.g. if we had separate hits for a
469
+ species and the genus that contains that species.)
470
+
471
+ Returns a list of dicts:
472
+ [
473
+ {
474
+ 'source': 'inat' or 'gbif',
475
+ 'taxonomy': [(taxon_id, taxon_rank, scientific_name, [common names])]
476
+ },
477
+ ...
478
+ ]
479
+ """
480
+
481
+ # list of dicts: {'source': source_name, 'taxonomy': match_details}
482
+ matching_trees: List[Dict[str, Any]] = []
483
+
484
+ # i_match = 0
485
+ for i_match in matching_rownums:
486
+
487
+ # list of (taxon_id, taxonRank, scientific name, [vernacular names])
488
+ # corresponding to an exact match and its parents
489
+ match_details = []
490
+ current_row = taxonomy.iloc[i_match]
491
+
492
+ # Walk taxonomy hierarchy
493
+ while True:
494
+
495
+ taxon_id = current_row['taxonID']
496
+ # sort for determinism
497
+ vernacular_names = sorted(taxon_id_to_vernacular[taxon_id])
498
+ match_details.append((taxon_id, current_row['taxonRank'],
499
+ get_scientific_name_from_row(current_row),
500
+ vernacular_names))
501
+
502
+ if np.isnan(current_row['parentNameUsageID']):
503
+ break
504
+ parent_taxon_id = current_row['parentNameUsageID'].astype('int64')
505
+ if parent_taxon_id not in taxon_id_to_row:
506
+ # This can happen because we remove questionable rows from the
507
+ # GBIF taxonomy
508
+ # print(f'Warning: no row exists for parent_taxon_id {parent_taxon_id},' + \
509
+ # f'child taxon_id: {taxon_id}, query: {query}')
510
+ break
511
+ i_parent_row = taxon_id_to_row[parent_taxon_id]
512
+ current_row = taxonomy.iloc[i_parent_row]
513
+
514
+ # The GBIF taxonomy contains unranked entries
515
+ if current_row['taxonRank'] == 'unranked':
516
+ break
517
+
518
+ # ...while there is taxonomy left to walk
519
+
520
+ matching_trees.append({'source': source_name,
521
+ 'taxonomy': match_details})
522
+
523
+ # ...for each match
524
+
525
+ # Remove redundant matches
526
+ b_valid_tree = [True] * len(matching_rownums)
527
+ # i_tree_a = 0; tree_a = matching_trees[i_tree_a]
528
+ for i_tree_a, tree_a in enumerate(matching_trees):
529
+
530
+ tree_a_primary_taxon_id = tree_a['taxonomy'][0][0]
531
+
532
+ # i_tree_b = 1; tree_b = matching_trees[i_tree_b]
533
+ for i_tree_b, tree_b in enumerate(matching_trees):
534
+
535
+ if i_tree_a == i_tree_b:
536
+ continue
537
+
538
+ # If tree a's primary taxon ID is inside tree b, discard tree a
539
+ #
540
+ # taxonomy_level_b = tree_b['taxonomy'][0]
541
+ for taxonomy_level_b in tree_b['taxonomy']:
542
+ if tree_a_primary_taxon_id == taxonomy_level_b[0]:
543
+ b_valid_tree[i_tree_a] = False
544
+ break
545
+
546
+ # ...for each level in taxonomy B
547
+
548
+ # ...for each tree (inner)
549
+
550
+ # ...for each tree (outer)
551
+
552
+ matching_trees = list(compress(matching_trees, b_valid_tree))
553
+ return matching_trees
554
+
555
+ # ...def traverse_taxonomy()
556
+
557
+
558
+ def get_taxonomic_info(query: str) -> List[Dict[str, Any]]:
559
+ """
560
+ Main entry point: get taxonomic matches from both taxonomies for [query],
561
+ which may be a scientific or common name.
562
+ """
563
+
564
+ query = query.strip().lower()
565
+ # print("Finding taxonomy information for: {0}".format(query))
566
+
567
+ inat_taxon_ids = set()
568
+ if query in inat_scientific_to_taxon_id:
569
+ inat_taxon_ids |= inat_scientific_to_taxon_id[query]
570
+ if query in inat_vernacular_to_taxon_id:
571
+ inat_taxon_ids |= inat_vernacular_to_taxon_id[query]
572
+
573
+ # In GBIF, some queries hit for both common and scientific, make sure we end
574
+ # up with unique inputs
575
+ gbif_taxon_ids = set()
576
+ if query in gbif_scientific_to_taxon_id:
577
+ gbif_taxon_ids |= gbif_scientific_to_taxon_id[query]
578
+ if query in gbif_vernacular_to_taxon_id:
579
+ gbif_taxon_ids |= gbif_vernacular_to_taxon_id[query]
580
+
581
+ # If the species is not found in either taxonomy, return None
582
+ if (len(inat_taxon_ids) == 0) and (len(gbif_taxon_ids) == 0):
583
+ return []
584
+
585
+ # Both GBIF and iNat have a 1-to-1 mapping between taxon_id and row number
586
+ inat_row_indices = [inat_taxon_id_to_row[i] for i in inat_taxon_ids]
587
+ gbif_row_indices = [gbif_taxon_id_to_row[i] for i in gbif_taxon_ids]
588
+
589
+ # Walk both taxonomies
590
+ inat_matching_trees = traverse_taxonomy(
591
+ inat_row_indices, inat_taxon_id_to_row, inat_taxon_id_to_vernacular,
592
+ inat_taxonomy, 'inat', query)
593
+ gbif_matching_trees = traverse_taxonomy(
594
+ gbif_row_indices, gbif_taxon_id_to_row, gbif_taxon_id_to_vernacular,
595
+ gbif_taxonomy, 'gbif', query)
596
+
597
+ return gbif_matching_trees + inat_matching_trees
598
+
599
+ # ...def get_taxonomic_info()
600
+
601
+
602
+ def print_taxonomy_matches(matches, verbose=False):
603
+ """
604
+ Console-friendly printing function to make nicely-indentend trees
605
+ """
606
+
607
+ # m = matches[0]
608
+ for m in matches:
609
+
610
+ source = m['source']
611
+
612
+ # For example: [(9761484, 'species', 'anas platyrhynchos')]
613
+ for i_taxonomy_level in range(0, len(m['taxonomy'])):
614
+ taxonomy_level_info = m['taxonomy'][i_taxonomy_level]
615
+ taxonomy_level = taxonomy_level_info[1]
616
+ name = taxonomy_level_info[2]
617
+ common = taxonomy_level_info[3]
618
+
619
+ if i_taxonomy_level > 0:
620
+ print('\t',end='')
621
+
622
+ print('{} {} ({})'.format(taxonomy_level, name, common), end='')
623
+
624
+ if i_taxonomy_level == 0:
625
+ print(' ({})'.format(source))
626
+ else:
627
+ print('')
628
+
629
+ if not verbose:
630
+ break
631
+
632
+ # ...for each taxonomy level
633
+
634
+ # ...for each match
635
+
636
+ # ...def print_taxonomy_matches()
637
+
638
+
639
+ #%% Taxonomy functions that make subjective judgements
640
+
641
+ import unicodedata
642
+ import re
643
+
644
+ def slugify(value: Any, allow_unicode: bool = False) -> str:
645
+ """
646
+ From:
647
+ https://github.com/django/django/blob/master/django/utils/text.py
648
+
649
+ Convert to ASCII if 'allow_unicode' is False. Convert spaces to hyphens.
650
+ Remove characters that aren't alphanumerics, underscores, or hyphens.
651
+ Convert to lowercase. Also strip leading and trailing whitespace.
652
+ """
653
+
654
+ value = str(value)
655
+ value = unicodedata.normalize('NFKC', value)
656
+ if not allow_unicode:
657
+ value = value.encode('ascii', 'ignore').decode('ascii')
658
+ value = re.sub(r'[^\w\s-]', '', value.lower()).strip()
659
+ return re.sub(r'[-\s]+', '-', value)
660
+
661
+
662
+ class TaxonomicMatch:
663
+
664
+ def __init__(self, scientific_name, common_name, taxonomic_level, source,
665
+ taxonomy_string, match):
666
+ self.scientific_name = scientific_name
667
+ self.common_name = common_name
668
+ self.taxonomic_level = taxonomic_level
669
+ self.source = source
670
+ self.taxonomy_string = taxonomy_string
671
+ self.match = match
672
+
673
+ def __repr__(self):
674
+ return ('TaxonomicMatch('
675
+ f'scientific_name={self.scientific_name}, '
676
+ f'common_name={self.common_name}, '
677
+ f'taxonomic_level={self.taxonomic_level}, '
678
+ f'source={self.source}')
679
+
680
+
681
+ hyphenated_terms = ['crowned', 'backed', 'throated', 'tailed', 'headed', 'cheeked',
682
+ 'ruffed', 'browed', 'eating', 'striped', 'shanked',
683
+ 'fronted', 'bellied', 'spotted', 'eared', 'collared', 'breasted',
684
+ 'necked']
685
+
686
+ def pop_levels(m, n_levels=1):
687
+ """
688
+ Remove [n_levels] levels from the bottom of the TaxonomicMatch object m, typically used to remove
689
+ silly subgenera.
690
+ """
691
+
692
+ v = eval(m.taxonomy_string)
693
+ assert v[0][1] == m.taxonomic_level
694
+ assert v[0][2] == m.scientific_name
695
+ popped_v = v[n_levels:]
696
+ taxonomic_level = popped_v[0][1]
697
+ scientific_name = popped_v[0][2]
698
+ common_name = popped_v[0][3]
699
+ if len(common_name) == 0:
700
+ common_name = ''
701
+ else:
702
+ common_name = common_name[0]
703
+ taxonomy_string = str(popped_v)
704
+ source = m.source
705
+ return TaxonomicMatch(scientific_name=scientific_name,
706
+ common_name=common_name,
707
+ taxonomic_level=taxonomic_level,
708
+ source=source,
709
+ taxonomy_string=taxonomy_string,
710
+ match=None)
711
+
712
+ # ...def pop_levels(...)
713
+
714
+
715
+ def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retry=True) -> TaxonomicMatch:
716
+ """
717
+ Wrapper for _get_preferred_taxonomic_match, but expressing a variety of heuristics
718
+ and preferences that are specific to our scenario.
719
+
720
+ Args:
721
+ query (str): The common or scientific name we want to look up
722
+ taxonomy_preference (str, optional): 'inat' or 'gbif'
723
+ retry (bool, optional): if the initial lookup fails, should we try heuristic
724
+ substitutions, e.g. replacing "_" with " ", or "spp" with "species"?
725
+
726
+ Returns:
727
+ TaxonomicMatch: the best taxonomic match, or None
728
+ """
729
+
730
+ m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
731
+ if (len(m.scientific_name) > 0) or (not retry):
732
+ return m
733
+
734
+ for s in hyphenated_terms:
735
+ query = query.replace(' ' + s,'-' + s)
736
+ m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
737
+
738
+ if (len(m.scientific_name) > 0) or (not retry):
739
+ return m
740
+
741
+ query = query.replace(' species','')
742
+ query = query.replace(' order','')
743
+ query = query.replace(' genus','')
744
+ query = query.replace(' family','')
745
+ query = query.replace(' subfamily','')
746
+ m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
747
+
748
+ return m
749
+
750
+
751
+ def validate_and_convert(data):
752
+ """
753
+ Recursively validates that all elements in the nested structure are only
754
+ tuples, lists, ints, or np.int64, and converts np.int64 to int.
755
+
756
+ Args:
757
+ data: The nested structure to validate and convert
758
+
759
+ Returns:
760
+ The validated and converted structure
761
+
762
+ Raises:
763
+ TypeError: If an invalid type is encountered
764
+ """
765
+
766
+ if isinstance(data, np.int64):
767
+ return int(data)
768
+ elif isinstance(data, int) or isinstance(data, str):
769
+ return data
770
+ elif isinstance(data, (list, tuple)):
771
+ # Process lists and tuples recursively
772
+ container_type = type(data)
773
+ return container_type(validate_and_convert(item) for item in data)
774
+ else:
775
+ raise TypeError(f"Invalid type encountered: {type(data).__name__}. "
776
+ f"Only int, np.int64, list, and tuple are allowed.")
777
+
778
+ # ...def validate_and_convert(...)
779
+
780
+
781
+ def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') -> TaxonomicMatch:
782
+
783
+ query = query.lower().strip().replace('_', ' ')
784
+ query = query.replace('unidentified','')
785
+ query = query.replace('unknown','')
786
+ if query.endswith(' sp'):
787
+ query = query.replace(' sp','')
788
+ if query.endswith(' group'):
789
+ query = query.replace(' group','')
790
+
791
+ query = query.strip()
792
+
793
+ # query = 'person'
794
+ matches = get_taxonomic_info(query)
795
+
796
+ # Do we have an iNat match?
797
+ inat_matches = [m for m in matches if m['source'] == 'inat']
798
+ gbif_matches = [m for m in matches if m['source'] == 'gbif']
799
+
800
+ # print_taxonomy_matches(inat_matches, verbose=True)
801
+ # print_taxonomy_matches(gbif_matches, verbose=True)
802
+
803
+ scientific_name = ''
804
+ common_name = ''
805
+ taxonomic_level = ''
806
+ match = ''
807
+ source = ''
808
+ taxonomy_string = ''
809
+
810
+ n_inat_matches = len(inat_matches)
811
+ n_gbif_matches = len(gbif_matches)
812
+
813
+ selected_matches = None
814
+
815
+ assert taxonomy_preference in ['gbif','inat'],\
816
+ 'Unrecognized taxonomy preference: {}'.format(taxonomy_preference)
817
+
818
+ if n_inat_matches > 0 and taxonomy_preference == 'inat':
819
+ selected_matches = 'inat'
820
+ elif n_gbif_matches > 0:
821
+ selected_matches = 'gbif'
822
+
823
+ if selected_matches == 'inat':
824
+
825
+ i_match = 0
826
+
827
+ if len(inat_matches) > 1:
828
+ # print('Warning: multiple iNat matches for {}'.format(query))
829
+
830
+ # Prefer chordates... most of the names that aren't what we want
831
+ # are esoteric insects, like a moth called "cheetah"
832
+ #
833
+ # If we can't find a chordate, just take the first match.
834
+ #
835
+ # i_test_match = 0
836
+ for i_test_match, match in enumerate(inat_matches):
837
+ found_vertebrate = False
838
+ taxonomy = match['taxonomy']
839
+ for taxonomy_level in taxonomy:
840
+ taxon_rank = taxonomy_level[1]
841
+ scientific_name = taxonomy_level[2]
842
+ if taxon_rank == 'phylum' and scientific_name == 'chordata':
843
+ i_match = i_test_match
844
+ found_vertebrate = True
845
+ break
846
+ if found_vertebrate:
847
+ break
848
+
849
+ match = inat_matches[i_match]['taxonomy']
850
+
851
+ # This is (taxonID, taxonLevel, scientific, [list of common])
852
+ lowest_level = match[0]
853
+ taxonomic_level = lowest_level[1]
854
+ scientific_name = lowest_level[2]
855
+ assert len(scientific_name) > 0
856
+ common_names = lowest_level[3]
857
+ if len(common_names) > 1:
858
+ # print(f'Warning: multiple iNat common names for {query}')
859
+ # Default to returning the query
860
+ if query in common_names:
861
+ common_name = query
862
+ else:
863
+ common_name = common_names[0]
864
+ elif len(common_names) > 0:
865
+ common_name = common_names[0]
866
+
867
+ # print(f'Matched iNat {query} to {scientific_name},{common_name}')
868
+ source = 'inat'
869
+
870
+ # ...if we had iNat matches
871
+
872
+ # If we either prefer GBIF or didn't have iNat matches
873
+ #
874
+ # Code is deliberately redundant here; I'm expecting some subtleties in how
875
+ # handle GBIF and iNat.
876
+ elif selected_matches == 'gbif':
877
+
878
+ i_match = 0
879
+
880
+ if len(gbif_matches) > 1:
881
+ # print('Warning: multiple GBIF matches for {}'.format(query))
882
+
883
+ # Prefer chordates... most of the names that aren't what we want
884
+ # are esoteric insects, like a moth called "cheetah"
885
+ #
886
+ # If we can't find a chordate, just take the first match.
887
+ #
888
+ # i_test_match = 0
889
+ for i_test_match, match in enumerate(gbif_matches):
890
+ found_vertebrate = False
891
+ taxonomy = match['taxonomy']
892
+ for taxonomy_level in taxonomy:
893
+ taxon_rank = taxonomy_level[1]
894
+ scientific_name = taxonomy_level[2]
895
+ if taxon_rank == 'phylum' and scientific_name == 'chordata':
896
+ i_match = i_test_match
897
+ found_vertebrate = True
898
+ break
899
+ if found_vertebrate:
900
+ break
901
+
902
+ match = gbif_matches[i_match]['taxonomy']
903
+
904
+ # This is (taxonID, taxonLevel, scientific, [list of common])
905
+ lowest_level = match[0]
906
+ taxonomic_level = lowest_level[1]
907
+ scientific_name = lowest_level[2]
908
+ assert len(scientific_name) > 0
909
+
910
+ common_names = lowest_level[3]
911
+ if len(common_names) > 1:
912
+ # print(f'Warning: multiple GBIF common names for {query}')
913
+ # Default to returning the query
914
+ if query in common_names:
915
+ common_name = query
916
+ else:
917
+ common_name = common_names[0]
918
+ elif len(common_names) > 0:
919
+ common_name = common_names[0]
920
+
921
+ source = 'gbif'
922
+
923
+ # ...if we needed to look in the GBIF taxonomy
924
+
925
+ # Convert np.int64's to ints
926
+ if match is not None:
927
+ match = validate_and_convert(match)
928
+
929
+ taxonomy_string = str(match)
930
+
931
+ m = TaxonomicMatch(scientific_name, common_name, taxonomic_level, source,
932
+ taxonomy_string, match)
933
+
934
+ if (m.taxonomic_level == 'subgenus' and \
935
+ match[1][1] == 'genus' and \
936
+ match[1][2] == m.scientific_name):
937
+ print('Removing redundant subgenus {}'.format(scientific_name))
938
+ m = pop_levels(m,1)
939
+
940
+ return m,query
941
+
942
+ # ...def _get_preferred_taxonomic_match()
943
+
944
+
945
+ #%% Interactive drivers and debug
946
+
947
+ if False:
948
+
949
+ #%% Initialization
950
+
951
+ initialize_taxonomy_lookup()
952
+
953
+
954
+ #%% Taxonomic lookup
955
+
956
+ # query = 'lion'
957
+ query = 'xenoperdix'
958
+ matches = get_taxonomic_info(query)
959
+ # print(matches)
960
+
961
+ print_taxonomy_matches(matches,verbose=True)
962
+
963
+ print('\n\n')
964
+
965
+ # Print the taxonomy in the taxonomy spreadsheet format
966
+ assert matches[1]['source'] == 'inat'
967
+ t = str(matches[1]['taxonomy'])
968
+ print(t)
969
+ import clipboard; clipboard.copy(t)
970
+
971
+
972
+ #%% Directly access the taxonomy tables
973
+
974
+ taxon_ids = gbif_vernacular_to_taxon_id['lion']
975
+ for taxon_id in taxon_ids:
976
+ i_row = gbif_taxon_id_to_row[taxon_id]
977
+ print(taxonomy_row_to_string(gbif_taxonomy.iloc[i_row]))
978
+
979
+
980
+ #%% Command-line driver
981
+
982
+ def main(): # noqa
983
+
984
+ # Read command line inputs (absolute path)
985
+ parser = argparse.ArgumentParser()
986
+ parser.add_argument('input_file')
987
+
988
+ if len(sys.argv[1:]) == 0:
989
+ parser.print_help()
990
+ parser.exit()
991
+
992
+ args = parser.parse_args()
993
+ input_file = args.input_file
994
+
995
+ initialize_taxonomy_lookup()
996
+
997
+ # Read the tokens from the input text file
998
+ with open(input_file, 'r') as f:
999
+ tokens = f.readlines()
1000
+
1001
+ # Loop through each token and get scientific name
1002
+ for token in tokens:
1003
+ token = token.strip().lower()
1004
+ matches = get_taxonomic_info(token)
1005
+ print_taxonomy_matches(matches)
1006
+
1007
+ if __name__ == '__main__':
1008
+ main()