megadetector 5.0.11__py3-none-any.whl → 5.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (203) hide show
  1. megadetector/api/__init__.py +0 -0
  2. megadetector/api/batch_processing/__init__.py +0 -0
  3. megadetector/api/batch_processing/api_core/__init__.py +0 -0
  4. megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
  5. megadetector/api/batch_processing/api_core/batch_service/score.py +439 -0
  6. megadetector/api/batch_processing/api_core/server.py +294 -0
  7. megadetector/api/batch_processing/api_core/server_api_config.py +97 -0
  8. megadetector/api/batch_processing/api_core/server_app_config.py +55 -0
  9. megadetector/api/batch_processing/api_core/server_batch_job_manager.py +220 -0
  10. megadetector/api/batch_processing/api_core/server_job_status_table.py +149 -0
  11. megadetector/api/batch_processing/api_core/server_orchestration.py +360 -0
  12. megadetector/api/batch_processing/api_core/server_utils.py +88 -0
  13. megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
  14. megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +46 -0
  15. megadetector/api/batch_processing/api_support/__init__.py +0 -0
  16. megadetector/api/batch_processing/api_support/summarize_daily_activity.py +152 -0
  17. megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
  18. megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
  19. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
  20. megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
  21. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
  22. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
  23. megadetector/api/synchronous/__init__.py +0 -0
  24. megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  25. megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +152 -0
  26. megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +263 -0
  27. megadetector/api/synchronous/api_core/animal_detection_api/config.py +35 -0
  28. megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
  29. megadetector/api/synchronous/api_core/tests/load_test.py +110 -0
  30. megadetector/classification/__init__.py +0 -0
  31. megadetector/classification/aggregate_classifier_probs.py +108 -0
  32. megadetector/classification/analyze_failed_images.py +227 -0
  33. megadetector/classification/cache_batchapi_outputs.py +198 -0
  34. megadetector/classification/create_classification_dataset.py +627 -0
  35. megadetector/classification/crop_detections.py +516 -0
  36. megadetector/classification/csv_to_json.py +226 -0
  37. megadetector/classification/detect_and_crop.py +855 -0
  38. megadetector/classification/efficientnet/__init__.py +9 -0
  39. megadetector/classification/efficientnet/model.py +415 -0
  40. megadetector/classification/efficientnet/utils.py +607 -0
  41. megadetector/classification/evaluate_model.py +520 -0
  42. megadetector/classification/identify_mislabeled_candidates.py +152 -0
  43. megadetector/classification/json_to_azcopy_list.py +63 -0
  44. megadetector/classification/json_validator.py +699 -0
  45. megadetector/classification/map_classification_categories.py +276 -0
  46. megadetector/classification/merge_classification_detection_output.py +506 -0
  47. megadetector/classification/prepare_classification_script.py +194 -0
  48. megadetector/classification/prepare_classification_script_mc.py +228 -0
  49. megadetector/classification/run_classifier.py +287 -0
  50. megadetector/classification/save_mislabeled.py +110 -0
  51. megadetector/classification/train_classifier.py +827 -0
  52. megadetector/classification/train_classifier_tf.py +725 -0
  53. megadetector/classification/train_utils.py +323 -0
  54. megadetector/data_management/__init__.py +0 -0
  55. megadetector/data_management/annotations/__init__.py +0 -0
  56. megadetector/data_management/annotations/annotation_constants.py +34 -0
  57. megadetector/data_management/camtrap_dp_to_coco.py +237 -0
  58. megadetector/data_management/cct_json_utils.py +404 -0
  59. megadetector/data_management/cct_to_md.py +176 -0
  60. megadetector/data_management/cct_to_wi.py +289 -0
  61. megadetector/data_management/coco_to_labelme.py +283 -0
  62. megadetector/data_management/coco_to_yolo.py +662 -0
  63. megadetector/data_management/databases/__init__.py +0 -0
  64. megadetector/data_management/databases/add_width_and_height_to_db.py +33 -0
  65. megadetector/data_management/databases/combine_coco_camera_traps_files.py +206 -0
  66. megadetector/data_management/databases/integrity_check_json_db.py +493 -0
  67. megadetector/data_management/databases/subset_json_db.py +115 -0
  68. megadetector/data_management/generate_crops_from_cct.py +149 -0
  69. megadetector/data_management/get_image_sizes.py +189 -0
  70. megadetector/data_management/importers/add_nacti_sizes.py +52 -0
  71. megadetector/data_management/importers/add_timestamps_to_icct.py +79 -0
  72. megadetector/data_management/importers/animl_results_to_md_results.py +158 -0
  73. megadetector/data_management/importers/auckland_doc_test_to_json.py +373 -0
  74. megadetector/data_management/importers/auckland_doc_to_json.py +201 -0
  75. megadetector/data_management/importers/awc_to_json.py +191 -0
  76. megadetector/data_management/importers/bellevue_to_json.py +273 -0
  77. megadetector/data_management/importers/cacophony-thermal-importer.py +793 -0
  78. megadetector/data_management/importers/carrizo_shrubfree_2018.py +269 -0
  79. megadetector/data_management/importers/carrizo_trail_cam_2017.py +289 -0
  80. megadetector/data_management/importers/cct_field_adjustments.py +58 -0
  81. megadetector/data_management/importers/channel_islands_to_cct.py +913 -0
  82. megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +180 -0
  83. megadetector/data_management/importers/eMammal/eMammal_helpers.py +249 -0
  84. megadetector/data_management/importers/eMammal/make_eMammal_json.py +223 -0
  85. megadetector/data_management/importers/ena24_to_json.py +276 -0
  86. megadetector/data_management/importers/filenames_to_json.py +386 -0
  87. megadetector/data_management/importers/helena_to_cct.py +283 -0
  88. megadetector/data_management/importers/idaho-camera-traps.py +1407 -0
  89. megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +294 -0
  90. megadetector/data_management/importers/jb_csv_to_json.py +150 -0
  91. megadetector/data_management/importers/mcgill_to_json.py +250 -0
  92. megadetector/data_management/importers/missouri_to_json.py +490 -0
  93. megadetector/data_management/importers/nacti_fieldname_adjustments.py +79 -0
  94. megadetector/data_management/importers/noaa_seals_2019.py +181 -0
  95. megadetector/data_management/importers/pc_to_json.py +365 -0
  96. megadetector/data_management/importers/plot_wni_giraffes.py +123 -0
  97. megadetector/data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -0
  98. megadetector/data_management/importers/prepare_zsl_imerit.py +131 -0
  99. megadetector/data_management/importers/rspb_to_json.py +356 -0
  100. megadetector/data_management/importers/save_the_elephants_survey_A.py +320 -0
  101. megadetector/data_management/importers/save_the_elephants_survey_B.py +329 -0
  102. megadetector/data_management/importers/snapshot_safari_importer.py +758 -0
  103. megadetector/data_management/importers/snapshot_safari_importer_reprise.py +665 -0
  104. megadetector/data_management/importers/snapshot_serengeti_lila.py +1067 -0
  105. megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +150 -0
  106. megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +153 -0
  107. megadetector/data_management/importers/sulross_get_exif.py +65 -0
  108. megadetector/data_management/importers/timelapse_csv_set_to_json.py +490 -0
  109. megadetector/data_management/importers/ubc_to_json.py +399 -0
  110. megadetector/data_management/importers/umn_to_json.py +507 -0
  111. megadetector/data_management/importers/wellington_to_json.py +263 -0
  112. megadetector/data_management/importers/wi_to_json.py +442 -0
  113. megadetector/data_management/importers/zamba_results_to_md_results.py +181 -0
  114. megadetector/data_management/labelme_to_coco.py +547 -0
  115. megadetector/data_management/labelme_to_yolo.py +272 -0
  116. megadetector/data_management/lila/__init__.py +0 -0
  117. megadetector/data_management/lila/add_locations_to_island_camera_traps.py +97 -0
  118. megadetector/data_management/lila/add_locations_to_nacti.py +147 -0
  119. megadetector/data_management/lila/create_lila_blank_set.py +558 -0
  120. megadetector/data_management/lila/create_lila_test_set.py +152 -0
  121. megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
  122. megadetector/data_management/lila/download_lila_subset.py +178 -0
  123. megadetector/data_management/lila/generate_lila_per_image_labels.py +516 -0
  124. megadetector/data_management/lila/get_lila_annotation_counts.py +170 -0
  125. megadetector/data_management/lila/get_lila_image_counts.py +112 -0
  126. megadetector/data_management/lila/lila_common.py +300 -0
  127. megadetector/data_management/lila/test_lila_metadata_urls.py +132 -0
  128. megadetector/data_management/ocr_tools.py +870 -0
  129. megadetector/data_management/read_exif.py +809 -0
  130. megadetector/data_management/remap_coco_categories.py +84 -0
  131. megadetector/data_management/remove_exif.py +66 -0
  132. megadetector/data_management/rename_images.py +187 -0
  133. megadetector/data_management/resize_coco_dataset.py +189 -0
  134. megadetector/data_management/wi_download_csv_to_coco.py +247 -0
  135. megadetector/data_management/yolo_output_to_md_output.py +446 -0
  136. megadetector/data_management/yolo_to_coco.py +676 -0
  137. megadetector/detection/__init__.py +0 -0
  138. megadetector/detection/detector_training/__init__.py +0 -0
  139. megadetector/detection/detector_training/model_main_tf2.py +114 -0
  140. megadetector/detection/process_video.py +846 -0
  141. megadetector/detection/pytorch_detector.py +355 -0
  142. megadetector/detection/run_detector.py +779 -0
  143. megadetector/detection/run_detector_batch.py +1219 -0
  144. megadetector/detection/run_inference_with_yolov5_val.py +1087 -0
  145. megadetector/detection/run_tiled_inference.py +934 -0
  146. megadetector/detection/tf_detector.py +192 -0
  147. megadetector/detection/video_utils.py +698 -0
  148. megadetector/postprocessing/__init__.py +0 -0
  149. megadetector/postprocessing/add_max_conf.py +64 -0
  150. megadetector/postprocessing/categorize_detections_by_size.py +165 -0
  151. megadetector/postprocessing/classification_postprocessing.py +716 -0
  152. megadetector/postprocessing/combine_api_outputs.py +249 -0
  153. megadetector/postprocessing/compare_batch_results.py +966 -0
  154. megadetector/postprocessing/convert_output_format.py +396 -0
  155. megadetector/postprocessing/load_api_results.py +195 -0
  156. megadetector/postprocessing/md_to_coco.py +310 -0
  157. megadetector/postprocessing/md_to_labelme.py +330 -0
  158. megadetector/postprocessing/merge_detections.py +412 -0
  159. megadetector/postprocessing/postprocess_batch_results.py +1908 -0
  160. megadetector/postprocessing/remap_detection_categories.py +170 -0
  161. megadetector/postprocessing/render_detection_confusion_matrix.py +660 -0
  162. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +211 -0
  163. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +83 -0
  164. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1635 -0
  165. megadetector/postprocessing/separate_detections_into_folders.py +730 -0
  166. megadetector/postprocessing/subset_json_detector_output.py +700 -0
  167. megadetector/postprocessing/top_folders_to_bottom.py +223 -0
  168. megadetector/taxonomy_mapping/__init__.py +0 -0
  169. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
  170. megadetector/taxonomy_mapping/map_new_lila_datasets.py +150 -0
  171. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -0
  172. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +588 -0
  173. megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
  174. megadetector/taxonomy_mapping/simple_image_download.py +219 -0
  175. megadetector/taxonomy_mapping/species_lookup.py +834 -0
  176. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
  177. megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
  178. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
  179. megadetector/utils/__init__.py +0 -0
  180. megadetector/utils/azure_utils.py +178 -0
  181. megadetector/utils/ct_utils.py +613 -0
  182. megadetector/utils/directory_listing.py +246 -0
  183. megadetector/utils/md_tests.py +1164 -0
  184. megadetector/utils/path_utils.py +1045 -0
  185. megadetector/utils/process_utils.py +160 -0
  186. megadetector/utils/sas_blob_utils.py +509 -0
  187. megadetector/utils/split_locations_into_train_val.py +228 -0
  188. megadetector/utils/string_utils.py +92 -0
  189. megadetector/utils/url_utils.py +323 -0
  190. megadetector/utils/write_html_image_list.py +225 -0
  191. megadetector/visualization/__init__.py +0 -0
  192. megadetector/visualization/plot_utils.py +293 -0
  193. megadetector/visualization/render_images_with_thumbnails.py +275 -0
  194. megadetector/visualization/visualization_utils.py +1536 -0
  195. megadetector/visualization/visualize_db.py +552 -0
  196. megadetector/visualization/visualize_detector_output.py +405 -0
  197. {megadetector-5.0.11.dist-info → megadetector-5.0.13.dist-info}/LICENSE +0 -0
  198. {megadetector-5.0.11.dist-info → megadetector-5.0.13.dist-info}/METADATA +2 -2
  199. megadetector-5.0.13.dist-info/RECORD +201 -0
  200. megadetector-5.0.13.dist-info/top_level.txt +1 -0
  201. megadetector-5.0.11.dist-info/RECORD +0 -5
  202. megadetector-5.0.11.dist-info/top_level.txt +0 -1
  203. {megadetector-5.0.11.dist-info → megadetector-5.0.13.dist-info}/WHEEL +0 -0
@@ -0,0 +1,834 @@
1
+ """
2
+
3
+ species_lookup.py
4
+
5
+ Look up species names (common or scientific) in the GBIF and iNaturalist
6
+ taxonomies.
7
+
8
+ Run initialize_taxonomy_lookup() before calling any other function.
9
+
10
+ """
11
+
12
+ #%% Constants and imports
13
+
14
+ import argparse
15
+ import pickle
16
+ import shutil
17
+ import zipfile
18
+ import sys
19
+ import os
20
+
21
+ from collections import defaultdict
22
+ from itertools import compress
23
+ from tqdm import tqdm
24
+ from typing import Any, Dict, List, Mapping, Sequence, Set
25
+
26
+ import pandas as pd
27
+ import numpy as np
28
+
29
+ from megadetector.utils import url_utils
30
+
31
+ taxonomy_download_dir = os.path.expanduser('~/taxonomy')
32
+
33
+ taxonomy_urls = {
34
+ 'GBIF': 'https://hosted-datasets.gbif.org/datasets/backbone/current/backbone.zip',
35
+ 'iNaturalist': 'https://www.inaturalist.org/observations/inaturalist-dwca-with-taxa.zip' # pylint: disable=line-too-long
36
+ }
37
+
38
+ files_to_unzip = {
39
+ # GBIF used to put everything in a "backbone" folder within the zipfile, but as of
40
+ # 12.2023, this is no longer the case.
41
+ # 'GBIF': ['backbone/Taxon.tsv', 'backbone/VernacularName.tsv'],
42
+ 'GBIF': ['Taxon.tsv', 'VernacularName.tsv'],
43
+ 'iNaturalist': ['taxa.csv']
44
+ }
45
+
46
+ # As of 2020.05.12:
47
+ #
48
+ # GBIF: ~777MB zipped, ~1.6GB taxonomy
49
+ # iNat: ~2.2GB zipped, ~51MB taxonomy (most of the zipfile is observations)
50
+
51
+ # As of 2023.12.29:
52
+ #
53
+ # GBIF: ~948MB zipped, ~2.2GB taxonomy
54
+ # iNat: ~6.7GB zipped, ~62MB taxonomy (most of the zipfile is observations)
55
+
56
+
57
+ os.makedirs(taxonomy_download_dir, exist_ok=True)
58
+ for taxonomy_name in taxonomy_urls:
59
+ taxonomy_dir = os.path.join(taxonomy_download_dir, taxonomy_name)
60
+ os.makedirs(taxonomy_dir, exist_ok=True)
61
+
62
+ serialized_structures_file = os.path.join(taxonomy_download_dir,
63
+ 'serialized_taxonomies.p')
64
+
65
+ # These are un-initialized globals that must be initialized by
66
+ # the initialize_taxonomy_lookup() function below.
67
+ inat_taxonomy = None # : pd.DataFrame
68
+ gbif_taxonomy = None # : pd.DataFrame
69
+ gbif_common_mapping = None # : pd.DataFrame
70
+ inat_taxon_id_to_row = None # : Dict[np.int64, int]
71
+ gbif_taxon_id_to_row = None # : Dict[np.int64, int]
72
+ inat_taxon_id_to_vernacular = None # : Dict[np.int64, Set[str]]
73
+ inat_vernacular_to_taxon_id = None # : Dict[str, np.int64]
74
+ inat_taxon_id_to_scientific = None # : Dict[np.int64, Set[str]]
75
+ inat_scientific_to_taxon_id = None # : Dict[str, np.int64]
76
+ gbif_taxon_id_to_vernacular = None # : Dict[np.int64, Set[str]]
77
+ gbif_vernacular_to_taxon_id = None # : Dict[str, np.int64]
78
+ gbif_taxon_id_to_scientific = None # : Dict[np.int64, Set[str]]
79
+ gbif_scientific_to_taxon_id = None # : Dict[str, np.int64]
80
+
81
+
82
+ #%% Functions
83
+
84
+ # Initialization function
85
+
86
+ def initialize_taxonomy_lookup(force_init=False) -> None:
87
+ """
88
+ Initialize this module by doing the following:
89
+
90
+ * Downloads and unzips the current GBIF and iNat taxonomies if necessary
91
+ (only unzips what's necessary, but does not delete the original zipfiles)
92
+ * Builds a bunch of dictionaries and tables to facilitate lookup
93
+ * Serializes those tables via pickle
94
+ * Skips all of the above if the serialized pickle file already exists
95
+ """
96
+
97
+ global inat_taxonomy,\
98
+ gbif_taxonomy,\
99
+ gbif_common_mapping,\
100
+ inat_taxon_id_to_row,\
101
+ gbif_taxon_id_to_row,\
102
+ inat_taxon_id_to_vernacular,\
103
+ inat_vernacular_to_taxon_id,\
104
+ inat_taxon_id_to_scientific,\
105
+ inat_scientific_to_taxon_id,\
106
+ gbif_taxon_id_to_vernacular,\
107
+ gbif_vernacular_to_taxon_id,\
108
+ gbif_taxon_id_to_scientific,\
109
+ gbif_scientific_to_taxon_id
110
+
111
+
112
+ ## Load serialized taxonomy info if we've already saved it
113
+
114
+ if (not force_init) and (inat_taxonomy is not None):
115
+ print('Skipping taxonomy re-init')
116
+ return
117
+
118
+ if (not force_init) and (os.path.isfile(serialized_structures_file)):
119
+
120
+ print(f'De-serializing taxonomy data from {serialized_structures_file}')
121
+
122
+ with open(serialized_structures_file, 'rb') as f:
123
+ structures_to_serialize = pickle.load(f)
124
+
125
+ inat_taxonomy,\
126
+ gbif_taxonomy,\
127
+ gbif_common_mapping,\
128
+ inat_taxon_id_to_row,\
129
+ gbif_taxon_id_to_row,\
130
+ inat_taxon_id_to_vernacular,\
131
+ inat_vernacular_to_taxon_id,\
132
+ inat_taxon_id_to_scientific,\
133
+ inat_scientific_to_taxon_id,\
134
+ gbif_taxon_id_to_vernacular,\
135
+ gbif_vernacular_to_taxon_id,\
136
+ gbif_taxon_id_to_scientific,\
137
+ gbif_scientific_to_taxon_id = structures_to_serialize
138
+
139
+ return
140
+
141
+
142
+ ## If we don't have serialized taxonomy info, create it from scratch.
143
+
144
+ # Download and unzip taxonomy files
145
+ # taxonomy_name = list(taxonomy_urls.items())[0][0]; zip_url = list(taxonomy_urls.items())[0][1]
146
+ for taxonomy_name, zip_url in taxonomy_urls.items():
147
+
148
+ need_to_download = False
149
+
150
+ if force_init:
151
+ need_to_download = True
152
+
153
+ # Don't download the zipfile if we've already unzipped what we need
154
+ for fn in files_to_unzip[taxonomy_name]:
155
+ target_file = os.path.join(
156
+ taxonomy_download_dir, taxonomy_name, fn)
157
+ if not os.path.isfile(target_file):
158
+ need_to_download = True
159
+ break
160
+ if not need_to_download:
161
+ print(f'Bypassing download of {taxonomy_name}, all files available')
162
+ continue
163
+
164
+ zipfile_path = os.path.join(
165
+ taxonomy_download_dir, zip_url.split('/')[-1])
166
+
167
+ # Bypasses download if the file exists already (unless force_init is set)
168
+ url_utils.download_url(
169
+ zip_url, os.path.join(zipfile_path),
170
+ progress_updater=url_utils.DownloadProgressBar(),
171
+ verbose=True,force_download=force_init)
172
+
173
+ # Unzip the files we need
174
+ files_we_need = files_to_unzip[taxonomy_name]
175
+
176
+ with zipfile.ZipFile(zipfile_path, 'r') as zipH:
177
+
178
+ for fn in files_we_need:
179
+ print('Unzipping {}'.format(fn))
180
+ target_file = os.path.join(
181
+ taxonomy_download_dir, taxonomy_name, os.path.basename(fn))
182
+
183
+ if (not force_init) and (os.path.isfile(target_file)):
184
+ print(f'Bypassing unzip of {target_file}, file exists')
185
+ else:
186
+ os.makedirs(os.path.basename(target_file),exist_ok=True)
187
+ with zipH.open(fn) as zf, open(target_file, 'wb') as f:
188
+ shutil.copyfileobj(zf, f)
189
+
190
+ # ...for each file that we need from this zipfile
191
+
192
+ # Remove the zipfile
193
+ # os.remove(zipfile_path)
194
+
195
+ # ...for each taxonomy
196
+
197
+
198
+ # Create dataframes from each of the taxonomy files, and the GBIF common
199
+ # name file
200
+
201
+ # Load iNat taxonomy
202
+ inat_taxonomy_file = os.path.join(taxonomy_download_dir, 'iNaturalist', 'taxa.csv')
203
+ print('Loading iNat taxonomy from {}'.format(inat_taxonomy_file))
204
+ inat_taxonomy = pd.read_csv(inat_taxonomy_file)
205
+ inat_taxonomy['scientificName'] = inat_taxonomy['scientificName'].fillna('').str.strip()
206
+ inat_taxonomy['vernacularName'] = inat_taxonomy['vernacularName'].fillna('').str.strip()
207
+
208
+ # Load GBIF taxonomy
209
+ gbif_taxonomy_file = os.path.join(taxonomy_download_dir, 'GBIF', 'Taxon.tsv')
210
+ print('Loading GBIF taxonomy from {}'.format(gbif_taxonomy_file))
211
+ gbif_taxonomy = pd.read_csv(gbif_taxonomy_file, sep='\t')
212
+ gbif_taxonomy['scientificName'] = gbif_taxonomy['scientificName'].fillna('').str.strip()
213
+ gbif_taxonomy['canonicalName'] = gbif_taxonomy['canonicalName'].fillna('').str.strip()
214
+
215
+ # Remove questionable rows from the GBIF taxonomy
216
+ gbif_taxonomy = gbif_taxonomy[~gbif_taxonomy['taxonomicStatus'].isin(['doubtful', 'misapplied'])]
217
+ gbif_taxonomy = gbif_taxonomy.reset_index()
218
+
219
+ # Load GBIF vernacular name mapping
220
+ gbif_common_mapping = pd.read_csv(os.path.join(
221
+ taxonomy_download_dir, 'GBIF', 'VernacularName.tsv'), sep='\t')
222
+ gbif_common_mapping['vernacularName'] = gbif_common_mapping['vernacularName'].fillna('').str.strip()
223
+
224
+ # Only keep English mappings
225
+ gbif_common_mapping = gbif_common_mapping.loc[gbif_common_mapping['language'] == 'en']
226
+ gbif_common_mapping = gbif_common_mapping.reset_index()
227
+
228
+
229
+ # Convert everything to lowercase
230
+
231
+ def convert_df_to_lowercase(df):
232
+ df = df.applymap(lambda s: s.lower() if isinstance(s, str) else s)
233
+ return df
234
+
235
+ inat_taxonomy = convert_df_to_lowercase(inat_taxonomy)
236
+ gbif_taxonomy = convert_df_to_lowercase(gbif_taxonomy)
237
+ gbif_common_mapping = convert_df_to_lowercase(gbif_common_mapping)
238
+
239
+
240
+ # For each taxonomy table, create a mapping from taxon IDs to rows
241
+
242
+ inat_taxon_id_to_row = {}
243
+ gbif_taxon_id_to_row = {}
244
+
245
+ print('Building iNat taxonID --> row table')
246
+ for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
247
+ inat_taxon_id_to_row[row['taxonID']] = i_row
248
+
249
+ print('Building GBIF taxonID --> row table')
250
+ for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
251
+ gbif_taxon_id_to_row[row['taxonID']] = i_row
252
+
253
+
254
+ # Create name mapping dictionaries
255
+
256
+ inat_taxon_id_to_vernacular = defaultdict(set)
257
+ inat_vernacular_to_taxon_id = defaultdict(set)
258
+ inat_taxon_id_to_scientific = defaultdict(set)
259
+ inat_scientific_to_taxon_id = defaultdict(set)
260
+
261
+ gbif_taxon_id_to_vernacular = defaultdict(set)
262
+ gbif_vernacular_to_taxon_id = defaultdict(set)
263
+ gbif_taxon_id_to_scientific = defaultdict(set)
264
+ gbif_scientific_to_taxon_id = defaultdict(set)
265
+
266
+
267
+ # Build iNat dictionaries
268
+
269
+ print('Building lookup dictionaries for iNat taxonomy')
270
+
271
+ for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
272
+
273
+ taxon_id = row['taxonID']
274
+ vernacular_name = row['vernacularName']
275
+ scientific_name = row['scientificName']
276
+
277
+ if len(vernacular_name) > 0:
278
+ inat_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
279
+ inat_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
280
+
281
+ assert len(scientific_name) > 0
282
+ inat_taxon_id_to_scientific[taxon_id].add(scientific_name)
283
+ inat_scientific_to_taxon_id[scientific_name].add(taxon_id)
284
+
285
+
286
+ # Build GBIF dictionaries
287
+
288
+ print('Building lookup dictionaries for GBIF taxonomy')
289
+
290
+ for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
291
+
292
+ taxon_id = row['taxonID']
293
+
294
+ # The canonical name is the Latin name; the "scientific name"
295
+ # include the taxonomy name.
296
+ #
297
+ # http://globalnames.org/docs/glossary/
298
+
299
+ scientific_name = row['canonicalName']
300
+
301
+ # This only seems to happen for really esoteric species that aren't
302
+ # likely to apply to our problems, but doing this for completeness.
303
+ if len(scientific_name) == 0:
304
+ scientific_name = row['scientificName']
305
+
306
+ assert len(scientific_name) > 0
307
+ gbif_taxon_id_to_scientific[taxon_id].add(scientific_name)
308
+ gbif_scientific_to_taxon_id[scientific_name].add(taxon_id)
309
+
310
+ for i_row, row in tqdm(gbif_common_mapping.iterrows(), total=len(gbif_common_mapping)):
311
+
312
+ taxon_id = row['taxonID']
313
+
314
+ # Don't include taxon IDs that were removed from the master table
315
+ if taxon_id not in gbif_taxon_id_to_scientific:
316
+ continue
317
+
318
+ vernacular_name = row['vernacularName']
319
+
320
+ assert len(vernacular_name) > 0
321
+ gbif_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
322
+ gbif_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
323
+
324
+
325
+ # Save everything to file
326
+
327
+ structures_to_serialize = [
328
+ inat_taxonomy,
329
+ gbif_taxonomy,
330
+ gbif_common_mapping,
331
+ inat_taxon_id_to_row,
332
+ gbif_taxon_id_to_row,
333
+ inat_taxon_id_to_vernacular,
334
+ inat_vernacular_to_taxon_id,
335
+ inat_taxon_id_to_scientific,
336
+ inat_scientific_to_taxon_id,
337
+ gbif_taxon_id_to_vernacular,
338
+ gbif_vernacular_to_taxon_id,
339
+ gbif_taxon_id_to_scientific,
340
+ gbif_scientific_to_taxon_id
341
+ ]
342
+
343
+ print('Serializing to {}...'.format(serialized_structures_file), end='')
344
+ if not os.path.isfile(serialized_structures_file):
345
+ with open(serialized_structures_file, 'wb') as p:
346
+ pickle.dump(structures_to_serialize, p)
347
+ print(' done')
348
+
349
+ # ...def initialize_taxonomy_lookup(...)
350
+
351
+
352
+ def get_scientific_name_from_row(r):
353
+ """
354
+ r: a dataframe that's really a row in one of our taxonomy tables
355
+ """
356
+
357
+ if 'canonicalName' in r and len(r['canonicalName']) > 0:
358
+ scientific_name = r['canonicalName']
359
+ else:
360
+ scientific_name = r['scientificName']
361
+ return scientific_name
362
+
363
+
364
+ def taxonomy_row_to_string(r):
365
+ """
366
+ r: a dataframe that's really a row in one of our taxonomy tables
367
+ """
368
+
369
+ if 'vernacularName' in r:
370
+ common_string = ' (' + r['vernacularName'] + ')'
371
+ else:
372
+ common_string = ''
373
+ scientific_name = get_scientific_name_from_row(r)
374
+
375
+ return r['taxonRank'] + ' ' + scientific_name + common_string
376
+
377
+
378
+ def traverse_taxonomy(matching_rownums: Sequence[int],
379
+ taxon_id_to_row: Mapping[str, int],
380
+ taxon_id_to_vernacular: Mapping[str, Set[str]],
381
+ taxonomy: pd.DataFrame,
382
+ source_name: str,
383
+ query: str) -> List[Dict[str, Any]]:
384
+ """
385
+ Given a data frame that's a set of rows from one of our taxonomy tables,
386
+ walks the taxonomy hierarchy from each row to put together a full taxonomy
387
+ tree, then prunes redundant trees (e.g. if we had separate hits for a
388
+ species and the genus that contains that species.)
389
+
390
+ Returns a list of dicts:
391
+ [
392
+ {
393
+ 'source': 'inat' or 'gbif',
394
+ 'taxonomy': [(taxon_id, taxon_rank, scientific_name, [common names])]
395
+ },
396
+ ...
397
+ ]
398
+ """
399
+
400
+ # list of dicts: {'source': source_name, 'taxonomy': match_details}
401
+ matching_trees: List[Dict[str, Any]] = []
402
+
403
+ # i_match = 0
404
+ for i_match in matching_rownums:
405
+
406
+ # list of (taxon_id, taxonRank, scientific name, [vernacular names])
407
+ # corresponding to an exact match and its parents
408
+ match_details = []
409
+ current_row = taxonomy.iloc[i_match]
410
+
411
+ # Walk taxonomy hierarchy
412
+ while True:
413
+
414
+ taxon_id = current_row['taxonID']
415
+ vernacular_names = sorted(taxon_id_to_vernacular[taxon_id]) # sort for determinism, pylint: disable=line-too-long
416
+ match_details.append((taxon_id, current_row['taxonRank'],
417
+ get_scientific_name_from_row(current_row),
418
+ vernacular_names))
419
+
420
+ if np.isnan(current_row['parentNameUsageID']):
421
+ break
422
+ parent_taxon_id = current_row['parentNameUsageID'].astype('int64')
423
+ if parent_taxon_id not in taxon_id_to_row:
424
+ # This can happen because we remove questionable rows from the
425
+ # GBIF taxonomy
426
+ # print(f'Warning: no row exists for parent_taxon_id {parent_taxon_id},' + \
427
+ # f'child taxon_id: {taxon_id}, query: {query}')
428
+ break
429
+ i_parent_row = taxon_id_to_row[parent_taxon_id]
430
+ current_row = taxonomy.iloc[i_parent_row]
431
+
432
+ # The GBIF taxonomy contains unranked entries
433
+ if current_row['taxonRank'] == 'unranked':
434
+ break
435
+
436
+ # ...while there is taxonomy left to walk
437
+
438
+ matching_trees.append({'source': source_name,
439
+ 'taxonomy': match_details})
440
+
441
+ # ...for each match
442
+
443
+ # Remove redundant matches
444
+ b_valid_tree = [True] * len(matching_rownums)
445
+ # i_tree_a = 0; tree_a = matching_trees[i_tree_a]
446
+ for i_tree_a, tree_a in enumerate(matching_trees):
447
+
448
+ tree_a_primary_taxon_id = tree_a['taxonomy'][0][0]
449
+
450
+ # i_tree_b = 1; tree_b = matching_trees[i_tree_b]
451
+ for i_tree_b, tree_b in enumerate(matching_trees):
452
+
453
+ if i_tree_a == i_tree_b:
454
+ continue
455
+
456
+ # If tree a's primary taxon ID is inside tree b, discard tree a
457
+ #
458
+ # taxonomy_level_b = tree_b['taxonomy'][0]
459
+ for taxonomy_level_b in tree_b['taxonomy']:
460
+ if tree_a_primary_taxon_id == taxonomy_level_b[0]:
461
+ b_valid_tree[i_tree_a] = False
462
+ break
463
+
464
+ # ...for each level in taxonomy B
465
+
466
+ # ...for each tree (inner)
467
+
468
+ # ...for each tree (outer)
469
+
470
+ matching_trees = list(compress(matching_trees, b_valid_tree))
471
+ return matching_trees
472
+
473
+ # ...def traverse_taxonomy()
474
+
475
+
476
+ def get_taxonomic_info(query: str) -> List[Dict[str, Any]]:
477
+ """
478
+ Main entry point: get taxonomic matches from both taxonomies for [query],
479
+ which may be a scientific or common name.
480
+ """
481
+ query = query.strip().lower()
482
+ # print("Finding taxonomy information for: {0}".format(query))
483
+
484
+ inat_taxon_ids = set()
485
+ if query in inat_scientific_to_taxon_id:
486
+ inat_taxon_ids |= inat_scientific_to_taxon_id[query]
487
+ if query in inat_vernacular_to_taxon_id:
488
+ inat_taxon_ids |= inat_vernacular_to_taxon_id[query]
489
+
490
+ # In GBIF, some queries hit for both common and scientific, make sure we end
491
+ # up with unique inputs
492
+ gbif_taxon_ids = set()
493
+ if query in gbif_scientific_to_taxon_id:
494
+ gbif_taxon_ids |= gbif_scientific_to_taxon_id[query]
495
+ if query in gbif_vernacular_to_taxon_id:
496
+ gbif_taxon_ids |= gbif_vernacular_to_taxon_id[query]
497
+
498
+ # If the species is not found in either taxonomy, return None
499
+ if (len(inat_taxon_ids) == 0) and (len(gbif_taxon_ids) == 0):
500
+ return []
501
+
502
+ # Both GBIF and iNat have a 1-to-1 mapping between taxon_id and row number
503
+ inat_row_indices = [inat_taxon_id_to_row[i] for i in inat_taxon_ids]
504
+ gbif_row_indices = [gbif_taxon_id_to_row[i] for i in gbif_taxon_ids]
505
+
506
+ # Walk both taxonomies
507
+ inat_matching_trees = traverse_taxonomy(
508
+ inat_row_indices, inat_taxon_id_to_row, inat_taxon_id_to_vernacular,
509
+ inat_taxonomy, 'inat', query)
510
+ gbif_matching_trees = traverse_taxonomy(
511
+ gbif_row_indices, gbif_taxon_id_to_row, gbif_taxon_id_to_vernacular,
512
+ gbif_taxonomy, 'gbif', query)
513
+
514
+ return gbif_matching_trees + inat_matching_trees
515
+
516
+ # ...def get_taxonomic_info()
517
+
518
+
519
+ def print_taxonomy_matches(matches, verbose=False):
520
+ """
521
+ Console-friendly printing function to make nicely-indentend trees
522
+ """
523
+
524
+ # m = matches[0]
525
+ for m in matches:
526
+
527
+ source = m['source']
528
+
529
+ # For example: [(9761484, 'species', 'anas platyrhynchos')]
530
+ for i_taxonomy_level in range(0, len(m['taxonomy'])):
531
+ taxonomy_level_info = m['taxonomy'][i_taxonomy_level]
532
+ taxonomy_level = taxonomy_level_info[1]
533
+ name = taxonomy_level_info[2]
534
+ common = taxonomy_level_info[3]
535
+
536
+ if i_taxonomy_level > 0:
537
+ print('\t',end='')
538
+
539
+ print('{} {} ({})'.format(taxonomy_level, name, common), end='')
540
+
541
+ if i_taxonomy_level == 0:
542
+ print(' ({})'.format(source))
543
+ else:
544
+ print('')
545
+
546
+ if not verbose:
547
+ break
548
+
549
+ # ...for each taxonomy level
550
+
551
+ # ...for each match
552
+
553
+ # ...def print_taxonomy_matches()
554
+
555
+
556
+ #%% Taxonomy functions that make subjective judgements
557
+
558
+ import unicodedata
559
+ import re
560
+
561
+ def slugify(value: Any, allow_unicode: bool = False) -> str:
562
+ """
563
+ From:
564
+ https://github.com/django/django/blob/master/django/utils/text.py
565
+
566
+ Convert to ASCII if 'allow_unicode' is False. Convert spaces to hyphens.
567
+ Remove characters that aren't alphanumerics, underscores, or hyphens.
568
+ Convert to lowercase. Also strip leading and trailing whitespace.
569
+ """
570
+
571
+ value = str(value)
572
+ value = unicodedata.normalize('NFKC', value)
573
+ if not allow_unicode:
574
+ value = value.encode('ascii', 'ignore').decode('ascii')
575
+ value = re.sub(r'[^\w\s-]', '', value.lower()).strip()
576
+ return re.sub(r'[-\s]+', '-', value)
577
+
578
+
579
+ class TaxonomicMatch:
580
+
581
+ def __init__(self, scientific_name, common_name, taxonomic_level, source,
582
+ taxonomy_string, match):
583
+ self.scientific_name = scientific_name
584
+ self.common_name = common_name
585
+ self.taxonomic_level = taxonomic_level
586
+ self.source = source
587
+ self.taxonomy_string = taxonomy_string
588
+ self.match = match
589
+
590
+ def __repr__(self):
591
+ return ('TaxonomicMatch('
592
+ f'scientific_name={self.scientific_name}, '
593
+ f'common_name={self.common_name}, '
594
+ f'taxonomic_level={self.taxonomic_level}, '
595
+ f'source={self.source}')
596
+
597
+
598
+ hyphenated_terms = ['crowned', 'backed', 'throated', 'tailed', 'headed', 'cheeked',
599
+ 'ruffed', 'browed', 'eating', 'striped', 'shanked',
600
+ 'fronted', 'bellied', 'spotted', 'eared', 'collared', 'breasted',
601
+ 'necked']
602
+
603
+ def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retry=True) -> TaxonomicMatch:
604
+ """
605
+ Wrapper for species_lookup.py, but expressing a variety of heuristics and
606
+ preferences that are specific to our scenario.
607
+ """
608
+
609
+ m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
610
+ if (len(m.scientific_name) > 0) or (not retry):
611
+ return m
612
+
613
+ for s in hyphenated_terms:
614
+ query = query.replace(' ' + s,'-' + s)
615
+ m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
616
+ return m
617
+
618
+
619
+ def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') -> TaxonomicMatch:
620
+
621
+ query = query.lower().strip().replace('_', ' ')
622
+ query = query.replace('unidentified','')
623
+ query = query.replace('unknown','')
624
+ if query.endswith(' sp'):
625
+ query = query.replace(' sp','')
626
+ if query.endswith(' group'):
627
+ query = query.replace(' group','')
628
+
629
+ query = query.strip()
630
+
631
+ # query = 'person'
632
+ matches = get_taxonomic_info(query)
633
+
634
+ # Do we have an iNat match?
635
+ inat_matches = [m for m in matches if m['source'] == 'inat']
636
+ gbif_matches = [m for m in matches if m['source'] == 'gbif']
637
+
638
+ # print_taxonomy_matches(inat_matches, verbose=True)
639
+ # print_taxonomy_matches(gbif_matches, verbose=True)
640
+
641
+ scientific_name = ''
642
+ common_name = ''
643
+ taxonomic_level = ''
644
+ match = ''
645
+ source = ''
646
+ taxonomy_string = ''
647
+
648
+ n_inat_matches = len(inat_matches)
649
+ n_gbif_matches = len(gbif_matches)
650
+
651
+ selected_matches = None
652
+
653
+ assert taxonomy_preference in ['gbif','inat'],\
654
+ 'Unrecognized taxonomy preference: {}'.format(taxonomy_preference)
655
+
656
+ if n_inat_matches > 0 and taxonomy_preference == 'inat':
657
+ selected_matches = 'inat'
658
+ elif n_gbif_matches > 0:
659
+ selected_matches = 'gbif'
660
+
661
+ if selected_matches == 'inat':
662
+
663
+ i_match = 0
664
+
665
+ if len(inat_matches) > 1:
666
+ # print('Warning: multiple iNat matches for {}'.format(query))
667
+
668
+ # Prefer chordates... most of the names that aren't what we want
669
+ # are esoteric insects, like a moth called "cheetah"
670
+ #
671
+ # If we can't find a chordate, just take the first match.
672
+ #
673
+ # i_test_match = 0
674
+ for i_test_match, match in enumerate(inat_matches):
675
+ found_vertebrate = False
676
+ taxonomy = match['taxonomy']
677
+ for taxonomy_level in taxonomy:
678
+ taxon_rank = taxonomy_level[1]
679
+ scientific_name = taxonomy_level[2]
680
+ if taxon_rank == 'phylum' and scientific_name == 'chordata':
681
+ i_match = i_test_match
682
+ found_vertebrate = True
683
+ break
684
+ if found_vertebrate:
685
+ break
686
+
687
+ match = inat_matches[i_match]['taxonomy']
688
+
689
+ # This is (taxonID, taxonLevel, scientific, [list of common])
690
+ lowest_level = match[0]
691
+ taxonomic_level = lowest_level[1]
692
+ scientific_name = lowest_level[2]
693
+ assert len(scientific_name) > 0
694
+ common_names = lowest_level[3]
695
+ if len(common_names) > 1:
696
+ # print(f'Warning: multiple iNat common names for {query}')
697
+ # Default to returning the query
698
+ if query in common_names:
699
+ common_name = query
700
+ else:
701
+ common_name = common_names[0]
702
+ elif len(common_names) > 0:
703
+ common_name = common_names[0]
704
+
705
+ # print(f'Matched iNat {query} to {scientific_name},{common_name}')
706
+ source = 'inat'
707
+
708
+ # ...if we had iNat matches
709
+
710
+ # If we either prefer GBIF or didn't have iNat matches
711
+ #
712
+ # Code is deliberately redundant here; I'm expecting some subtleties in how
713
+ # handle GBIF and iNat.
714
+ elif selected_matches == 'gbif':
715
+
716
+ i_match = 0
717
+
718
+ if len(gbif_matches) > 1:
719
+ # print('Warning: multiple GBIF matches for {}'.format(query))
720
+
721
+ # Prefer chordates... most of the names that aren't what we want
722
+ # are esoteric insects, like a moth called "cheetah"
723
+ #
724
+ # If we can't find a chordate, just take the first match.
725
+ #
726
+ # i_test_match = 0
727
+ for i_test_match, match in enumerate(gbif_matches):
728
+ found_vertebrate = False
729
+ taxonomy = match['taxonomy']
730
+ for taxonomy_level in taxonomy:
731
+ taxon_rank = taxonomy_level[1]
732
+ scientific_name = taxonomy_level[2]
733
+ if taxon_rank == 'phylum' and scientific_name == 'chordata':
734
+ i_match = i_test_match
735
+ found_vertebrate = True
736
+ break
737
+ if found_vertebrate:
738
+ break
739
+
740
+ match = gbif_matches[i_match]['taxonomy']
741
+
742
+ # This is (taxonID, taxonLevel, scientific, [list of common])
743
+ lowest_level = match[0]
744
+ taxonomic_level = lowest_level[1]
745
+ scientific_name = lowest_level[2]
746
+ assert len(scientific_name) > 0
747
+
748
+ common_names = lowest_level[3]
749
+ if len(common_names) > 1:
750
+ # print(f'Warning: multiple GBIF common names for {query}')
751
+ # Default to returning the query
752
+ if query in common_names:
753
+ common_name = query
754
+ else:
755
+ common_name = common_names[0]
756
+ elif len(common_names) > 0:
757
+ common_name = common_names[0]
758
+
759
+ source = 'gbif'
760
+
761
+ # ...if we needed to look in the GBIF taxonomy
762
+
763
+ taxonomy_string = str(match)
764
+
765
+ return TaxonomicMatch(scientific_name, common_name, taxonomic_level, source,
766
+ taxonomy_string, match),query
767
+
768
+ # ...def _get_preferred_taxonomic_match()
769
+
770
+
771
+ #%% Interactive drivers and debug
772
+
773
+ if False:
774
+
775
+ #%% Initialization
776
+
777
+ initialize_taxonomy_lookup()
778
+
779
+
780
+ #%% Taxonomic lookup
781
+
782
+ # query = 'lion'
783
+ query = 'xenoperdix'
784
+ matches = get_taxonomic_info(query)
785
+ # print(matches)
786
+
787
+ print_taxonomy_matches(matches,verbose=True)
788
+
789
+ print('\n\n')
790
+
791
+ # Print the taxonomy in the taxonomy spreadsheet format
792
+ assert matches[1]['source'] == 'inat'
793
+ t = str(matches[1]['taxonomy'])
794
+ print(t)
795
+ import clipboard; clipboard.copy(t)
796
+
797
+
798
+ #%% Directly access the taxonomy tables
799
+
800
+ taxon_ids = gbif_vernacular_to_taxon_id['lion']
801
+ for taxon_id in taxon_ids:
802
+ i_row = gbif_taxon_id_to_row[taxon_id]
803
+ print(taxonomy_row_to_string(gbif_taxonomy.iloc[i_row]))
804
+
805
+
806
+ #%% Command-line driver
807
+
808
+ def main():
809
+
810
+ # Read command line inputs (absolute path)
811
+ parser = argparse.ArgumentParser()
812
+ parser.add_argument('input_file')
813
+
814
+ if len(sys.argv[1:]) == 0:
815
+ parser.print_help()
816
+ parser.exit()
817
+
818
+ args = parser.parse_args()
819
+ input_file = args.input_file
820
+
821
+ initialize_taxonomy_lookup()
822
+
823
+ # Read the tokens from the input text file
824
+ with open(input_file, 'r') as f:
825
+ tokens = f.readlines()
826
+
827
+ # Loop through each token and get scientific name
828
+ for token in tokens:
829
+ token = token.strip().lower()
830
+ matches = get_taxonomic_info(token)
831
+ print_taxonomy_matches(matches)
832
+
833
+ if __name__ == '__main__':
834
+ main()