megadetector 5.0.10__py3-none-any.whl → 5.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (226) hide show
  1. {megadetector-5.0.10.dist-info → megadetector-5.0.11.dist-info}/LICENSE +0 -0
  2. {megadetector-5.0.10.dist-info → megadetector-5.0.11.dist-info}/METADATA +12 -11
  3. megadetector-5.0.11.dist-info/RECORD +5 -0
  4. megadetector-5.0.11.dist-info/top_level.txt +1 -0
  5. api/__init__.py +0 -0
  6. api/batch_processing/__init__.py +0 -0
  7. api/batch_processing/api_core/__init__.py +0 -0
  8. api/batch_processing/api_core/batch_service/__init__.py +0 -0
  9. api/batch_processing/api_core/batch_service/score.py +0 -439
  10. api/batch_processing/api_core/server.py +0 -294
  11. api/batch_processing/api_core/server_api_config.py +0 -98
  12. api/batch_processing/api_core/server_app_config.py +0 -55
  13. api/batch_processing/api_core/server_batch_job_manager.py +0 -220
  14. api/batch_processing/api_core/server_job_status_table.py +0 -152
  15. api/batch_processing/api_core/server_orchestration.py +0 -360
  16. api/batch_processing/api_core/server_utils.py +0 -92
  17. api/batch_processing/api_core_support/__init__.py +0 -0
  18. api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
  19. api/batch_processing/api_support/__init__.py +0 -0
  20. api/batch_processing/api_support/summarize_daily_activity.py +0 -152
  21. api/batch_processing/data_preparation/__init__.py +0 -0
  22. api/batch_processing/data_preparation/manage_local_batch.py +0 -2391
  23. api/batch_processing/data_preparation/manage_video_batch.py +0 -327
  24. api/batch_processing/integration/digiKam/setup.py +0 -6
  25. api/batch_processing/integration/digiKam/xmp_integration.py +0 -465
  26. api/batch_processing/integration/eMammal/test_scripts/config_template.py +0 -5
  27. api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -126
  28. api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +0 -55
  29. api/batch_processing/postprocessing/__init__.py +0 -0
  30. api/batch_processing/postprocessing/add_max_conf.py +0 -64
  31. api/batch_processing/postprocessing/categorize_detections_by_size.py +0 -163
  32. api/batch_processing/postprocessing/combine_api_outputs.py +0 -249
  33. api/batch_processing/postprocessing/compare_batch_results.py +0 -958
  34. api/batch_processing/postprocessing/convert_output_format.py +0 -397
  35. api/batch_processing/postprocessing/load_api_results.py +0 -195
  36. api/batch_processing/postprocessing/md_to_coco.py +0 -310
  37. api/batch_processing/postprocessing/md_to_labelme.py +0 -330
  38. api/batch_processing/postprocessing/merge_detections.py +0 -401
  39. api/batch_processing/postprocessing/postprocess_batch_results.py +0 -1904
  40. api/batch_processing/postprocessing/remap_detection_categories.py +0 -170
  41. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +0 -661
  42. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +0 -211
  43. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +0 -82
  44. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +0 -1631
  45. api/batch_processing/postprocessing/separate_detections_into_folders.py +0 -731
  46. api/batch_processing/postprocessing/subset_json_detector_output.py +0 -696
  47. api/batch_processing/postprocessing/top_folders_to_bottom.py +0 -223
  48. api/synchronous/__init__.py +0 -0
  49. api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  50. api/synchronous/api_core/animal_detection_api/api_backend.py +0 -152
  51. api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -266
  52. api/synchronous/api_core/animal_detection_api/config.py +0 -35
  53. api/synchronous/api_core/animal_detection_api/data_management/annotations/annotation_constants.py +0 -47
  54. api/synchronous/api_core/animal_detection_api/detection/detector_training/copy_checkpoints.py +0 -43
  55. api/synchronous/api_core/animal_detection_api/detection/detector_training/model_main_tf2.py +0 -114
  56. api/synchronous/api_core/animal_detection_api/detection/process_video.py +0 -543
  57. api/synchronous/api_core/animal_detection_api/detection/pytorch_detector.py +0 -304
  58. api/synchronous/api_core/animal_detection_api/detection/run_detector.py +0 -627
  59. api/synchronous/api_core/animal_detection_api/detection/run_detector_batch.py +0 -1029
  60. api/synchronous/api_core/animal_detection_api/detection/run_inference_with_yolov5_val.py +0 -581
  61. api/synchronous/api_core/animal_detection_api/detection/run_tiled_inference.py +0 -754
  62. api/synchronous/api_core/animal_detection_api/detection/tf_detector.py +0 -165
  63. api/synchronous/api_core/animal_detection_api/detection/video_utils.py +0 -495
  64. api/synchronous/api_core/animal_detection_api/md_utils/azure_utils.py +0 -174
  65. api/synchronous/api_core/animal_detection_api/md_utils/ct_utils.py +0 -262
  66. api/synchronous/api_core/animal_detection_api/md_utils/directory_listing.py +0 -251
  67. api/synchronous/api_core/animal_detection_api/md_utils/matlab_porting_tools.py +0 -97
  68. api/synchronous/api_core/animal_detection_api/md_utils/path_utils.py +0 -416
  69. api/synchronous/api_core/animal_detection_api/md_utils/process_utils.py +0 -110
  70. api/synchronous/api_core/animal_detection_api/md_utils/sas_blob_utils.py +0 -509
  71. api/synchronous/api_core/animal_detection_api/md_utils/string_utils.py +0 -59
  72. api/synchronous/api_core/animal_detection_api/md_utils/url_utils.py +0 -144
  73. api/synchronous/api_core/animal_detection_api/md_utils/write_html_image_list.py +0 -226
  74. api/synchronous/api_core/animal_detection_api/md_visualization/visualization_utils.py +0 -841
  75. api/synchronous/api_core/tests/__init__.py +0 -0
  76. api/synchronous/api_core/tests/load_test.py +0 -110
  77. classification/__init__.py +0 -0
  78. classification/aggregate_classifier_probs.py +0 -108
  79. classification/analyze_failed_images.py +0 -227
  80. classification/cache_batchapi_outputs.py +0 -198
  81. classification/create_classification_dataset.py +0 -627
  82. classification/crop_detections.py +0 -516
  83. classification/csv_to_json.py +0 -226
  84. classification/detect_and_crop.py +0 -855
  85. classification/efficientnet/__init__.py +0 -9
  86. classification/efficientnet/model.py +0 -415
  87. classification/efficientnet/utils.py +0 -610
  88. classification/evaluate_model.py +0 -520
  89. classification/identify_mislabeled_candidates.py +0 -152
  90. classification/json_to_azcopy_list.py +0 -63
  91. classification/json_validator.py +0 -695
  92. classification/map_classification_categories.py +0 -276
  93. classification/merge_classification_detection_output.py +0 -506
  94. classification/prepare_classification_script.py +0 -194
  95. classification/prepare_classification_script_mc.py +0 -228
  96. classification/run_classifier.py +0 -286
  97. classification/save_mislabeled.py +0 -110
  98. classification/train_classifier.py +0 -825
  99. classification/train_classifier_tf.py +0 -724
  100. classification/train_utils.py +0 -322
  101. data_management/__init__.py +0 -0
  102. data_management/annotations/__init__.py +0 -0
  103. data_management/annotations/annotation_constants.py +0 -34
  104. data_management/camtrap_dp_to_coco.py +0 -238
  105. data_management/cct_json_utils.py +0 -395
  106. data_management/cct_to_md.py +0 -176
  107. data_management/cct_to_wi.py +0 -289
  108. data_management/coco_to_labelme.py +0 -272
  109. data_management/coco_to_yolo.py +0 -662
  110. data_management/databases/__init__.py +0 -0
  111. data_management/databases/add_width_and_height_to_db.py +0 -33
  112. data_management/databases/combine_coco_camera_traps_files.py +0 -206
  113. data_management/databases/integrity_check_json_db.py +0 -477
  114. data_management/databases/subset_json_db.py +0 -115
  115. data_management/generate_crops_from_cct.py +0 -149
  116. data_management/get_image_sizes.py +0 -188
  117. data_management/importers/add_nacti_sizes.py +0 -52
  118. data_management/importers/add_timestamps_to_icct.py +0 -79
  119. data_management/importers/animl_results_to_md_results.py +0 -158
  120. data_management/importers/auckland_doc_test_to_json.py +0 -372
  121. data_management/importers/auckland_doc_to_json.py +0 -200
  122. data_management/importers/awc_to_json.py +0 -189
  123. data_management/importers/bellevue_to_json.py +0 -273
  124. data_management/importers/cacophony-thermal-importer.py +0 -796
  125. data_management/importers/carrizo_shrubfree_2018.py +0 -268
  126. data_management/importers/carrizo_trail_cam_2017.py +0 -287
  127. data_management/importers/cct_field_adjustments.py +0 -57
  128. data_management/importers/channel_islands_to_cct.py +0 -913
  129. data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
  130. data_management/importers/eMammal/eMammal_helpers.py +0 -249
  131. data_management/importers/eMammal/make_eMammal_json.py +0 -223
  132. data_management/importers/ena24_to_json.py +0 -275
  133. data_management/importers/filenames_to_json.py +0 -385
  134. data_management/importers/helena_to_cct.py +0 -282
  135. data_management/importers/idaho-camera-traps.py +0 -1407
  136. data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
  137. data_management/importers/jb_csv_to_json.py +0 -150
  138. data_management/importers/mcgill_to_json.py +0 -250
  139. data_management/importers/missouri_to_json.py +0 -489
  140. data_management/importers/nacti_fieldname_adjustments.py +0 -79
  141. data_management/importers/noaa_seals_2019.py +0 -181
  142. data_management/importers/pc_to_json.py +0 -365
  143. data_management/importers/plot_wni_giraffes.py +0 -123
  144. data_management/importers/prepare-noaa-fish-data-for-lila.py +0 -359
  145. data_management/importers/prepare_zsl_imerit.py +0 -131
  146. data_management/importers/rspb_to_json.py +0 -356
  147. data_management/importers/save_the_elephants_survey_A.py +0 -320
  148. data_management/importers/save_the_elephants_survey_B.py +0 -332
  149. data_management/importers/snapshot_safari_importer.py +0 -758
  150. data_management/importers/snapshot_safari_importer_reprise.py +0 -665
  151. data_management/importers/snapshot_serengeti_lila.py +0 -1067
  152. data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
  153. data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
  154. data_management/importers/sulross_get_exif.py +0 -65
  155. data_management/importers/timelapse_csv_set_to_json.py +0 -490
  156. data_management/importers/ubc_to_json.py +0 -399
  157. data_management/importers/umn_to_json.py +0 -507
  158. data_management/importers/wellington_to_json.py +0 -263
  159. data_management/importers/wi_to_json.py +0 -441
  160. data_management/importers/zamba_results_to_md_results.py +0 -181
  161. data_management/labelme_to_coco.py +0 -548
  162. data_management/labelme_to_yolo.py +0 -272
  163. data_management/lila/__init__.py +0 -0
  164. data_management/lila/add_locations_to_island_camera_traps.py +0 -97
  165. data_management/lila/add_locations_to_nacti.py +0 -147
  166. data_management/lila/create_lila_blank_set.py +0 -557
  167. data_management/lila/create_lila_test_set.py +0 -151
  168. data_management/lila/create_links_to_md_results_files.py +0 -106
  169. data_management/lila/download_lila_subset.py +0 -177
  170. data_management/lila/generate_lila_per_image_labels.py +0 -515
  171. data_management/lila/get_lila_annotation_counts.py +0 -170
  172. data_management/lila/get_lila_image_counts.py +0 -111
  173. data_management/lila/lila_common.py +0 -300
  174. data_management/lila/test_lila_metadata_urls.py +0 -132
  175. data_management/ocr_tools.py +0 -874
  176. data_management/read_exif.py +0 -681
  177. data_management/remap_coco_categories.py +0 -84
  178. data_management/remove_exif.py +0 -66
  179. data_management/resize_coco_dataset.py +0 -189
  180. data_management/wi_download_csv_to_coco.py +0 -246
  181. data_management/yolo_output_to_md_output.py +0 -441
  182. data_management/yolo_to_coco.py +0 -676
  183. detection/__init__.py +0 -0
  184. detection/detector_training/__init__.py +0 -0
  185. detection/detector_training/model_main_tf2.py +0 -114
  186. detection/process_video.py +0 -703
  187. detection/pytorch_detector.py +0 -337
  188. detection/run_detector.py +0 -779
  189. detection/run_detector_batch.py +0 -1219
  190. detection/run_inference_with_yolov5_val.py +0 -917
  191. detection/run_tiled_inference.py +0 -935
  192. detection/tf_detector.py +0 -188
  193. detection/video_utils.py +0 -606
  194. docs/source/conf.py +0 -43
  195. md_utils/__init__.py +0 -0
  196. md_utils/azure_utils.py +0 -174
  197. md_utils/ct_utils.py +0 -612
  198. md_utils/directory_listing.py +0 -246
  199. md_utils/md_tests.py +0 -968
  200. md_utils/path_utils.py +0 -1044
  201. md_utils/process_utils.py +0 -157
  202. md_utils/sas_blob_utils.py +0 -509
  203. md_utils/split_locations_into_train_val.py +0 -228
  204. md_utils/string_utils.py +0 -92
  205. md_utils/url_utils.py +0 -323
  206. md_utils/write_html_image_list.py +0 -225
  207. md_visualization/__init__.py +0 -0
  208. md_visualization/plot_utils.py +0 -293
  209. md_visualization/render_images_with_thumbnails.py +0 -275
  210. md_visualization/visualization_utils.py +0 -1537
  211. md_visualization/visualize_db.py +0 -551
  212. md_visualization/visualize_detector_output.py +0 -406
  213. megadetector-5.0.10.dist-info/RECORD +0 -224
  214. megadetector-5.0.10.dist-info/top_level.txt +0 -8
  215. taxonomy_mapping/__init__.py +0 -0
  216. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +0 -491
  217. taxonomy_mapping/map_new_lila_datasets.py +0 -154
  218. taxonomy_mapping/prepare_lila_taxonomy_release.py +0 -142
  219. taxonomy_mapping/preview_lila_taxonomy.py +0 -591
  220. taxonomy_mapping/retrieve_sample_image.py +0 -71
  221. taxonomy_mapping/simple_image_download.py +0 -218
  222. taxonomy_mapping/species_lookup.py +0 -834
  223. taxonomy_mapping/taxonomy_csv_checker.py +0 -159
  224. taxonomy_mapping/taxonomy_graph.py +0 -346
  225. taxonomy_mapping/validate_lila_category_mappings.py +0 -83
  226. {megadetector-5.0.10.dist-info → megadetector-5.0.11.dist-info}/WHEEL +0 -0
@@ -1,834 +0,0 @@
1
- """
2
-
3
- species_lookup.py
4
-
5
- Look up species names (common or scientific) in the GBIF and iNaturalist
6
- taxonomies.
7
-
8
- Run initialize_taxonomy_lookup() before calling any other function.
9
-
10
- """
11
-
12
- #%% Constants and imports
13
-
14
- import argparse
15
- import pickle
16
- import shutil
17
- import zipfile
18
- import sys
19
- import os
20
-
21
- from collections import defaultdict
22
- from itertools import compress
23
- from tqdm import tqdm
24
- from typing import Any, Dict, List, Mapping, Sequence, Set
25
-
26
- import pandas as pd
27
- import numpy as np
28
-
29
- from md_utils import url_utils
30
-
31
- taxonomy_download_dir = os.path.expanduser('~/taxonomy')
32
-
33
- taxonomy_urls = {
34
- 'GBIF': 'https://hosted-datasets.gbif.org/datasets/backbone/current/backbone.zip',
35
- 'iNaturalist': 'https://www.inaturalist.org/observations/inaturalist-dwca-with-taxa.zip' # pylint: disable=line-too-long
36
- }
37
-
38
- files_to_unzip = {
39
- # GBIF used to put everything in a "backbone" folder within the zipfile, but as of
40
- # 12.2023, this is no longer the case.
41
- # 'GBIF': ['backbone/Taxon.tsv', 'backbone/VernacularName.tsv'],
42
- 'GBIF': ['Taxon.tsv', 'VernacularName.tsv'],
43
- 'iNaturalist': ['taxa.csv']
44
- }
45
-
46
- # As of 2020.05.12:
47
- #
48
- # GBIF: ~777MB zipped, ~1.6GB taxonomy
49
- # iNat: ~2.2GB zipped, ~51MB taxonomy (most of the zipfile is observations)
50
-
51
- # As of 2023.12.29:
52
- #
53
- # GBIF: ~948MB zipped, ~2.2GB taxonomy
54
- # iNat: ~6.7GB zipped, ~62MB taxonomy (most of the zipfile is observations)
55
-
56
-
57
- os.makedirs(taxonomy_download_dir, exist_ok=True)
58
- for taxonomy_name in taxonomy_urls:
59
- taxonomy_dir = os.path.join(taxonomy_download_dir, taxonomy_name)
60
- os.makedirs(taxonomy_dir, exist_ok=True)
61
-
62
- serialized_structures_file = os.path.join(taxonomy_download_dir,
63
- 'serialized_taxonomies.p')
64
-
65
- # These are un-initialized globals that must be initialized by
66
- # the initialize_taxonomy_lookup() function below.
67
- inat_taxonomy = None # : pd.DataFrame
68
- gbif_taxonomy = None # : pd.DataFrame
69
- gbif_common_mapping = None # : pd.DataFrame
70
- inat_taxon_id_to_row = None # : Dict[np.int64, int]
71
- gbif_taxon_id_to_row = None # : Dict[np.int64, int]
72
- inat_taxon_id_to_vernacular = None # : Dict[np.int64, Set[str]]
73
- inat_vernacular_to_taxon_id = None # : Dict[str, np.int64]
74
- inat_taxon_id_to_scientific = None # : Dict[np.int64, Set[str]]
75
- inat_scientific_to_taxon_id = None # : Dict[str, np.int64]
76
- gbif_taxon_id_to_vernacular = None # : Dict[np.int64, Set[str]]
77
- gbif_vernacular_to_taxon_id = None # : Dict[str, np.int64]
78
- gbif_taxon_id_to_scientific = None # : Dict[np.int64, Set[str]]
79
- gbif_scientific_to_taxon_id = None # : Dict[str, np.int64]
80
-
81
-
82
- #%% Functions
83
-
84
- # Initialization function
85
-
86
- def initialize_taxonomy_lookup(force_init=False) -> None:
87
- """
88
- Initialize this module by doing the following:
89
-
90
- * Downloads and unzips the current GBIF and iNat taxonomies if necessary
91
- (only unzips what's necessary, but does not delete the original zipfiles)
92
- * Builds a bunch of dictionaries and tables to facilitate lookup
93
- * Serializes those tables via pickle
94
- * Skips all of the above if the serialized pickle file already exists
95
- """
96
-
97
- global inat_taxonomy,\
98
- gbif_taxonomy,\
99
- gbif_common_mapping,\
100
- inat_taxon_id_to_row,\
101
- gbif_taxon_id_to_row,\
102
- inat_taxon_id_to_vernacular,\
103
- inat_vernacular_to_taxon_id,\
104
- inat_taxon_id_to_scientific,\
105
- inat_scientific_to_taxon_id,\
106
- gbif_taxon_id_to_vernacular,\
107
- gbif_vernacular_to_taxon_id,\
108
- gbif_taxon_id_to_scientific,\
109
- gbif_scientific_to_taxon_id
110
-
111
-
112
- ## Load serialized taxonomy info if we've already saved it
113
-
114
- if (not force_init) and (inat_taxonomy is not None):
115
- print('Skipping taxonomy re-init')
116
- return
117
-
118
- if (not force_init) and (os.path.isfile(serialized_structures_file)):
119
-
120
- print(f'De-serializing taxonomy data from {serialized_structures_file}')
121
-
122
- with open(serialized_structures_file, 'rb') as f:
123
- structures_to_serialize = pickle.load(f)
124
-
125
- inat_taxonomy,\
126
- gbif_taxonomy,\
127
- gbif_common_mapping,\
128
- inat_taxon_id_to_row,\
129
- gbif_taxon_id_to_row,\
130
- inat_taxon_id_to_vernacular,\
131
- inat_vernacular_to_taxon_id,\
132
- inat_taxon_id_to_scientific,\
133
- inat_scientific_to_taxon_id,\
134
- gbif_taxon_id_to_vernacular,\
135
- gbif_vernacular_to_taxon_id,\
136
- gbif_taxon_id_to_scientific,\
137
- gbif_scientific_to_taxon_id = structures_to_serialize
138
-
139
- return
140
-
141
-
142
- ## If we don't have serialized taxonomy info, create it from scratch.
143
-
144
- # Download and unzip taxonomy files
145
- # taxonomy_name = list(taxonomy_urls.items())[0][0]; zip_url = list(taxonomy_urls.items())[0][1]
146
- for taxonomy_name, zip_url in taxonomy_urls.items():
147
-
148
- need_to_download = False
149
-
150
- if force_init:
151
- need_to_download = True
152
-
153
- # Don't download the zipfile if we've already unzipped what we need
154
- for fn in files_to_unzip[taxonomy_name]:
155
- target_file = os.path.join(
156
- taxonomy_download_dir, taxonomy_name, fn)
157
- if not os.path.isfile(target_file):
158
- need_to_download = True
159
- break
160
- if not need_to_download:
161
- print(f'Bypassing download of {taxonomy_name}, all files available')
162
- continue
163
-
164
- zipfile_path = os.path.join(
165
- taxonomy_download_dir, zip_url.split('/')[-1])
166
-
167
- # Bypasses download if the file exists already (unless force_init is set)
168
- url_utils.download_url(
169
- zip_url, os.path.join(zipfile_path),
170
- progress_updater=url_utils.DownloadProgressBar(),
171
- verbose=True,force_download=force_init)
172
-
173
- # Unzip the files we need
174
- files_we_need = files_to_unzip[taxonomy_name]
175
-
176
- with zipfile.ZipFile(zipfile_path, 'r') as zipH:
177
-
178
- for fn in files_we_need:
179
- print('Unzipping {}'.format(fn))
180
- target_file = os.path.join(
181
- taxonomy_download_dir, taxonomy_name, os.path.basename(fn))
182
-
183
- if (not force_init) and (os.path.isfile(target_file)):
184
- print(f'Bypassing unzip of {target_file}, file exists')
185
- else:
186
- os.makedirs(os.path.basename(target_file),exist_ok=True)
187
- with zipH.open(fn) as zf, open(target_file, 'wb') as f:
188
- shutil.copyfileobj(zf, f)
189
-
190
- # ...for each file that we need from this zipfile
191
-
192
- # Remove the zipfile
193
- # os.remove(zipfile_path)
194
-
195
- # ...for each taxonomy
196
-
197
-
198
- # Create dataframes from each of the taxonomy files, and the GBIF common
199
- # name file
200
-
201
- # Load iNat taxonomy
202
- inat_taxonomy_file = os.path.join(taxonomy_download_dir, 'iNaturalist', 'taxa.csv')
203
- print('Loading iNat taxonomy from {}'.format(inat_taxonomy_file))
204
- inat_taxonomy = pd.read_csv(inat_taxonomy_file)
205
- inat_taxonomy['scientificName'] = inat_taxonomy['scientificName'].fillna('').str.strip()
206
- inat_taxonomy['vernacularName'] = inat_taxonomy['vernacularName'].fillna('').str.strip()
207
-
208
- # Load GBIF taxonomy
209
- gbif_taxonomy_file = os.path.join(taxonomy_download_dir, 'GBIF', 'Taxon.tsv')
210
- print('Loading GBIF taxonomy from {}'.format(gbif_taxonomy_file))
211
- gbif_taxonomy = pd.read_csv(gbif_taxonomy_file, sep='\t')
212
- gbif_taxonomy['scientificName'] = gbif_taxonomy['scientificName'].fillna('').str.strip()
213
- gbif_taxonomy['canonicalName'] = gbif_taxonomy['canonicalName'].fillna('').str.strip()
214
-
215
- # Remove questionable rows from the GBIF taxonomy
216
- gbif_taxonomy = gbif_taxonomy[~gbif_taxonomy['taxonomicStatus'].isin(['doubtful', 'misapplied'])]
217
- gbif_taxonomy = gbif_taxonomy.reset_index()
218
-
219
- # Load GBIF vernacular name mapping
220
- gbif_common_mapping = pd.read_csv(os.path.join(
221
- taxonomy_download_dir, 'GBIF', 'VernacularName.tsv'), sep='\t')
222
- gbif_common_mapping['vernacularName'] = gbif_common_mapping['vernacularName'].fillna('').str.strip()
223
-
224
- # Only keep English mappings
225
- gbif_common_mapping = gbif_common_mapping.loc[gbif_common_mapping['language'] == 'en']
226
- gbif_common_mapping = gbif_common_mapping.reset_index()
227
-
228
-
229
- # Convert everything to lowercase
230
-
231
- def convert_df_to_lowercase(df):
232
- df = df.applymap(lambda s: s.lower() if isinstance(s, str) else s)
233
- return df
234
-
235
- inat_taxonomy = convert_df_to_lowercase(inat_taxonomy)
236
- gbif_taxonomy = convert_df_to_lowercase(gbif_taxonomy)
237
- gbif_common_mapping = convert_df_to_lowercase(gbif_common_mapping)
238
-
239
-
240
- # For each taxonomy table, create a mapping from taxon IDs to rows
241
-
242
- inat_taxon_id_to_row = {}
243
- gbif_taxon_id_to_row = {}
244
-
245
- print('Building iNat taxonID --> row table')
246
- for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
247
- inat_taxon_id_to_row[row['taxonID']] = i_row
248
-
249
- print('Building GBIF taxonID --> row table')
250
- for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
251
- gbif_taxon_id_to_row[row['taxonID']] = i_row
252
-
253
-
254
- # Create name mapping dictionaries
255
-
256
- inat_taxon_id_to_vernacular = defaultdict(set)
257
- inat_vernacular_to_taxon_id = defaultdict(set)
258
- inat_taxon_id_to_scientific = defaultdict(set)
259
- inat_scientific_to_taxon_id = defaultdict(set)
260
-
261
- gbif_taxon_id_to_vernacular = defaultdict(set)
262
- gbif_vernacular_to_taxon_id = defaultdict(set)
263
- gbif_taxon_id_to_scientific = defaultdict(set)
264
- gbif_scientific_to_taxon_id = defaultdict(set)
265
-
266
-
267
- # Build iNat dictionaries
268
-
269
- print('Building lookup dictionaries for iNat taxonomy')
270
-
271
- for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
272
-
273
- taxon_id = row['taxonID']
274
- vernacular_name = row['vernacularName']
275
- scientific_name = row['scientificName']
276
-
277
- if len(vernacular_name) > 0:
278
- inat_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
279
- inat_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
280
-
281
- assert len(scientific_name) > 0
282
- inat_taxon_id_to_scientific[taxon_id].add(scientific_name)
283
- inat_scientific_to_taxon_id[scientific_name].add(taxon_id)
284
-
285
-
286
- # Build GBIF dictionaries
287
-
288
- print('Building lookup dictionaries for GBIF taxonomy')
289
-
290
- for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
291
-
292
- taxon_id = row['taxonID']
293
-
294
- # The canonical name is the Latin name; the "scientific name"
295
- # include the taxonomy name.
296
- #
297
- # http://globalnames.org/docs/glossary/
298
-
299
- scientific_name = row['canonicalName']
300
-
301
- # This only seems to happen for really esoteric species that aren't
302
- # likely to apply to our problems, but doing this for completeness.
303
- if len(scientific_name) == 0:
304
- scientific_name = row['scientificName']
305
-
306
- assert len(scientific_name) > 0
307
- gbif_taxon_id_to_scientific[taxon_id].add(scientific_name)
308
- gbif_scientific_to_taxon_id[scientific_name].add(taxon_id)
309
-
310
- for i_row, row in tqdm(gbif_common_mapping.iterrows(), total=len(gbif_common_mapping)):
311
-
312
- taxon_id = row['taxonID']
313
-
314
- # Don't include taxon IDs that were removed from the master table
315
- if taxon_id not in gbif_taxon_id_to_scientific:
316
- continue
317
-
318
- vernacular_name = row['vernacularName']
319
-
320
- assert len(vernacular_name) > 0
321
- gbif_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
322
- gbif_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
323
-
324
-
325
- # Save everything to file
326
-
327
- structures_to_serialize = [
328
- inat_taxonomy,
329
- gbif_taxonomy,
330
- gbif_common_mapping,
331
- inat_taxon_id_to_row,
332
- gbif_taxon_id_to_row,
333
- inat_taxon_id_to_vernacular,
334
- inat_vernacular_to_taxon_id,
335
- inat_taxon_id_to_scientific,
336
- inat_scientific_to_taxon_id,
337
- gbif_taxon_id_to_vernacular,
338
- gbif_vernacular_to_taxon_id,
339
- gbif_taxon_id_to_scientific,
340
- gbif_scientific_to_taxon_id
341
- ]
342
-
343
- print('Serializing to {}...'.format(serialized_structures_file), end='')
344
- if not os.path.isfile(serialized_structures_file):
345
- with open(serialized_structures_file, 'wb') as p:
346
- pickle.dump(structures_to_serialize, p)
347
- print(' done')
348
-
349
- # ...def initialize_taxonomy_lookup(...)
350
-
351
-
352
- def get_scientific_name_from_row(r):
353
- """
354
- r: a dataframe that's really a row in one of our taxonomy tables
355
- """
356
-
357
- if 'canonicalName' in r and len(r['canonicalName']) > 0:
358
- scientific_name = r['canonicalName']
359
- else:
360
- scientific_name = r['scientificName']
361
- return scientific_name
362
-
363
-
364
- def taxonomy_row_to_string(r):
365
- """
366
- r: a dataframe that's really a row in one of our taxonomy tables
367
- """
368
-
369
- if 'vernacularName' in r:
370
- common_string = ' (' + r['vernacularName'] + ')'
371
- else:
372
- common_string = ''
373
- scientific_name = get_scientific_name_from_row(r)
374
-
375
- return r['taxonRank'] + ' ' + scientific_name + common_string
376
-
377
-
378
- def traverse_taxonomy(matching_rownums: Sequence[int],
379
- taxon_id_to_row: Mapping[str, int],
380
- taxon_id_to_vernacular: Mapping[str, Set[str]],
381
- taxonomy: pd.DataFrame,
382
- source_name: str,
383
- query: str) -> List[Dict[str, Any]]:
384
- """
385
- Given a data frame that's a set of rows from one of our taxonomy tables,
386
- walks the taxonomy hierarchy from each row to put together a full taxonomy
387
- tree, then prunes redundant trees (e.g. if we had separate hits for a
388
- species and the genus that contains that species.)
389
-
390
- Returns a list of dicts:
391
- [
392
- {
393
- 'source': 'inat' or 'gbif',
394
- 'taxonomy': [(taxon_id, taxon_rank, scientific_name, [common names])]
395
- },
396
- ...
397
- ]
398
- """
399
-
400
- # list of dicts: {'source': source_name, 'taxonomy': match_details}
401
- matching_trees: List[Dict[str, Any]] = []
402
-
403
- # i_match = 0
404
- for i_match in matching_rownums:
405
-
406
- # list of (taxon_id, taxonRank, scientific name, [vernacular names])
407
- # corresponding to an exact match and its parents
408
- match_details = []
409
- current_row = taxonomy.iloc[i_match]
410
-
411
- # Walk taxonomy hierarchy
412
- while True:
413
-
414
- taxon_id = current_row['taxonID']
415
- vernacular_names = sorted(taxon_id_to_vernacular[taxon_id]) # sort for determinism, pylint: disable=line-too-long
416
- match_details.append((taxon_id, current_row['taxonRank'],
417
- get_scientific_name_from_row(current_row),
418
- vernacular_names))
419
-
420
- if np.isnan(current_row['parentNameUsageID']):
421
- break
422
- parent_taxon_id = current_row['parentNameUsageID'].astype('int64')
423
- if parent_taxon_id not in taxon_id_to_row:
424
- # This can happen because we remove questionable rows from the
425
- # GBIF taxonomy
426
- # print(f'Warning: no row exists for parent_taxon_id {parent_taxon_id},' + \
427
- # f'child taxon_id: {taxon_id}, query: {query}')
428
- break
429
- i_parent_row = taxon_id_to_row[parent_taxon_id]
430
- current_row = taxonomy.iloc[i_parent_row]
431
-
432
- # The GBIF taxonomy contains unranked entries
433
- if current_row['taxonRank'] == 'unranked':
434
- break
435
-
436
- # ...while there is taxonomy left to walk
437
-
438
- matching_trees.append({'source': source_name,
439
- 'taxonomy': match_details})
440
-
441
- # ...for each match
442
-
443
- # Remove redundant matches
444
- b_valid_tree = [True] * len(matching_rownums)
445
- # i_tree_a = 0; tree_a = matching_trees[i_tree_a]
446
- for i_tree_a, tree_a in enumerate(matching_trees):
447
-
448
- tree_a_primary_taxon_id = tree_a['taxonomy'][0][0]
449
-
450
- # i_tree_b = 1; tree_b = matching_trees[i_tree_b]
451
- for i_tree_b, tree_b in enumerate(matching_trees):
452
-
453
- if i_tree_a == i_tree_b:
454
- continue
455
-
456
- # If tree a's primary taxon ID is inside tree b, discard tree a
457
- #
458
- # taxonomy_level_b = tree_b['taxonomy'][0]
459
- for taxonomy_level_b in tree_b['taxonomy']:
460
- if tree_a_primary_taxon_id == taxonomy_level_b[0]:
461
- b_valid_tree[i_tree_a] = False
462
- break
463
-
464
- # ...for each level in taxonomy B
465
-
466
- # ...for each tree (inner)
467
-
468
- # ...for each tree (outer)
469
-
470
- matching_trees = list(compress(matching_trees, b_valid_tree))
471
- return matching_trees
472
-
473
- # ...def traverse_taxonomy()
474
-
475
-
476
- def get_taxonomic_info(query: str) -> List[Dict[str, Any]]:
477
- """
478
- Main entry point: get taxonomic matches from both taxonomies for [query],
479
- which may be a scientific or common name.
480
- """
481
- query = query.strip().lower()
482
- # print("Finding taxonomy information for: {0}".format(query))
483
-
484
- inat_taxon_ids = set()
485
- if query in inat_scientific_to_taxon_id:
486
- inat_taxon_ids |= inat_scientific_to_taxon_id[query]
487
- if query in inat_vernacular_to_taxon_id:
488
- inat_taxon_ids |= inat_vernacular_to_taxon_id[query]
489
-
490
- # In GBIF, some queries hit for both common and scientific, make sure we end
491
- # up with unique inputs
492
- gbif_taxon_ids = set()
493
- if query in gbif_scientific_to_taxon_id:
494
- gbif_taxon_ids |= gbif_scientific_to_taxon_id[query]
495
- if query in gbif_vernacular_to_taxon_id:
496
- gbif_taxon_ids |= gbif_vernacular_to_taxon_id[query]
497
-
498
- # If the species is not found in either taxonomy, return None
499
- if (len(inat_taxon_ids) == 0) and (len(gbif_taxon_ids) == 0):
500
- return []
501
-
502
- # Both GBIF and iNat have a 1-to-1 mapping between taxon_id and row number
503
- inat_row_indices = [inat_taxon_id_to_row[i] for i in inat_taxon_ids]
504
- gbif_row_indices = [gbif_taxon_id_to_row[i] for i in gbif_taxon_ids]
505
-
506
- # Walk both taxonomies
507
- inat_matching_trees = traverse_taxonomy(
508
- inat_row_indices, inat_taxon_id_to_row, inat_taxon_id_to_vernacular,
509
- inat_taxonomy, 'inat', query)
510
- gbif_matching_trees = traverse_taxonomy(
511
- gbif_row_indices, gbif_taxon_id_to_row, gbif_taxon_id_to_vernacular,
512
- gbif_taxonomy, 'gbif', query)
513
-
514
- return gbif_matching_trees + inat_matching_trees
515
-
516
- # ...def get_taxonomic_info()
517
-
518
-
519
- def print_taxonomy_matches(matches, verbose=False):
520
- """
521
- Console-friendly printing function to make nicely-indentend trees
522
- """
523
-
524
- # m = matches[0]
525
- for m in matches:
526
-
527
- source = m['source']
528
-
529
- # For example: [(9761484, 'species', 'anas platyrhynchos')]
530
- for i_taxonomy_level in range(0, len(m['taxonomy'])):
531
- taxonomy_level_info = m['taxonomy'][i_taxonomy_level]
532
- taxonomy_level = taxonomy_level_info[1]
533
- name = taxonomy_level_info[2]
534
- common = taxonomy_level_info[3]
535
-
536
- if i_taxonomy_level > 0:
537
- print('\t',end='')
538
-
539
- print('{} {} ({})'.format(taxonomy_level, name, common), end='')
540
-
541
- if i_taxonomy_level == 0:
542
- print(' ({})'.format(source))
543
- else:
544
- print('')
545
-
546
- if not verbose:
547
- break
548
-
549
- # ...for each taxonomy level
550
-
551
- # ...for each match
552
-
553
- # ...def print_taxonomy_matches()
554
-
555
-
556
- #%% Taxonomy functions that make subjective judgements
557
-
558
- import unicodedata
559
- import re
560
-
561
- def slugify(value: Any, allow_unicode: bool = False) -> str:
562
- """
563
- From:
564
- https://github.com/django/django/blob/master/django/utils/text.py
565
-
566
- Convert to ASCII if 'allow_unicode' is False. Convert spaces to hyphens.
567
- Remove characters that aren't alphanumerics, underscores, or hyphens.
568
- Convert to lowercase. Also strip leading and trailing whitespace.
569
- """
570
-
571
- value = str(value)
572
- value = unicodedata.normalize('NFKC', value)
573
- if not allow_unicode:
574
- value = value.encode('ascii', 'ignore').decode('ascii')
575
- value = re.sub(r'[^\w\s-]', '', value.lower()).strip()
576
- return re.sub(r'[-\s]+', '-', value)
577
-
578
-
579
- class TaxonomicMatch:
580
-
581
- def __init__(self, scientific_name, common_name, taxonomic_level, source,
582
- taxonomy_string, match):
583
- self.scientific_name = scientific_name
584
- self.common_name = common_name
585
- self.taxonomic_level = taxonomic_level
586
- self.source = source
587
- self.taxonomy_string = taxonomy_string
588
- self.match = match
589
-
590
- def __repr__(self):
591
- return ('TaxonomicMatch('
592
- f'scientific_name={self.scientific_name}, '
593
- f'common_name={self.common_name}, '
594
- f'taxonomic_level={self.taxonomic_level}, '
595
- f'source={self.source}')
596
-
597
-
598
- hyphenated_terms = ['crowned', 'backed', 'throated', 'tailed', 'headed', 'cheeked',
599
- 'ruffed', 'browed', 'eating', 'striped', 'shanked',
600
- 'fronted', 'bellied', 'spotted', 'eared', 'collared', 'breasted',
601
- 'necked']
602
-
603
- def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retry=True) -> TaxonomicMatch:
604
- """
605
- Wrapper for species_lookup.py, but expressing a variety of heuristics and
606
- preferences that are specific to our scenario.
607
- """
608
-
609
- m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
610
- if (len(m.scientific_name) > 0) or (not retry):
611
- return m
612
-
613
- for s in hyphenated_terms:
614
- query = query.replace(' ' + s,'-' + s)
615
- m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
616
- return m
617
-
618
-
619
- def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') -> TaxonomicMatch:
620
-
621
- query = query.lower().strip().replace('_', ' ')
622
- query = query.replace('unidentified','')
623
- query = query.replace('unknown','')
624
- if query.endswith(' sp'):
625
- query = query.replace(' sp','')
626
- if query.endswith(' group'):
627
- query = query.replace(' group','')
628
-
629
- query = query.strip()
630
-
631
- # query = 'person'
632
- matches = get_taxonomic_info(query)
633
-
634
- # Do we have an iNat match?
635
- inat_matches = [m for m in matches if m['source'] == 'inat']
636
- gbif_matches = [m for m in matches if m['source'] == 'gbif']
637
-
638
- # print_taxonomy_matches(inat_matches, verbose=True)
639
- # print_taxonomy_matches(gbif_matches, verbose=True)
640
-
641
- scientific_name = ''
642
- common_name = ''
643
- taxonomic_level = ''
644
- match = ''
645
- source = ''
646
- taxonomy_string = ''
647
-
648
- n_inat_matches = len(inat_matches)
649
- n_gbif_matches = len(gbif_matches)
650
-
651
- selected_matches = None
652
-
653
- assert taxonomy_preference in ['gbif','inat'],\
654
- 'Unrecognized taxonomy preference: {}'.format(taxonomy_preference)
655
-
656
- if n_inat_matches > 0 and taxonomy_preference == 'inat':
657
- selected_matches = 'inat'
658
- elif n_gbif_matches > 0:
659
- selected_matches = 'gbif'
660
-
661
- if selected_matches == 'inat':
662
-
663
- i_match = 0
664
-
665
- if len(inat_matches) > 1:
666
- # print('Warning: multiple iNat matches for {}'.format(query))
667
-
668
- # Prefer chordates... most of the names that aren't what we want
669
- # are esoteric insects, like a moth called "cheetah"
670
- #
671
- # If we can't find a chordate, just take the first match.
672
- #
673
- # i_test_match = 0
674
- for i_test_match, match in enumerate(inat_matches):
675
- found_vertebrate = False
676
- taxonomy = match['taxonomy']
677
- for taxonomy_level in taxonomy:
678
- taxon_rank = taxonomy_level[1]
679
- scientific_name = taxonomy_level[2]
680
- if taxon_rank == 'phylum' and scientific_name == 'chordata':
681
- i_match = i_test_match
682
- found_vertebrate = True
683
- break
684
- if found_vertebrate:
685
- break
686
-
687
- match = inat_matches[i_match]['taxonomy']
688
-
689
- # This is (taxonID, taxonLevel, scientific, [list of common])
690
- lowest_level = match[0]
691
- taxonomic_level = lowest_level[1]
692
- scientific_name = lowest_level[2]
693
- assert len(scientific_name) > 0
694
- common_names = lowest_level[3]
695
- if len(common_names) > 1:
696
- # print(f'Warning: multiple iNat common names for {query}')
697
- # Default to returning the query
698
- if query in common_names:
699
- common_name = query
700
- else:
701
- common_name = common_names[0]
702
- elif len(common_names) > 0:
703
- common_name = common_names[0]
704
-
705
- # print(f'Matched iNat {query} to {scientific_name},{common_name}')
706
- source = 'inat'
707
-
708
- # ...if we had iNat matches
709
-
710
- # If we either prefer GBIF or didn't have iNat matches
711
- #
712
- # Code is deliberately redundant here; I'm expecting some subtleties in how
713
- # handle GBIF and iNat.
714
- elif selected_matches == 'gbif':
715
-
716
- i_match = 0
717
-
718
- if len(gbif_matches) > 1:
719
- # print('Warning: multiple GBIF matches for {}'.format(query))
720
-
721
- # Prefer chordates... most of the names that aren't what we want
722
- # are esoteric insects, like a moth called "cheetah"
723
- #
724
- # If we can't find a chordate, just take the first match.
725
- #
726
- # i_test_match = 0
727
- for i_test_match, match in enumerate(gbif_matches):
728
- found_vertebrate = False
729
- taxonomy = match['taxonomy']
730
- for taxonomy_level in taxonomy:
731
- taxon_rank = taxonomy_level[1]
732
- scientific_name = taxonomy_level[2]
733
- if taxon_rank == 'phylum' and scientific_name == 'chordata':
734
- i_match = i_test_match
735
- found_vertebrate = True
736
- break
737
- if found_vertebrate:
738
- break
739
-
740
- match = gbif_matches[i_match]['taxonomy']
741
-
742
- # This is (taxonID, taxonLevel, scientific, [list of common])
743
- lowest_level = match[0]
744
- taxonomic_level = lowest_level[1]
745
- scientific_name = lowest_level[2]
746
- assert len(scientific_name) > 0
747
-
748
- common_names = lowest_level[3]
749
- if len(common_names) > 1:
750
- # print(f'Warning: multiple GBIF common names for {query}')
751
- # Default to returning the query
752
- if query in common_names:
753
- common_name = query
754
- else:
755
- common_name = common_names[0]
756
- elif len(common_names) > 0:
757
- common_name = common_names[0]
758
-
759
- source = 'gbif'
760
-
761
- # ...if we needed to look in the GBIF taxonomy
762
-
763
- taxonomy_string = str(match)
764
-
765
- return TaxonomicMatch(scientific_name, common_name, taxonomic_level, source,
766
- taxonomy_string, match),query
767
-
768
- # ...def _get_preferred_taxonomic_match()
769
-
770
-
771
- #%% Interactive drivers and debug
772
-
773
- if False:
774
-
775
- #%% Initialization
776
-
777
- initialize_taxonomy_lookup()
778
-
779
-
780
- #%% Taxonomic lookup
781
-
782
- # query = 'lion'
783
- query = 'xenoperdix'
784
- matches = get_taxonomic_info(query)
785
- # print(matches)
786
-
787
- print_taxonomy_matches(matches,verbose=True)
788
-
789
- print('\n\n')
790
-
791
- # Print the taxonomy in the taxonomy spreadsheet format
792
- assert matches[1]['source'] == 'inat'
793
- t = str(matches[1]['taxonomy'])
794
- print(t)
795
- import clipboard; clipboard.copy(t)
796
-
797
-
798
- #%% Directly access the taxonomy tables
799
-
800
- taxon_ids = gbif_vernacular_to_taxon_id['lion']
801
- for taxon_id in taxon_ids:
802
- i_row = gbif_taxon_id_to_row[taxon_id]
803
- print(taxonomy_row_to_string(gbif_taxonomy.iloc[i_row]))
804
-
805
-
806
- #%% Command-line driver
807
-
808
- def main():
809
-
810
- # Read command line inputs (absolute path)
811
- parser = argparse.ArgumentParser()
812
- parser.add_argument('input_file')
813
-
814
- if len(sys.argv[1:]) == 0:
815
- parser.print_help()
816
- parser.exit()
817
-
818
- args = parser.parse_args()
819
- input_file = args.input_file
820
-
821
- initialize_taxonomy_lookup()
822
-
823
- # Read the tokens from the input text file
824
- with open(input_file, 'r') as f:
825
- tokens = f.readlines()
826
-
827
- # Loop through each token and get scientific name
828
- for token in tokens:
829
- token = token.strip().lower()
830
- matches = get_taxonomic_info(token)
831
- print_taxonomy_matches(matches)
832
-
833
- if __name__ == '__main__':
834
- main()