megadetector 5.0.28__py3-none-any.whl → 10.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (197) hide show
  1. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
  2. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
  3. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
  4. megadetector/classification/aggregate_classifier_probs.py +3 -3
  5. megadetector/classification/analyze_failed_images.py +5 -5
  6. megadetector/classification/cache_batchapi_outputs.py +5 -5
  7. megadetector/classification/create_classification_dataset.py +11 -12
  8. megadetector/classification/crop_detections.py +10 -10
  9. megadetector/classification/csv_to_json.py +8 -8
  10. megadetector/classification/detect_and_crop.py +13 -15
  11. megadetector/classification/efficientnet/model.py +8 -8
  12. megadetector/classification/efficientnet/utils.py +6 -5
  13. megadetector/classification/evaluate_model.py +7 -7
  14. megadetector/classification/identify_mislabeled_candidates.py +6 -6
  15. megadetector/classification/json_to_azcopy_list.py +1 -1
  16. megadetector/classification/json_validator.py +29 -32
  17. megadetector/classification/map_classification_categories.py +9 -9
  18. megadetector/classification/merge_classification_detection_output.py +12 -9
  19. megadetector/classification/prepare_classification_script.py +19 -19
  20. megadetector/classification/prepare_classification_script_mc.py +26 -26
  21. megadetector/classification/run_classifier.py +4 -4
  22. megadetector/classification/save_mislabeled.py +6 -6
  23. megadetector/classification/train_classifier.py +1 -1
  24. megadetector/classification/train_classifier_tf.py +9 -9
  25. megadetector/classification/train_utils.py +10 -10
  26. megadetector/data_management/annotations/annotation_constants.py +1 -2
  27. megadetector/data_management/camtrap_dp_to_coco.py +79 -46
  28. megadetector/data_management/cct_json_utils.py +103 -103
  29. megadetector/data_management/cct_to_md.py +49 -49
  30. megadetector/data_management/cct_to_wi.py +33 -33
  31. megadetector/data_management/coco_to_labelme.py +75 -75
  32. megadetector/data_management/coco_to_yolo.py +210 -193
  33. megadetector/data_management/databases/add_width_and_height_to_db.py +86 -12
  34. megadetector/data_management/databases/combine_coco_camera_traps_files.py +40 -40
  35. megadetector/data_management/databases/integrity_check_json_db.py +228 -200
  36. megadetector/data_management/databases/subset_json_db.py +33 -33
  37. megadetector/data_management/generate_crops_from_cct.py +88 -39
  38. megadetector/data_management/get_image_sizes.py +54 -49
  39. megadetector/data_management/labelme_to_coco.py +133 -125
  40. megadetector/data_management/labelme_to_yolo.py +159 -73
  41. megadetector/data_management/lila/create_lila_blank_set.py +81 -83
  42. megadetector/data_management/lila/create_lila_test_set.py +32 -31
  43. megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
  44. megadetector/data_management/lila/download_lila_subset.py +21 -24
  45. megadetector/data_management/lila/generate_lila_per_image_labels.py +365 -107
  46. megadetector/data_management/lila/get_lila_annotation_counts.py +35 -33
  47. megadetector/data_management/lila/get_lila_image_counts.py +22 -22
  48. megadetector/data_management/lila/lila_common.py +73 -70
  49. megadetector/data_management/lila/test_lila_metadata_urls.py +28 -19
  50. megadetector/data_management/mewc_to_md.py +344 -340
  51. megadetector/data_management/ocr_tools.py +262 -255
  52. megadetector/data_management/read_exif.py +249 -227
  53. megadetector/data_management/remap_coco_categories.py +90 -28
  54. megadetector/data_management/remove_exif.py +81 -21
  55. megadetector/data_management/rename_images.py +187 -187
  56. megadetector/data_management/resize_coco_dataset.py +588 -120
  57. megadetector/data_management/speciesnet_to_md.py +41 -41
  58. megadetector/data_management/wi_download_csv_to_coco.py +55 -55
  59. megadetector/data_management/yolo_output_to_md_output.py +248 -122
  60. megadetector/data_management/yolo_to_coco.py +333 -191
  61. megadetector/detection/change_detection.py +832 -0
  62. megadetector/detection/process_video.py +340 -337
  63. megadetector/detection/pytorch_detector.py +358 -278
  64. megadetector/detection/run_detector.py +399 -186
  65. megadetector/detection/run_detector_batch.py +404 -377
  66. megadetector/detection/run_inference_with_yolov5_val.py +340 -327
  67. megadetector/detection/run_tiled_inference.py +257 -249
  68. megadetector/detection/tf_detector.py +24 -24
  69. megadetector/detection/video_utils.py +332 -295
  70. megadetector/postprocessing/add_max_conf.py +19 -11
  71. megadetector/postprocessing/categorize_detections_by_size.py +45 -45
  72. megadetector/postprocessing/classification_postprocessing.py +468 -433
  73. megadetector/postprocessing/combine_batch_outputs.py +23 -23
  74. megadetector/postprocessing/compare_batch_results.py +590 -525
  75. megadetector/postprocessing/convert_output_format.py +106 -102
  76. megadetector/postprocessing/create_crop_folder.py +347 -147
  77. megadetector/postprocessing/detector_calibration.py +173 -168
  78. megadetector/postprocessing/generate_csv_report.py +508 -499
  79. megadetector/postprocessing/load_api_results.py +48 -27
  80. megadetector/postprocessing/md_to_coco.py +133 -102
  81. megadetector/postprocessing/md_to_labelme.py +107 -90
  82. megadetector/postprocessing/md_to_wi.py +40 -40
  83. megadetector/postprocessing/merge_detections.py +92 -114
  84. megadetector/postprocessing/postprocess_batch_results.py +319 -301
  85. megadetector/postprocessing/remap_detection_categories.py +91 -38
  86. megadetector/postprocessing/render_detection_confusion_matrix.py +214 -205
  87. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
  88. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
  89. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +704 -679
  90. megadetector/postprocessing/separate_detections_into_folders.py +226 -211
  91. megadetector/postprocessing/subset_json_detector_output.py +265 -262
  92. megadetector/postprocessing/top_folders_to_bottom.py +45 -45
  93. megadetector/postprocessing/validate_batch_results.py +70 -70
  94. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
  95. megadetector/taxonomy_mapping/map_new_lila_datasets.py +18 -19
  96. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +54 -33
  97. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +67 -67
  98. megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
  99. megadetector/taxonomy_mapping/simple_image_download.py +8 -8
  100. megadetector/taxonomy_mapping/species_lookup.py +156 -74
  101. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
  102. megadetector/taxonomy_mapping/taxonomy_graph.py +10 -10
  103. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
  104. megadetector/utils/ct_utils.py +1049 -211
  105. megadetector/utils/directory_listing.py +21 -77
  106. megadetector/utils/gpu_test.py +22 -22
  107. megadetector/utils/md_tests.py +632 -529
  108. megadetector/utils/path_utils.py +1520 -431
  109. megadetector/utils/process_utils.py +41 -41
  110. megadetector/utils/split_locations_into_train_val.py +62 -62
  111. megadetector/utils/string_utils.py +148 -27
  112. megadetector/utils/url_utils.py +489 -176
  113. megadetector/utils/wi_utils.py +2658 -2526
  114. megadetector/utils/write_html_image_list.py +137 -137
  115. megadetector/visualization/plot_utils.py +34 -30
  116. megadetector/visualization/render_images_with_thumbnails.py +39 -74
  117. megadetector/visualization/visualization_utils.py +487 -435
  118. megadetector/visualization/visualize_db.py +232 -198
  119. megadetector/visualization/visualize_detector_output.py +82 -76
  120. {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/METADATA +5 -2
  121. megadetector-10.0.0.dist-info/RECORD +139 -0
  122. {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/WHEEL +1 -1
  123. megadetector/api/batch_processing/api_core/__init__.py +0 -0
  124. megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
  125. megadetector/api/batch_processing/api_core/batch_service/score.py +0 -439
  126. megadetector/api/batch_processing/api_core/server.py +0 -294
  127. megadetector/api/batch_processing/api_core/server_api_config.py +0 -97
  128. megadetector/api/batch_processing/api_core/server_app_config.py +0 -55
  129. megadetector/api/batch_processing/api_core/server_batch_job_manager.py +0 -220
  130. megadetector/api/batch_processing/api_core/server_job_status_table.py +0 -149
  131. megadetector/api/batch_processing/api_core/server_orchestration.py +0 -360
  132. megadetector/api/batch_processing/api_core/server_utils.py +0 -88
  133. megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
  134. megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
  135. megadetector/api/batch_processing/api_support/__init__.py +0 -0
  136. megadetector/api/batch_processing/api_support/summarize_daily_activity.py +0 -152
  137. megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
  138. megadetector/api/synchronous/__init__.py +0 -0
  139. megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  140. megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +0 -151
  141. megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -263
  142. megadetector/api/synchronous/api_core/animal_detection_api/config.py +0 -35
  143. megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
  144. megadetector/api/synchronous/api_core/tests/load_test.py +0 -110
  145. megadetector/data_management/importers/add_nacti_sizes.py +0 -52
  146. megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
  147. megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
  148. megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
  149. megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
  150. megadetector/data_management/importers/awc_to_json.py +0 -191
  151. megadetector/data_management/importers/bellevue_to_json.py +0 -272
  152. megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
  153. megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
  154. megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
  155. megadetector/data_management/importers/cct_field_adjustments.py +0 -58
  156. megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
  157. megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
  158. megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
  159. megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
  160. megadetector/data_management/importers/ena24_to_json.py +0 -276
  161. megadetector/data_management/importers/filenames_to_json.py +0 -386
  162. megadetector/data_management/importers/helena_to_cct.py +0 -283
  163. megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
  164. megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
  165. megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
  166. megadetector/data_management/importers/jb_csv_to_json.py +0 -150
  167. megadetector/data_management/importers/mcgill_to_json.py +0 -250
  168. megadetector/data_management/importers/missouri_to_json.py +0 -490
  169. megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
  170. megadetector/data_management/importers/noaa_seals_2019.py +0 -181
  171. megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
  172. megadetector/data_management/importers/pc_to_json.py +0 -365
  173. megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
  174. megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
  175. megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
  176. megadetector/data_management/importers/rspb_to_json.py +0 -356
  177. megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
  178. megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
  179. megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
  180. megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
  181. megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
  182. megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
  183. megadetector/data_management/importers/sulross_get_exif.py +0 -65
  184. megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
  185. megadetector/data_management/importers/ubc_to_json.py +0 -399
  186. megadetector/data_management/importers/umn_to_json.py +0 -507
  187. megadetector/data_management/importers/wellington_to_json.py +0 -263
  188. megadetector/data_management/importers/wi_to_json.py +0 -442
  189. megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
  190. megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
  191. megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
  192. megadetector/utils/azure_utils.py +0 -178
  193. megadetector/utils/sas_blob_utils.py +0 -509
  194. megadetector-5.0.28.dist-info/RECORD +0 -209
  195. /megadetector/{api/batch_processing/__init__.py → __init__.py} +0 -0
  196. {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/licenses/LICENSE +0 -0
  197. {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/top_level.txt +0 -0
@@ -32,27 +32,18 @@ taxonomy_download_dir = os.path.expanduser('~/taxonomy')
32
32
 
33
33
  taxonomy_urls = {
34
34
  'GBIF': 'https://hosted-datasets.gbif.org/datasets/backbone/current/backbone.zip',
35
- 'iNaturalist': 'https://www.inaturalist.org/observations/inaturalist-dwca-with-taxa.zip' # pylint: disable=line-too-long
35
+ 'iNaturalist': 'https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip'
36
36
  }
37
37
 
38
38
  files_to_unzip = {
39
- # GBIF used to put everything in a "backbone" folder within the zipfile, but as of
40
- # 12.2023, this is no longer the case.
41
- # 'GBIF': ['backbone/Taxon.tsv', 'backbone/VernacularName.tsv'],
42
39
  'GBIF': ['Taxon.tsv', 'VernacularName.tsv'],
43
- 'iNaturalist': ['taxa.csv']
40
+ 'iNaturalist': ['taxa.csv','VernacularNames-english.csv']
44
41
  }
45
42
 
46
- # As of 2020.05.12:
43
+ # As of 2025.06.24:
47
44
  #
48
- # GBIF: ~777MB zipped, ~1.6GB taxonomy
49
- # iNat: ~2.2GB zipped, ~51MB taxonomy (most of the zipfile is observations)
50
-
51
- # As of 2023.12.29:
52
- #
53
- # GBIF: ~948MB zipped, ~2.2GB taxonomy
54
- # iNat: ~6.7GB zipped, ~62MB taxonomy (most of the zipfile is observations)
55
-
45
+ # GBIF: 950MB zipped, 2.3GB of relevant content unzipped
46
+ # iNat: 71MB zipped, 415MB of relevant content unzipped
56
47
 
57
48
  os.makedirs(taxonomy_download_dir, exist_ok=True)
58
49
  for taxonomy_name in taxonomy_urls:
@@ -83,7 +74,7 @@ gbif_scientific_to_taxon_id = None # : Dict[str, np.int64]
83
74
 
84
75
  # Initialization function
85
76
 
86
- def initialize_taxonomy_lookup(force_init=False) -> None:
77
+ def initialize_taxonomy_lookup(force_init=False):
87
78
  """
88
79
  Initialize this module by doing the following:
89
80
 
@@ -92,8 +83,14 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
92
83
  * Builds a bunch of dictionaries and tables to facilitate lookup
93
84
  * Serializes those tables via pickle
94
85
  * Skips all of the above if the serialized pickle file already exists
86
+
87
+ Args:
88
+ force_init (bool, optional): force re-download and parsing of the source .zip files,
89
+ even if the cached .p file already exists
95
90
  """
96
91
 
92
+ #%%
93
+
97
94
  global inat_taxonomy,\
98
95
  gbif_taxonomy,\
99
96
  gbif_common_mapping,\
@@ -109,12 +106,12 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
109
106
  gbif_scientific_to_taxon_id
110
107
 
111
108
 
112
- ## Load serialized taxonomy info if we've already saved it
109
+ #%% Load serialized taxonomy info if we've already saved it
113
110
 
114
111
  if (not force_init) and (inat_taxonomy is not None):
115
112
  print('Skipping taxonomy re-init')
116
113
  return
117
-
114
+
118
115
  if (not force_init) and (os.path.isfile(serialized_structures_file)):
119
116
 
120
117
  print(f'De-serializing taxonomy data from {serialized_structures_file}')
@@ -135,18 +132,17 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
135
132
  gbif_vernacular_to_taxon_id,\
136
133
  gbif_taxon_id_to_scientific,\
137
134
  gbif_scientific_to_taxon_id = structures_to_serialize
138
-
135
+
139
136
  return
140
137
 
141
138
 
142
- ## If we don't have serialized taxonomy info, create it from scratch.
139
+ #%% Download and unzip taxonomy files
143
140
 
144
- # Download and unzip taxonomy files
145
141
  # taxonomy_name = list(taxonomy_urls.items())[0][0]; zip_url = list(taxonomy_urls.items())[0][1]
146
142
  for taxonomy_name, zip_url in taxonomy_urls.items():
147
143
 
148
144
  need_to_download = False
149
-
145
+
150
146
  if force_init:
151
147
  need_to_download = True
152
148
 
@@ -189,21 +185,44 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
189
185
 
190
186
  # ...for each file that we need from this zipfile
191
187
 
192
- # Remove the zipfile
193
- # os.remove(zipfile_path)
194
-
195
188
  # ...for each taxonomy
196
189
 
197
190
 
198
- # Create dataframes from each of the taxonomy files, and the GBIF common
199
- # name file
191
+ #%% Create dataframes from each of the taxonomy/vernacular files
200
192
 
201
193
  # Load iNat taxonomy
202
194
  inat_taxonomy_file = os.path.join(taxonomy_download_dir, 'iNaturalist', 'taxa.csv')
203
195
  print('Loading iNat taxonomy from {}'.format(inat_taxonomy_file))
204
196
  inat_taxonomy = pd.read_csv(inat_taxonomy_file)
205
197
  inat_taxonomy['scientificName'] = inat_taxonomy['scientificName'].fillna('').str.strip()
206
- inat_taxonomy['vernacularName'] = inat_taxonomy['vernacularName'].fillna('').str.strip()
198
+
199
+ # Delete columns we won't use. The "taxonID" column is a non-int version of "ID"
200
+ inat_taxonomy = inat_taxonomy.drop(['identifier', 'taxonID', 'modified', 'references'], axis=1)
201
+
202
+ # The "parentNameUsageID" column in inat_taxonomy is a URL, like:
203
+ #
204
+ # https://www.inaturalist.org/taxa/71262
205
+ #
206
+ # Convert this column to be integer-valued, using only the last token of the URL
207
+ inat_taxonomy['parentNameUsageID'] = \
208
+ inat_taxonomy['parentNameUsageID'].str.split('/').str[-1].fillna(0).astype(int)
209
+
210
+ # Rename the "id" column to "taxonID"
211
+ inat_taxonomy = inat_taxonomy.rename(columns={'id': 'taxonID'})
212
+
213
+ assert 'id' not in inat_taxonomy.columns
214
+ assert 'taxonID' in inat_taxonomy.columns
215
+
216
+ # Load iNat common name mapping
217
+ inat_common_mapping_file = os.path.join(taxonomy_download_dir, 'iNaturalist', 'VernacularNames-english.csv')
218
+ inat_common_mapping = pd.read_csv(inat_common_mapping_file)
219
+ inat_common_mapping['vernacularName'] = inat_common_mapping['vernacularName'].fillna('').str.strip()
220
+
221
+ inat_common_mapping = inat_common_mapping.drop(['language','locality','countryCode',
222
+ 'source','lexicon','contributor','created'], axis=1)
223
+ assert 'id' in inat_common_mapping.columns
224
+ assert 'taxonID' not in inat_common_mapping.columns
225
+ assert 'vernacularName' in inat_common_mapping.columns
207
226
 
208
227
  # Load GBIF taxonomy
209
228
  gbif_taxonomy_file = os.path.join(taxonomy_download_dir, 'GBIF', 'Taxon.tsv')
@@ -211,12 +230,20 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
211
230
  gbif_taxonomy = pd.read_csv(gbif_taxonomy_file, sep='\t', encoding='utf-8',on_bad_lines='warn')
212
231
  gbif_taxonomy['scientificName'] = gbif_taxonomy['scientificName'].fillna('').str.strip()
213
232
  gbif_taxonomy['canonicalName'] = gbif_taxonomy['canonicalName'].fillna('').str.strip()
233
+ gbif_taxonomy['parentNameUsageID'] = gbif_taxonomy['parentNameUsageID'].fillna(-1).astype(int)
214
234
 
215
235
  # Remove questionable rows from the GBIF taxonomy
216
236
  gbif_taxonomy = gbif_taxonomy[~gbif_taxonomy['taxonomicStatus'].isin(['doubtful', 'misapplied'])]
217
237
  gbif_taxonomy = gbif_taxonomy.reset_index()
218
238
 
219
- # Load GBIF vernacular name mapping
239
+ gbif_taxonomy = gbif_taxonomy.drop(['datasetID','acceptedNameUsageID','originalNameUsageID',
240
+ 'scientificNameAuthorship','nameAccordingTo','namePublishedIn',
241
+ 'taxonomicStatus','nomenclaturalStatus','taxonRemarks'], axis=1)
242
+
243
+ assert 'taxonID' in gbif_taxonomy.columns
244
+ assert 'scientificName' in gbif_taxonomy.columns
245
+
246
+ # Load GBIF common name mapping
220
247
  gbif_common_mapping = pd.read_csv(os.path.join(
221
248
  taxonomy_download_dir, 'GBIF', 'VernacularName.tsv'), sep='\t')
222
249
  gbif_common_mapping['vernacularName'] = gbif_common_mapping['vernacularName'].fillna('').str.strip()
@@ -225,6 +252,12 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
225
252
  gbif_common_mapping = gbif_common_mapping.loc[gbif_common_mapping['language'] == 'en']
226
253
  gbif_common_mapping = gbif_common_mapping.reset_index()
227
254
 
255
+ gbif_common_mapping = gbif_common_mapping.drop(['language','country','countryCode','sex',
256
+ 'lifeStage','source'],axis=1)
257
+
258
+ assert 'taxonID' in gbif_common_mapping.columns
259
+ assert 'vernacularName' in gbif_common_mapping.columns
260
+
228
261
 
229
262
  # Convert everything to lowercase
230
263
 
@@ -235,23 +268,28 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
235
268
  inat_taxonomy = convert_df_to_lowercase(inat_taxonomy)
236
269
  gbif_taxonomy = convert_df_to_lowercase(gbif_taxonomy)
237
270
  gbif_common_mapping = convert_df_to_lowercase(gbif_common_mapping)
271
+ inat_common_mapping = convert_df_to_lowercase(inat_common_mapping)
238
272
 
239
273
 
240
- # For each taxonomy table, create a mapping from taxon IDs to rows
274
+ ##%% For each taxonomy table, create a mapping from taxon IDs to rows
241
275
 
242
276
  inat_taxon_id_to_row = {}
243
277
  gbif_taxon_id_to_row = {}
244
278
 
245
279
  print('Building iNat taxonID --> row table')
246
280
  for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
247
- inat_taxon_id_to_row[row['taxonID']] = i_row
281
+ taxon_id = row['taxonID']
282
+ assert isinstance(taxon_id, int)
283
+ inat_taxon_id_to_row[taxon_id] = i_row
248
284
 
249
285
  print('Building GBIF taxonID --> row table')
250
286
  for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
251
- gbif_taxon_id_to_row[row['taxonID']] = i_row
287
+ taxon_id = row['taxonID']
288
+ assert isinstance(taxon_id, int)
289
+ gbif_taxon_id_to_row[taxon_id] = i_row
252
290
 
253
291
 
254
- # Create name mapping dictionaries
292
+ ##%% Create name mapping dictionaries
255
293
 
256
294
  inat_taxon_id_to_vernacular = defaultdict(set)
257
295
  inat_vernacular_to_taxon_id = defaultdict(set)
@@ -267,32 +305,61 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
267
305
  # Build iNat dictionaries
268
306
 
269
307
  print('Building lookup dictionaries for iNat taxonomy')
270
-
308
+
309
+ # iNat Scientific name mapping
310
+
271
311
  for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
272
312
 
273
313
  taxon_id = row['taxonID']
274
- vernacular_name = row['vernacularName']
275
- scientific_name = row['scientificName']
276
-
277
- if len(vernacular_name) > 0:
278
- inat_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
279
- inat_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
314
+ assert isinstance(taxon_id,int)
280
315
 
316
+ scientific_name = row['scientificName']
281
317
  assert len(scientific_name) > 0
318
+
282
319
  inat_taxon_id_to_scientific[taxon_id].add(scientific_name)
283
320
  inat_scientific_to_taxon_id[scientific_name].add(taxon_id)
284
321
 
322
+ # iNat common name mapping
323
+
324
+ inat_taxon_ids_in_vernacular_file_but_not_in_taxa_file = set()
325
+
326
+ for i_row, row in tqdm(inat_common_mapping.iterrows(), total=len(inat_common_mapping)):
327
+
328
+ taxon_id = row['id']
329
+ assert isinstance(taxon_id,int)
330
+
331
+ # This should never happen; we will assert() this at the end of the loop
332
+ if taxon_id not in inat_taxon_id_to_scientific:
333
+ inat_taxon_ids_in_vernacular_file_but_not_in_taxa_file.add(taxon_id)
334
+ continue
335
+
336
+ vernacular_name = row['vernacularName']
337
+
338
+ assert len(vernacular_name) > 0
339
+ inat_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
340
+ inat_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
341
+
342
+ assert len(inat_taxon_ids_in_vernacular_file_but_not_in_taxa_file) == 0
285
343
 
286
- # Build GBIF dictionaries
344
+
345
+ ##%% Build GBIF dictionaries
287
346
 
288
347
  print('Building lookup dictionaries for GBIF taxonomy')
289
-
348
+
349
+ # GBIF scientific name mapping
350
+
290
351
  for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
291
352
 
292
353
  taxon_id = row['taxonID']
354
+ assert isinstance(taxon_id,int)
293
355
 
294
- # The canonical name is the Latin name; the "scientific name"
295
- # include the taxonomy name.
356
+ # The "canonical name" is the Latin name; the "scientific name"
357
+ # column includes other information. For example:
358
+ #
359
+ # "scientificName": Schizophoria impressa (Hall, 1843)
360
+ # "canonicalName": Schizophoria impressa
361
+ #
362
+ # Also see:
296
363
  #
297
364
  # http://globalnames.org/docs/glossary/
298
365
 
@@ -307,12 +374,18 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
307
374
  gbif_taxon_id_to_scientific[taxon_id].add(scientific_name)
308
375
  gbif_scientific_to_taxon_id[scientific_name].add(taxon_id)
309
376
 
377
+ # GBIF common name mapping
378
+
379
+ gbif_taxon_ids_in_vernacular_file_but_not_in_taxa_file = set()
380
+
310
381
  for i_row, row in tqdm(gbif_common_mapping.iterrows(), total=len(gbif_common_mapping)):
311
382
 
312
383
  taxon_id = row['taxonID']
384
+ assert isinstance(taxon_id,int)
313
385
 
314
386
  # Don't include taxon IDs that were removed from the master table
315
387
  if taxon_id not in gbif_taxon_id_to_scientific:
388
+ gbif_taxon_ids_in_vernacular_file_but_not_in_taxa_file.add(taxon_id)
316
389
  continue
317
390
 
318
391
  vernacular_name = row['vernacularName']
@@ -321,8 +394,13 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
321
394
  gbif_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
322
395
  gbif_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
323
396
 
397
+ print('Finished GBIF common --> scientific mapping, failed to map {} of {} taxon IDs'.format(
398
+ len(gbif_taxon_ids_in_vernacular_file_but_not_in_taxa_file),
399
+ len(gbif_common_mapping)
400
+ ))
324
401
 
325
- # Save everything to file
402
+
403
+ ##%% Save everything to file
326
404
 
327
405
  structures_to_serialize = [
328
406
  inat_taxonomy,
@@ -344,7 +422,10 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
344
422
  if not os.path.isfile(serialized_structures_file):
345
423
  with open(serialized_structures_file, 'wb') as p:
346
424
  pickle.dump(structures_to_serialize, p)
347
- print(' done')
425
+ print('...done')
426
+
427
+
428
+ #%%
348
429
 
349
430
  # ...def initialize_taxonomy_lookup(...)
350
431
 
@@ -412,7 +493,8 @@ def traverse_taxonomy(matching_rownums: Sequence[int],
412
493
  while True:
413
494
 
414
495
  taxon_id = current_row['taxonID']
415
- vernacular_names = sorted(taxon_id_to_vernacular[taxon_id]) # sort for determinism, pylint: disable=line-too-long
496
+ # sort for determinism
497
+ vernacular_names = sorted(taxon_id_to_vernacular[taxon_id])
416
498
  match_details.append((taxon_id, current_row['taxonRank'],
417
499
  get_scientific_name_from_row(current_row),
418
500
  vernacular_names))
@@ -596,21 +678,21 @@ class TaxonomicMatch:
596
678
 
597
679
 
598
680
  hyphenated_terms = ['crowned', 'backed', 'throated', 'tailed', 'headed', 'cheeked',
599
- 'ruffed', 'browed', 'eating', 'striped', 'shanked',
681
+ 'ruffed', 'browed', 'eating', 'striped', 'shanked',
600
682
  'fronted', 'bellied', 'spotted', 'eared', 'collared', 'breasted',
601
683
  'necked']
602
684
 
603
685
  def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retry=True) -> TaxonomicMatch:
604
686
  """
605
- Wrapper for _get_preferred_taxonomic_match, but expressing a variety of heuristics
687
+ Wrapper for _get_preferred_taxonomic_match, but expressing a variety of heuristics
606
688
  and preferences that are specific to our scenario.
607
-
689
+
608
690
  Args:
609
691
  query (str): The common or scientific name we want to look up
610
692
  taxonomy_preference (str, optional): 'inat' or 'gbif'
611
- retry (bool, optional): if the initial lookup fails, should we try heuristic
693
+ retry (bool, optional): if the initial lookup fails, should we try heuristic
612
694
  substitutions, e.g. replacing "_" with " ", or "spp" with "species"?
613
-
695
+
614
696
  Returns:
615
697
  TaxonomicMatch: the best taxonomic match, or None
616
698
  """
@@ -618,31 +700,31 @@ def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retr
618
700
  m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
619
701
  if (len(m.scientific_name) > 0) or (not retry):
620
702
  return m
621
-
703
+
622
704
  for s in hyphenated_terms:
623
705
  query = query.replace(' ' + s,'-' + s)
624
706
  m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
625
707
  return m
626
-
627
-
708
+
709
+
628
710
  def validate_and_convert(data):
629
711
  """
630
712
  Recursively validates that all elements in the nested structure are only
631
713
  tuples, lists, ints, or np.int64, and converts np.int64 to int.
632
-
714
+
633
715
  Args:
634
716
  data: The nested structure to validate and convert
635
-
717
+
636
718
  Returns:
637
719
  The validated and converted structure
638
-
720
+
639
721
  Raises:
640
722
  TypeError: If an invalid type is encountered
641
723
  """
642
-
643
- if isinstance(data, np.int64):
724
+
725
+ if isinstance(data, np.int64):
644
726
  return int(data)
645
- elif isinstance(data, int) or isinstance(data, str):
727
+ elif isinstance(data, int) or isinstance(data, str):
646
728
  return data
647
729
  elif isinstance(data, (list, tuple)):
648
730
  # Process lists and tuples recursively
@@ -654,17 +736,17 @@ def validate_and_convert(data):
654
736
 
655
737
  # ...def validate_and_convert(...)
656
738
 
657
-
739
+
658
740
  def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') -> TaxonomicMatch:
659
-
741
+
660
742
  query = query.lower().strip().replace('_', ' ')
661
743
  query = query.replace('unidentified','')
662
744
  query = query.replace('unknown','')
663
745
  if query.endswith(' sp'):
664
746
  query = query.replace(' sp','')
665
747
  if query.endswith(' group'):
666
- query = query.replace(' group','')
667
-
748
+ query = query.replace(' group','')
749
+
668
750
  query = query.strip()
669
751
 
670
752
  # query = 'person'
@@ -686,17 +768,17 @@ def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') ->
686
768
 
687
769
  n_inat_matches = len(inat_matches)
688
770
  n_gbif_matches = len(gbif_matches)
689
-
771
+
690
772
  selected_matches = None
691
-
773
+
692
774
  assert taxonomy_preference in ['gbif','inat'],\
693
775
  'Unrecognized taxonomy preference: {}'.format(taxonomy_preference)
694
-
776
+
695
777
  if n_inat_matches > 0 and taxonomy_preference == 'inat':
696
778
  selected_matches = 'inat'
697
779
  elif n_gbif_matches > 0:
698
780
  selected_matches = 'gbif'
699
-
781
+
700
782
  if selected_matches == 'inat':
701
783
 
702
784
  i_match = 0
@@ -802,7 +884,7 @@ def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') ->
802
884
  # Convert np.int64's to ints
803
885
  if match is not None:
804
886
  match = validate_and_convert(match)
805
-
887
+
806
888
  taxonomy_string = str(match)
807
889
 
808
890
  return TaxonomicMatch(scientific_name, common_name, taxonomic_level, source,
@@ -828,15 +910,15 @@ if False:
828
910
  # print(matches)
829
911
 
830
912
  print_taxonomy_matches(matches,verbose=True)
831
-
913
+
832
914
  print('\n\n')
833
-
915
+
834
916
  # Print the taxonomy in the taxonomy spreadsheet format
835
917
  assert matches[1]['source'] == 'inat'
836
918
  t = str(matches[1]['taxonomy'])
837
919
  print(t)
838
920
  import clipboard; clipboard.copy(t)
839
-
921
+
840
922
 
841
923
  #%% Directly access the taxonomy tables
842
924
 
@@ -848,12 +930,12 @@ if False:
848
930
 
849
931
  #%% Command-line driver
850
932
 
851
- def main():
933
+ def main(): # noqa
852
934
 
853
935
  # Read command line inputs (absolute path)
854
936
  parser = argparse.ArgumentParser()
855
937
  parser.add_argument('input_file')
856
-
938
+
857
939
  if len(sys.argv[1:]) == 0:
858
940
  parser.print_help()
859
941
  parser.exit()
@@ -36,7 +36,7 @@ def check_taxonomy_csv(csv_path: str) -> None:
36
36
  """
37
37
  See module docstring.
38
38
  """
39
-
39
+
40
40
  taxonomy_df = pd.read_csv(csv_path)
41
41
 
42
42
  graph = nx.DiGraph()
@@ -46,12 +46,12 @@ def check_taxonomy_csv(csv_path: str) -> None:
46
46
  num_scientific_name_errors = 0
47
47
 
48
48
  for i_row, row in taxonomy_df.iterrows():
49
-
49
+
50
50
  ds = row['dataset_name']
51
51
  ds_label = row['query']
52
52
  scientific_name = row['scientific_name']
53
53
  level = row['taxonomy_level']
54
-
54
+
55
55
  # This used to represent the source of the mapping: iNat, gbif, or manual. We've
56
56
  # stopped tracking this, so this is now vestigial.
57
57
  id_source = 0 # row['source']
@@ -95,8 +95,8 @@ def check_taxonomy_csv(csv_path: str) -> None:
95
95
  num_scientific_name_errors += 1
96
96
 
97
97
  taxon_child = node
98
-
99
- # ...for each row in the taxonomy file
98
+
99
+ # ...for each row in the taxonomy file
100
100
 
101
101
  assert nx.is_directed_acyclic_graph(graph)
102
102
 
@@ -124,36 +124,36 @@ def check_taxonomy_csv(csv_path: str) -> None:
124
124
  print(f'At least one node has unresolved ambiguous parents: {e}')
125
125
 
126
126
  print('Processed {} rows from {}'.format(len(taxonomy_df),csv_path))
127
-
127
+
128
128
  print('num taxon level errors:', num_taxon_level_errors)
129
129
  print('num scientific name errors:', num_scientific_name_errors)
130
130
 
131
131
 
132
132
  #%% Command-line driver
133
-
133
+
134
134
  if __name__ == '__main__':
135
-
135
+
136
136
  parser = argparse.ArgumentParser()
137
137
  parser.add_argument(
138
138
  'taxonomy_csv_path',
139
139
  help='path to taxonomy CSV file')
140
-
140
+
141
141
  if len(sys.argv[1:]) == 0:
142
142
  parser.print_help()
143
143
  parser.exit()
144
-
144
+
145
145
  args = parser.parse_args()
146
146
 
147
147
  check_taxonomy_csv(args.taxonomy_csv_path)
148
148
 
149
149
 
150
150
  #%% Interactive driver
151
-
151
+
152
152
  if False:
153
-
153
+
154
154
  #%%
155
-
155
+
156
156
  import os
157
157
  csv_path = os.path.expanduser('~/lila/lila-taxonomy-mapping_release.csv')
158
158
  check_taxonomy_csv(csv_path)
159
-
159
+
@@ -1,4 +1,4 @@
1
- """
1
+ r"""
2
2
 
3
3
  taxonomy_graph.py
4
4
 
@@ -69,7 +69,7 @@ class TaxonNode:
69
69
  By default, we support multiple parents for each TaxonNode. See discussion
70
70
  in module docstring above.
71
71
  """
72
-
72
+
73
73
  # class variables
74
74
  single_parent_only: ClassVar[bool] = False
75
75
 
@@ -82,7 +82,7 @@ class TaxonNode:
82
82
 
83
83
  def __init__(self, level: str, name: str,
84
84
  graph: Optional[nx.DiGraph] = None):
85
-
85
+
86
86
  self.level = level
87
87
  self.name = name
88
88
  self.graph = graph
@@ -131,7 +131,7 @@ class TaxonNode:
131
131
  Args:
132
132
  parent: TaxonNode, must be higher in the taxonomical hierarchy
133
133
  """
134
-
134
+
135
135
  assert self.graph is not None
136
136
  parents = self.parents
137
137
  if TaxonNode.single_parent_only and len(parents) > 0:
@@ -150,7 +150,7 @@ class TaxonNode:
150
150
  Args:
151
151
  child: TaxonNode, must be lower in the taxonomical hierarchy
152
152
  """
153
-
153
+
154
154
  assert self.graph is not None
155
155
  self.graph.add_edge(self, child)
156
156
 
@@ -160,7 +160,7 @@ class TaxonNode:
160
160
  ds: str, name of dataset
161
161
  ds_label: str, name of label used by that dataset
162
162
  """
163
-
163
+
164
164
  self.dataset_labels.add((ds, ds_label))
165
165
 
166
166
  def get_dataset_labels(self,
@@ -176,7 +176,7 @@ class TaxonNode:
176
176
 
177
177
  Returns: set of (ds, ds_label) tuples
178
178
  """
179
-
179
+
180
180
  result = self.dataset_labels
181
181
  if include_datasets is not None:
182
182
  result = set(tup for tup in result if tup[0] in include_datasets)
@@ -199,7 +199,7 @@ class TaxonNode:
199
199
 
200
200
  Returns: TaxonNode, the LCA if it exists, or None if no LCA exists
201
201
  """
202
-
202
+
203
203
  paths = []
204
204
  for node in nodes:
205
205
  # get path to root
@@ -242,7 +242,7 @@ def build_taxonomy_graph(taxonomy_df: pd.DataFrame
242
242
  TaxonNode node in the tree that contains the label,
243
243
  keys are all lowercase
244
244
  """
245
-
245
+
246
246
  graph = nx.DiGraph()
247
247
  taxon_to_node = {} # maps (taxon_level, taxon_name) to a TaxonNode
248
248
  label_to_node = {} # maps (dataset_name, dataset_label) to a TaxonNode
@@ -308,7 +308,7 @@ def dag_to_tree(graph: nx.DiGraph,
308
308
 
309
309
  Returns: nx.DiGraph, a tree-structured graph
310
310
  """
311
-
311
+
312
312
  tree = nx.DiGraph()
313
313
  for node in graph.nodes:
314
314
  tree.add_node(node)