megadetector 5.0.28__py3-none-any.whl → 10.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
- megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
- megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
- megadetector/classification/aggregate_classifier_probs.py +3 -3
- megadetector/classification/analyze_failed_images.py +5 -5
- megadetector/classification/cache_batchapi_outputs.py +5 -5
- megadetector/classification/create_classification_dataset.py +11 -12
- megadetector/classification/crop_detections.py +10 -10
- megadetector/classification/csv_to_json.py +8 -8
- megadetector/classification/detect_and_crop.py +13 -15
- megadetector/classification/efficientnet/model.py +8 -8
- megadetector/classification/efficientnet/utils.py +6 -5
- megadetector/classification/evaluate_model.py +7 -7
- megadetector/classification/identify_mislabeled_candidates.py +6 -6
- megadetector/classification/json_to_azcopy_list.py +1 -1
- megadetector/classification/json_validator.py +29 -32
- megadetector/classification/map_classification_categories.py +9 -9
- megadetector/classification/merge_classification_detection_output.py +12 -9
- megadetector/classification/prepare_classification_script.py +19 -19
- megadetector/classification/prepare_classification_script_mc.py +26 -26
- megadetector/classification/run_classifier.py +4 -4
- megadetector/classification/save_mislabeled.py +6 -6
- megadetector/classification/train_classifier.py +1 -1
- megadetector/classification/train_classifier_tf.py +9 -9
- megadetector/classification/train_utils.py +10 -10
- megadetector/data_management/annotations/annotation_constants.py +1 -2
- megadetector/data_management/camtrap_dp_to_coco.py +79 -46
- megadetector/data_management/cct_json_utils.py +103 -103
- megadetector/data_management/cct_to_md.py +49 -49
- megadetector/data_management/cct_to_wi.py +33 -33
- megadetector/data_management/coco_to_labelme.py +75 -75
- megadetector/data_management/coco_to_yolo.py +210 -193
- megadetector/data_management/databases/add_width_and_height_to_db.py +86 -12
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +40 -40
- megadetector/data_management/databases/integrity_check_json_db.py +228 -200
- megadetector/data_management/databases/subset_json_db.py +33 -33
- megadetector/data_management/generate_crops_from_cct.py +88 -39
- megadetector/data_management/get_image_sizes.py +54 -49
- megadetector/data_management/labelme_to_coco.py +133 -125
- megadetector/data_management/labelme_to_yolo.py +159 -73
- megadetector/data_management/lila/create_lila_blank_set.py +81 -83
- megadetector/data_management/lila/create_lila_test_set.py +32 -31
- megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
- megadetector/data_management/lila/download_lila_subset.py +21 -24
- megadetector/data_management/lila/generate_lila_per_image_labels.py +365 -107
- megadetector/data_management/lila/get_lila_annotation_counts.py +35 -33
- megadetector/data_management/lila/get_lila_image_counts.py +22 -22
- megadetector/data_management/lila/lila_common.py +73 -70
- megadetector/data_management/lila/test_lila_metadata_urls.py +28 -19
- megadetector/data_management/mewc_to_md.py +344 -340
- megadetector/data_management/ocr_tools.py +262 -255
- megadetector/data_management/read_exif.py +249 -227
- megadetector/data_management/remap_coco_categories.py +90 -28
- megadetector/data_management/remove_exif.py +81 -21
- megadetector/data_management/rename_images.py +187 -187
- megadetector/data_management/resize_coco_dataset.py +588 -120
- megadetector/data_management/speciesnet_to_md.py +41 -41
- megadetector/data_management/wi_download_csv_to_coco.py +55 -55
- megadetector/data_management/yolo_output_to_md_output.py +248 -122
- megadetector/data_management/yolo_to_coco.py +333 -191
- megadetector/detection/change_detection.py +832 -0
- megadetector/detection/process_video.py +340 -337
- megadetector/detection/pytorch_detector.py +358 -278
- megadetector/detection/run_detector.py +399 -186
- megadetector/detection/run_detector_batch.py +404 -377
- megadetector/detection/run_inference_with_yolov5_val.py +340 -327
- megadetector/detection/run_tiled_inference.py +257 -249
- megadetector/detection/tf_detector.py +24 -24
- megadetector/detection/video_utils.py +332 -295
- megadetector/postprocessing/add_max_conf.py +19 -11
- megadetector/postprocessing/categorize_detections_by_size.py +45 -45
- megadetector/postprocessing/classification_postprocessing.py +468 -433
- megadetector/postprocessing/combine_batch_outputs.py +23 -23
- megadetector/postprocessing/compare_batch_results.py +590 -525
- megadetector/postprocessing/convert_output_format.py +106 -102
- megadetector/postprocessing/create_crop_folder.py +347 -147
- megadetector/postprocessing/detector_calibration.py +173 -168
- megadetector/postprocessing/generate_csv_report.py +508 -499
- megadetector/postprocessing/load_api_results.py +48 -27
- megadetector/postprocessing/md_to_coco.py +133 -102
- megadetector/postprocessing/md_to_labelme.py +107 -90
- megadetector/postprocessing/md_to_wi.py +40 -40
- megadetector/postprocessing/merge_detections.py +92 -114
- megadetector/postprocessing/postprocess_batch_results.py +319 -301
- megadetector/postprocessing/remap_detection_categories.py +91 -38
- megadetector/postprocessing/render_detection_confusion_matrix.py +214 -205
- megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
- megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +704 -679
- megadetector/postprocessing/separate_detections_into_folders.py +226 -211
- megadetector/postprocessing/subset_json_detector_output.py +265 -262
- megadetector/postprocessing/top_folders_to_bottom.py +45 -45
- megadetector/postprocessing/validate_batch_results.py +70 -70
- megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +18 -19
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +54 -33
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +67 -67
- megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
- megadetector/taxonomy_mapping/simple_image_download.py +8 -8
- megadetector/taxonomy_mapping/species_lookup.py +156 -74
- megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
- megadetector/taxonomy_mapping/taxonomy_graph.py +10 -10
- megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
- megadetector/utils/ct_utils.py +1049 -211
- megadetector/utils/directory_listing.py +21 -77
- megadetector/utils/gpu_test.py +22 -22
- megadetector/utils/md_tests.py +632 -529
- megadetector/utils/path_utils.py +1520 -431
- megadetector/utils/process_utils.py +41 -41
- megadetector/utils/split_locations_into_train_val.py +62 -62
- megadetector/utils/string_utils.py +148 -27
- megadetector/utils/url_utils.py +489 -176
- megadetector/utils/wi_utils.py +2658 -2526
- megadetector/utils/write_html_image_list.py +137 -137
- megadetector/visualization/plot_utils.py +34 -30
- megadetector/visualization/render_images_with_thumbnails.py +39 -74
- megadetector/visualization/visualization_utils.py +487 -435
- megadetector/visualization/visualize_db.py +232 -198
- megadetector/visualization/visualize_detector_output.py +82 -76
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/METADATA +5 -2
- megadetector-10.0.0.dist-info/RECORD +139 -0
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/WHEEL +1 -1
- megadetector/api/batch_processing/api_core/__init__.py +0 -0
- megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
- megadetector/api/batch_processing/api_core/batch_service/score.py +0 -439
- megadetector/api/batch_processing/api_core/server.py +0 -294
- megadetector/api/batch_processing/api_core/server_api_config.py +0 -97
- megadetector/api/batch_processing/api_core/server_app_config.py +0 -55
- megadetector/api/batch_processing/api_core/server_batch_job_manager.py +0 -220
- megadetector/api/batch_processing/api_core/server_job_status_table.py +0 -149
- megadetector/api/batch_processing/api_core/server_orchestration.py +0 -360
- megadetector/api/batch_processing/api_core/server_utils.py +0 -88
- megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
- megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
- megadetector/api/batch_processing/api_support/__init__.py +0 -0
- megadetector/api/batch_processing/api_support/summarize_daily_activity.py +0 -152
- megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
- megadetector/api/synchronous/__init__.py +0 -0
- megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
- megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +0 -151
- megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -263
- megadetector/api/synchronous/api_core/animal_detection_api/config.py +0 -35
- megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
- megadetector/api/synchronous/api_core/tests/load_test.py +0 -110
- megadetector/data_management/importers/add_nacti_sizes.py +0 -52
- megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
- megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
- megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
- megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
- megadetector/data_management/importers/awc_to_json.py +0 -191
- megadetector/data_management/importers/bellevue_to_json.py +0 -272
- megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
- megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
- megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
- megadetector/data_management/importers/cct_field_adjustments.py +0 -58
- megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
- megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
- megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
- megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
- megadetector/data_management/importers/ena24_to_json.py +0 -276
- megadetector/data_management/importers/filenames_to_json.py +0 -386
- megadetector/data_management/importers/helena_to_cct.py +0 -283
- megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
- megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
- megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
- megadetector/data_management/importers/jb_csv_to_json.py +0 -150
- megadetector/data_management/importers/mcgill_to_json.py +0 -250
- megadetector/data_management/importers/missouri_to_json.py +0 -490
- megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
- megadetector/data_management/importers/noaa_seals_2019.py +0 -181
- megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
- megadetector/data_management/importers/pc_to_json.py +0 -365
- megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
- megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
- megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
- megadetector/data_management/importers/rspb_to_json.py +0 -356
- megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
- megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
- megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
- megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
- megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
- megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
- megadetector/data_management/importers/sulross_get_exif.py +0 -65
- megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
- megadetector/data_management/importers/ubc_to_json.py +0 -399
- megadetector/data_management/importers/umn_to_json.py +0 -507
- megadetector/data_management/importers/wellington_to_json.py +0 -263
- megadetector/data_management/importers/wi_to_json.py +0 -442
- megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
- megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
- megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
- megadetector/utils/azure_utils.py +0 -178
- megadetector/utils/sas_blob_utils.py +0 -509
- megadetector-5.0.28.dist-info/RECORD +0 -209
- /megadetector/{api/batch_processing/__init__.py → __init__.py} +0 -0
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/licenses/LICENSE +0 -0
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/top_level.txt +0 -0
|
@@ -32,27 +32,18 @@ taxonomy_download_dir = os.path.expanduser('~/taxonomy')
|
|
|
32
32
|
|
|
33
33
|
taxonomy_urls = {
|
|
34
34
|
'GBIF': 'https://hosted-datasets.gbif.org/datasets/backbone/current/backbone.zip',
|
|
35
|
-
'iNaturalist': 'https://www.inaturalist.org/
|
|
35
|
+
'iNaturalist': 'https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip'
|
|
36
36
|
}
|
|
37
37
|
|
|
38
38
|
files_to_unzip = {
|
|
39
|
-
# GBIF used to put everything in a "backbone" folder within the zipfile, but as of
|
|
40
|
-
# 12.2023, this is no longer the case.
|
|
41
|
-
# 'GBIF': ['backbone/Taxon.tsv', 'backbone/VernacularName.tsv'],
|
|
42
39
|
'GBIF': ['Taxon.tsv', 'VernacularName.tsv'],
|
|
43
|
-
'iNaturalist': ['taxa.csv']
|
|
40
|
+
'iNaturalist': ['taxa.csv','VernacularNames-english.csv']
|
|
44
41
|
}
|
|
45
42
|
|
|
46
|
-
# As of
|
|
43
|
+
# As of 2025.06.24:
|
|
47
44
|
#
|
|
48
|
-
# GBIF:
|
|
49
|
-
# iNat:
|
|
50
|
-
|
|
51
|
-
# As of 2023.12.29:
|
|
52
|
-
#
|
|
53
|
-
# GBIF: ~948MB zipped, ~2.2GB taxonomy
|
|
54
|
-
# iNat: ~6.7GB zipped, ~62MB taxonomy (most of the zipfile is observations)
|
|
55
|
-
|
|
45
|
+
# GBIF: 950MB zipped, 2.3GB of relevant content unzipped
|
|
46
|
+
# iNat: 71MB zipped, 415MB of relevant content unzipped
|
|
56
47
|
|
|
57
48
|
os.makedirs(taxonomy_download_dir, exist_ok=True)
|
|
58
49
|
for taxonomy_name in taxonomy_urls:
|
|
@@ -83,7 +74,7 @@ gbif_scientific_to_taxon_id = None # : Dict[str, np.int64]
|
|
|
83
74
|
|
|
84
75
|
# Initialization function
|
|
85
76
|
|
|
86
|
-
def initialize_taxonomy_lookup(force_init=False)
|
|
77
|
+
def initialize_taxonomy_lookup(force_init=False):
|
|
87
78
|
"""
|
|
88
79
|
Initialize this module by doing the following:
|
|
89
80
|
|
|
@@ -92,8 +83,14 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
92
83
|
* Builds a bunch of dictionaries and tables to facilitate lookup
|
|
93
84
|
* Serializes those tables via pickle
|
|
94
85
|
* Skips all of the above if the serialized pickle file already exists
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
force_init (bool, optional): force re-download and parsing of the source .zip files,
|
|
89
|
+
even if the cached .p file already exists
|
|
95
90
|
"""
|
|
96
91
|
|
|
92
|
+
#%%
|
|
93
|
+
|
|
97
94
|
global inat_taxonomy,\
|
|
98
95
|
gbif_taxonomy,\
|
|
99
96
|
gbif_common_mapping,\
|
|
@@ -109,12 +106,12 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
109
106
|
gbif_scientific_to_taxon_id
|
|
110
107
|
|
|
111
108
|
|
|
112
|
-
|
|
109
|
+
#%% Load serialized taxonomy info if we've already saved it
|
|
113
110
|
|
|
114
111
|
if (not force_init) and (inat_taxonomy is not None):
|
|
115
112
|
print('Skipping taxonomy re-init')
|
|
116
113
|
return
|
|
117
|
-
|
|
114
|
+
|
|
118
115
|
if (not force_init) and (os.path.isfile(serialized_structures_file)):
|
|
119
116
|
|
|
120
117
|
print(f'De-serializing taxonomy data from {serialized_structures_file}')
|
|
@@ -135,18 +132,17 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
135
132
|
gbif_vernacular_to_taxon_id,\
|
|
136
133
|
gbif_taxon_id_to_scientific,\
|
|
137
134
|
gbif_scientific_to_taxon_id = structures_to_serialize
|
|
138
|
-
|
|
135
|
+
|
|
139
136
|
return
|
|
140
137
|
|
|
141
138
|
|
|
142
|
-
|
|
139
|
+
#%% Download and unzip taxonomy files
|
|
143
140
|
|
|
144
|
-
# Download and unzip taxonomy files
|
|
145
141
|
# taxonomy_name = list(taxonomy_urls.items())[0][0]; zip_url = list(taxonomy_urls.items())[0][1]
|
|
146
142
|
for taxonomy_name, zip_url in taxonomy_urls.items():
|
|
147
143
|
|
|
148
144
|
need_to_download = False
|
|
149
|
-
|
|
145
|
+
|
|
150
146
|
if force_init:
|
|
151
147
|
need_to_download = True
|
|
152
148
|
|
|
@@ -189,21 +185,44 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
189
185
|
|
|
190
186
|
# ...for each file that we need from this zipfile
|
|
191
187
|
|
|
192
|
-
# Remove the zipfile
|
|
193
|
-
# os.remove(zipfile_path)
|
|
194
|
-
|
|
195
188
|
# ...for each taxonomy
|
|
196
189
|
|
|
197
190
|
|
|
198
|
-
|
|
199
|
-
# name file
|
|
191
|
+
#%% Create dataframes from each of the taxonomy/vernacular files
|
|
200
192
|
|
|
201
193
|
# Load iNat taxonomy
|
|
202
194
|
inat_taxonomy_file = os.path.join(taxonomy_download_dir, 'iNaturalist', 'taxa.csv')
|
|
203
195
|
print('Loading iNat taxonomy from {}'.format(inat_taxonomy_file))
|
|
204
196
|
inat_taxonomy = pd.read_csv(inat_taxonomy_file)
|
|
205
197
|
inat_taxonomy['scientificName'] = inat_taxonomy['scientificName'].fillna('').str.strip()
|
|
206
|
-
|
|
198
|
+
|
|
199
|
+
# Delete columns we won't use. The "taxonID" column is a non-int version of "ID"
|
|
200
|
+
inat_taxonomy = inat_taxonomy.drop(['identifier', 'taxonID', 'modified', 'references'], axis=1)
|
|
201
|
+
|
|
202
|
+
# The "parentNameUsageID" column in inat_taxonomy is a URL, like:
|
|
203
|
+
#
|
|
204
|
+
# https://www.inaturalist.org/taxa/71262
|
|
205
|
+
#
|
|
206
|
+
# Convert this column to be integer-valued, using only the last token of the URL
|
|
207
|
+
inat_taxonomy['parentNameUsageID'] = \
|
|
208
|
+
inat_taxonomy['parentNameUsageID'].str.split('/').str[-1].fillna(0).astype(int)
|
|
209
|
+
|
|
210
|
+
# Rename the "id" column to "taxonID"
|
|
211
|
+
inat_taxonomy = inat_taxonomy.rename(columns={'id': 'taxonID'})
|
|
212
|
+
|
|
213
|
+
assert 'id' not in inat_taxonomy.columns
|
|
214
|
+
assert 'taxonID' in inat_taxonomy.columns
|
|
215
|
+
|
|
216
|
+
# Load iNat common name mapping
|
|
217
|
+
inat_common_mapping_file = os.path.join(taxonomy_download_dir, 'iNaturalist', 'VernacularNames-english.csv')
|
|
218
|
+
inat_common_mapping = pd.read_csv(inat_common_mapping_file)
|
|
219
|
+
inat_common_mapping['vernacularName'] = inat_common_mapping['vernacularName'].fillna('').str.strip()
|
|
220
|
+
|
|
221
|
+
inat_common_mapping = inat_common_mapping.drop(['language','locality','countryCode',
|
|
222
|
+
'source','lexicon','contributor','created'], axis=1)
|
|
223
|
+
assert 'id' in inat_common_mapping.columns
|
|
224
|
+
assert 'taxonID' not in inat_common_mapping.columns
|
|
225
|
+
assert 'vernacularName' in inat_common_mapping.columns
|
|
207
226
|
|
|
208
227
|
# Load GBIF taxonomy
|
|
209
228
|
gbif_taxonomy_file = os.path.join(taxonomy_download_dir, 'GBIF', 'Taxon.tsv')
|
|
@@ -211,12 +230,20 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
211
230
|
gbif_taxonomy = pd.read_csv(gbif_taxonomy_file, sep='\t', encoding='utf-8',on_bad_lines='warn')
|
|
212
231
|
gbif_taxonomy['scientificName'] = gbif_taxonomy['scientificName'].fillna('').str.strip()
|
|
213
232
|
gbif_taxonomy['canonicalName'] = gbif_taxonomy['canonicalName'].fillna('').str.strip()
|
|
233
|
+
gbif_taxonomy['parentNameUsageID'] = gbif_taxonomy['parentNameUsageID'].fillna(-1).astype(int)
|
|
214
234
|
|
|
215
235
|
# Remove questionable rows from the GBIF taxonomy
|
|
216
236
|
gbif_taxonomy = gbif_taxonomy[~gbif_taxonomy['taxonomicStatus'].isin(['doubtful', 'misapplied'])]
|
|
217
237
|
gbif_taxonomy = gbif_taxonomy.reset_index()
|
|
218
238
|
|
|
219
|
-
|
|
239
|
+
gbif_taxonomy = gbif_taxonomy.drop(['datasetID','acceptedNameUsageID','originalNameUsageID',
|
|
240
|
+
'scientificNameAuthorship','nameAccordingTo','namePublishedIn',
|
|
241
|
+
'taxonomicStatus','nomenclaturalStatus','taxonRemarks'], axis=1)
|
|
242
|
+
|
|
243
|
+
assert 'taxonID' in gbif_taxonomy.columns
|
|
244
|
+
assert 'scientificName' in gbif_taxonomy.columns
|
|
245
|
+
|
|
246
|
+
# Load GBIF common name mapping
|
|
220
247
|
gbif_common_mapping = pd.read_csv(os.path.join(
|
|
221
248
|
taxonomy_download_dir, 'GBIF', 'VernacularName.tsv'), sep='\t')
|
|
222
249
|
gbif_common_mapping['vernacularName'] = gbif_common_mapping['vernacularName'].fillna('').str.strip()
|
|
@@ -225,6 +252,12 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
225
252
|
gbif_common_mapping = gbif_common_mapping.loc[gbif_common_mapping['language'] == 'en']
|
|
226
253
|
gbif_common_mapping = gbif_common_mapping.reset_index()
|
|
227
254
|
|
|
255
|
+
gbif_common_mapping = gbif_common_mapping.drop(['language','country','countryCode','sex',
|
|
256
|
+
'lifeStage','source'],axis=1)
|
|
257
|
+
|
|
258
|
+
assert 'taxonID' in gbif_common_mapping.columns
|
|
259
|
+
assert 'vernacularName' in gbif_common_mapping.columns
|
|
260
|
+
|
|
228
261
|
|
|
229
262
|
# Convert everything to lowercase
|
|
230
263
|
|
|
@@ -235,23 +268,28 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
235
268
|
inat_taxonomy = convert_df_to_lowercase(inat_taxonomy)
|
|
236
269
|
gbif_taxonomy = convert_df_to_lowercase(gbif_taxonomy)
|
|
237
270
|
gbif_common_mapping = convert_df_to_lowercase(gbif_common_mapping)
|
|
271
|
+
inat_common_mapping = convert_df_to_lowercase(inat_common_mapping)
|
|
238
272
|
|
|
239
273
|
|
|
240
|
-
|
|
274
|
+
##%% For each taxonomy table, create a mapping from taxon IDs to rows
|
|
241
275
|
|
|
242
276
|
inat_taxon_id_to_row = {}
|
|
243
277
|
gbif_taxon_id_to_row = {}
|
|
244
278
|
|
|
245
279
|
print('Building iNat taxonID --> row table')
|
|
246
280
|
for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
|
|
247
|
-
|
|
281
|
+
taxon_id = row['taxonID']
|
|
282
|
+
assert isinstance(taxon_id, int)
|
|
283
|
+
inat_taxon_id_to_row[taxon_id] = i_row
|
|
248
284
|
|
|
249
285
|
print('Building GBIF taxonID --> row table')
|
|
250
286
|
for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
|
|
251
|
-
|
|
287
|
+
taxon_id = row['taxonID']
|
|
288
|
+
assert isinstance(taxon_id, int)
|
|
289
|
+
gbif_taxon_id_to_row[taxon_id] = i_row
|
|
252
290
|
|
|
253
291
|
|
|
254
|
-
|
|
292
|
+
##%% Create name mapping dictionaries
|
|
255
293
|
|
|
256
294
|
inat_taxon_id_to_vernacular = defaultdict(set)
|
|
257
295
|
inat_vernacular_to_taxon_id = defaultdict(set)
|
|
@@ -267,32 +305,61 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
267
305
|
# Build iNat dictionaries
|
|
268
306
|
|
|
269
307
|
print('Building lookup dictionaries for iNat taxonomy')
|
|
270
|
-
|
|
308
|
+
|
|
309
|
+
# iNat Scientific name mapping
|
|
310
|
+
|
|
271
311
|
for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
|
|
272
312
|
|
|
273
313
|
taxon_id = row['taxonID']
|
|
274
|
-
|
|
275
|
-
scientific_name = row['scientificName']
|
|
276
|
-
|
|
277
|
-
if len(vernacular_name) > 0:
|
|
278
|
-
inat_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
|
|
279
|
-
inat_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
|
|
314
|
+
assert isinstance(taxon_id,int)
|
|
280
315
|
|
|
316
|
+
scientific_name = row['scientificName']
|
|
281
317
|
assert len(scientific_name) > 0
|
|
318
|
+
|
|
282
319
|
inat_taxon_id_to_scientific[taxon_id].add(scientific_name)
|
|
283
320
|
inat_scientific_to_taxon_id[scientific_name].add(taxon_id)
|
|
284
321
|
|
|
322
|
+
# iNat common name mapping
|
|
323
|
+
|
|
324
|
+
inat_taxon_ids_in_vernacular_file_but_not_in_taxa_file = set()
|
|
325
|
+
|
|
326
|
+
for i_row, row in tqdm(inat_common_mapping.iterrows(), total=len(inat_common_mapping)):
|
|
327
|
+
|
|
328
|
+
taxon_id = row['id']
|
|
329
|
+
assert isinstance(taxon_id,int)
|
|
330
|
+
|
|
331
|
+
# This should never happen; we will assert() this at the end of the loop
|
|
332
|
+
if taxon_id not in inat_taxon_id_to_scientific:
|
|
333
|
+
inat_taxon_ids_in_vernacular_file_but_not_in_taxa_file.add(taxon_id)
|
|
334
|
+
continue
|
|
335
|
+
|
|
336
|
+
vernacular_name = row['vernacularName']
|
|
337
|
+
|
|
338
|
+
assert len(vernacular_name) > 0
|
|
339
|
+
inat_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
|
|
340
|
+
inat_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
|
|
341
|
+
|
|
342
|
+
assert len(inat_taxon_ids_in_vernacular_file_but_not_in_taxa_file) == 0
|
|
285
343
|
|
|
286
|
-
|
|
344
|
+
|
|
345
|
+
##%% Build GBIF dictionaries
|
|
287
346
|
|
|
288
347
|
print('Building lookup dictionaries for GBIF taxonomy')
|
|
289
|
-
|
|
348
|
+
|
|
349
|
+
# GBIF scientific name mapping
|
|
350
|
+
|
|
290
351
|
for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
|
|
291
352
|
|
|
292
353
|
taxon_id = row['taxonID']
|
|
354
|
+
assert isinstance(taxon_id,int)
|
|
293
355
|
|
|
294
|
-
# The canonical name is the Latin name; the "scientific name"
|
|
295
|
-
#
|
|
356
|
+
# The "canonical name" is the Latin name; the "scientific name"
|
|
357
|
+
# column includes other information. For example:
|
|
358
|
+
#
|
|
359
|
+
# "scientificName": Schizophoria impressa (Hall, 1843)
|
|
360
|
+
# "canonicalName": Schizophoria impressa
|
|
361
|
+
#
|
|
362
|
+
# Also see:
|
|
296
363
|
#
|
|
297
364
|
# http://globalnames.org/docs/glossary/
|
|
298
365
|
|
|
@@ -307,12 +374,18 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
307
374
|
gbif_taxon_id_to_scientific[taxon_id].add(scientific_name)
|
|
308
375
|
gbif_scientific_to_taxon_id[scientific_name].add(taxon_id)
|
|
309
376
|
|
|
377
|
+
# GBIF common name mapping
|
|
378
|
+
|
|
379
|
+
gbif_taxon_ids_in_vernacular_file_but_not_in_taxa_file = set()
|
|
380
|
+
|
|
310
381
|
for i_row, row in tqdm(gbif_common_mapping.iterrows(), total=len(gbif_common_mapping)):
|
|
311
382
|
|
|
312
383
|
taxon_id = row['taxonID']
|
|
384
|
+
assert isinstance(taxon_id,int)
|
|
313
385
|
|
|
314
386
|
# Don't include taxon IDs that were removed from the master table
|
|
315
387
|
if taxon_id not in gbif_taxon_id_to_scientific:
|
|
388
|
+
gbif_taxon_ids_in_vernacular_file_but_not_in_taxa_file.add(taxon_id)
|
|
316
389
|
continue
|
|
317
390
|
|
|
318
391
|
vernacular_name = row['vernacularName']
|
|
@@ -321,8 +394,13 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
321
394
|
gbif_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
|
|
322
395
|
gbif_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
|
|
323
396
|
|
|
397
|
+
print('Finished GBIF common --> scientific mapping, failed to map {} of {} taxon IDs'.format(
|
|
398
|
+
len(gbif_taxon_ids_in_vernacular_file_but_not_in_taxa_file),
|
|
399
|
+
len(gbif_common_mapping)
|
|
400
|
+
))
|
|
324
401
|
|
|
325
|
-
|
|
402
|
+
|
|
403
|
+
##%% Save everything to file
|
|
326
404
|
|
|
327
405
|
structures_to_serialize = [
|
|
328
406
|
inat_taxonomy,
|
|
@@ -344,7 +422,10 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
344
422
|
if not os.path.isfile(serialized_structures_file):
|
|
345
423
|
with open(serialized_structures_file, 'wb') as p:
|
|
346
424
|
pickle.dump(structures_to_serialize, p)
|
|
347
|
-
print('
|
|
425
|
+
print('...done')
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
#%%
|
|
348
429
|
|
|
349
430
|
# ...def initialize_taxonomy_lookup(...)
|
|
350
431
|
|
|
@@ -412,7 +493,8 @@ def traverse_taxonomy(matching_rownums: Sequence[int],
|
|
|
412
493
|
while True:
|
|
413
494
|
|
|
414
495
|
taxon_id = current_row['taxonID']
|
|
415
|
-
|
|
496
|
+
# sort for determinism
|
|
497
|
+
vernacular_names = sorted(taxon_id_to_vernacular[taxon_id])
|
|
416
498
|
match_details.append((taxon_id, current_row['taxonRank'],
|
|
417
499
|
get_scientific_name_from_row(current_row),
|
|
418
500
|
vernacular_names))
|
|
@@ -596,21 +678,21 @@ class TaxonomicMatch:
|
|
|
596
678
|
|
|
597
679
|
|
|
598
680
|
hyphenated_terms = ['crowned', 'backed', 'throated', 'tailed', 'headed', 'cheeked',
|
|
599
|
-
'ruffed', 'browed', 'eating', 'striped', 'shanked',
|
|
681
|
+
'ruffed', 'browed', 'eating', 'striped', 'shanked',
|
|
600
682
|
'fronted', 'bellied', 'spotted', 'eared', 'collared', 'breasted',
|
|
601
683
|
'necked']
|
|
602
684
|
|
|
603
685
|
def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retry=True) -> TaxonomicMatch:
|
|
604
686
|
"""
|
|
605
|
-
Wrapper for _get_preferred_taxonomic_match, but expressing a variety of heuristics
|
|
687
|
+
Wrapper for _get_preferred_taxonomic_match, but expressing a variety of heuristics
|
|
606
688
|
and preferences that are specific to our scenario.
|
|
607
|
-
|
|
689
|
+
|
|
608
690
|
Args:
|
|
609
691
|
query (str): The common or scientific name we want to look up
|
|
610
692
|
taxonomy_preference (str, optional): 'inat' or 'gbif'
|
|
611
|
-
retry (bool, optional): if the initial lookup fails, should we try heuristic
|
|
693
|
+
retry (bool, optional): if the initial lookup fails, should we try heuristic
|
|
612
694
|
substitutions, e.g. replacing "_" with " ", or "spp" with "species"?
|
|
613
|
-
|
|
695
|
+
|
|
614
696
|
Returns:
|
|
615
697
|
TaxonomicMatch: the best taxonomic match, or None
|
|
616
698
|
"""
|
|
@@ -618,31 +700,31 @@ def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retr
|
|
|
618
700
|
m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
|
|
619
701
|
if (len(m.scientific_name) > 0) or (not retry):
|
|
620
702
|
return m
|
|
621
|
-
|
|
703
|
+
|
|
622
704
|
for s in hyphenated_terms:
|
|
623
705
|
query = query.replace(' ' + s,'-' + s)
|
|
624
706
|
m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
|
|
625
707
|
return m
|
|
626
|
-
|
|
627
|
-
|
|
708
|
+
|
|
709
|
+
|
|
628
710
|
def validate_and_convert(data):
|
|
629
711
|
"""
|
|
630
712
|
Recursively validates that all elements in the nested structure are only
|
|
631
713
|
tuples, lists, ints, or np.int64, and converts np.int64 to int.
|
|
632
|
-
|
|
714
|
+
|
|
633
715
|
Args:
|
|
634
716
|
data: The nested structure to validate and convert
|
|
635
|
-
|
|
717
|
+
|
|
636
718
|
Returns:
|
|
637
719
|
The validated and converted structure
|
|
638
|
-
|
|
720
|
+
|
|
639
721
|
Raises:
|
|
640
722
|
TypeError: If an invalid type is encountered
|
|
641
723
|
"""
|
|
642
|
-
|
|
643
|
-
if isinstance(data, np.int64):
|
|
724
|
+
|
|
725
|
+
if isinstance(data, np.int64):
|
|
644
726
|
return int(data)
|
|
645
|
-
elif isinstance(data, int) or isinstance(data, str):
|
|
727
|
+
elif isinstance(data, int) or isinstance(data, str):
|
|
646
728
|
return data
|
|
647
729
|
elif isinstance(data, (list, tuple)):
|
|
648
730
|
# Process lists and tuples recursively
|
|
@@ -654,17 +736,17 @@ def validate_and_convert(data):
|
|
|
654
736
|
|
|
655
737
|
# ...def validate_and_convert(...)
|
|
656
738
|
|
|
657
|
-
|
|
739
|
+
|
|
658
740
|
def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') -> TaxonomicMatch:
|
|
659
|
-
|
|
741
|
+
|
|
660
742
|
query = query.lower().strip().replace('_', ' ')
|
|
661
743
|
query = query.replace('unidentified','')
|
|
662
744
|
query = query.replace('unknown','')
|
|
663
745
|
if query.endswith(' sp'):
|
|
664
746
|
query = query.replace(' sp','')
|
|
665
747
|
if query.endswith(' group'):
|
|
666
|
-
query = query.replace(' group','')
|
|
667
|
-
|
|
748
|
+
query = query.replace(' group','')
|
|
749
|
+
|
|
668
750
|
query = query.strip()
|
|
669
751
|
|
|
670
752
|
# query = 'person'
|
|
@@ -686,17 +768,17 @@ def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') ->
|
|
|
686
768
|
|
|
687
769
|
n_inat_matches = len(inat_matches)
|
|
688
770
|
n_gbif_matches = len(gbif_matches)
|
|
689
|
-
|
|
771
|
+
|
|
690
772
|
selected_matches = None
|
|
691
|
-
|
|
773
|
+
|
|
692
774
|
assert taxonomy_preference in ['gbif','inat'],\
|
|
693
775
|
'Unrecognized taxonomy preference: {}'.format(taxonomy_preference)
|
|
694
|
-
|
|
776
|
+
|
|
695
777
|
if n_inat_matches > 0 and taxonomy_preference == 'inat':
|
|
696
778
|
selected_matches = 'inat'
|
|
697
779
|
elif n_gbif_matches > 0:
|
|
698
780
|
selected_matches = 'gbif'
|
|
699
|
-
|
|
781
|
+
|
|
700
782
|
if selected_matches == 'inat':
|
|
701
783
|
|
|
702
784
|
i_match = 0
|
|
@@ -802,7 +884,7 @@ def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') ->
|
|
|
802
884
|
# Convert np.int64's to ints
|
|
803
885
|
if match is not None:
|
|
804
886
|
match = validate_and_convert(match)
|
|
805
|
-
|
|
887
|
+
|
|
806
888
|
taxonomy_string = str(match)
|
|
807
889
|
|
|
808
890
|
return TaxonomicMatch(scientific_name, common_name, taxonomic_level, source,
|
|
@@ -828,15 +910,15 @@ if False:
|
|
|
828
910
|
# print(matches)
|
|
829
911
|
|
|
830
912
|
print_taxonomy_matches(matches,verbose=True)
|
|
831
|
-
|
|
913
|
+
|
|
832
914
|
print('\n\n')
|
|
833
|
-
|
|
915
|
+
|
|
834
916
|
# Print the taxonomy in the taxonomy spreadsheet format
|
|
835
917
|
assert matches[1]['source'] == 'inat'
|
|
836
918
|
t = str(matches[1]['taxonomy'])
|
|
837
919
|
print(t)
|
|
838
920
|
import clipboard; clipboard.copy(t)
|
|
839
|
-
|
|
921
|
+
|
|
840
922
|
|
|
841
923
|
#%% Directly access the taxonomy tables
|
|
842
924
|
|
|
@@ -848,12 +930,12 @@ if False:
|
|
|
848
930
|
|
|
849
931
|
#%% Command-line driver
|
|
850
932
|
|
|
851
|
-
def main():
|
|
933
|
+
def main(): # noqa
|
|
852
934
|
|
|
853
935
|
# Read command line inputs (absolute path)
|
|
854
936
|
parser = argparse.ArgumentParser()
|
|
855
937
|
parser.add_argument('input_file')
|
|
856
|
-
|
|
938
|
+
|
|
857
939
|
if len(sys.argv[1:]) == 0:
|
|
858
940
|
parser.print_help()
|
|
859
941
|
parser.exit()
|
|
@@ -36,7 +36,7 @@ def check_taxonomy_csv(csv_path: str) -> None:
|
|
|
36
36
|
"""
|
|
37
37
|
See module docstring.
|
|
38
38
|
"""
|
|
39
|
-
|
|
39
|
+
|
|
40
40
|
taxonomy_df = pd.read_csv(csv_path)
|
|
41
41
|
|
|
42
42
|
graph = nx.DiGraph()
|
|
@@ -46,12 +46,12 @@ def check_taxonomy_csv(csv_path: str) -> None:
|
|
|
46
46
|
num_scientific_name_errors = 0
|
|
47
47
|
|
|
48
48
|
for i_row, row in taxonomy_df.iterrows():
|
|
49
|
-
|
|
49
|
+
|
|
50
50
|
ds = row['dataset_name']
|
|
51
51
|
ds_label = row['query']
|
|
52
52
|
scientific_name = row['scientific_name']
|
|
53
53
|
level = row['taxonomy_level']
|
|
54
|
-
|
|
54
|
+
|
|
55
55
|
# This used to represent the source of the mapping: iNat, gbif, or manual. We've
|
|
56
56
|
# stopped tracking this, so this is now vestigial.
|
|
57
57
|
id_source = 0 # row['source']
|
|
@@ -95,8 +95,8 @@ def check_taxonomy_csv(csv_path: str) -> None:
|
|
|
95
95
|
num_scientific_name_errors += 1
|
|
96
96
|
|
|
97
97
|
taxon_child = node
|
|
98
|
-
|
|
99
|
-
# ...for each row in the taxonomy file
|
|
98
|
+
|
|
99
|
+
# ...for each row in the taxonomy file
|
|
100
100
|
|
|
101
101
|
assert nx.is_directed_acyclic_graph(graph)
|
|
102
102
|
|
|
@@ -124,36 +124,36 @@ def check_taxonomy_csv(csv_path: str) -> None:
|
|
|
124
124
|
print(f'At least one node has unresolved ambiguous parents: {e}')
|
|
125
125
|
|
|
126
126
|
print('Processed {} rows from {}'.format(len(taxonomy_df),csv_path))
|
|
127
|
-
|
|
127
|
+
|
|
128
128
|
print('num taxon level errors:', num_taxon_level_errors)
|
|
129
129
|
print('num scientific name errors:', num_scientific_name_errors)
|
|
130
130
|
|
|
131
131
|
|
|
132
132
|
#%% Command-line driver
|
|
133
|
-
|
|
133
|
+
|
|
134
134
|
if __name__ == '__main__':
|
|
135
|
-
|
|
135
|
+
|
|
136
136
|
parser = argparse.ArgumentParser()
|
|
137
137
|
parser.add_argument(
|
|
138
138
|
'taxonomy_csv_path',
|
|
139
139
|
help='path to taxonomy CSV file')
|
|
140
|
-
|
|
140
|
+
|
|
141
141
|
if len(sys.argv[1:]) == 0:
|
|
142
142
|
parser.print_help()
|
|
143
143
|
parser.exit()
|
|
144
|
-
|
|
144
|
+
|
|
145
145
|
args = parser.parse_args()
|
|
146
146
|
|
|
147
147
|
check_taxonomy_csv(args.taxonomy_csv_path)
|
|
148
148
|
|
|
149
149
|
|
|
150
150
|
#%% Interactive driver
|
|
151
|
-
|
|
151
|
+
|
|
152
152
|
if False:
|
|
153
|
-
|
|
153
|
+
|
|
154
154
|
#%%
|
|
155
|
-
|
|
155
|
+
|
|
156
156
|
import os
|
|
157
157
|
csv_path = os.path.expanduser('~/lila/lila-taxonomy-mapping_release.csv')
|
|
158
158
|
check_taxonomy_csv(csv_path)
|
|
159
|
-
|
|
159
|
+
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""
|
|
1
|
+
r"""
|
|
2
2
|
|
|
3
3
|
taxonomy_graph.py
|
|
4
4
|
|
|
@@ -69,7 +69,7 @@ class TaxonNode:
|
|
|
69
69
|
By default, we support multiple parents for each TaxonNode. See discussion
|
|
70
70
|
in module docstring above.
|
|
71
71
|
"""
|
|
72
|
-
|
|
72
|
+
|
|
73
73
|
# class variables
|
|
74
74
|
single_parent_only: ClassVar[bool] = False
|
|
75
75
|
|
|
@@ -82,7 +82,7 @@ class TaxonNode:
|
|
|
82
82
|
|
|
83
83
|
def __init__(self, level: str, name: str,
|
|
84
84
|
graph: Optional[nx.DiGraph] = None):
|
|
85
|
-
|
|
85
|
+
|
|
86
86
|
self.level = level
|
|
87
87
|
self.name = name
|
|
88
88
|
self.graph = graph
|
|
@@ -131,7 +131,7 @@ class TaxonNode:
|
|
|
131
131
|
Args:
|
|
132
132
|
parent: TaxonNode, must be higher in the taxonomical hierarchy
|
|
133
133
|
"""
|
|
134
|
-
|
|
134
|
+
|
|
135
135
|
assert self.graph is not None
|
|
136
136
|
parents = self.parents
|
|
137
137
|
if TaxonNode.single_parent_only and len(parents) > 0:
|
|
@@ -150,7 +150,7 @@ class TaxonNode:
|
|
|
150
150
|
Args:
|
|
151
151
|
child: TaxonNode, must be lower in the taxonomical hierarchy
|
|
152
152
|
"""
|
|
153
|
-
|
|
153
|
+
|
|
154
154
|
assert self.graph is not None
|
|
155
155
|
self.graph.add_edge(self, child)
|
|
156
156
|
|
|
@@ -160,7 +160,7 @@ class TaxonNode:
|
|
|
160
160
|
ds: str, name of dataset
|
|
161
161
|
ds_label: str, name of label used by that dataset
|
|
162
162
|
"""
|
|
163
|
-
|
|
163
|
+
|
|
164
164
|
self.dataset_labels.add((ds, ds_label))
|
|
165
165
|
|
|
166
166
|
def get_dataset_labels(self,
|
|
@@ -176,7 +176,7 @@ class TaxonNode:
|
|
|
176
176
|
|
|
177
177
|
Returns: set of (ds, ds_label) tuples
|
|
178
178
|
"""
|
|
179
|
-
|
|
179
|
+
|
|
180
180
|
result = self.dataset_labels
|
|
181
181
|
if include_datasets is not None:
|
|
182
182
|
result = set(tup for tup in result if tup[0] in include_datasets)
|
|
@@ -199,7 +199,7 @@ class TaxonNode:
|
|
|
199
199
|
|
|
200
200
|
Returns: TaxonNode, the LCA if it exists, or None if no LCA exists
|
|
201
201
|
"""
|
|
202
|
-
|
|
202
|
+
|
|
203
203
|
paths = []
|
|
204
204
|
for node in nodes:
|
|
205
205
|
# get path to root
|
|
@@ -242,7 +242,7 @@ def build_taxonomy_graph(taxonomy_df: pd.DataFrame
|
|
|
242
242
|
TaxonNode node in the tree that contains the label,
|
|
243
243
|
keys are all lowercase
|
|
244
244
|
"""
|
|
245
|
-
|
|
245
|
+
|
|
246
246
|
graph = nx.DiGraph()
|
|
247
247
|
taxon_to_node = {} # maps (taxon_level, taxon_name) to a TaxonNode
|
|
248
248
|
label_to_node = {} # maps (dataset_name, dataset_label) to a TaxonNode
|
|
@@ -308,7 +308,7 @@ def dag_to_tree(graph: nx.DiGraph,
|
|
|
308
308
|
|
|
309
309
|
Returns: nx.DiGraph, a tree-structured graph
|
|
310
310
|
"""
|
|
311
|
-
|
|
311
|
+
|
|
312
312
|
tree = nx.DiGraph()
|
|
313
313
|
for node in graph.nodes:
|
|
314
314
|
tree.add_node(node)
|