megadetector 5.0.8__py3-none-any.whl → 5.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- api/__init__.py +0 -0
- api/batch_processing/__init__.py +0 -0
- api/batch_processing/api_core/__init__.py +0 -0
- api/batch_processing/api_core/batch_service/__init__.py +0 -0
- api/batch_processing/api_core/batch_service/score.py +0 -1
- api/batch_processing/api_core/server_job_status_table.py +0 -1
- api/batch_processing/api_core_support/__init__.py +0 -0
- api/batch_processing/api_core_support/aggregate_results_manually.py +0 -1
- api/batch_processing/api_support/__init__.py +0 -0
- api/batch_processing/api_support/summarize_daily_activity.py +0 -1
- api/batch_processing/data_preparation/__init__.py +0 -0
- api/batch_processing/data_preparation/manage_local_batch.py +65 -65
- api/batch_processing/data_preparation/manage_video_batch.py +8 -8
- api/batch_processing/integration/digiKam/xmp_integration.py +0 -1
- api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -1
- api/batch_processing/postprocessing/__init__.py +0 -0
- api/batch_processing/postprocessing/add_max_conf.py +12 -12
- api/batch_processing/postprocessing/categorize_detections_by_size.py +32 -14
- api/batch_processing/postprocessing/combine_api_outputs.py +68 -54
- api/batch_processing/postprocessing/compare_batch_results.py +113 -43
- api/batch_processing/postprocessing/convert_output_format.py +41 -16
- api/batch_processing/postprocessing/load_api_results.py +16 -17
- api/batch_processing/postprocessing/md_to_coco.py +31 -21
- api/batch_processing/postprocessing/md_to_labelme.py +52 -22
- api/batch_processing/postprocessing/merge_detections.py +14 -14
- api/batch_processing/postprocessing/postprocess_batch_results.py +246 -174
- api/batch_processing/postprocessing/remap_detection_categories.py +32 -25
- api/batch_processing/postprocessing/render_detection_confusion_matrix.py +60 -27
- api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +53 -44
- api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +25 -14
- api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +242 -158
- api/batch_processing/postprocessing/separate_detections_into_folders.py +159 -114
- api/batch_processing/postprocessing/subset_json_detector_output.py +146 -169
- api/batch_processing/postprocessing/top_folders_to_bottom.py +77 -43
- api/synchronous/__init__.py +0 -0
- api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
- api/synchronous/api_core/animal_detection_api/api_backend.py +0 -2
- api/synchronous/api_core/animal_detection_api/api_frontend.py +266 -268
- api/synchronous/api_core/animal_detection_api/config.py +35 -35
- api/synchronous/api_core/tests/__init__.py +0 -0
- api/synchronous/api_core/tests/load_test.py +109 -109
- classification/__init__.py +0 -0
- classification/aggregate_classifier_probs.py +21 -24
- classification/analyze_failed_images.py +11 -13
- classification/cache_batchapi_outputs.py +51 -51
- classification/create_classification_dataset.py +69 -68
- classification/crop_detections.py +54 -53
- classification/csv_to_json.py +97 -100
- classification/detect_and_crop.py +105 -105
- classification/evaluate_model.py +43 -42
- classification/identify_mislabeled_candidates.py +47 -46
- classification/json_to_azcopy_list.py +10 -10
- classification/json_validator.py +72 -71
- classification/map_classification_categories.py +44 -43
- classification/merge_classification_detection_output.py +68 -68
- classification/prepare_classification_script.py +157 -154
- classification/prepare_classification_script_mc.py +228 -228
- classification/run_classifier.py +27 -26
- classification/save_mislabeled.py +30 -30
- classification/train_classifier.py +20 -20
- classification/train_classifier_tf.py +21 -22
- classification/train_utils.py +10 -10
- data_management/__init__.py +0 -0
- data_management/annotations/__init__.py +0 -0
- data_management/annotations/annotation_constants.py +18 -31
- data_management/camtrap_dp_to_coco.py +238 -0
- data_management/cct_json_utils.py +102 -59
- data_management/cct_to_md.py +176 -158
- data_management/cct_to_wi.py +247 -219
- data_management/coco_to_labelme.py +272 -263
- data_management/coco_to_yolo.py +79 -58
- data_management/databases/__init__.py +0 -0
- data_management/databases/add_width_and_height_to_db.py +20 -16
- data_management/databases/combine_coco_camera_traps_files.py +35 -31
- data_management/databases/integrity_check_json_db.py +62 -24
- data_management/databases/subset_json_db.py +24 -15
- data_management/generate_crops_from_cct.py +27 -45
- data_management/get_image_sizes.py +188 -162
- data_management/importers/add_nacti_sizes.py +8 -8
- data_management/importers/add_timestamps_to_icct.py +78 -78
- data_management/importers/animl_results_to_md_results.py +158 -158
- data_management/importers/auckland_doc_test_to_json.py +9 -9
- data_management/importers/auckland_doc_to_json.py +8 -8
- data_management/importers/awc_to_json.py +7 -7
- data_management/importers/bellevue_to_json.py +15 -15
- data_management/importers/cacophony-thermal-importer.py +13 -13
- data_management/importers/carrizo_shrubfree_2018.py +8 -8
- data_management/importers/carrizo_trail_cam_2017.py +8 -8
- data_management/importers/cct_field_adjustments.py +9 -9
- data_management/importers/channel_islands_to_cct.py +10 -10
- data_management/importers/eMammal/copy_and_unzip_emammal.py +1 -0
- data_management/importers/ena24_to_json.py +7 -7
- data_management/importers/filenames_to_json.py +8 -8
- data_management/importers/helena_to_cct.py +7 -7
- data_management/importers/idaho-camera-traps.py +7 -7
- data_management/importers/idfg_iwildcam_lila_prep.py +10 -10
- data_management/importers/jb_csv_to_json.py +9 -9
- data_management/importers/mcgill_to_json.py +8 -8
- data_management/importers/missouri_to_json.py +18 -18
- data_management/importers/nacti_fieldname_adjustments.py +10 -10
- data_management/importers/noaa_seals_2019.py +7 -7
- data_management/importers/pc_to_json.py +7 -7
- data_management/importers/plot_wni_giraffes.py +7 -7
- data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -359
- data_management/importers/prepare_zsl_imerit.py +7 -7
- data_management/importers/rspb_to_json.py +8 -8
- data_management/importers/save_the_elephants_survey_A.py +8 -8
- data_management/importers/save_the_elephants_survey_B.py +9 -9
- data_management/importers/snapshot_safari_importer.py +26 -26
- data_management/importers/snapshot_safari_importer_reprise.py +665 -665
- data_management/importers/snapshot_serengeti_lila.py +14 -14
- data_management/importers/sulross_get_exif.py +8 -9
- data_management/importers/timelapse_csv_set_to_json.py +11 -11
- data_management/importers/ubc_to_json.py +13 -13
- data_management/importers/umn_to_json.py +7 -7
- data_management/importers/wellington_to_json.py +8 -8
- data_management/importers/wi_to_json.py +9 -9
- data_management/importers/zamba_results_to_md_results.py +181 -181
- data_management/labelme_to_coco.py +65 -24
- data_management/labelme_to_yolo.py +8 -8
- data_management/lila/__init__.py +0 -0
- data_management/lila/add_locations_to_island_camera_traps.py +9 -9
- data_management/lila/add_locations_to_nacti.py +147 -147
- data_management/lila/create_lila_blank_set.py +13 -13
- data_management/lila/create_lila_test_set.py +8 -8
- data_management/lila/create_links_to_md_results_files.py +106 -106
- data_management/lila/download_lila_subset.py +44 -110
- data_management/lila/generate_lila_per_image_labels.py +55 -42
- data_management/lila/get_lila_annotation_counts.py +18 -15
- data_management/lila/get_lila_image_counts.py +11 -11
- data_management/lila/lila_common.py +96 -33
- data_management/lila/test_lila_metadata_urls.py +132 -116
- data_management/ocr_tools.py +173 -128
- data_management/read_exif.py +110 -97
- data_management/remap_coco_categories.py +83 -83
- data_management/remove_exif.py +58 -62
- data_management/resize_coco_dataset.py +30 -23
- data_management/wi_download_csv_to_coco.py +246 -239
- data_management/yolo_output_to_md_output.py +86 -73
- data_management/yolo_to_coco.py +300 -60
- detection/__init__.py +0 -0
- detection/detector_training/__init__.py +0 -0
- detection/process_video.py +85 -33
- detection/pytorch_detector.py +43 -25
- detection/run_detector.py +157 -72
- detection/run_detector_batch.py +179 -113
- detection/run_inference_with_yolov5_val.py +108 -48
- detection/run_tiled_inference.py +111 -40
- detection/tf_detector.py +51 -29
- detection/video_utils.py +606 -521
- docs/source/conf.py +43 -0
- md_utils/__init__.py +0 -0
- md_utils/azure_utils.py +9 -9
- md_utils/ct_utils.py +228 -68
- md_utils/directory_listing.py +59 -64
- md_utils/md_tests.py +968 -871
- md_utils/path_utils.py +460 -134
- md_utils/process_utils.py +157 -133
- md_utils/sas_blob_utils.py +20 -20
- md_utils/split_locations_into_train_val.py +45 -32
- md_utils/string_utils.py +33 -10
- md_utils/url_utils.py +176 -60
- md_utils/write_html_image_list.py +40 -33
- md_visualization/__init__.py +0 -0
- md_visualization/plot_utils.py +102 -109
- md_visualization/render_images_with_thumbnails.py +34 -34
- md_visualization/visualization_utils.py +597 -291
- md_visualization/visualize_db.py +76 -48
- md_visualization/visualize_detector_output.py +61 -42
- {megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/METADATA +13 -7
- megadetector-5.0.9.dist-info/RECORD +224 -0
- {megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/top_level.txt +1 -0
- taxonomy_mapping/__init__.py +0 -0
- taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +342 -335
- taxonomy_mapping/map_new_lila_datasets.py +154 -154
- taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -134
- taxonomy_mapping/preview_lila_taxonomy.py +591 -591
- taxonomy_mapping/retrieve_sample_image.py +12 -12
- taxonomy_mapping/simple_image_download.py +11 -11
- taxonomy_mapping/species_lookup.py +10 -10
- taxonomy_mapping/taxonomy_csv_checker.py +18 -18
- taxonomy_mapping/taxonomy_graph.py +47 -47
- taxonomy_mapping/validate_lila_category_mappings.py +83 -76
- data_management/cct_json_to_filename_json.py +0 -89
- data_management/cct_to_csv.py +0 -140
- data_management/databases/remove_corrupted_images_from_db.py +0 -191
- detection/detector_training/copy_checkpoints.py +0 -43
- megadetector-5.0.8.dist-info/RECORD +0 -205
- {megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/LICENSE +0 -0
- {megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/WHEEL +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
map_lila_taxonomy_to_wi_taxonomy.py
|
|
4
|
+
|
|
5
|
+
Loads the LILA category mapping (in which taxonomy information comes from an
|
|
6
|
+
iNat taxonomy snapshot) and tries to map each class to the Wildlife Insights taxonomy.
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
9
|
|
|
10
10
|
#%% Constants and imports
|
|
11
11
|
|
|
@@ -18,467 +18,474 @@ from tqdm import tqdm
|
|
|
18
18
|
from data_management.lila.lila_common import read_lila_taxonomy_mapping, \
|
|
19
19
|
read_wildlife_insights_taxonomy_mapping
|
|
20
20
|
|
|
21
|
-
lila_local_base = os.path.expanduser('~/lila')
|
|
22
21
|
|
|
23
|
-
|
|
24
|
-
os.makedirs(metadata_dir, exist_ok=True)
|
|
22
|
+
#%% Prevent execution during infrastructural imports
|
|
25
23
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
24
|
+
if False:
|
|
25
|
+
|
|
26
|
+
#%%
|
|
27
|
+
|
|
28
|
+
lila_local_base = os.path.expanduser('~/lila')
|
|
30
29
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
'~/git/MegaDetector/taxonomy_mapping/lila_to_wi_supplementary_mapping_file.csv')
|
|
30
|
+
metadata_dir = os.path.join(lila_local_base, 'metadata')
|
|
31
|
+
os.makedirs(metadata_dir, exist_ok=True)
|
|
34
32
|
|
|
35
|
-
|
|
33
|
+
# Created by get_lila_category_list.py... contains counts for each category
|
|
34
|
+
category_list_dir = os.path.join(lila_local_base, 'lila_categories_list')
|
|
35
|
+
lila_dataset_to_categories_file = os.path.join(
|
|
36
|
+
category_list_dir, 'lila_dataset_to_categories.json')
|
|
36
37
|
|
|
37
|
-
# This is
|
|
38
|
-
|
|
38
|
+
# This is a manually-curated file used to store mappings that had to be made manually
|
|
39
|
+
lila_to_wi_supplementary_mapping_file = os.path.expanduser(
|
|
40
|
+
'~/git/MegaDetector/taxonomy_mapping/lila_to_wi_supplementary_mapping_file.csv')
|
|
39
41
|
|
|
40
|
-
|
|
42
|
+
assert os.path.isfile(lila_dataset_to_categories_file)
|
|
41
43
|
|
|
44
|
+
# This is the main output file from this whole process
|
|
45
|
+
wi_mapping_table_file = os.path.join(lila_local_base,'lila_wi_mapping_table.csv')
|
|
42
46
|
|
|
43
|
-
|
|
47
|
+
id_column = 'uniqueIdentifier' # 'id'
|
|
44
48
|
|
|
45
|
-
with open(lila_dataset_to_categories_file, 'r') as f:
|
|
46
|
-
lila_dataset_to_categories = json.load(f)
|
|
47
49
|
|
|
48
|
-
|
|
50
|
+
#%% Load category and taxonomy files
|
|
49
51
|
|
|
50
|
-
|
|
52
|
+
with open(lila_dataset_to_categories_file, 'r') as f:
|
|
53
|
+
lila_dataset_to_categories = json.load(f)
|
|
51
54
|
|
|
55
|
+
lila_taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
|
|
52
56
|
|
|
53
|
-
|
|
57
|
+
wi_taxonomy_df = read_wildlife_insights_taxonomy_mapping(metadata_dir)
|
|
54
58
|
|
|
55
|
-
lila_taxonomy = lila_taxonomy_df.to_dict('records')
|
|
56
|
-
wi_taxonomy = wi_taxonomy_df.to_dict('records')
|
|
57
59
|
|
|
60
|
+
#%% Pull everything out of pandas
|
|
58
61
|
|
|
59
|
-
|
|
62
|
+
lila_taxonomy = lila_taxonomy_df.to_dict('records')
|
|
63
|
+
wi_taxonomy = wi_taxonomy_df.to_dict('records')
|
|
60
64
|
|
|
61
|
-
def is_empty_wi_item(v):
|
|
62
|
-
if isinstance(v, str):
|
|
63
|
-
return len(v) == 0
|
|
64
|
-
elif v is None:
|
|
65
|
-
return True
|
|
66
|
-
else:
|
|
67
|
-
assert isinstance(v, float) and np.isnan(v), 'Invalid item: {}'.format(str(v))
|
|
68
|
-
return True
|
|
69
65
|
|
|
66
|
+
#%% Cache WI taxonomy lookups
|
|
70
67
|
|
|
71
|
-
def
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
return a == b
|
|
68
|
+
def is_empty_wi_item(v):
|
|
69
|
+
if isinstance(v, str):
|
|
70
|
+
return len(v) == 0
|
|
71
|
+
elif v is None:
|
|
72
|
+
return True
|
|
73
|
+
else:
|
|
74
|
+
assert isinstance(v, float) and np.isnan(v), 'Invalid item: {}'.format(str(v))
|
|
75
|
+
return True
|
|
80
76
|
|
|
81
77
|
|
|
82
|
-
|
|
83
|
-
|
|
78
|
+
def taxonomy_items_equal(a, b):
|
|
79
|
+
if isinstance(a, str) and (not isinstance(b, str)):
|
|
80
|
+
return False
|
|
81
|
+
if isinstance(b, str) and (not isinstance(a, str)):
|
|
82
|
+
return False
|
|
83
|
+
if (not isinstance(a, str)) or (not isinstance(b, str)):
|
|
84
|
+
assert (a is None and b is None) or (isinstance(a, float) and isinstance(b, float))
|
|
85
|
+
return True
|
|
86
|
+
return a == b
|
|
84
87
|
|
|
85
|
-
from collections import defaultdict
|
|
86
|
-
wi_taxon_name_to_taxa = defaultdict(list)
|
|
87
88
|
|
|
88
|
-
|
|
89
|
-
|
|
89
|
+
for taxon in wi_taxonomy:
|
|
90
|
+
taxon['taxon_name'] = None
|
|
90
91
|
|
|
91
|
-
|
|
92
|
-
|
|
92
|
+
from collections import defaultdict
|
|
93
|
+
wi_taxon_name_to_taxa = defaultdict(list)
|
|
93
94
|
|
|
94
|
-
|
|
95
|
-
|
|
95
|
+
# This is just a handy lookup table that we'll use to debug mismatches
|
|
96
|
+
wi_common_name_to_taxon = {}
|
|
96
97
|
|
|
97
|
-
|
|
98
|
-
|
|
98
|
+
blank_taxon_name = 'blank'
|
|
99
|
+
blank_taxon = None
|
|
99
100
|
|
|
100
|
-
|
|
101
|
+
animal_taxon_name = 'animal'
|
|
102
|
+
animal_taxon = None
|
|
101
103
|
|
|
102
|
-
|
|
104
|
+
unknown_taxon_name = 'unknown'
|
|
105
|
+
unknown_taxon = None
|
|
103
106
|
|
|
104
|
-
|
|
107
|
+
ignore_taxa = set(['No CV Result', 'CV Needed', 'CV Failed'])
|
|
105
108
|
|
|
106
|
-
|
|
107
|
-
for taxon in tqdm(wi_taxonomy):
|
|
109
|
+
known_problematic_taxon_ids = ['f94e6d97-59cf-4d38-a05a-a75efdd2863b']
|
|
108
110
|
|
|
109
|
-
|
|
111
|
+
human_taxa = []
|
|
110
112
|
|
|
111
|
-
|
|
113
|
+
# taxon = wi_taxonomy[21653]; print(taxon)
|
|
114
|
+
for taxon in tqdm(wi_taxonomy):
|
|
112
115
|
|
|
113
|
-
|
|
114
|
-
v = taxon[k]
|
|
115
|
-
if isinstance(v,str):
|
|
116
|
-
taxon[k] = v.strip()
|
|
117
|
-
|
|
118
|
-
if taxon['commonNameEnglish'] in ignore_taxa:
|
|
119
|
-
continue
|
|
116
|
+
taxon_name = None
|
|
120
117
|
|
|
121
|
-
|
|
118
|
+
assert taxon['taxonomyType'] == 'object' or taxon['taxonomyType'] == 'biological'
|
|
122
119
|
|
|
123
|
-
|
|
124
|
-
|
|
120
|
+
for k in taxon.keys():
|
|
121
|
+
v = taxon[k]
|
|
122
|
+
if isinstance(v,str):
|
|
123
|
+
taxon[k] = v.strip()
|
|
124
|
+
|
|
125
|
+
if taxon['commonNameEnglish'] in ignore_taxa:
|
|
126
|
+
continue
|
|
125
127
|
|
|
126
|
-
|
|
128
|
+
if isinstance(taxon['commonNameEnglish'], str):
|
|
127
129
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
blank_taxon = taxon
|
|
131
|
-
special_taxon = True
|
|
130
|
+
wi_common_name_to_taxon[taxon['commonNameEnglish'].strip(
|
|
131
|
+
).lower()] = taxon
|
|
132
132
|
|
|
133
|
-
|
|
134
|
-
animal_taxon = taxon
|
|
135
|
-
special_taxon = True
|
|
133
|
+
special_taxon = False
|
|
136
134
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
135
|
+
# Look for keywords that don't refer to specific taxa: blank/animal/unknown
|
|
136
|
+
if taxon['commonNameEnglish'].strip().lower() == blank_taxon_name:
|
|
137
|
+
blank_taxon = taxon
|
|
138
|
+
special_taxon = True
|
|
140
139
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
wi_taxon_name_to_taxa[taxon_name].append(taxon)
|
|
145
|
-
continue
|
|
140
|
+
elif taxon['commonNameEnglish'].strip().lower() == animal_taxon_name:
|
|
141
|
+
animal_taxon = taxon
|
|
142
|
+
special_taxon = True
|
|
146
143
|
|
|
147
|
-
|
|
148
|
-
|
|
144
|
+
elif taxon['commonNameEnglish'].strip().lower() == unknown_taxon_name:
|
|
145
|
+
unknown_taxon = taxon
|
|
146
|
+
special_taxon = True
|
|
149
147
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
not is_empty_wi_item(taxon['family'])
|
|
148
|
+
if special_taxon:
|
|
149
|
+
taxon_name = taxon['commonNameEnglish'].strip().lower()
|
|
150
|
+
taxon['taxon_name'] = taxon_name
|
|
151
|
+
wi_taxon_name_to_taxa[taxon_name].append(taxon)
|
|
152
|
+
continue
|
|
153
|
+
|
|
154
|
+
# Do we have a species name?
|
|
155
|
+
if not is_empty_wi_item(taxon['species']):
|
|
159
156
|
|
|
160
|
-
|
|
157
|
+
# If 'species' is populated, 'genus' should always be populated; one item currently breaks
|
|
158
|
+
# this rule.
|
|
159
|
+
assert not is_empty_wi_item(taxon['genus'])
|
|
160
|
+
|
|
161
|
+
taxon_name = (taxon['genus'].strip() + ' ' +
|
|
162
|
+
taxon['species'].strip()).strip().lower()
|
|
163
|
+
assert not is_empty_wi_item(taxon['class']) and \
|
|
164
|
+
not is_empty_wi_item(taxon['order']) and \
|
|
165
|
+
not is_empty_wi_item(taxon['family'])
|
|
161
166
|
|
|
162
|
-
|
|
163
|
-
not is_empty_wi_item(taxon['order']) and \
|
|
164
|
-
not is_empty_wi_item(taxon['family'])
|
|
165
|
-
taxon_name = taxon['genus'].strip().lower()
|
|
167
|
+
elif not is_empty_wi_item(taxon['genus']):
|
|
166
168
|
|
|
167
|
-
|
|
169
|
+
assert not is_empty_wi_item(taxon['class']) and \
|
|
170
|
+
not is_empty_wi_item(taxon['order']) and \
|
|
171
|
+
not is_empty_wi_item(taxon['family'])
|
|
172
|
+
taxon_name = taxon['genus'].strip().lower()
|
|
168
173
|
|
|
169
|
-
|
|
170
|
-
not is_empty_wi_item(taxon['order'])
|
|
171
|
-
taxon_name = taxon['family'].strip().lower()
|
|
174
|
+
elif not is_empty_wi_item(taxon['family']):
|
|
172
175
|
|
|
173
|
-
|
|
176
|
+
assert not is_empty_wi_item(taxon['class']) and \
|
|
177
|
+
not is_empty_wi_item(taxon['order'])
|
|
178
|
+
taxon_name = taxon['family'].strip().lower()
|
|
174
179
|
|
|
175
|
-
|
|
176
|
-
taxon_name = taxon['order'].strip().lower()
|
|
180
|
+
elif not is_empty_wi_item(taxon['order']):
|
|
177
181
|
|
|
178
|
-
|
|
182
|
+
assert not is_empty_wi_item(taxon['class'])
|
|
183
|
+
taxon_name = taxon['order'].strip().lower()
|
|
179
184
|
|
|
180
|
-
|
|
185
|
+
elif not is_empty_wi_item(taxon['class']):
|
|
181
186
|
|
|
182
|
-
|
|
183
|
-
assert taxon['taxonomyType'] == 'biological'
|
|
184
|
-
else:
|
|
185
|
-
assert taxon['taxonomyType'] == 'object'
|
|
186
|
-
taxon_name = taxon['commonNameEnglish'].strip().lower()
|
|
187
|
+
taxon_name = taxon['class'].strip().lower()
|
|
187
188
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
print('Skipping problematic taxon ID {}'.format(taxon[id_column]))
|
|
189
|
+
if taxon_name is not None:
|
|
190
|
+
assert taxon['taxonomyType'] == 'biological'
|
|
191
191
|
else:
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
for level in ['class', 'order', 'family', 'genus', 'species']:
|
|
195
|
-
error_string = 'Error: taxon {} appeared previously in {} {} (as {}), now in {} {}'.format(
|
|
196
|
-
taxon_name,
|
|
197
|
-
level,previous_taxon[level],
|
|
198
|
-
previous_taxon['taxon_name'],
|
|
199
|
-
level,taxon[level])
|
|
200
|
-
assert taxonomy_items_equal(previous_taxon[level], taxon[level]), error_string
|
|
201
|
-
|
|
202
|
-
taxon['taxon_name'] = taxon_name
|
|
203
|
-
if taxon_name == 'homo sapiens':
|
|
204
|
-
human_taxa.append(taxon)
|
|
205
|
-
wi_taxon_name_to_taxa[taxon_name].append(taxon)
|
|
192
|
+
assert taxon['taxonomyType'] == 'object'
|
|
193
|
+
taxon_name = taxon['commonNameEnglish'].strip().lower()
|
|
206
194
|
|
|
207
|
-
|
|
195
|
+
if taxon_name in wi_taxon_name_to_taxa:
|
|
196
|
+
if taxon[id_column] in known_problematic_taxon_ids:
|
|
197
|
+
print('Skipping problematic taxon ID {}'.format(taxon[id_column]))
|
|
198
|
+
else:
|
|
199
|
+
previous_taxa = wi_taxon_name_to_taxa[taxon_name]
|
|
200
|
+
for previous_taxon in previous_taxa:
|
|
201
|
+
for level in ['class', 'order', 'family', 'genus', 'species']:
|
|
202
|
+
error_string = 'Error: taxon {} appeared previously in {} {} (as {}), now in {} {}'.format(
|
|
203
|
+
taxon_name,
|
|
204
|
+
level,previous_taxon[level],
|
|
205
|
+
previous_taxon['taxon_name'],
|
|
206
|
+
level,taxon[level])
|
|
207
|
+
assert taxonomy_items_equal(previous_taxon[level], taxon[level]), error_string
|
|
208
|
+
|
|
209
|
+
taxon['taxon_name'] = taxon_name
|
|
210
|
+
if taxon_name == 'homo sapiens':
|
|
211
|
+
human_taxa.append(taxon)
|
|
212
|
+
wi_taxon_name_to_taxa[taxon_name].append(taxon)
|
|
208
213
|
|
|
209
|
-
|
|
210
|
-
assert animal_taxon is not None
|
|
211
|
-
assert blank_taxon is not None
|
|
214
|
+
# ...for each taxon
|
|
212
215
|
|
|
216
|
+
assert unknown_taxon is not None
|
|
217
|
+
assert animal_taxon is not None
|
|
218
|
+
assert blank_taxon is not None
|
|
213
219
|
|
|
214
|
-
#%% Find redundant taxa
|
|
215
220
|
|
|
216
|
-
|
|
217
|
-
for wi_taxon_name in wi_taxon_name_to_taxa:
|
|
218
|
-
if len(wi_taxon_name_to_taxa[wi_taxon_name]) > 1:
|
|
219
|
-
taxon_names_with_multiple_entries.append(wi_taxon_name)
|
|
221
|
+
#%% Find redundant taxa
|
|
220
222
|
|
|
221
|
-
|
|
223
|
+
taxon_names_with_multiple_entries = []
|
|
224
|
+
for wi_taxon_name in wi_taxon_name_to_taxa:
|
|
225
|
+
if len(wi_taxon_name_to_taxa[wi_taxon_name]) > 1:
|
|
226
|
+
taxon_names_with_multiple_entries.append(wi_taxon_name)
|
|
222
227
|
|
|
223
|
-
|
|
224
|
-
print(s)
|
|
228
|
+
print('{} names have multiple entries\n:'.format(len(taxon_names_with_multiple_entries)))
|
|
225
229
|
|
|
226
|
-
|
|
227
|
-
|
|
230
|
+
for s in taxon_names_with_multiple_entries:
|
|
231
|
+
print(s)
|
|
228
232
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
s = taxon_names_with_multiple_entries[15]
|
|
232
|
-
taxa = wi_taxon_name_to_taxa[s]
|
|
233
|
-
for t in taxa:
|
|
234
|
-
for k in t.keys():
|
|
235
|
-
print('{}: {}'.format(k,t[k]))
|
|
236
|
-
print()
|
|
237
|
-
# print(t,end='\n\n')
|
|
233
|
+
if False:
|
|
234
|
+
pass
|
|
238
235
|
|
|
236
|
+
#%% Manual review of redundant taxa
|
|
237
|
+
|
|
238
|
+
s = taxon_names_with_multiple_entries[15]
|
|
239
|
+
taxa = wi_taxon_name_to_taxa[s]
|
|
240
|
+
for t in taxa:
|
|
241
|
+
for k in t.keys():
|
|
242
|
+
print('{}: {}'.format(k,t[k]))
|
|
243
|
+
print()
|
|
244
|
+
# print(t,end='\n\n')
|
|
239
245
|
|
|
240
|
-
#%% Clean up redundant taxa
|
|
241
246
|
|
|
242
|
-
|
|
247
|
+
#%% Clean up redundant taxa
|
|
243
248
|
|
|
244
|
-
|
|
245
|
-
taxon_name_to_preferred_taxon_id['numida meleagris'] = '83133617-8358-4910-82ee-4c23e40ba3dc' # 2005826
|
|
249
|
+
taxon_name_to_preferred_taxon_id = {}
|
|
246
250
|
|
|
247
|
-
# "
|
|
248
|
-
taxon_name_to_preferred_taxon_id['meleagris
|
|
251
|
+
# "helmeted guineafowl" vs "domestic guineafowl"
|
|
252
|
+
taxon_name_to_preferred_taxon_id['numida meleagris'] = '83133617-8358-4910-82ee-4c23e40ba3dc' # 2005826
|
|
249
253
|
|
|
250
|
-
#
|
|
251
|
-
taxon_name_to_preferred_taxon_id['
|
|
254
|
+
# "domestic turkey" vs. "wild turkey"
|
|
255
|
+
taxon_name_to_preferred_taxon_id['meleagris gallopavo'] = 'c10547c3-1748-48bf-a451-8066c820f22f' # 2021598
|
|
252
256
|
|
|
253
|
-
#
|
|
254
|
-
taxon_name_to_preferred_taxon_id['
|
|
257
|
+
# multiple sensible human entries
|
|
258
|
+
taxon_name_to_preferred_taxon_id['homo sapiens'] = '990ae9dd-7a59-4344-afcb-1b7b21368000' # 2002045
|
|
255
259
|
|
|
256
|
-
# "
|
|
257
|
-
taxon_name_to_preferred_taxon_id['
|
|
260
|
+
# "domestic dog" and "dog-on-leash"
|
|
261
|
+
taxon_name_to_preferred_taxon_id['canis familiaris'] = '3d80f1d6-b1df-4966-9ff4-94053c7a902a' # 2021548
|
|
258
262
|
|
|
259
|
-
# "
|
|
260
|
-
taxon_name_to_preferred_taxon_id['
|
|
263
|
+
# "small mammal" vs. "mammal"
|
|
264
|
+
taxon_name_to_preferred_taxon_id['mammalia'] = 'f2d233e3-80e3-433d-9687-e29ecc7a467a' # 2021108
|
|
261
265
|
|
|
262
|
-
# "
|
|
263
|
-
taxon_name_to_preferred_taxon_id['
|
|
266
|
+
# "Hispaniolan Mango" vs. NaN
|
|
267
|
+
taxon_name_to_preferred_taxon_id['anthracothorax dominicus'] = 'f94e6d97-59cf-4d38-a05a-a75efdd2863b'
|
|
264
268
|
|
|
265
|
-
#
|
|
266
|
-
taxon_name_to_preferred_taxon_id['
|
|
269
|
+
# "millipedes" vs. "Millipede"
|
|
270
|
+
taxon_name_to_preferred_taxon_id['diplopoda'] = '065884eb-4e64-4233-84dc-de25bd06ffd2' # 2021760
|
|
267
271
|
|
|
268
|
-
#
|
|
269
|
-
taxon_name_to_preferred_taxon_id['
|
|
272
|
+
# Different suborders: Squamata vs. Lacertilia
|
|
273
|
+
taxon_name_to_preferred_taxon_id['squamata'] = '710c4066-bd5d-4313-bcf4-0217c4c84da7' # 2021703
|
|
270
274
|
|
|
271
|
-
#
|
|
272
|
-
taxon_name_to_preferred_taxon_id['
|
|
275
|
+
# Redundancy (both "beautiful firetail")
|
|
276
|
+
taxon_name_to_preferred_taxon_id['stagonopleura bella'] = '7fec8e7e-fd3b-4d7f-99fd-3ade6f3bbaa5' # 2021939
|
|
273
277
|
|
|
274
|
-
# "
|
|
275
|
-
taxon_name_to_preferred_taxon_id['
|
|
278
|
+
# "yellow wagtail" vs. "yellow crowned-wagtail"
|
|
279
|
+
taxon_name_to_preferred_taxon_id['motacilla flava'] = 'ac6669bc-9f9e-4473-b609-b9082f9bf50c' # 2016194
|
|
276
280
|
|
|
277
|
-
# "
|
|
278
|
-
taxon_name_to_preferred_taxon_id['
|
|
281
|
+
# "dremomys species" vs. "dremomys genus"
|
|
282
|
+
taxon_name_to_preferred_taxon_id['dremomys'] = '1507d153-af11-46f1-bfb8-77918d035ab3' # 2019370
|
|
279
283
|
|
|
280
|
-
# "
|
|
281
|
-
taxon_name_to_preferred_taxon_id['
|
|
284
|
+
# "elk" vs. "domestic elk"
|
|
285
|
+
taxon_name_to_preferred_taxon_id['cervus canadensis'] = 'c5ce946f-8f0d-4379-992b-cc0982381f5e'
|
|
282
286
|
|
|
283
|
-
# "
|
|
284
|
-
taxon_name_to_preferred_taxon_id['
|
|
287
|
+
# "American bison" vs. "domestic bison"
|
|
288
|
+
taxon_name_to_preferred_taxon_id['bison bison'] = '539ebd55-081b-429a-9ae6-5a6a0f6999d4' # 2021593
|
|
285
289
|
|
|
286
|
-
#
|
|
287
|
-
taxon_name_to_preferred_taxon_id['
|
|
290
|
+
# "woodrat or rat or mouse species" vs. "mouse species"
|
|
291
|
+
taxon_name_to_preferred_taxon_id['muridae'] = 'e7503287-468c-45af-a1bd-a17821bb62f2' # 2021642
|
|
288
292
|
|
|
289
|
-
#
|
|
290
|
-
taxon_name_to_preferred_taxon_id['
|
|
293
|
+
# both "southern sand frog"
|
|
294
|
+
taxon_name_to_preferred_taxon_id['tomopterna adiastola'] = 'a5dc63cb-41be-4090-84a7-b944b16dcee4' # 2021834
|
|
291
295
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
296
|
+
# sericornis species vs. scrubwren species
|
|
297
|
+
taxon_name_to_preferred_taxon_id['sericornis'] = 'ad82c0ac-df48-4028-bf71-d2b2f4bc4129' # 2021776
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
# taxon_name = list(taxon_name_to_preferred_taxon_id.keys())[0]
|
|
301
|
+
for taxon_name in taxon_name_to_preferred_taxon_id.keys():
|
|
302
|
+
|
|
303
|
+
candidate_taxa = wi_taxon_name_to_taxa[taxon_name]
|
|
304
|
+
|
|
305
|
+
# If we've gotten this far, we should be choosing from multiple taxa.
|
|
306
|
+
#
|
|
307
|
+
# This will become untrue if any of these are resolved later, at which point we should
|
|
308
|
+
# remove them from taxon_name_to_preferred_id
|
|
309
|
+
assert len(candidate_taxa) > 1, 'Only one taxon available for {}'.format(taxon_name)
|
|
310
|
+
|
|
311
|
+
# Choose the preferred taxa
|
|
312
|
+
selected_taxa = [t for t in candidate_taxa if t[id_column] == \
|
|
313
|
+
taxon_name_to_preferred_taxon_id[taxon_name]]
|
|
314
|
+
assert len(selected_taxa) == 1
|
|
315
|
+
wi_taxon_name_to_taxa[taxon_name] = selected_taxa
|
|
309
316
|
|
|
310
|
-
wi_taxon_name_to_taxon = {}
|
|
317
|
+
wi_taxon_name_to_taxon = {}
|
|
311
318
|
|
|
312
|
-
for taxon_name in wi_taxon_name_to_taxa.keys():
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
319
|
+
for taxon_name in wi_taxon_name_to_taxa.keys():
|
|
320
|
+
taxa = wi_taxon_name_to_taxa[taxon_name]
|
|
321
|
+
assert len(taxa) == 1
|
|
322
|
+
wi_taxon_name_to_taxon[taxon_name] = taxa[0]
|
|
316
323
|
|
|
317
324
|
|
|
318
|
-
#%% Read supplementary mappings
|
|
325
|
+
#%% Read supplementary mappings
|
|
319
326
|
|
|
320
|
-
with open(lila_to_wi_supplementary_mapping_file, 'r') as f:
|
|
321
|
-
|
|
327
|
+
with open(lila_to_wi_supplementary_mapping_file, 'r') as f:
|
|
328
|
+
lines = f.readlines()
|
|
322
329
|
|
|
323
|
-
supplementary_lila_query_to_wi_query = {}
|
|
330
|
+
supplementary_lila_query_to_wi_query = {}
|
|
324
331
|
|
|
325
|
-
for line in lines:
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
332
|
+
for line in lines:
|
|
333
|
+
# Each line is [lila query],[WI taxon name],[notes]
|
|
334
|
+
tokens = line.strip().split(',')
|
|
335
|
+
assert len(tokens) == 3
|
|
336
|
+
lila_query = tokens[0].strip().lower()
|
|
337
|
+
wi_taxon_name = tokens[1].strip().lower()
|
|
338
|
+
assert wi_taxon_name in wi_taxon_name_to_taxa
|
|
339
|
+
supplementary_lila_query_to_wi_query[lila_query] = wi_taxon_name
|
|
333
340
|
|
|
334
341
|
|
|
335
|
-
#%% Map LILA categories to WI categories
|
|
342
|
+
#%% Map LILA categories to WI categories
|
|
336
343
|
|
|
337
|
-
mismatches = set()
|
|
338
|
-
mismatches_with_common_mappings = set()
|
|
339
|
-
supplementary_mappings = set()
|
|
344
|
+
mismatches = set()
|
|
345
|
+
mismatches_with_common_mappings = set()
|
|
346
|
+
supplementary_mappings = set()
|
|
340
347
|
|
|
341
|
-
all_searches = set()
|
|
348
|
+
all_searches = set()
|
|
342
349
|
|
|
343
|
-
# Must be ordered from kingdom --> species
|
|
344
|
-
lila_taxonomy_levels = ['kingdom', 'phylum', 'subphylum', 'superclass', 'class', 'subclass',
|
|
345
|
-
|
|
346
|
-
|
|
350
|
+
# Must be ordered from kingdom --> species
|
|
351
|
+
lila_taxonomy_levels = ['kingdom', 'phylum', 'subphylum', 'superclass', 'class', 'subclass',
|
|
352
|
+
'infraclass', 'superorder', 'order', 'suborder', 'infraorder',
|
|
353
|
+
'superfamily', 'family', 'subfamily', 'tribe', 'genus', 'species']
|
|
347
354
|
|
|
348
|
-
unknown_queries = set(
|
|
349
|
-
|
|
350
|
-
blank_queries = set(['empty'])
|
|
351
|
-
animal_queries = set(['animalia'])
|
|
355
|
+
unknown_queries = set(
|
|
356
|
+
['unidentifiable', 'other', 'unidentified', 'unknown', 'unclassifiable'])
|
|
357
|
+
blank_queries = set(['empty'])
|
|
358
|
+
animal_queries = set(['animalia'])
|
|
352
359
|
|
|
353
|
-
lila_dataset_category_to_wi_taxon = {}
|
|
360
|
+
lila_dataset_category_to_wi_taxon = {}
|
|
354
361
|
|
|
355
|
-
# i_taxon = 0; taxon = lila_taxonomy[i_taxon]; print(taxon)
|
|
356
|
-
for i_taxon, lila_taxon in enumerate(lila_taxonomy):
|
|
362
|
+
# i_taxon = 0; taxon = lila_taxonomy[i_taxon]; print(taxon)
|
|
363
|
+
for i_taxon, lila_taxon in enumerate(lila_taxonomy):
|
|
357
364
|
|
|
358
|
-
|
|
365
|
+
query = None
|
|
359
366
|
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
+
lila_dataset_category = lila_taxon['dataset_name'] + ':' + lila_taxon['query']
|
|
368
|
+
|
|
369
|
+
# Go from kingdom --> species, choosing the lowest-level description as the query
|
|
370
|
+
for level in lila_taxonomy_levels:
|
|
371
|
+
if isinstance(lila_taxon[level], str):
|
|
372
|
+
query = lila_taxon[level]
|
|
373
|
+
all_searches.add(query)
|
|
367
374
|
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
375
|
+
if query is None:
|
|
376
|
+
# E.g., 'car'
|
|
377
|
+
query = lila_taxon['query']
|
|
371
378
|
|
|
372
|
-
|
|
379
|
+
wi_taxon = None
|
|
373
380
|
|
|
374
|
-
|
|
381
|
+
if query in unknown_queries:
|
|
375
382
|
|
|
376
|
-
|
|
383
|
+
wi_taxon = unknown_taxon
|
|
377
384
|
|
|
378
|
-
|
|
385
|
+
elif query in blank_queries:
|
|
379
386
|
|
|
380
|
-
|
|
387
|
+
wi_taxon = blank_taxon
|
|
381
388
|
|
|
382
|
-
|
|
389
|
+
elif query in animal_queries:
|
|
383
390
|
|
|
384
|
-
|
|
391
|
+
wi_taxon = animal_taxon
|
|
385
392
|
|
|
386
|
-
|
|
393
|
+
elif query in wi_taxon_name_to_taxon:
|
|
387
394
|
|
|
388
|
-
|
|
395
|
+
wi_taxon = wi_taxon_name_to_taxon[query]
|
|
389
396
|
|
|
390
|
-
|
|
397
|
+
elif query in supplementary_lila_query_to_wi_query:
|
|
391
398
|
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
399
|
+
wi_taxon = wi_taxon_name_to_taxon[supplementary_lila_query_to_wi_query[query]]
|
|
400
|
+
supplementary_mappings.add(query)
|
|
401
|
+
# print('Made a supplementary mapping from {} to {}'.format(query,wi_taxon['taxon_name']))
|
|
395
402
|
|
|
396
|
-
|
|
403
|
+
else:
|
|
397
404
|
|
|
398
|
-
|
|
399
|
-
|
|
405
|
+
# print('No match for {}'.format(query))
|
|
406
|
+
lila_common_name = lila_taxon['common_name']
|
|
400
407
|
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
408
|
+
if lila_common_name in wi_common_name_to_taxon:
|
|
409
|
+
wi_taxon = wi_common_name_to_taxon[lila_common_name]
|
|
410
|
+
wi_common_name = wi_taxon['commonNameEnglish']
|
|
411
|
+
wi_taxon_name = wi_taxon['taxon_name']
|
|
412
|
+
if False:
|
|
413
|
+
print('LILA common name {} maps to WI taxon {} ({})'.format(lila_common_name,
|
|
414
|
+
wi_taxon_name,
|
|
415
|
+
wi_common_name))
|
|
416
|
+
mismatches_with_common_mappings.add(query)
|
|
410
417
|
|
|
411
|
-
|
|
418
|
+
else:
|
|
412
419
|
|
|
413
|
-
|
|
420
|
+
mismatches.add(query)
|
|
414
421
|
|
|
415
|
-
|
|
422
|
+
lila_dataset_category_to_wi_taxon[lila_dataset_category] = wi_taxon
|
|
416
423
|
|
|
417
|
-
# ...for each LILA taxon
|
|
424
|
+
# ...for each LILA taxon
|
|
418
425
|
|
|
419
|
-
print('Of {} entities, there are {} mismatches ({} mapped by common name) ({} mapped by supplementary mapping file)'.format(
|
|
420
|
-
|
|
426
|
+
print('Of {} entities, there are {} mismatches ({} mapped by common name) ({} mapped by supplementary mapping file)'.format(
|
|
427
|
+
len(all_searches), len(mismatches), len(mismatches_with_common_mappings), len(supplementary_mappings)))
|
|
421
428
|
|
|
422
|
-
assert len(mismatches) == 0
|
|
429
|
+
assert len(mismatches) == 0
|
|
423
430
|
|
|
424
431
|
|
|
425
|
-
#%% Manual mapping
|
|
432
|
+
#%% Manual mapping
|
|
426
433
|
|
|
427
|
-
if not os.path.isfile(lila_to_wi_supplementary_mapping_file):
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
else:
|
|
434
|
-
|
|
434
|
+
if not os.path.isfile(lila_to_wi_supplementary_mapping_file):
|
|
435
|
+
print('Creating mapping file {}'.format(
|
|
436
|
+
lila_to_wi_supplementary_mapping_file))
|
|
437
|
+
with open(lila_to_wi_supplementary_mapping_file, 'w') as f:
|
|
438
|
+
for query in mismatches:
|
|
439
|
+
f.write(query + ',' + '\n')
|
|
440
|
+
else:
|
|
441
|
+
print('{} exists, not re-writing'.format(lila_to_wi_supplementary_mapping_file))
|
|
435
442
|
|
|
436
443
|
|
|
437
|
-
#%% Build a dictionary from LILA dataset names and categories to LILA taxa
|
|
444
|
+
#%% Build a dictionary from LILA dataset names and categories to LILA taxa
|
|
438
445
|
|
|
439
|
-
lila_dataset_category_to_lila_taxon = {}
|
|
446
|
+
lila_dataset_category_to_lila_taxon = {}
|
|
440
447
|
|
|
441
|
-
# i_d = 0; d = lila_taxonomy[i_d]
|
|
442
|
-
for i_d,d in enumerate(lila_taxonomy):
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
448
|
+
# i_d = 0; d = lila_taxonomy[i_d]
|
|
449
|
+
for i_d,d in enumerate(lila_taxonomy):
|
|
450
|
+
lila_dataset_category = d['dataset_name'] + ':' + d['query']
|
|
451
|
+
assert lila_dataset_category not in lila_dataset_category_to_lila_taxon
|
|
452
|
+
lila_dataset_category_to_lila_taxon[lila_dataset_category] = d
|
|
446
453
|
|
|
447
454
|
|
|
448
|
-
#%% Map LILA datasets to WI taxa, and count the number of each taxon available in each dataset
|
|
455
|
+
#%% Map LILA datasets to WI taxa, and count the number of each taxon available in each dataset
|
|
449
456
|
|
|
450
|
-
with open(wi_mapping_table_file,'w') as f:
|
|
451
|
-
|
|
452
|
-
f.write('lila_dataset_name,lila_category_name,wi_guid,wi_taxon_name,wi_common,count\n')
|
|
453
|
-
|
|
454
|
-
# dataset_name = list(lila_dataset_to_categories.keys())[0]
|
|
455
|
-
for dataset_name in lila_dataset_to_categories.keys():
|
|
457
|
+
with open(wi_mapping_table_file,'w') as f:
|
|
456
458
|
|
|
457
|
-
|
|
458
|
-
continue
|
|
459
|
-
|
|
460
|
-
dataset_categories = lila_dataset_to_categories[dataset_name]
|
|
459
|
+
f.write('lila_dataset_name,lila_category_name,wi_guid,wi_taxon_name,wi_common,count\n')
|
|
461
460
|
|
|
462
|
-
#
|
|
463
|
-
for
|
|
461
|
+
# dataset_name = list(lila_dataset_to_categories.keys())[0]
|
|
462
|
+
for dataset_name in lila_dataset_to_categories.keys():
|
|
464
463
|
|
|
465
|
-
|
|
466
|
-
if '#' in lila_dataset_category:
|
|
464
|
+
if '_bbox' in dataset_name:
|
|
467
465
|
continue
|
|
468
|
-
assert lila_dataset_category in lila_dataset_category_to_lila_taxon
|
|
469
|
-
assert lila_dataset_category in lila_dataset_category_to_wi_taxon
|
|
470
|
-
assert 'count' in category
|
|
471
|
-
|
|
472
|
-
wi_taxon = lila_dataset_category_to_wi_taxon[lila_dataset_category]
|
|
473
466
|
|
|
474
|
-
|
|
475
|
-
# and count
|
|
476
|
-
s = f"{dataset_name},{category['name']},{wi_taxon['uniqueIdentifier']},"+\
|
|
477
|
-
f"{wi_taxon['taxon_name']},{wi_taxon['commonNameEnglish']},{category['count']}\n"
|
|
478
|
-
f.write(s)
|
|
467
|
+
dataset_categories = lila_dataset_to_categories[dataset_name]
|
|
479
468
|
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
469
|
+
# dataset_category = dataset_categories[0]
|
|
470
|
+
for category in dataset_categories:
|
|
471
|
+
|
|
472
|
+
lila_dataset_category = dataset_name + ':' + category['name'].strip().lower()
|
|
473
|
+
if '#' in lila_dataset_category:
|
|
474
|
+
continue
|
|
475
|
+
assert lila_dataset_category in lila_dataset_category_to_lila_taxon
|
|
476
|
+
assert lila_dataset_category in lila_dataset_category_to_wi_taxon
|
|
477
|
+
assert 'count' in category
|
|
478
|
+
|
|
479
|
+
wi_taxon = lila_dataset_category_to_wi_taxon[lila_dataset_category]
|
|
480
|
+
|
|
481
|
+
# Write out the dataset name, category name, WI GUID, WI scientific name, WI common name,
|
|
482
|
+
# and count
|
|
483
|
+
s = f"{dataset_name},{category['name']},{wi_taxon['uniqueIdentifier']},"+\
|
|
484
|
+
f"{wi_taxon['taxon_name']},{wi_taxon['commonNameEnglish']},{category['count']}\n"
|
|
485
|
+
f.write(s)
|
|
486
|
+
|
|
487
|
+
# ...for each category in this dataset
|
|
488
|
+
|
|
489
|
+
# ...for each dataset
|
|
483
490
|
|
|
484
|
-
# ...with open()
|
|
491
|
+
# ...with open()
|