megadetector 5.0.8__py3-none-any.whl → 5.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- api/__init__.py +0 -0
- api/batch_processing/__init__.py +0 -0
- api/batch_processing/api_core/__init__.py +0 -0
- api/batch_processing/api_core/batch_service/__init__.py +0 -0
- api/batch_processing/api_core/batch_service/score.py +0 -1
- api/batch_processing/api_core/server_job_status_table.py +0 -1
- api/batch_processing/api_core_support/__init__.py +0 -0
- api/batch_processing/api_core_support/aggregate_results_manually.py +0 -1
- api/batch_processing/api_support/__init__.py +0 -0
- api/batch_processing/api_support/summarize_daily_activity.py +0 -1
- api/batch_processing/data_preparation/__init__.py +0 -0
- api/batch_processing/data_preparation/manage_local_batch.py +65 -65
- api/batch_processing/data_preparation/manage_video_batch.py +8 -8
- api/batch_processing/integration/digiKam/xmp_integration.py +0 -1
- api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -1
- api/batch_processing/postprocessing/__init__.py +0 -0
- api/batch_processing/postprocessing/add_max_conf.py +12 -12
- api/batch_processing/postprocessing/categorize_detections_by_size.py +32 -14
- api/batch_processing/postprocessing/combine_api_outputs.py +68 -54
- api/batch_processing/postprocessing/compare_batch_results.py +113 -43
- api/batch_processing/postprocessing/convert_output_format.py +41 -16
- api/batch_processing/postprocessing/load_api_results.py +16 -17
- api/batch_processing/postprocessing/md_to_coco.py +31 -21
- api/batch_processing/postprocessing/md_to_labelme.py +52 -22
- api/batch_processing/postprocessing/merge_detections.py +14 -14
- api/batch_processing/postprocessing/postprocess_batch_results.py +246 -174
- api/batch_processing/postprocessing/remap_detection_categories.py +32 -25
- api/batch_processing/postprocessing/render_detection_confusion_matrix.py +60 -27
- api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +53 -44
- api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +25 -14
- api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +242 -158
- api/batch_processing/postprocessing/separate_detections_into_folders.py +159 -114
- api/batch_processing/postprocessing/subset_json_detector_output.py +146 -169
- api/batch_processing/postprocessing/top_folders_to_bottom.py +77 -43
- api/synchronous/__init__.py +0 -0
- api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
- api/synchronous/api_core/animal_detection_api/api_backend.py +0 -2
- api/synchronous/api_core/animal_detection_api/api_frontend.py +266 -268
- api/synchronous/api_core/animal_detection_api/config.py +35 -35
- api/synchronous/api_core/tests/__init__.py +0 -0
- api/synchronous/api_core/tests/load_test.py +109 -109
- classification/__init__.py +0 -0
- classification/aggregate_classifier_probs.py +21 -24
- classification/analyze_failed_images.py +11 -13
- classification/cache_batchapi_outputs.py +51 -51
- classification/create_classification_dataset.py +69 -68
- classification/crop_detections.py +54 -53
- classification/csv_to_json.py +97 -100
- classification/detect_and_crop.py +105 -105
- classification/evaluate_model.py +43 -42
- classification/identify_mislabeled_candidates.py +47 -46
- classification/json_to_azcopy_list.py +10 -10
- classification/json_validator.py +72 -71
- classification/map_classification_categories.py +44 -43
- classification/merge_classification_detection_output.py +68 -68
- classification/prepare_classification_script.py +157 -154
- classification/prepare_classification_script_mc.py +228 -228
- classification/run_classifier.py +27 -26
- classification/save_mislabeled.py +30 -30
- classification/train_classifier.py +20 -20
- classification/train_classifier_tf.py +21 -22
- classification/train_utils.py +10 -10
- data_management/__init__.py +0 -0
- data_management/annotations/__init__.py +0 -0
- data_management/annotations/annotation_constants.py +18 -31
- data_management/camtrap_dp_to_coco.py +238 -0
- data_management/cct_json_utils.py +102 -59
- data_management/cct_to_md.py +176 -158
- data_management/cct_to_wi.py +247 -219
- data_management/coco_to_labelme.py +272 -263
- data_management/coco_to_yolo.py +79 -58
- data_management/databases/__init__.py +0 -0
- data_management/databases/add_width_and_height_to_db.py +20 -16
- data_management/databases/combine_coco_camera_traps_files.py +35 -31
- data_management/databases/integrity_check_json_db.py +62 -24
- data_management/databases/subset_json_db.py +24 -15
- data_management/generate_crops_from_cct.py +27 -45
- data_management/get_image_sizes.py +188 -162
- data_management/importers/add_nacti_sizes.py +8 -8
- data_management/importers/add_timestamps_to_icct.py +78 -78
- data_management/importers/animl_results_to_md_results.py +158 -158
- data_management/importers/auckland_doc_test_to_json.py +9 -9
- data_management/importers/auckland_doc_to_json.py +8 -8
- data_management/importers/awc_to_json.py +7 -7
- data_management/importers/bellevue_to_json.py +15 -15
- data_management/importers/cacophony-thermal-importer.py +13 -13
- data_management/importers/carrizo_shrubfree_2018.py +8 -8
- data_management/importers/carrizo_trail_cam_2017.py +8 -8
- data_management/importers/cct_field_adjustments.py +9 -9
- data_management/importers/channel_islands_to_cct.py +10 -10
- data_management/importers/eMammal/copy_and_unzip_emammal.py +1 -0
- data_management/importers/ena24_to_json.py +7 -7
- data_management/importers/filenames_to_json.py +8 -8
- data_management/importers/helena_to_cct.py +7 -7
- data_management/importers/idaho-camera-traps.py +7 -7
- data_management/importers/idfg_iwildcam_lila_prep.py +10 -10
- data_management/importers/jb_csv_to_json.py +9 -9
- data_management/importers/mcgill_to_json.py +8 -8
- data_management/importers/missouri_to_json.py +18 -18
- data_management/importers/nacti_fieldname_adjustments.py +10 -10
- data_management/importers/noaa_seals_2019.py +7 -7
- data_management/importers/pc_to_json.py +7 -7
- data_management/importers/plot_wni_giraffes.py +7 -7
- data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -359
- data_management/importers/prepare_zsl_imerit.py +7 -7
- data_management/importers/rspb_to_json.py +8 -8
- data_management/importers/save_the_elephants_survey_A.py +8 -8
- data_management/importers/save_the_elephants_survey_B.py +9 -9
- data_management/importers/snapshot_safari_importer.py +26 -26
- data_management/importers/snapshot_safari_importer_reprise.py +665 -665
- data_management/importers/snapshot_serengeti_lila.py +14 -14
- data_management/importers/sulross_get_exif.py +8 -9
- data_management/importers/timelapse_csv_set_to_json.py +11 -11
- data_management/importers/ubc_to_json.py +13 -13
- data_management/importers/umn_to_json.py +7 -7
- data_management/importers/wellington_to_json.py +8 -8
- data_management/importers/wi_to_json.py +9 -9
- data_management/importers/zamba_results_to_md_results.py +181 -181
- data_management/labelme_to_coco.py +65 -24
- data_management/labelme_to_yolo.py +8 -8
- data_management/lila/__init__.py +0 -0
- data_management/lila/add_locations_to_island_camera_traps.py +9 -9
- data_management/lila/add_locations_to_nacti.py +147 -147
- data_management/lila/create_lila_blank_set.py +13 -13
- data_management/lila/create_lila_test_set.py +8 -8
- data_management/lila/create_links_to_md_results_files.py +106 -106
- data_management/lila/download_lila_subset.py +44 -110
- data_management/lila/generate_lila_per_image_labels.py +55 -42
- data_management/lila/get_lila_annotation_counts.py +18 -15
- data_management/lila/get_lila_image_counts.py +11 -11
- data_management/lila/lila_common.py +96 -33
- data_management/lila/test_lila_metadata_urls.py +132 -116
- data_management/ocr_tools.py +173 -128
- data_management/read_exif.py +110 -97
- data_management/remap_coco_categories.py +83 -83
- data_management/remove_exif.py +58 -62
- data_management/resize_coco_dataset.py +30 -23
- data_management/wi_download_csv_to_coco.py +246 -239
- data_management/yolo_output_to_md_output.py +86 -73
- data_management/yolo_to_coco.py +300 -60
- detection/__init__.py +0 -0
- detection/detector_training/__init__.py +0 -0
- detection/process_video.py +85 -33
- detection/pytorch_detector.py +43 -25
- detection/run_detector.py +157 -72
- detection/run_detector_batch.py +179 -113
- detection/run_inference_with_yolov5_val.py +108 -48
- detection/run_tiled_inference.py +111 -40
- detection/tf_detector.py +51 -29
- detection/video_utils.py +606 -521
- docs/source/conf.py +43 -0
- md_utils/__init__.py +0 -0
- md_utils/azure_utils.py +9 -9
- md_utils/ct_utils.py +228 -68
- md_utils/directory_listing.py +59 -64
- md_utils/md_tests.py +968 -871
- md_utils/path_utils.py +460 -134
- md_utils/process_utils.py +157 -133
- md_utils/sas_blob_utils.py +20 -20
- md_utils/split_locations_into_train_val.py +45 -32
- md_utils/string_utils.py +33 -10
- md_utils/url_utils.py +176 -60
- md_utils/write_html_image_list.py +40 -33
- md_visualization/__init__.py +0 -0
- md_visualization/plot_utils.py +102 -109
- md_visualization/render_images_with_thumbnails.py +34 -34
- md_visualization/visualization_utils.py +597 -291
- md_visualization/visualize_db.py +76 -48
- md_visualization/visualize_detector_output.py +61 -42
- {megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/METADATA +13 -7
- megadetector-5.0.9.dist-info/RECORD +224 -0
- {megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/top_level.txt +1 -0
- taxonomy_mapping/__init__.py +0 -0
- taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +342 -335
- taxonomy_mapping/map_new_lila_datasets.py +154 -154
- taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -134
- taxonomy_mapping/preview_lila_taxonomy.py +591 -591
- taxonomy_mapping/retrieve_sample_image.py +12 -12
- taxonomy_mapping/simple_image_download.py +11 -11
- taxonomy_mapping/species_lookup.py +10 -10
- taxonomy_mapping/taxonomy_csv_checker.py +18 -18
- taxonomy_mapping/taxonomy_graph.py +47 -47
- taxonomy_mapping/validate_lila_category_mappings.py +83 -76
- data_management/cct_json_to_filename_json.py +0 -89
- data_management/cct_to_csv.py +0 -140
- data_management/databases/remove_corrupted_images_from_db.py +0 -191
- detection/detector_training/copy_checkpoints.py +0 -43
- megadetector-5.0.8.dist-info/RECORD +0 -205
- {megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/LICENSE +0 -0
- {megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/WHEEL +0 -0
|
@@ -1,154 +1,154 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
#%% Constants and imports
|
|
11
|
-
|
|
12
|
-
import os
|
|
13
|
-
import json
|
|
14
|
-
|
|
15
|
-
# Created by get_lila_category_list.py
|
|
16
|
-
input_lila_category_list_file = os.path.expanduser('~/lila/lila_categories_list/lila_dataset_to_categories.json')
|
|
17
|
-
|
|
18
|
-
output_file = os.path.expanduser('~/lila/lila_additions_2023.12.29.csv')
|
|
19
|
-
|
|
20
|
-
datasets_to_map = [
|
|
21
|
-
'Trail Camera Images of New Zealand Animals'
|
|
22
|
-
]
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
#%% Initialize taxonomic lookup
|
|
26
|
-
|
|
27
|
-
from taxonomy_mapping.species_lookup import (
|
|
28
|
-
initialize_taxonomy_lookup,
|
|
29
|
-
get_preferred_taxonomic_match)
|
|
30
|
-
|
|
31
|
-
# from taxonomy_mapping.species_lookup import (
|
|
32
|
-
# get_taxonomic_info, print_taxonomy_matche)
|
|
33
|
-
|
|
34
|
-
initialize_taxonomy_lookup(force_init=False)
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
#%% Read the list of datasets
|
|
38
|
-
|
|
39
|
-
with open(input_lila_category_list_file,'r') as f:
|
|
40
|
-
input_lila_categories = json.load(f)
|
|
41
|
-
|
|
42
|
-
lila_datasets = set()
|
|
43
|
-
|
|
44
|
-
for dataset_name in input_lila_categories.keys():
|
|
45
|
-
# The script that generates this dictionary creates a separate entry for bounding box
|
|
46
|
-
# metadata files, but those don't represent new dataset names
|
|
47
|
-
lila_datasets.add(dataset_name.replace('_bbox',''))
|
|
48
|
-
|
|
49
|
-
for s in datasets_to_map:
|
|
50
|
-
assert s in lila_datasets
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
#%% Find all categories
|
|
54
|
-
|
|
55
|
-
category_mappings = []
|
|
56
|
-
|
|
57
|
-
# dataset_name = datasets_to_map[0]
|
|
58
|
-
for dataset_name in datasets_to_map:
|
|
59
|
-
|
|
60
|
-
ds_categories = input_lila_categories[dataset_name]
|
|
61
|
-
for category in ds_categories:
|
|
62
|
-
category_name = category['name']
|
|
63
|
-
assert ':' not in category_name
|
|
64
|
-
mapping_name = dataset_name + ':' + category_name
|
|
65
|
-
category_mappings.append(mapping_name)
|
|
66
|
-
|
|
67
|
-
print('Need to create {} mappings'.format(len(category_mappings)))
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
#%% Match every query against our taxonomies
|
|
71
|
-
|
|
72
|
-
output_rows = []
|
|
73
|
-
|
|
74
|
-
taxonomy_preference = 'inat'
|
|
75
|
-
|
|
76
|
-
allow_non_preferred_matches = True
|
|
77
|
-
|
|
78
|
-
# mapping_string = category_mappings[1]; print(mapping_string)
|
|
79
|
-
for mapping_string in category_mappings:
|
|
80
|
-
|
|
81
|
-
tokens = mapping_string.split(':')
|
|
82
|
-
assert len(tokens) == 2
|
|
83
|
-
|
|
84
|
-
dataset_name = tokens[0]
|
|
85
|
-
query = tokens[1]
|
|
86
|
-
|
|
87
|
-
taxonomic_match = get_preferred_taxonomic_match(query,taxonomy_preference=taxonomy_preference)
|
|
88
|
-
|
|
89
|
-
if (taxonomic_match.source == taxonomy_preference) or allow_non_preferred_matches:
|
|
90
|
-
|
|
91
|
-
output_row = {
|
|
92
|
-
'dataset_name': dataset_name,
|
|
93
|
-
'query': query,
|
|
94
|
-
'source': taxonomic_match.source,
|
|
95
|
-
'taxonomy_level': taxonomic_match.taxonomic_level,
|
|
96
|
-
'scientific_name': taxonomic_match.scientific_name,
|
|
97
|
-
'common_name': taxonomic_match.common_name,
|
|
98
|
-
'taxonomy_string': taxonomic_match.taxonomy_string
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
else:
|
|
102
|
-
|
|
103
|
-
output_row = {
|
|
104
|
-
'dataset_name': dataset_name,
|
|
105
|
-
'query': query,
|
|
106
|
-
'source': '',
|
|
107
|
-
'taxonomy_level': '',
|
|
108
|
-
'scientific_name': '',
|
|
109
|
-
'common_name': '',
|
|
110
|
-
'taxonomy_string': ''
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
output_rows.append(output_row)
|
|
114
|
-
|
|
115
|
-
# ...for each mapping
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
#%% Write output rows
|
|
119
|
-
|
|
120
|
-
import os
|
|
121
|
-
import pandas as pd
|
|
122
|
-
|
|
123
|
-
assert not os.path.isfile(output_file), 'Delete the output file before re-generating'
|
|
124
|
-
|
|
125
|
-
output_df = pd.DataFrame(data=output_rows, columns=[
|
|
126
|
-
'dataset_name', 'query', 'source', 'taxonomy_level',
|
|
127
|
-
'scientific_name', 'common_name', 'taxonomy_string'])
|
|
128
|
-
output_df.to_csv(output_file, index=None, header=True)
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
#%% Manual lookup
|
|
132
|
-
|
|
133
|
-
if False:
|
|
134
|
-
|
|
135
|
-
#%%
|
|
136
|
-
|
|
137
|
-
# q = 'white-throated monkey'
|
|
138
|
-
# q = 'cingulata'
|
|
139
|
-
# q = 'notamacropus'
|
|
140
|
-
q = 'porzana'
|
|
141
|
-
taxonomy_preference = 'inat'
|
|
142
|
-
m = get_preferred_taxonomic_match(q,taxonomy_preference)
|
|
143
|
-
# print(m.scientific_name); import clipboard; clipboard.copy(m.scientific_name)
|
|
144
|
-
|
|
145
|
-
if m is None:
|
|
146
|
-
print('No match')
|
|
147
|
-
else:
|
|
148
|
-
if m.source != taxonomy_preference:
|
|
149
|
-
print('\n*** non-preferred match ***\n')
|
|
150
|
-
# raise ValueError('')
|
|
151
|
-
print(m.source)
|
|
152
|
-
print(m.taxonomy_string)
|
|
153
|
-
# print(m.scientific_name); import clipboard; clipboard.copy(m.scientific_name)
|
|
154
|
-
import clipboard; clipboard.copy(m.taxonomy_string)
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
map_new_lila_datasets.py
|
|
4
|
+
|
|
5
|
+
Given a subset of LILA datasets, find all the categories, and start the taxonomy
|
|
6
|
+
mapping process.
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
#%% Constants and imports
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import json
|
|
14
|
+
|
|
15
|
+
# Created by get_lila_category_list.py
|
|
16
|
+
input_lila_category_list_file = os.path.expanduser('~/lila/lila_categories_list/lila_dataset_to_categories.json')
|
|
17
|
+
|
|
18
|
+
output_file = os.path.expanduser('~/lila/lila_additions_2023.12.29.csv')
|
|
19
|
+
|
|
20
|
+
datasets_to_map = [
|
|
21
|
+
'Trail Camera Images of New Zealand Animals'
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
#%% Initialize taxonomic lookup
|
|
26
|
+
|
|
27
|
+
from taxonomy_mapping.species_lookup import (
|
|
28
|
+
initialize_taxonomy_lookup,
|
|
29
|
+
get_preferred_taxonomic_match)
|
|
30
|
+
|
|
31
|
+
# from taxonomy_mapping.species_lookup import (
|
|
32
|
+
# get_taxonomic_info, print_taxonomy_matche)
|
|
33
|
+
|
|
34
|
+
initialize_taxonomy_lookup(force_init=False)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
#%% Read the list of datasets
|
|
38
|
+
|
|
39
|
+
with open(input_lila_category_list_file,'r') as f:
|
|
40
|
+
input_lila_categories = json.load(f)
|
|
41
|
+
|
|
42
|
+
lila_datasets = set()
|
|
43
|
+
|
|
44
|
+
for dataset_name in input_lila_categories.keys():
|
|
45
|
+
# The script that generates this dictionary creates a separate entry for bounding box
|
|
46
|
+
# metadata files, but those don't represent new dataset names
|
|
47
|
+
lila_datasets.add(dataset_name.replace('_bbox',''))
|
|
48
|
+
|
|
49
|
+
for s in datasets_to_map:
|
|
50
|
+
assert s in lila_datasets
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
#%% Find all categories
|
|
54
|
+
|
|
55
|
+
category_mappings = []
|
|
56
|
+
|
|
57
|
+
# dataset_name = datasets_to_map[0]
|
|
58
|
+
for dataset_name in datasets_to_map:
|
|
59
|
+
|
|
60
|
+
ds_categories = input_lila_categories[dataset_name]
|
|
61
|
+
for category in ds_categories:
|
|
62
|
+
category_name = category['name']
|
|
63
|
+
assert ':' not in category_name
|
|
64
|
+
mapping_name = dataset_name + ':' + category_name
|
|
65
|
+
category_mappings.append(mapping_name)
|
|
66
|
+
|
|
67
|
+
print('Need to create {} mappings'.format(len(category_mappings)))
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
#%% Match every query against our taxonomies
|
|
71
|
+
|
|
72
|
+
output_rows = []
|
|
73
|
+
|
|
74
|
+
taxonomy_preference = 'inat'
|
|
75
|
+
|
|
76
|
+
allow_non_preferred_matches = True
|
|
77
|
+
|
|
78
|
+
# mapping_string = category_mappings[1]; print(mapping_string)
|
|
79
|
+
for mapping_string in category_mappings:
|
|
80
|
+
|
|
81
|
+
tokens = mapping_string.split(':')
|
|
82
|
+
assert len(tokens) == 2
|
|
83
|
+
|
|
84
|
+
dataset_name = tokens[0]
|
|
85
|
+
query = tokens[1]
|
|
86
|
+
|
|
87
|
+
taxonomic_match = get_preferred_taxonomic_match(query,taxonomy_preference=taxonomy_preference)
|
|
88
|
+
|
|
89
|
+
if (taxonomic_match.source == taxonomy_preference) or allow_non_preferred_matches:
|
|
90
|
+
|
|
91
|
+
output_row = {
|
|
92
|
+
'dataset_name': dataset_name,
|
|
93
|
+
'query': query,
|
|
94
|
+
'source': taxonomic_match.source,
|
|
95
|
+
'taxonomy_level': taxonomic_match.taxonomic_level,
|
|
96
|
+
'scientific_name': taxonomic_match.scientific_name,
|
|
97
|
+
'common_name': taxonomic_match.common_name,
|
|
98
|
+
'taxonomy_string': taxonomic_match.taxonomy_string
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
else:
|
|
102
|
+
|
|
103
|
+
output_row = {
|
|
104
|
+
'dataset_name': dataset_name,
|
|
105
|
+
'query': query,
|
|
106
|
+
'source': '',
|
|
107
|
+
'taxonomy_level': '',
|
|
108
|
+
'scientific_name': '',
|
|
109
|
+
'common_name': '',
|
|
110
|
+
'taxonomy_string': ''
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
output_rows.append(output_row)
|
|
114
|
+
|
|
115
|
+
# ...for each mapping
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
#%% Write output rows
|
|
119
|
+
|
|
120
|
+
import os
|
|
121
|
+
import pandas as pd
|
|
122
|
+
|
|
123
|
+
assert not os.path.isfile(output_file), 'Delete the output file before re-generating'
|
|
124
|
+
|
|
125
|
+
output_df = pd.DataFrame(data=output_rows, columns=[
|
|
126
|
+
'dataset_name', 'query', 'source', 'taxonomy_level',
|
|
127
|
+
'scientific_name', 'common_name', 'taxonomy_string'])
|
|
128
|
+
output_df.to_csv(output_file, index=None, header=True)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
#%% Manual lookup
|
|
132
|
+
|
|
133
|
+
if False:
|
|
134
|
+
|
|
135
|
+
#%%
|
|
136
|
+
|
|
137
|
+
# q = 'white-throated monkey'
|
|
138
|
+
# q = 'cingulata'
|
|
139
|
+
# q = 'notamacropus'
|
|
140
|
+
q = 'porzana'
|
|
141
|
+
taxonomy_preference = 'inat'
|
|
142
|
+
m = get_preferred_taxonomic_match(q,taxonomy_preference)
|
|
143
|
+
# print(m.scientific_name); import clipboard; clipboard.copy(m.scientific_name)
|
|
144
|
+
|
|
145
|
+
if m is None:
|
|
146
|
+
print('No match')
|
|
147
|
+
else:
|
|
148
|
+
if m.source != taxonomy_preference:
|
|
149
|
+
print('\n*** non-preferred match ***\n')
|
|
150
|
+
# raise ValueError('')
|
|
151
|
+
print(m.source)
|
|
152
|
+
print(m.taxonomy_string)
|
|
153
|
+
# print(m.scientific_name); import clipboard; clipboard.copy(m.scientific_name)
|
|
154
|
+
import clipboard; clipboard.copy(m.taxonomy_string)
|
|
@@ -1,134 +1,142 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
#%% Imports and constants
|
|
11
|
-
|
|
12
|
-
import os
|
|
13
|
-
import json
|
|
14
|
-
import pandas as pd
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
for
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
#
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
prepare_lila_taxonomy_release.py
|
|
4
|
+
|
|
5
|
+
Given the private intermediate taxonomy mapping (produced by map_new_lila_datasets.py),
|
|
6
|
+
prepare the public (release) taxonomy mapping file.
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
#%% Imports and constants
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import json
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
#%% Prevent execution during infrastructural imports
|
|
18
|
+
|
|
19
|
+
if False:
|
|
20
|
+
|
|
21
|
+
#%% Filenames
|
|
22
|
+
|
|
23
|
+
lila_taxonomy_file = 'c:/git/agentmorrisprivate/lila-taxonomy/lila-taxonomy-mapping.csv'
|
|
24
|
+
release_taxonomy_file = os.path.expanduser('~/lila/lila-taxonomy-mapping_release.csv')
|
|
25
|
+
# import clipboard; clipboard.copy(release_taxonomy_file)
|
|
26
|
+
|
|
27
|
+
# Created by get_lila_category_list.py... contains counts for each category
|
|
28
|
+
lila_dataset_to_categories_file = os.path.expanduser('~/lila/lila_categories_list/lila_dataset_to_categories.json')
|
|
29
|
+
|
|
30
|
+
assert os.path.isfile(lila_dataset_to_categories_file)
|
|
31
|
+
assert os.path.isfile(lila_taxonomy_file)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
#%% Find out which categories are actually used
|
|
35
|
+
|
|
36
|
+
df = pd.read_csv(lila_taxonomy_file)
|
|
37
|
+
|
|
38
|
+
with open(lila_dataset_to_categories_file,'r') as f:
|
|
39
|
+
lila_dataset_to_categories = json.load(f)
|
|
40
|
+
|
|
41
|
+
used_category_mappings = []
|
|
42
|
+
|
|
43
|
+
# dataset_name = datasets_to_map[0]
|
|
44
|
+
for dataset_name in lila_dataset_to_categories.keys():
|
|
45
|
+
|
|
46
|
+
ds_categories = lila_dataset_to_categories[dataset_name]
|
|
47
|
+
for category in ds_categories:
|
|
48
|
+
category_name = category['name'].lower()
|
|
49
|
+
assert ':' not in category_name
|
|
50
|
+
mapping_name = dataset_name + ':' + category_name
|
|
51
|
+
used_category_mappings.append(mapping_name)
|
|
52
|
+
|
|
53
|
+
df['used'] = False
|
|
54
|
+
|
|
55
|
+
# i_row = 0; row = df.iloc[i_row]; row
|
|
56
|
+
for i_row,row in df.iterrows():
|
|
57
|
+
ds_name = row['dataset_name']
|
|
58
|
+
query = row['query']
|
|
59
|
+
mapping_name = ds_name + ':' + query
|
|
60
|
+
if mapping_name in used_category_mappings:
|
|
61
|
+
df.loc[i_row,'used'] = True
|
|
62
|
+
else:
|
|
63
|
+
print('Dropping unused mapping {}'.format(mapping_name))
|
|
64
|
+
|
|
65
|
+
df = df[df.used]
|
|
66
|
+
df = df.drop('used',axis=1)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
#%% Generate the final output file
|
|
70
|
+
|
|
71
|
+
assert not os.path.isfile(release_taxonomy_file)
|
|
72
|
+
|
|
73
|
+
known_levels = ['stateofmatter', #noqa
|
|
74
|
+
'kingdom',
|
|
75
|
+
'phylum','subphylum',
|
|
76
|
+
'superclass','class','subclass','infraclass',
|
|
77
|
+
'superorder','order','parvorder','suborder','infraorder',
|
|
78
|
+
'zoosection',
|
|
79
|
+
'superfamily','family','subfamily','tribe',
|
|
80
|
+
'genus',
|
|
81
|
+
'species','subspecies','variety']
|
|
82
|
+
|
|
83
|
+
levels_to_include = ['kingdom',
|
|
84
|
+
'phylum','subphylum',
|
|
85
|
+
'superclass','class','subclass','infraclass',
|
|
86
|
+
'superorder','order','suborder','infraorder',
|
|
87
|
+
'superfamily','family','subfamily','tribe',
|
|
88
|
+
'genus',
|
|
89
|
+
'species','subspecies','variety']
|
|
90
|
+
|
|
91
|
+
levels_to_exclude = ['stateofmatter','zoosection','parvorder']
|
|
92
|
+
|
|
93
|
+
for s in levels_to_exclude:
|
|
94
|
+
assert s not in levels_to_include
|
|
95
|
+
|
|
96
|
+
levels_used = set()
|
|
97
|
+
|
|
98
|
+
# i_row = 0; row = df.iloc[i_row]; row
|
|
99
|
+
for i_row,row in df.iterrows():
|
|
100
|
+
|
|
101
|
+
if not isinstance(row['scientific_name'],str):
|
|
102
|
+
assert not isinstance(row['taxonomy_string'],str)
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
taxonomic_match = eval(row['taxonomy_string'])
|
|
106
|
+
|
|
107
|
+
# match_at_level = taxonomic_match[0]
|
|
108
|
+
for match_at_level in taxonomic_match:
|
|
109
|
+
assert len(match_at_level) == 4
|
|
110
|
+
levels_used.add(match_at_level[1])
|
|
111
|
+
|
|
112
|
+
levels_used = [s for s in levels_used if isinstance(s,str)]
|
|
113
|
+
|
|
114
|
+
for s in levels_used:
|
|
115
|
+
assert s in levels_to_exclude or s in levels_to_include, 'Unrecognized level {}'.format(s)
|
|
116
|
+
|
|
117
|
+
for s in levels_to_include:
|
|
118
|
+
assert s in levels_used
|
|
119
|
+
|
|
120
|
+
for s in levels_to_include:
|
|
121
|
+
df[s] = ''
|
|
122
|
+
|
|
123
|
+
# i_row = 0; row = df.iloc[i_row]; row
|
|
124
|
+
for i_row,row in df.iterrows():
|
|
125
|
+
|
|
126
|
+
if not isinstance(row['scientific_name'],str):
|
|
127
|
+
assert not isinstance(row['taxonomy_string'],str)
|
|
128
|
+
continue
|
|
129
|
+
|
|
130
|
+
# E.g.: (43117, 'genus', 'lepus', ['hares and jackrabbits']
|
|
131
|
+
taxonomic_match = eval(row['taxonomy_string'])
|
|
132
|
+
|
|
133
|
+
for match_at_level in taxonomic_match:
|
|
134
|
+
level = match_at_level[1]
|
|
135
|
+
if level in levels_to_include:
|
|
136
|
+
df.loc[i_row,level] = match_at_level[2]
|
|
137
|
+
|
|
138
|
+
df = df.drop('source',axis=1)
|
|
139
|
+
df.to_csv(release_taxonomy_file,header=True,index=False)
|
|
140
|
+
|
|
141
|
+
print('Wrote final output to {}'.format(release_taxonomy_file))
|
|
142
|
+
|