megadetector 10.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- megadetector/__init__.py +0 -0
- megadetector/api/__init__.py +0 -0
- megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
- megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
- megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
- megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
- megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
- megadetector/classification/__init__.py +0 -0
- megadetector/classification/aggregate_classifier_probs.py +108 -0
- megadetector/classification/analyze_failed_images.py +227 -0
- megadetector/classification/cache_batchapi_outputs.py +198 -0
- megadetector/classification/create_classification_dataset.py +626 -0
- megadetector/classification/crop_detections.py +516 -0
- megadetector/classification/csv_to_json.py +226 -0
- megadetector/classification/detect_and_crop.py +853 -0
- megadetector/classification/efficientnet/__init__.py +9 -0
- megadetector/classification/efficientnet/model.py +415 -0
- megadetector/classification/efficientnet/utils.py +608 -0
- megadetector/classification/evaluate_model.py +520 -0
- megadetector/classification/identify_mislabeled_candidates.py +152 -0
- megadetector/classification/json_to_azcopy_list.py +63 -0
- megadetector/classification/json_validator.py +696 -0
- megadetector/classification/map_classification_categories.py +276 -0
- megadetector/classification/merge_classification_detection_output.py +509 -0
- megadetector/classification/prepare_classification_script.py +194 -0
- megadetector/classification/prepare_classification_script_mc.py +228 -0
- megadetector/classification/run_classifier.py +287 -0
- megadetector/classification/save_mislabeled.py +110 -0
- megadetector/classification/train_classifier.py +827 -0
- megadetector/classification/train_classifier_tf.py +725 -0
- megadetector/classification/train_utils.py +323 -0
- megadetector/data_management/__init__.py +0 -0
- megadetector/data_management/animl_to_md.py +161 -0
- megadetector/data_management/annotations/__init__.py +0 -0
- megadetector/data_management/annotations/annotation_constants.py +33 -0
- megadetector/data_management/camtrap_dp_to_coco.py +270 -0
- megadetector/data_management/cct_json_utils.py +566 -0
- megadetector/data_management/cct_to_md.py +184 -0
- megadetector/data_management/cct_to_wi.py +293 -0
- megadetector/data_management/coco_to_labelme.py +284 -0
- megadetector/data_management/coco_to_yolo.py +701 -0
- megadetector/data_management/databases/__init__.py +0 -0
- megadetector/data_management/databases/add_width_and_height_to_db.py +107 -0
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +210 -0
- megadetector/data_management/databases/integrity_check_json_db.py +563 -0
- megadetector/data_management/databases/subset_json_db.py +195 -0
- megadetector/data_management/generate_crops_from_cct.py +200 -0
- megadetector/data_management/get_image_sizes.py +164 -0
- megadetector/data_management/labelme_to_coco.py +559 -0
- megadetector/data_management/labelme_to_yolo.py +349 -0
- megadetector/data_management/lila/__init__.py +0 -0
- megadetector/data_management/lila/create_lila_blank_set.py +556 -0
- megadetector/data_management/lila/create_lila_test_set.py +192 -0
- megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
- megadetector/data_management/lila/download_lila_subset.py +182 -0
- megadetector/data_management/lila/generate_lila_per_image_labels.py +777 -0
- megadetector/data_management/lila/get_lila_annotation_counts.py +174 -0
- megadetector/data_management/lila/get_lila_image_counts.py +112 -0
- megadetector/data_management/lila/lila_common.py +319 -0
- megadetector/data_management/lila/test_lila_metadata_urls.py +164 -0
- megadetector/data_management/mewc_to_md.py +344 -0
- megadetector/data_management/ocr_tools.py +873 -0
- megadetector/data_management/read_exif.py +964 -0
- megadetector/data_management/remap_coco_categories.py +195 -0
- megadetector/data_management/remove_exif.py +156 -0
- megadetector/data_management/rename_images.py +194 -0
- megadetector/data_management/resize_coco_dataset.py +665 -0
- megadetector/data_management/speciesnet_to_md.py +41 -0
- megadetector/data_management/wi_download_csv_to_coco.py +247 -0
- megadetector/data_management/yolo_output_to_md_output.py +594 -0
- megadetector/data_management/yolo_to_coco.py +984 -0
- megadetector/data_management/zamba_to_md.py +188 -0
- megadetector/detection/__init__.py +0 -0
- megadetector/detection/change_detection.py +840 -0
- megadetector/detection/process_video.py +479 -0
- megadetector/detection/pytorch_detector.py +1451 -0
- megadetector/detection/run_detector.py +1267 -0
- megadetector/detection/run_detector_batch.py +2172 -0
- megadetector/detection/run_inference_with_yolov5_val.py +1314 -0
- megadetector/detection/run_md_and_speciesnet.py +1604 -0
- megadetector/detection/run_tiled_inference.py +1044 -0
- megadetector/detection/tf_detector.py +209 -0
- megadetector/detection/video_utils.py +1379 -0
- megadetector/postprocessing/__init__.py +0 -0
- megadetector/postprocessing/add_max_conf.py +72 -0
- megadetector/postprocessing/categorize_detections_by_size.py +166 -0
- megadetector/postprocessing/classification_postprocessing.py +1943 -0
- megadetector/postprocessing/combine_batch_outputs.py +249 -0
- megadetector/postprocessing/compare_batch_results.py +2110 -0
- megadetector/postprocessing/convert_output_format.py +403 -0
- megadetector/postprocessing/create_crop_folder.py +629 -0
- megadetector/postprocessing/detector_calibration.py +570 -0
- megadetector/postprocessing/generate_csv_report.py +522 -0
- megadetector/postprocessing/load_api_results.py +223 -0
- megadetector/postprocessing/md_to_coco.py +428 -0
- megadetector/postprocessing/md_to_labelme.py +351 -0
- megadetector/postprocessing/md_to_wi.py +41 -0
- megadetector/postprocessing/merge_detections.py +392 -0
- megadetector/postprocessing/postprocess_batch_results.py +2140 -0
- megadetector/postprocessing/remap_detection_categories.py +226 -0
- megadetector/postprocessing/render_detection_confusion_matrix.py +677 -0
- megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +206 -0
- megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +82 -0
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1665 -0
- megadetector/postprocessing/separate_detections_into_folders.py +795 -0
- megadetector/postprocessing/subset_json_detector_output.py +964 -0
- megadetector/postprocessing/top_folders_to_bottom.py +238 -0
- megadetector/postprocessing/validate_batch_results.py +332 -0
- megadetector/taxonomy_mapping/__init__.py +0 -0
- megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +211 -0
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +165 -0
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +543 -0
- megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
- megadetector/taxonomy_mapping/simple_image_download.py +231 -0
- megadetector/taxonomy_mapping/species_lookup.py +1008 -0
- megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
- megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
- megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
- megadetector/tests/__init__.py +0 -0
- megadetector/tests/test_nms_synthetic.py +335 -0
- megadetector/utils/__init__.py +0 -0
- megadetector/utils/ct_utils.py +1857 -0
- megadetector/utils/directory_listing.py +199 -0
- megadetector/utils/extract_frames_from_video.py +307 -0
- megadetector/utils/gpu_test.py +125 -0
- megadetector/utils/md_tests.py +2072 -0
- megadetector/utils/path_utils.py +2872 -0
- megadetector/utils/process_utils.py +172 -0
- megadetector/utils/split_locations_into_train_val.py +237 -0
- megadetector/utils/string_utils.py +234 -0
- megadetector/utils/url_utils.py +825 -0
- megadetector/utils/wi_platform_utils.py +968 -0
- megadetector/utils/wi_taxonomy_utils.py +1766 -0
- megadetector/utils/write_html_image_list.py +239 -0
- megadetector/visualization/__init__.py +0 -0
- megadetector/visualization/plot_utils.py +309 -0
- megadetector/visualization/render_images_with_thumbnails.py +243 -0
- megadetector/visualization/visualization_utils.py +1973 -0
- megadetector/visualization/visualize_db.py +630 -0
- megadetector/visualization/visualize_detector_output.py +498 -0
- megadetector/visualization/visualize_video_output.py +705 -0
- megadetector-10.0.15.dist-info/METADATA +115 -0
- megadetector-10.0.15.dist-info/RECORD +147 -0
- megadetector-10.0.15.dist-info/WHEEL +5 -0
- megadetector-10.0.15.dist-info/licenses/LICENSE +19 -0
- megadetector-10.0.15.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1943 @@
|
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
classification_postprocessing.py
|
|
4
|
+
|
|
5
|
+
Functions for postprocessing species classification results, particularly:
|
|
6
|
+
|
|
7
|
+
* Smoothing results within an image (an image with 700 cows and one deer is really just 701
|
|
8
|
+
cows)
|
|
9
|
+
* Smoothing results within a sequence (a sequence that looks like deer/deer/deer/elk/deer/deer
|
|
10
|
+
is really just a deer)
|
|
11
|
+
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
#%% Constants and imports
|
|
15
|
+
|
|
16
|
+
import os
|
|
17
|
+
import json
|
|
18
|
+
import copy
|
|
19
|
+
import pandas as pd
|
|
20
|
+
|
|
21
|
+
from collections import defaultdict
|
|
22
|
+
from tqdm import tqdm
|
|
23
|
+
|
|
24
|
+
from megadetector.utils.ct_utils import is_list_sorted
|
|
25
|
+
from megadetector.utils.ct_utils import is_empty
|
|
26
|
+
from megadetector.utils.ct_utils import sort_dictionary_by_value
|
|
27
|
+
from megadetector.utils.ct_utils import sort_dictionary_by_key
|
|
28
|
+
from megadetector.utils.ct_utils import invert_dictionary
|
|
29
|
+
from megadetector.utils.ct_utils import write_json
|
|
30
|
+
|
|
31
|
+
from megadetector.utils.wi_taxonomy_utils import clean_taxonomy_string
|
|
32
|
+
from megadetector.utils.wi_taxonomy_utils import taxonomy_level_index
|
|
33
|
+
from megadetector.utils.wi_taxonomy_utils import taxonomy_level_string_to_index
|
|
34
|
+
|
|
35
|
+
from megadetector.utils.wi_taxonomy_utils import human_prediction_string
|
|
36
|
+
from megadetector.utils.wi_taxonomy_utils import animal_prediction_string
|
|
37
|
+
from megadetector.utils.wi_taxonomy_utils import is_taxonomic_prediction_string
|
|
38
|
+
from megadetector.utils.wi_taxonomy_utils import blank_prediction_string # noqa
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
#%% Options classes
|
|
42
|
+
|
|
43
|
+
class ClassificationSmoothingOptions:
|
|
44
|
+
"""
|
|
45
|
+
Options used to parameterize smooth_classification_results_image_level()
|
|
46
|
+
and smooth_classification_results_sequence_level()
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(self):
|
|
50
|
+
|
|
51
|
+
#: How many detections do we need in a dominant category to overwrite
|
|
52
|
+
#: non-dominant classifications? This is irrelevant if
|
|
53
|
+
#: max_detections_nondominant_class <= 1.
|
|
54
|
+
self.min_detections_to_overwrite_secondary = 4
|
|
55
|
+
|
|
56
|
+
#: Even if we have a dominant class, if a non-dominant class has at least
|
|
57
|
+
#: this many classifications in an image, leave them alone.
|
|
58
|
+
#:
|
|
59
|
+
#: If this is <= 1, we won't replace non-dominant, non-other classes
|
|
60
|
+
#: with the dominant class, even if there are 900 cows and 1 deer.
|
|
61
|
+
self.max_detections_nondominant_class = 1
|
|
62
|
+
|
|
63
|
+
#: How many detections do we need in a dominant category to overwrite
|
|
64
|
+
#: non-dominant classifications in the same family? If this is <= 0,
|
|
65
|
+
#: we'll skip this step. This option doesn't mean anything if
|
|
66
|
+
#: max_detections_nondominant_class_same_family <= 1.
|
|
67
|
+
self.min_detections_to_overwrite_secondary_same_family = 2
|
|
68
|
+
|
|
69
|
+
#: If we have this many classifications of a nondominant category,
|
|
70
|
+
#: we won't do same-family overwrites. <= 1 means "even if there are
|
|
71
|
+
#: a million deer, if there are two million moose, call all the deer
|
|
72
|
+
#: moose". This option doesn't mean anything if
|
|
73
|
+
#: min_detections_to_overwrite_secondary_same_family <= 0.
|
|
74
|
+
self.max_detections_nondominant_class_same_family = -1
|
|
75
|
+
|
|
76
|
+
#: If the dominant class has at least this many classifications, overwrite
|
|
77
|
+
#: "other" classifications with the dominant class
|
|
78
|
+
self.min_detections_to_overwrite_other = 2
|
|
79
|
+
|
|
80
|
+
#: Names to treat as "other" categories; can't be None, but can be empty
|
|
81
|
+
#:
|
|
82
|
+
#: "Other" classifications will be changed to the dominant category, regardless
|
|
83
|
+
#: of confidence, as long as there are at least min_detections_to_overwrite_other
|
|
84
|
+
#: examples of the dominant class. For example, cow/other will remain unchanged,
|
|
85
|
+
#: but cow/cow/other will become cow/cow/cow.
|
|
86
|
+
self.other_category_names = ['other','unknown','no cv result','animal','blank','mammal']
|
|
87
|
+
|
|
88
|
+
#: We're not even going to mess around with classifications below this threshold.
|
|
89
|
+
#:
|
|
90
|
+
#: We won't count them, we won't over-write them, they don't exist during the
|
|
91
|
+
#: within-image smoothing step.
|
|
92
|
+
self.classification_confidence_threshold = 0.5
|
|
93
|
+
|
|
94
|
+
#: We're not even going to mess around with detections below this threshold.
|
|
95
|
+
#:
|
|
96
|
+
#: We won't count them, we won't over-write them, they don't exist during the
|
|
97
|
+
#: within-image smoothing step.
|
|
98
|
+
self.detection_confidence_threshold = 0.15
|
|
99
|
+
|
|
100
|
+
#: If classification descriptions are present and appear to represent taxonomic
|
|
101
|
+
#: information, should we propagate classifications when lower-level taxa are more
|
|
102
|
+
#: common in an image? For example, if we see "carnivore/fox/fox/deer", should
|
|
103
|
+
#: we make that "fox/fox/fox/deer"?
|
|
104
|
+
self.propagate_classifications_through_taxonomy = True
|
|
105
|
+
|
|
106
|
+
#: When propagating classifications down through taxonomy levels, we have to
|
|
107
|
+
#: decide whether we prefer more frequent categories or more specific categories.
|
|
108
|
+
#: taxonomy_propagation_level_weight and taxonomy_propagation_count_weight
|
|
109
|
+
#: balance levels against counts in this process.
|
|
110
|
+
self.taxonomy_propagation_level_weight = 1.0
|
|
111
|
+
|
|
112
|
+
#: When propagating classifications down through taxonomy levels, we have to
|
|
113
|
+
#: decide whether we prefer more frequent categories or more specific categories.
|
|
114
|
+
#: taxonomy_propagation_level_weight and taxonomy_propagation_count_weight
|
|
115
|
+
#: balance levels against counts in this process.
|
|
116
|
+
#:
|
|
117
|
+
#: With a very low default value, this just breaks ties.
|
|
118
|
+
self.taxonomy_propagation_count_weight = 0.01
|
|
119
|
+
|
|
120
|
+
#: Should we record information about the state of labels prior to smoothing?
|
|
121
|
+
self.add_pre_smoothing_description = True
|
|
122
|
+
|
|
123
|
+
#: When a dict (rather than a file) is passed to either smoothing function,
|
|
124
|
+
#: if this is True, we'll make a copy of the input dict before modifying.
|
|
125
|
+
self.modify_in_place = False
|
|
126
|
+
|
|
127
|
+
#: Only include these categories in the smoothing process (None to use all categories)
|
|
128
|
+
self.detection_category_names_to_smooth = ['animal']
|
|
129
|
+
|
|
130
|
+
#: Debug options
|
|
131
|
+
self.break_at_image = None
|
|
132
|
+
|
|
133
|
+
## Populated internally
|
|
134
|
+
|
|
135
|
+
#: Only include these categories in the smoothing process (None to use all categories)
|
|
136
|
+
self._detection_category_ids_to_smooth = None
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
#%% Utility functions
|
|
140
|
+
|
|
141
|
+
def _results_for_sequence(images_this_sequence,filename_to_results):
|
|
142
|
+
"""
|
|
143
|
+
Fetch MD results for every image in this sequence, based on the 'file_name' field
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
results_this_sequence = []
|
|
147
|
+
for im in images_this_sequence:
|
|
148
|
+
fn = im['file_name']
|
|
149
|
+
results_this_image = filename_to_results[fn]
|
|
150
|
+
assert isinstance(results_this_image,dict)
|
|
151
|
+
results_this_sequence.append(results_this_image)
|
|
152
|
+
|
|
153
|
+
return results_this_sequence
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _sort_images_by_time(images):
|
|
157
|
+
"""
|
|
158
|
+
Returns a copy of [images], sorted by the 'datetime' field (ascending).
|
|
159
|
+
"""
|
|
160
|
+
return sorted(images, key = lambda im: im['datetime'])
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _detection_is_relevant_for_smoothing(det,options):
|
|
164
|
+
"""
|
|
165
|
+
Determine whether [det] has classifications that might be meaningful for smoothing.
|
|
166
|
+
"""
|
|
167
|
+
|
|
168
|
+
if ('classifications' not in det) or \
|
|
169
|
+
(det['conf'] < options.detection_confidence_threshold):
|
|
170
|
+
return False
|
|
171
|
+
|
|
172
|
+
# Ignore non-smoothed categories
|
|
173
|
+
if (options._detection_category_ids_to_smooth is not None) and \
|
|
174
|
+
(det['category'] not in options._detection_category_ids_to_smooth):
|
|
175
|
+
return False
|
|
176
|
+
|
|
177
|
+
return True
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def count_detections_by_classification_category(detections,options=None):
|
|
181
|
+
"""
|
|
182
|
+
Count the number of instances of each classification category in the detections list
|
|
183
|
+
[detections] that have an above-threshold detection. Sort results in descending
|
|
184
|
+
order by count. Returns a dict mapping category ID --> count. If no detections
|
|
185
|
+
are above threshold, returns an empty dict.
|
|
186
|
+
|
|
187
|
+
Only processes the top classification for each detection.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
detections (list of dict): detections list
|
|
191
|
+
options (ClassificationSmoothingOptions, optional): see ClassificationSmoothingOptions
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
dict mapping above-threshold category IDs to counts
|
|
195
|
+
"""
|
|
196
|
+
|
|
197
|
+
if detections is None or len(detections) == 0:
|
|
198
|
+
return {}
|
|
199
|
+
|
|
200
|
+
if options is None:
|
|
201
|
+
options = ClassificationSmoothingOptions()
|
|
202
|
+
|
|
203
|
+
category_to_count = defaultdict(int)
|
|
204
|
+
|
|
205
|
+
for det in detections:
|
|
206
|
+
|
|
207
|
+
if not _detection_is_relevant_for_smoothing(det,options):
|
|
208
|
+
continue
|
|
209
|
+
|
|
210
|
+
c = det['classifications'][0]
|
|
211
|
+
if c[1] >= options.classification_confidence_threshold:
|
|
212
|
+
category_to_count[c[0]] += 1
|
|
213
|
+
|
|
214
|
+
category_to_count = {k: v for k, v in sorted(category_to_count.items(),
|
|
215
|
+
key=lambda item: item[1],
|
|
216
|
+
reverse=True)}
|
|
217
|
+
|
|
218
|
+
return category_to_count
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def get_classification_description_string(category_to_count,classification_descriptions):
|
|
222
|
+
"""
|
|
223
|
+
Return a string summarizing the image content according to [category_to_count].
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
category_to_count (dict): a dict mapping category IDs to counts
|
|
227
|
+
classification_descriptions (dict): a dict mapping category IDs to description strings
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
string: a description of this image's content, e.g. "rabbit (4), human (1)"
|
|
231
|
+
"""
|
|
232
|
+
|
|
233
|
+
category_strings = []
|
|
234
|
+
# category_id = next(iter(category_to_count))
|
|
235
|
+
for category_id in category_to_count:
|
|
236
|
+
category_description = classification_descriptions[category_id]
|
|
237
|
+
tokens = category_description.split(';')
|
|
238
|
+
assert len(tokens) == 7
|
|
239
|
+
category_name = tokens[-1]
|
|
240
|
+
if len(category_name) == 0:
|
|
241
|
+
category_name = 'undefined category'
|
|
242
|
+
count = category_to_count[category_id]
|
|
243
|
+
category_string = '{} ({})'.format(category_name,count)
|
|
244
|
+
category_strings.append(category_string)
|
|
245
|
+
|
|
246
|
+
return ', '.join(category_strings)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def _print_counts_with_names(category_to_count,classification_descriptions):
|
|
250
|
+
"""
|
|
251
|
+
Print a list of classification categories with counts, based in the name --> count
|
|
252
|
+
dict [category_to_count]
|
|
253
|
+
"""
|
|
254
|
+
|
|
255
|
+
for category_id in category_to_count:
|
|
256
|
+
category_name = classification_descriptions[category_id]
|
|
257
|
+
count = category_to_count[category_id]
|
|
258
|
+
print('{}: {} ({})'.format(category_id,category_name,count))
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _prepare_results_for_smoothing(input_file,options):
|
|
262
|
+
"""
|
|
263
|
+
Load results from [input_file] if necessary, prepare category descriptions
|
|
264
|
+
for smoothing. Adds pre-smoothing descriptions to every image if the options
|
|
265
|
+
say we're supposed to do that.
|
|
266
|
+
|
|
267
|
+
May modify some fields in [options].
|
|
268
|
+
"""
|
|
269
|
+
|
|
270
|
+
if isinstance(input_file,str):
|
|
271
|
+
with open(input_file,'r') as f:
|
|
272
|
+
print('Loading results from:\n{}'.format(input_file))
|
|
273
|
+
d = json.load(f)
|
|
274
|
+
else:
|
|
275
|
+
assert isinstance(input_file,dict)
|
|
276
|
+
if options.modify_in_place:
|
|
277
|
+
d = input_file
|
|
278
|
+
else:
|
|
279
|
+
print('modify_in_place is False, copying the input before modifying')
|
|
280
|
+
d = copy.deepcopy(input_file)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
## Category processing
|
|
284
|
+
|
|
285
|
+
category_name_to_id = {d['classification_categories'][k]:k for k in d['classification_categories']}
|
|
286
|
+
other_category_ids = []
|
|
287
|
+
for s in options.other_category_names:
|
|
288
|
+
if s in category_name_to_id:
|
|
289
|
+
other_category_ids.append(category_name_to_id[s])
|
|
290
|
+
|
|
291
|
+
# Possibly update the list of category IDs we should smooth
|
|
292
|
+
if options.detection_category_names_to_smooth is None:
|
|
293
|
+
options._detection_category_ids_to_smooth = None
|
|
294
|
+
else:
|
|
295
|
+
detection_category_id_to_name = d['detection_categories']
|
|
296
|
+
detection_category_name_to_id = invert_dictionary(detection_category_id_to_name)
|
|
297
|
+
options._detection_category_ids_to_smooth = []
|
|
298
|
+
for category_name in options.detection_category_names_to_smooth:
|
|
299
|
+
options._detection_category_ids_to_smooth.append(detection_category_name_to_id[category_name])
|
|
300
|
+
|
|
301
|
+
# Before we do anything else, get rid of everything but the top classification
|
|
302
|
+
# for each detection, and remove the 'classifications' field from detections with
|
|
303
|
+
# no classifications.
|
|
304
|
+
for im in tqdm(d['images']):
|
|
305
|
+
|
|
306
|
+
if 'detections' not in im or im['detections'] is None or len(im['detections']) == 0:
|
|
307
|
+
continue
|
|
308
|
+
|
|
309
|
+
detections = im['detections']
|
|
310
|
+
|
|
311
|
+
for det in detections:
|
|
312
|
+
|
|
313
|
+
if 'classifications' not in det:
|
|
314
|
+
continue
|
|
315
|
+
if len(det['classifications']) == 0:
|
|
316
|
+
del det['classifications']
|
|
317
|
+
continue
|
|
318
|
+
|
|
319
|
+
classification_confidence_values = [c[1] for c in det['classifications']]
|
|
320
|
+
assert is_list_sorted(classification_confidence_values,reverse=True)
|
|
321
|
+
det['classifications'] = [det['classifications'][0]]
|
|
322
|
+
|
|
323
|
+
# ...for each detection in this image
|
|
324
|
+
|
|
325
|
+
# ...for each image
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
## Clean up classification descriptions...
|
|
329
|
+
|
|
330
|
+
# ...so we can test taxonomic relationships by substring testing.
|
|
331
|
+
|
|
332
|
+
classification_descriptions_clean = None
|
|
333
|
+
classification_descriptions = None
|
|
334
|
+
|
|
335
|
+
if 'classification_category_descriptions' in d:
|
|
336
|
+
classification_descriptions = d['classification_category_descriptions']
|
|
337
|
+
classification_descriptions_single = {}
|
|
338
|
+
# We use "|" to delimit multiple descriptions, just use the first
|
|
339
|
+
# for smoothing. This isn't perfect or "correct", but it's reasonable.
|
|
340
|
+
for k in classification_descriptions.keys():
|
|
341
|
+
v = classification_descriptions[k]
|
|
342
|
+
v = v.split('|')[0]
|
|
343
|
+
classification_descriptions_single[k] = v
|
|
344
|
+
classification_descriptions = classification_descriptions_single
|
|
345
|
+
classification_descriptions_clean = {}
|
|
346
|
+
# category_id = next(iter(classification_descriptions))
|
|
347
|
+
for category_id in classification_descriptions:
|
|
348
|
+
classification_descriptions_clean[category_id] = \
|
|
349
|
+
clean_taxonomy_string(classification_descriptions[category_id]).strip(';').lower()
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
## Optionally add pre-smoothing descriptions to every image
|
|
353
|
+
|
|
354
|
+
if options.add_pre_smoothing_description and (classification_descriptions is not None):
|
|
355
|
+
|
|
356
|
+
for im in tqdm(d['images']):
|
|
357
|
+
|
|
358
|
+
if 'detections' not in im or im['detections'] is None or len(im['detections']) == 0:
|
|
359
|
+
continue
|
|
360
|
+
|
|
361
|
+
detections = im['detections']
|
|
362
|
+
category_to_count = count_detections_by_classification_category(detections, options)
|
|
363
|
+
|
|
364
|
+
im['pre_smoothing_description'] = \
|
|
365
|
+
get_classification_description_string(category_to_count, classification_descriptions)
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
return {
|
|
369
|
+
'd':d,
|
|
370
|
+
'other_category_ids':other_category_ids,
|
|
371
|
+
'classification_descriptions_clean':classification_descriptions_clean,
|
|
372
|
+
'classification_descriptions':classification_descriptions
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
# ...def _prepare_results_for_smoothing(...)
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def _smooth_classifications_for_list_of_detections(detections,
|
|
379
|
+
options,
|
|
380
|
+
other_category_ids,
|
|
381
|
+
classification_descriptions,
|
|
382
|
+
classification_descriptions_clean):
|
|
383
|
+
"""
|
|
384
|
+
Smooth classifications for a list of detections, which may have come from a single
|
|
385
|
+
image, or may represent an entire sequence.
|
|
386
|
+
|
|
387
|
+
Returns None if no changes are made, else a dict.
|
|
388
|
+
|
|
389
|
+
classification_descriptions_clean should be semicolon-delimited taxonomic strings
|
|
390
|
+
from which common names and GUIDs have already been removed.
|
|
391
|
+
|
|
392
|
+
Assumes there is only one classification per detection, i.e. that non-top classifications
|
|
393
|
+
have already been remoevd.
|
|
394
|
+
"""
|
|
395
|
+
|
|
396
|
+
## Count the number of instances of each category in this image
|
|
397
|
+
|
|
398
|
+
category_to_count = count_detections_by_classification_category(detections, options)
|
|
399
|
+
# _print_counts_with_names(category_to_count,classification_descriptions)
|
|
400
|
+
# get_classification_description_string(category_to_count, classification_descriptions)
|
|
401
|
+
|
|
402
|
+
if len(category_to_count) <= 1:
|
|
403
|
+
return None
|
|
404
|
+
|
|
405
|
+
keys = list(category_to_count.keys())
|
|
406
|
+
|
|
407
|
+
# Handle a quirky special case: if the most common category is "other" and
|
|
408
|
+
# it's "tied" with the second-most-common category, swap them
|
|
409
|
+
if (len(keys) > 1) and \
|
|
410
|
+
(keys[0] in other_category_ids) and \
|
|
411
|
+
(keys[1] not in other_category_ids) and \
|
|
412
|
+
(category_to_count[keys[0]] == category_to_count[keys[1]]):
|
|
413
|
+
keys[1], keys[0] = keys[0], keys[1]
|
|
414
|
+
|
|
415
|
+
max_count = category_to_count[keys[0]]
|
|
416
|
+
most_common_category = keys[0]
|
|
417
|
+
del keys
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
## Debug tools
|
|
421
|
+
|
|
422
|
+
verbose_debug_enabled = False
|
|
423
|
+
|
|
424
|
+
if options.break_at_image is not None:
|
|
425
|
+
for det in detections:
|
|
426
|
+
if 'image_filename' in det and \
|
|
427
|
+
det['image_filename'] == options.break_at_image:
|
|
428
|
+
verbose_debug_enabled = True
|
|
429
|
+
break
|
|
430
|
+
|
|
431
|
+
if verbose_debug_enabled:
|
|
432
|
+
_print_counts_with_names(category_to_count,classification_descriptions)
|
|
433
|
+
# from IPython import embed; embed()
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
## Possibly change "other" classifications to the most common category
|
|
437
|
+
|
|
438
|
+
# ...if the dominant category is not an "other" category.
|
|
439
|
+
|
|
440
|
+
n_other_classifications_changed_this_image = 0
|
|
441
|
+
|
|
442
|
+
# If we have at least *min_detections_to_overwrite_other* in a category that isn't
|
|
443
|
+
# "other", change all "other" classifications to that category
|
|
444
|
+
if (max_count >= options.min_detections_to_overwrite_other) and \
|
|
445
|
+
(most_common_category not in other_category_ids):
|
|
446
|
+
|
|
447
|
+
for det in detections:
|
|
448
|
+
|
|
449
|
+
if not _detection_is_relevant_for_smoothing(det,options):
|
|
450
|
+
continue
|
|
451
|
+
|
|
452
|
+
assert len(det['classifications']) == 1
|
|
453
|
+
c = det['classifications'][0]
|
|
454
|
+
|
|
455
|
+
if (c[1] >= options.classification_confidence_threshold) and \
|
|
456
|
+
(c[0] in other_category_ids):
|
|
457
|
+
|
|
458
|
+
if verbose_debug_enabled:
|
|
459
|
+
print('Replacing {} with {}'.format(
|
|
460
|
+
classification_descriptions[c[0]],
|
|
461
|
+
most_common_category))
|
|
462
|
+
|
|
463
|
+
n_other_classifications_changed_this_image += 1
|
|
464
|
+
c[0] = most_common_category
|
|
465
|
+
|
|
466
|
+
# ...if there are classifications for this detection
|
|
467
|
+
|
|
468
|
+
# ...for each detection
|
|
469
|
+
|
|
470
|
+
# ...if we should overwrite all "other" classifications
|
|
471
|
+
|
|
472
|
+
if verbose_debug_enabled:
|
|
473
|
+
print('Made {} other changes'.format(n_other_classifications_changed_this_image))
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
## Re-count
|
|
477
|
+
|
|
478
|
+
category_to_count = count_detections_by_classification_category(detections, options)
|
|
479
|
+
# _print_counts_with_names(category_to_count,classification_descriptions)
|
|
480
|
+
keys = list(category_to_count.keys())
|
|
481
|
+
max_count = category_to_count[keys[0]]
|
|
482
|
+
most_common_category = keys[0]
|
|
483
|
+
del keys
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
## Possibly change some non-dominant classifications to the dominant category
|
|
487
|
+
|
|
488
|
+
process_taxonomic_rules = \
|
|
489
|
+
(classification_descriptions_clean is not None) and \
|
|
490
|
+
(len(classification_descriptions_clean) > 0) and \
|
|
491
|
+
(len(category_to_count) > 1)
|
|
492
|
+
|
|
493
|
+
n_detections_flipped_this_image = 0
|
|
494
|
+
|
|
495
|
+
# Don't do this if the most common category is an "other" category, or
|
|
496
|
+
# if we don't have enough of the most common category
|
|
497
|
+
if (most_common_category not in other_category_ids) and \
|
|
498
|
+
(max_count >= options.min_detections_to_overwrite_secondary):
|
|
499
|
+
|
|
500
|
+
# i_det = 0; det = detections[i_det]
|
|
501
|
+
for i_det,det in enumerate(detections):
|
|
502
|
+
|
|
503
|
+
if not _detection_is_relevant_for_smoothing(det,options):
|
|
504
|
+
continue
|
|
505
|
+
|
|
506
|
+
assert len(det['classifications']) == 1
|
|
507
|
+
c = det['classifications'][0]
|
|
508
|
+
|
|
509
|
+
# Don't over-write the most common category with itself
|
|
510
|
+
if c[0] == most_common_category:
|
|
511
|
+
continue
|
|
512
|
+
|
|
513
|
+
# Don't bother with below-threshold classifications
|
|
514
|
+
if c[1] < options.classification_confidence_threshold:
|
|
515
|
+
continue
|
|
516
|
+
|
|
517
|
+
# If we're doing taxonomic processing, at this stage, don't turn children
|
|
518
|
+
# into parents; we'll likely turn parents into children in the next stage.
|
|
519
|
+
if process_taxonomic_rules:
|
|
520
|
+
|
|
521
|
+
most_common_category_description = \
|
|
522
|
+
classification_descriptions_clean[most_common_category]
|
|
523
|
+
|
|
524
|
+
category_id_this_classification = c[0]
|
|
525
|
+
assert category_id_this_classification in category_to_count
|
|
526
|
+
|
|
527
|
+
category_description_this_classification = \
|
|
528
|
+
classification_descriptions_clean[category_id_this_classification]
|
|
529
|
+
|
|
530
|
+
# An empty description corresponds to the "animal" category. We don't handle
|
|
531
|
+
# "animal" here as a parent category, that would be handled in the "other smoothing"
|
|
532
|
+
# step above.
|
|
533
|
+
if len(category_description_this_classification) == 0:
|
|
534
|
+
continue
|
|
535
|
+
|
|
536
|
+
most_common_category_is_parent_of_this_category = \
|
|
537
|
+
most_common_category_description in category_description_this_classification
|
|
538
|
+
|
|
539
|
+
if most_common_category_is_parent_of_this_category:
|
|
540
|
+
continue
|
|
541
|
+
|
|
542
|
+
# If we have fewer of this category than the most common category,
|
|
543
|
+
# but not *too* many, flip it to the most common category.
|
|
544
|
+
if (max_count > category_to_count[c[0]]) and \
|
|
545
|
+
(category_to_count[c[0]] <= options.max_detections_nondominant_class):
|
|
546
|
+
|
|
547
|
+
c[0] = most_common_category
|
|
548
|
+
n_detections_flipped_this_image += 1
|
|
549
|
+
|
|
550
|
+
# ...for each detection
|
|
551
|
+
|
|
552
|
+
# ...if the dominant category is legit
|
|
553
|
+
|
|
554
|
+
if verbose_debug_enabled:
|
|
555
|
+
print('Made {} non-dominant --> dominant changes'.format(
|
|
556
|
+
n_detections_flipped_this_image))
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
## Re-count
|
|
560
|
+
|
|
561
|
+
category_to_count = count_detections_by_classification_category(detections, options)
|
|
562
|
+
# _print_counts_with_names(category_to_count,classification_descriptions)
|
|
563
|
+
keys = list(category_to_count.keys())
|
|
564
|
+
max_count = category_to_count[keys[0]]
|
|
565
|
+
most_common_category = keys[0]
|
|
566
|
+
del keys
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
## Possibly collapse higher-level taxonomic predictions down to lower levels
|
|
570
|
+
|
|
571
|
+
n_taxonomic_changes_this_image = 0
|
|
572
|
+
|
|
573
|
+
process_taxonomic_rules = \
|
|
574
|
+
(classification_descriptions_clean is not None) and \
|
|
575
|
+
(len(classification_descriptions_clean) > 0) and \
|
|
576
|
+
(len(category_to_count) > 1)
|
|
577
|
+
|
|
578
|
+
if process_taxonomic_rules and options.propagate_classifications_through_taxonomy:
|
|
579
|
+
|
|
580
|
+
# det = detections[3]
|
|
581
|
+
for det in detections:
|
|
582
|
+
|
|
583
|
+
if not _detection_is_relevant_for_smoothing(det,options):
|
|
584
|
+
continue
|
|
585
|
+
|
|
586
|
+
assert len(det['classifications']) == 1
|
|
587
|
+
c = det['classifications'][0]
|
|
588
|
+
|
|
589
|
+
# Don't bother with any classifications below the confidence threshold
|
|
590
|
+
if c[1] < options.classification_confidence_threshold:
|
|
591
|
+
continue
|
|
592
|
+
|
|
593
|
+
category_id_this_classification = c[0]
|
|
594
|
+
assert category_id_this_classification in category_to_count
|
|
595
|
+
|
|
596
|
+
category_description_this_classification = \
|
|
597
|
+
classification_descriptions_clean[category_id_this_classification]
|
|
598
|
+
|
|
599
|
+
# An empty description corresponds to the "animal" category. We don't handle
|
|
600
|
+
# "animal" here as a parent category, that would be handled in the "other smoothing"
|
|
601
|
+
# step above.
|
|
602
|
+
if len(category_description_this_classification) == 0:
|
|
603
|
+
continue
|
|
604
|
+
|
|
605
|
+
# We may have multiple child categories to choose from; this keeps track of
|
|
606
|
+
# the "best" we've seen so far. "Best" is based on the level (species is better
|
|
607
|
+
# than genus) and number.
|
|
608
|
+
child_category_to_score = defaultdict(float)
|
|
609
|
+
|
|
610
|
+
for category_id_of_candidate_child in category_to_count.keys():
|
|
611
|
+
|
|
612
|
+
# A category is never its own child
|
|
613
|
+
if category_id_of_candidate_child == category_id_this_classification:
|
|
614
|
+
continue
|
|
615
|
+
|
|
616
|
+
# Is this candidate a child of the current classification?
|
|
617
|
+
category_description_candidate_child = \
|
|
618
|
+
classification_descriptions_clean[category_id_of_candidate_child]
|
|
619
|
+
|
|
620
|
+
# An empty description corresponds to "animal", which can never
|
|
621
|
+
# be a child of another category.
|
|
622
|
+
if len(category_description_candidate_child) == 0:
|
|
623
|
+
continue
|
|
624
|
+
|
|
625
|
+
# This handles a case that doesn't come up with "pure" SpeciesNet results;
|
|
626
|
+
# if two categories have different IDs but the same "clean" description, this
|
|
627
|
+
# means they're different common names for the same species, which we use
|
|
628
|
+
# for things like "white-tailed deer buck" and "white-tailed deer fawn".
|
|
629
|
+
#
|
|
630
|
+
# Currently we don't support smoothing those predictions, because it's not a
|
|
631
|
+
# parent/child relationship.
|
|
632
|
+
if category_description_candidate_child == \
|
|
633
|
+
category_description_this_classification:
|
|
634
|
+
continue
|
|
635
|
+
|
|
636
|
+
# As long as we're using "clean" descriptions, parent/child taxonomic
|
|
637
|
+
# relationships are defined by a substring relationship
|
|
638
|
+
is_child = category_description_this_classification in \
|
|
639
|
+
category_description_candidate_child
|
|
640
|
+
|
|
641
|
+
if not is_child:
|
|
642
|
+
continue
|
|
643
|
+
|
|
644
|
+
# How many instances of this child category are there?
|
|
645
|
+
child_category_count = category_to_count[category_id_of_candidate_child]
|
|
646
|
+
|
|
647
|
+
# What taxonomy level is this child category defined at?
|
|
648
|
+
child_category_level = taxonomy_level_index(
|
|
649
|
+
classification_descriptions[category_id_of_candidate_child])
|
|
650
|
+
|
|
651
|
+
child_category_to_score[category_id_of_candidate_child] = \
|
|
652
|
+
child_category_level * options.taxonomy_propagation_level_weight + \
|
|
653
|
+
child_category_count * options.taxonomy_propagation_count_weight
|
|
654
|
+
|
|
655
|
+
# ...for each category we are considering reducing this classification to
|
|
656
|
+
|
|
657
|
+
# Did we find a category we want to change this classification to?
|
|
658
|
+
if len(child_category_to_score) > 0:
|
|
659
|
+
|
|
660
|
+
# Find the child category with the highest score
|
|
661
|
+
child_category_to_score = sort_dictionary_by_value(
|
|
662
|
+
child_category_to_score,reverse=True)
|
|
663
|
+
best_child_category = next(iter(child_category_to_score.keys()))
|
|
664
|
+
|
|
665
|
+
if verbose_debug_enabled:
|
|
666
|
+
old_category_name = \
|
|
667
|
+
classification_descriptions_clean[c[0]]
|
|
668
|
+
new_category_name = \
|
|
669
|
+
classification_descriptions_clean[best_child_category]
|
|
670
|
+
print('Replacing {} with {}'.format(
|
|
671
|
+
old_category_name,new_category_name))
|
|
672
|
+
|
|
673
|
+
c[0] = best_child_category
|
|
674
|
+
n_taxonomic_changes_this_image += 1
|
|
675
|
+
|
|
676
|
+
# ...for each detection
|
|
677
|
+
|
|
678
|
+
# ...if we have taxonomic information available
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
## Re-count
|
|
682
|
+
|
|
683
|
+
category_to_count = count_detections_by_classification_category(detections, options)
|
|
684
|
+
# _print_counts_with_names(category_to_count,classification_descriptions)
|
|
685
|
+
keys = list(category_to_count.keys())
|
|
686
|
+
max_count = category_to_count[keys[0]]
|
|
687
|
+
most_common_category = keys[0]
|
|
688
|
+
del keys
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
## Possibly do within-family smoothing
|
|
692
|
+
|
|
693
|
+
n_within_family_smoothing_changes = 0
|
|
694
|
+
|
|
695
|
+
# min_detections_to_overwrite_secondary_same_family = -1
|
|
696
|
+
# max_detections_nondominant_class_same_family = 1
|
|
697
|
+
family_level = taxonomy_level_string_to_index('family')
|
|
698
|
+
|
|
699
|
+
if process_taxonomic_rules:
|
|
700
|
+
|
|
701
|
+
category_description_most_common_category = \
|
|
702
|
+
classification_descriptions[most_common_category]
|
|
703
|
+
most_common_category_taxonomic_level = \
|
|
704
|
+
taxonomy_level_index(category_description_most_common_category)
|
|
705
|
+
n_most_common_category = category_to_count[most_common_category]
|
|
706
|
+
tokens = category_description_most_common_category.split(';')
|
|
707
|
+
assert len(tokens) == 7
|
|
708
|
+
most_common_category_family = tokens[3]
|
|
709
|
+
most_common_category_genus = tokens[4]
|
|
710
|
+
|
|
711
|
+
# Only consider remapping to genus or species level, and only when we have
|
|
712
|
+
# a high enough count in the most common category
|
|
713
|
+
if process_taxonomic_rules and \
|
|
714
|
+
(options.min_detections_to_overwrite_secondary_same_family > 0) and \
|
|
715
|
+
(most_common_category not in other_category_ids) and \
|
|
716
|
+
(most_common_category_taxonomic_level > family_level) and \
|
|
717
|
+
(n_most_common_category >= options.min_detections_to_overwrite_secondary_same_family):
|
|
718
|
+
|
|
719
|
+
# det = detections[0]
|
|
720
|
+
for det in detections:
|
|
721
|
+
|
|
722
|
+
if not _detection_is_relevant_for_smoothing(det,options):
|
|
723
|
+
continue
|
|
724
|
+
|
|
725
|
+
assert len(det['classifications']) == 1
|
|
726
|
+
c = det['classifications'][0]
|
|
727
|
+
|
|
728
|
+
# Don't over-write the most common category with itself
|
|
729
|
+
if c[0] == most_common_category:
|
|
730
|
+
continue
|
|
731
|
+
|
|
732
|
+
# Don't bother with below-threshold classifications
|
|
733
|
+
if c[1] < options.classification_confidence_threshold:
|
|
734
|
+
continue
|
|
735
|
+
|
|
736
|
+
n_candidate_flip_category = category_to_count[c[0]]
|
|
737
|
+
|
|
738
|
+
# Do we have too many of the non-dominant category to do this kind of swap?
|
|
739
|
+
if n_candidate_flip_category > \
|
|
740
|
+
options.max_detections_nondominant_class_same_family:
|
|
741
|
+
continue
|
|
742
|
+
|
|
743
|
+
# Don't flip classes when it's a tie
|
|
744
|
+
if n_candidate_flip_category == n_most_common_category:
|
|
745
|
+
continue
|
|
746
|
+
|
|
747
|
+
category_description_candidate_flip = \
|
|
748
|
+
classification_descriptions[c[0]]
|
|
749
|
+
tokens = category_description_candidate_flip.split(';')
|
|
750
|
+
assert len(tokens) == 7
|
|
751
|
+
candidate_flip_category_family = tokens[3]
|
|
752
|
+
candidate_flip_category_genus = tokens[4]
|
|
753
|
+
candidate_flip_category_taxonomic_level = \
|
|
754
|
+
taxonomy_level_index(category_description_candidate_flip)
|
|
755
|
+
|
|
756
|
+
# Only proceed if we have valid family strings
|
|
757
|
+
if (len(candidate_flip_category_family) == 0) or \
|
|
758
|
+
(len(most_common_category_family) == 0):
|
|
759
|
+
continue
|
|
760
|
+
|
|
761
|
+
# Only proceed if the candidate and the most common category are in the same family
|
|
762
|
+
if candidate_flip_category_family != most_common_category_family:
|
|
763
|
+
continue
|
|
764
|
+
|
|
765
|
+
# Don't flip from a species to the genus level in the same genus
|
|
766
|
+
if (candidate_flip_category_genus == most_common_category_genus) and \
|
|
767
|
+
(candidate_flip_category_taxonomic_level > \
|
|
768
|
+
most_common_category_taxonomic_level):
|
|
769
|
+
continue
|
|
770
|
+
|
|
771
|
+
old_category_name = classification_descriptions_clean[c[0]]
|
|
772
|
+
new_category_name = classification_descriptions_clean[most_common_category]
|
|
773
|
+
|
|
774
|
+
c[0] = most_common_category
|
|
775
|
+
n_within_family_smoothing_changes += 1
|
|
776
|
+
|
|
777
|
+
# ...for each detection
|
|
778
|
+
|
|
779
|
+
# ...if the dominant category is legit and we have taxonomic information available
|
|
780
|
+
|
|
781
|
+
return {'n_other_classifications_changed_this_image':n_other_classifications_changed_this_image,
|
|
782
|
+
'n_detections_flipped_this_image':n_detections_flipped_this_image,
|
|
783
|
+
'n_taxonomic_changes_this_image':n_taxonomic_changes_this_image,
|
|
784
|
+
'n_within_family_smoothing_changes':n_within_family_smoothing_changes}
|
|
785
|
+
|
|
786
|
+
# ...def _smooth_classifications_for_list_of_detections(...)
|
|
787
|
+
|
|
788
|
+
|
|
789
|
+
def _smooth_single_image(im,
|
|
790
|
+
options,
|
|
791
|
+
other_category_ids,
|
|
792
|
+
classification_descriptions,
|
|
793
|
+
classification_descriptions_clean):
|
|
794
|
+
"""
|
|
795
|
+
Smooth classifications for a single image. Returns None if no changes are made,
|
|
796
|
+
else a dict.
|
|
797
|
+
|
|
798
|
+
classification_descriptions_clean should be semicolon-delimited taxonomic strings
|
|
799
|
+
from which common names and GUIDs have already been removed.
|
|
800
|
+
|
|
801
|
+
Assumes there is only one classification per detection, i.e. that non-top classifications
|
|
802
|
+
have already been remoevd.
|
|
803
|
+
"""
|
|
804
|
+
|
|
805
|
+
if 'detections' not in im or im['detections'] is None or len(im['detections']) == 0:
|
|
806
|
+
return
|
|
807
|
+
|
|
808
|
+
detections = im['detections']
|
|
809
|
+
|
|
810
|
+
# Simplify debugging
|
|
811
|
+
for det in detections:
|
|
812
|
+
det['image_filename'] = im['file']
|
|
813
|
+
|
|
814
|
+
to_return = _smooth_classifications_for_list_of_detections(detections,
|
|
815
|
+
options=options,
|
|
816
|
+
other_category_ids=other_category_ids,
|
|
817
|
+
classification_descriptions=classification_descriptions,
|
|
818
|
+
classification_descriptions_clean=classification_descriptions_clean)
|
|
819
|
+
|
|
820
|
+
# Clean out debug information
|
|
821
|
+
for det in detections:
|
|
822
|
+
del det['image_filename']
|
|
823
|
+
|
|
824
|
+
return to_return
|
|
825
|
+
|
|
826
|
+
# ...def smooth_single_image
|
|
827
|
+
|
|
828
|
+
|
|
829
|
+
#%% Image-level smoothing
|
|
830
|
+
|
|
831
|
+
def smooth_classification_results_image_level(input_file,output_file=None,options=None):
|
|
832
|
+
"""
|
|
833
|
+
Smooth classifications at the image level for all results in the MD-formatted results
|
|
834
|
+
file [input_file], optionally writing a new set of results to [output_file].
|
|
835
|
+
|
|
836
|
+
This function generally expresses the notion that an image with 700 cows and one deer
|
|
837
|
+
is really just 701 cows.
|
|
838
|
+
|
|
839
|
+
Only count detections with a classification confidence threshold above
|
|
840
|
+
[options.classification_confidence_threshold], which in practice means we're only
|
|
841
|
+
looking at one category per detection.
|
|
842
|
+
|
|
843
|
+
If an image has at least [options.min_detections_to_overwrite_secondary] such detections
|
|
844
|
+
in the most common category, and no more than [options.max_detections_nondominant_class]
|
|
845
|
+
in the second-most-common category, flip all detections to the most common
|
|
846
|
+
category.
|
|
847
|
+
|
|
848
|
+
Optionally treat some classes as particularly unreliable, typically used to overwrite an
|
|
849
|
+
"other" class.
|
|
850
|
+
|
|
851
|
+
This function also removes everything but the non-dominant classification for each detection.
|
|
852
|
+
|
|
853
|
+
Args:
|
|
854
|
+
input_file (str): MegaDetector-formatted classification results file to smooth. Can
|
|
855
|
+
also be an already-loaded results dict.
|
|
856
|
+
output_file (str, optional): .json file to write smoothed results
|
|
857
|
+
options (ClassificationSmoothingOptions, optional): see
|
|
858
|
+
ClassificationSmoothingOptions for details.
|
|
859
|
+
|
|
860
|
+
Returns:
|
|
861
|
+
dict: MegaDetector-results-formatted dict, identical to what's written to
|
|
862
|
+
[output_file] if [output_file] is not None.
|
|
863
|
+
"""
|
|
864
|
+
|
|
865
|
+
## Input validation
|
|
866
|
+
|
|
867
|
+
if options is None:
|
|
868
|
+
options = ClassificationSmoothingOptions()
|
|
869
|
+
|
|
870
|
+
r = _prepare_results_for_smoothing(input_file, options)
|
|
871
|
+
d = r['d']
|
|
872
|
+
other_category_ids = r['other_category_ids']
|
|
873
|
+
classification_descriptions_clean = r['classification_descriptions_clean']
|
|
874
|
+
classification_descriptions = r['classification_descriptions']
|
|
875
|
+
|
|
876
|
+
|
|
877
|
+
## Smoothing
|
|
878
|
+
|
|
879
|
+
n_other_classifications_changed = 0
|
|
880
|
+
n_other_images_changed = 0
|
|
881
|
+
n_taxonomic_images_changed = 0
|
|
882
|
+
|
|
883
|
+
n_detections_flipped = 0
|
|
884
|
+
n_images_changed = 0
|
|
885
|
+
n_taxonomic_classification_changes = 0
|
|
886
|
+
|
|
887
|
+
# im = d['images'][0]
|
|
888
|
+
for im in tqdm(d['images']):
|
|
889
|
+
|
|
890
|
+
r = _smooth_single_image(im,
|
|
891
|
+
options,
|
|
892
|
+
other_category_ids,
|
|
893
|
+
classification_descriptions=classification_descriptions,
|
|
894
|
+
classification_descriptions_clean=classification_descriptions_clean)
|
|
895
|
+
|
|
896
|
+
if r is None:
|
|
897
|
+
continue
|
|
898
|
+
|
|
899
|
+
n_detections_flipped_this_image = r['n_detections_flipped_this_image']
|
|
900
|
+
n_other_classifications_changed_this_image = \
|
|
901
|
+
r['n_other_classifications_changed_this_image']
|
|
902
|
+
n_taxonomic_changes_this_image = r['n_taxonomic_changes_this_image']
|
|
903
|
+
|
|
904
|
+
n_detections_flipped += n_detections_flipped_this_image
|
|
905
|
+
n_other_classifications_changed += n_other_classifications_changed_this_image
|
|
906
|
+
n_taxonomic_classification_changes += n_taxonomic_changes_this_image
|
|
907
|
+
|
|
908
|
+
if n_detections_flipped_this_image > 0:
|
|
909
|
+
n_images_changed += 1
|
|
910
|
+
if n_other_classifications_changed_this_image > 0:
|
|
911
|
+
n_other_images_changed += 1
|
|
912
|
+
if n_taxonomic_changes_this_image > 0:
|
|
913
|
+
n_taxonomic_images_changed += 1
|
|
914
|
+
|
|
915
|
+
# ...for each image
|
|
916
|
+
|
|
917
|
+
print('Classification smoothing: changed {} detections on {} images'.format(
|
|
918
|
+
n_detections_flipped,n_images_changed))
|
|
919
|
+
|
|
920
|
+
print('"Other" smoothing: changed {} detections on {} images'.format(
|
|
921
|
+
n_other_classifications_changed,n_other_images_changed))
|
|
922
|
+
|
|
923
|
+
print('Taxonomic smoothing: changed {} detections on {} images'.format(
|
|
924
|
+
n_taxonomic_classification_changes,n_taxonomic_images_changed))
|
|
925
|
+
|
|
926
|
+
|
|
927
|
+
## Write output
|
|
928
|
+
|
|
929
|
+
if output_file is not None:
|
|
930
|
+
print('Writing results after image-level smoothing to:\n{}'.format(output_file))
|
|
931
|
+
write_json(output_file,d)
|
|
932
|
+
|
|
933
|
+
return d
|
|
934
|
+
|
|
935
|
+
# ...def smooth_classification_results_image_level(...)
|
|
936
|
+
|
|
937
|
+
|
|
938
|
+
#%% Sequence-level smoothing
|
|
939
|
+
|
|
940
|
+
def smooth_classification_results_sequence_level(input_file,
|
|
941
|
+
cct_sequence_information,
|
|
942
|
+
output_file=None,
|
|
943
|
+
options=None):
|
|
944
|
+
"""
|
|
945
|
+
Smooth classifications at the sequence level for all results in the MD-formatted results
|
|
946
|
+
file [md_results_file], optionally writing a new set of results to [output_file].
|
|
947
|
+
|
|
948
|
+
This function generally expresses the notion that a sequence that looks like
|
|
949
|
+
deer/deer/deer/elk/deer/deer/deer/deer is really just a deer.
|
|
950
|
+
|
|
951
|
+
Args:
|
|
952
|
+
input_file (str or dict): MegaDetector-formatted classification results file to smooth
|
|
953
|
+
(or already-loaded results). If you supply a dict, it's copied by default, but
|
|
954
|
+
in-place modification is supported via options.modify_in_place.
|
|
955
|
+
cct_sequence_information (str, dict, or list): COCO Camera Traps file containing sequence IDs for
|
|
956
|
+
each image (or an already-loaded CCT-formatted dict, or just the 'images' list from a CCT dict).
|
|
957
|
+
output_file (str, optional): .json file to write smoothed results
|
|
958
|
+
options (ClassificationSmoothingOptions, optional): see
|
|
959
|
+
ClassificationSmoothingOptions for details.
|
|
960
|
+
|
|
961
|
+
Returns:
|
|
962
|
+
dict: MegaDetector-results-formatted dict, identical to what's written to
|
|
963
|
+
[output_file] if [output_file] is not None.
|
|
964
|
+
"""
|
|
965
|
+
|
|
966
|
+
## Input validation
|
|
967
|
+
|
|
968
|
+
if options is None:
|
|
969
|
+
options = ClassificationSmoothingOptions()
|
|
970
|
+
|
|
971
|
+
r = _prepare_results_for_smoothing(input_file, options)
|
|
972
|
+
d = r['d']
|
|
973
|
+
other_category_ids = r['other_category_ids']
|
|
974
|
+
classification_descriptions_clean = r['classification_descriptions_clean']
|
|
975
|
+
classification_descriptions = r['classification_descriptions']
|
|
976
|
+
|
|
977
|
+
|
|
978
|
+
## Make a list of images appearing in each sequence
|
|
979
|
+
|
|
980
|
+
if isinstance(cct_sequence_information,list):
|
|
981
|
+
image_info = cct_sequence_information
|
|
982
|
+
elif isinstance(cct_sequence_information,str):
|
|
983
|
+
print('Loading sequence information from {}'.format(cct_sequence_information))
|
|
984
|
+
with open(cct_sequence_information,'r') as f:
|
|
985
|
+
cct_sequence_information = json.load(f)
|
|
986
|
+
image_info = cct_sequence_information['images']
|
|
987
|
+
else:
|
|
988
|
+
assert isinstance(cct_sequence_information,dict)
|
|
989
|
+
image_info = cct_sequence_information['images']
|
|
990
|
+
|
|
991
|
+
sequence_to_image_filenames = defaultdict(list)
|
|
992
|
+
|
|
993
|
+
# im = image_info[0]
|
|
994
|
+
for im in tqdm(image_info):
|
|
995
|
+
sequence_to_image_filenames[im['seq_id']].append(im['file_name'])
|
|
996
|
+
del image_info
|
|
997
|
+
|
|
998
|
+
image_fn_to_classification_results = {}
|
|
999
|
+
for im in d['images']:
|
|
1000
|
+
fn = im['file']
|
|
1001
|
+
assert fn not in image_fn_to_classification_results
|
|
1002
|
+
image_fn_to_classification_results[fn] = im
|
|
1003
|
+
|
|
1004
|
+
|
|
1005
|
+
## Smoothing
|
|
1006
|
+
|
|
1007
|
+
n_other_classifications_changed = 0
|
|
1008
|
+
n_other_sequences_changed = 0
|
|
1009
|
+
n_taxonomic_sequences_changed = 0
|
|
1010
|
+
n_within_family_sequences_changed = 0
|
|
1011
|
+
|
|
1012
|
+
n_detections_flipped = 0
|
|
1013
|
+
n_sequences_changed = 0
|
|
1014
|
+
n_taxonomic_classification_changes = 0
|
|
1015
|
+
n_within_family_changes = 0
|
|
1016
|
+
|
|
1017
|
+
# sequence_id = list(sequence_to_image_filenames.keys())[0]
|
|
1018
|
+
for sequence_id in sequence_to_image_filenames.keys():
|
|
1019
|
+
|
|
1020
|
+
image_filenames_this_sequence = sequence_to_image_filenames[sequence_id]
|
|
1021
|
+
|
|
1022
|
+
# if 'file' in image_filenames_this_sequence:
|
|
1023
|
+
# from IPython import embed; embed()
|
|
1024
|
+
|
|
1025
|
+
detections_this_sequence = []
|
|
1026
|
+
for image_filename in image_filenames_this_sequence:
|
|
1027
|
+
if image_filename not in image_fn_to_classification_results:
|
|
1028
|
+
print('Warning: {} in sequence list but not in results'.format(
|
|
1029
|
+
image_filename))
|
|
1030
|
+
continue
|
|
1031
|
+
im = image_fn_to_classification_results[image_filename]
|
|
1032
|
+
if 'detections' not in im or im['detections'] is None:
|
|
1033
|
+
continue
|
|
1034
|
+
detections_this_sequence.extend(im['detections'])
|
|
1035
|
+
|
|
1036
|
+
# Temporarily add image filenames to every detection,
|
|
1037
|
+
# for debugging
|
|
1038
|
+
for det in im['detections']:
|
|
1039
|
+
det['image_filename'] = im['file']
|
|
1040
|
+
|
|
1041
|
+
if len(detections_this_sequence) == 0:
|
|
1042
|
+
continue
|
|
1043
|
+
|
|
1044
|
+
r = _smooth_classifications_for_list_of_detections(
|
|
1045
|
+
detections=detections_this_sequence,
|
|
1046
|
+
options=options,
|
|
1047
|
+
other_category_ids=other_category_ids,
|
|
1048
|
+
classification_descriptions=classification_descriptions,
|
|
1049
|
+
classification_descriptions_clean=classification_descriptions_clean)
|
|
1050
|
+
|
|
1051
|
+
if r is None:
|
|
1052
|
+
continue
|
|
1053
|
+
|
|
1054
|
+
n_detections_flipped_this_sequence = r['n_detections_flipped_this_image']
|
|
1055
|
+
n_other_classifications_changed_this_sequence = \
|
|
1056
|
+
r['n_other_classifications_changed_this_image']
|
|
1057
|
+
n_taxonomic_changes_this_sequence = r['n_taxonomic_changes_this_image']
|
|
1058
|
+
n_within_family_changes_this_sequence = r['n_within_family_smoothing_changes']
|
|
1059
|
+
|
|
1060
|
+
n_detections_flipped += n_detections_flipped_this_sequence
|
|
1061
|
+
n_other_classifications_changed += n_other_classifications_changed_this_sequence
|
|
1062
|
+
n_taxonomic_classification_changes += n_taxonomic_changes_this_sequence
|
|
1063
|
+
n_within_family_changes += n_within_family_changes_this_sequence
|
|
1064
|
+
|
|
1065
|
+
if n_detections_flipped_this_sequence > 0:
|
|
1066
|
+
n_sequences_changed += 1
|
|
1067
|
+
if n_other_classifications_changed_this_sequence > 0:
|
|
1068
|
+
n_other_sequences_changed += 1
|
|
1069
|
+
if n_taxonomic_changes_this_sequence > 0:
|
|
1070
|
+
n_taxonomic_sequences_changed += 1
|
|
1071
|
+
if n_within_family_changes_this_sequence > 0:
|
|
1072
|
+
n_within_family_sequences_changed += 1
|
|
1073
|
+
|
|
1074
|
+
# ...for each sequence
|
|
1075
|
+
|
|
1076
|
+
print('Classification smoothing: changed {} detections in {} sequences'.format(
|
|
1077
|
+
n_detections_flipped,n_sequences_changed))
|
|
1078
|
+
|
|
1079
|
+
print('"Other" smoothing: changed {} detections in {} sequences'.format(
|
|
1080
|
+
n_other_classifications_changed,n_other_sequences_changed))
|
|
1081
|
+
|
|
1082
|
+
print('Taxonomic smoothing: changed {} detections in {} sequences'.format(
|
|
1083
|
+
n_taxonomic_classification_changes,n_taxonomic_sequences_changed))
|
|
1084
|
+
|
|
1085
|
+
print('Within-family smoothing: changed {} detections in {} sequences'.format(
|
|
1086
|
+
n_within_family_changes,n_within_family_sequences_changed))
|
|
1087
|
+
|
|
1088
|
+
|
|
1089
|
+
## Clean up debug information
|
|
1090
|
+
|
|
1091
|
+
for im in d['images']:
|
|
1092
|
+
if 'detections' not in im or im['detections'] is None:
|
|
1093
|
+
continue
|
|
1094
|
+
for det in im['detections']:
|
|
1095
|
+
if 'image_filename' in det:
|
|
1096
|
+
del det['image_filename']
|
|
1097
|
+
|
|
1098
|
+
|
|
1099
|
+
## Write output
|
|
1100
|
+
|
|
1101
|
+
if output_file is not None:
|
|
1102
|
+
print('Writing sequence-smoothed classification results to {}'.format(
|
|
1103
|
+
output_file))
|
|
1104
|
+
write_json(output_file,d)
|
|
1105
|
+
|
|
1106
|
+
return d
|
|
1107
|
+
|
|
1108
|
+
# ...smooth_classification_results_sequence_level(...)
|
|
1109
|
+
|
|
1110
|
+
|
|
1111
|
+
def remove_classifications_from_non_animal_detections(input_file,
|
|
1112
|
+
output_file,
|
|
1113
|
+
animal_category_names=None):
|
|
1114
|
+
"""
|
|
1115
|
+
Remove classifications from non-animal detections in a MD .json file,
|
|
1116
|
+
optionally writing the results to a new .json file
|
|
1117
|
+
|
|
1118
|
+
Args:
|
|
1119
|
+
input_file (str): the MD-formatted .json file to process
|
|
1120
|
+
output_file (str, optional): the output file to write the modified results
|
|
1121
|
+
animal_category_names (list, optional): the detection category names that should
|
|
1122
|
+
be treated as animals (defaults to just 'animal').
|
|
1123
|
+
|
|
1124
|
+
Returns:
|
|
1125
|
+
dict: the modified results
|
|
1126
|
+
"""
|
|
1127
|
+
|
|
1128
|
+
if animal_category_names is None:
|
|
1129
|
+
animal_category_names = ['animal']
|
|
1130
|
+
animal_category_names = set(animal_category_names)
|
|
1131
|
+
|
|
1132
|
+
with open(input_file,'r') as f:
|
|
1133
|
+
d = json.load(f)
|
|
1134
|
+
|
|
1135
|
+
category_id_to_name = d['detection_categories']
|
|
1136
|
+
|
|
1137
|
+
n_classifications_removed = 0
|
|
1138
|
+
n_detections = 0
|
|
1139
|
+
|
|
1140
|
+
# im = d['images'][0]
|
|
1141
|
+
for im in d['images']:
|
|
1142
|
+
|
|
1143
|
+
if ('detections' not in im) or (im['detections'] is None):
|
|
1144
|
+
continue
|
|
1145
|
+
|
|
1146
|
+
n_detections += len(im['detections'])
|
|
1147
|
+
|
|
1148
|
+
for det in im['detections']:
|
|
1149
|
+
|
|
1150
|
+
if 'classifications' not in det:
|
|
1151
|
+
continue
|
|
1152
|
+
category_id = det['category']
|
|
1153
|
+
category_name = category_id_to_name[category_id]
|
|
1154
|
+
if category_name not in animal_category_names:
|
|
1155
|
+
del det['classifications']
|
|
1156
|
+
n_classifications_removed += 1
|
|
1157
|
+
continue
|
|
1158
|
+
|
|
1159
|
+
# ...for each detection
|
|
1160
|
+
|
|
1161
|
+
# ...for each image
|
|
1162
|
+
|
|
1163
|
+
print('Removed classifications from {} of {} detections'.format(
|
|
1164
|
+
n_classifications_removed,n_detections))
|
|
1165
|
+
|
|
1166
|
+
if output_file is not None:
|
|
1167
|
+
write_json(output_file,d)
|
|
1168
|
+
|
|
1169
|
+
return d
|
|
1170
|
+
|
|
1171
|
+
# ...def remove_classifications_from_non_animal_detections(...)
|
|
1172
|
+
|
|
1173
|
+
|
|
1174
|
+
def restrict_to_taxa_list(taxa_list,
|
|
1175
|
+
speciesnet_taxonomy_file,
|
|
1176
|
+
input_file,
|
|
1177
|
+
output_file,
|
|
1178
|
+
allow_walk_down=False,
|
|
1179
|
+
add_pre_filtering_description=True,
|
|
1180
|
+
add_post_filtering_description=True,
|
|
1181
|
+
allow_redundant_latin_names=True,
|
|
1182
|
+
protected_common_names=None,
|
|
1183
|
+
use_original_common_names_if_available=True,
|
|
1184
|
+
verbose=True,
|
|
1185
|
+
classification_threshold=None,
|
|
1186
|
+
combine_redundant_categories=True):
|
|
1187
|
+
"""
|
|
1188
|
+
Given a prediction file in MD .json format, likely without having had
|
|
1189
|
+
a geofence applied, apply a custom taxa list.
|
|
1190
|
+
|
|
1191
|
+
Args:
|
|
1192
|
+
taxa_list (str): .csv file with at least the columns "latin" and "common"
|
|
1193
|
+
speciesnet_taxonomy_file (str): taxonomy filename, in the same format used for
|
|
1194
|
+
model release (with 7-token taxonomy entries)
|
|
1195
|
+
input_file (str): .json file to read, in MD format. This can be None, in which
|
|
1196
|
+
case this function just validates [taxa_list].
|
|
1197
|
+
output_file (str): .json file to write, in MD format
|
|
1198
|
+
allow_walk_down (bool, optional): should we walk down the taxonomy tree
|
|
1199
|
+
when making mappings if a parent has only a single allowable child?
|
|
1200
|
+
For example, if only a single felid species is allowed, should other
|
|
1201
|
+
felid predictions be mapped to that species, as opposed to being mapped
|
|
1202
|
+
to the family?
|
|
1203
|
+
add_pre_filtering_description (bool, optional): should we add a new metadata
|
|
1204
|
+
field that summarizes each image's classifications prior to taxonomic
|
|
1205
|
+
restriction?
|
|
1206
|
+
add_post_filtering_description (bool, optional): should we add a new metadata
|
|
1207
|
+
field that summarizes each image's classifications after taxonomic
|
|
1208
|
+
restriction?
|
|
1209
|
+
allow_redundant_latin_names (bool, optional): if False, we'll raise an Exception
|
|
1210
|
+
if the same latin name appears twice in the taxonomy list; if True, we'll
|
|
1211
|
+
just print a warning and ignore all entries other than the first for this
|
|
1212
|
+
latin name
|
|
1213
|
+
protected_common_names (list, optional): these categories should be
|
|
1214
|
+
unmodified, even if they aren't used, or have the same taxonomic
|
|
1215
|
+
description as other categories
|
|
1216
|
+
use_original_common_names_if_available (bool, optional): if an "original_common"
|
|
1217
|
+
column is present in [taxa_list], use those common names instead of the ones
|
|
1218
|
+
in the taxonomy file
|
|
1219
|
+
verbose (bool, optional): enable additional debug output
|
|
1220
|
+
classification_threshold (float, optional): only relevant for the pre/post filtering
|
|
1221
|
+
descriptions
|
|
1222
|
+
combine_redundant_categories (bool, optional): whether to combine categories with the
|
|
1223
|
+
same common name
|
|
1224
|
+
"""
|
|
1225
|
+
|
|
1226
|
+
##%% Read target taxa list
|
|
1227
|
+
|
|
1228
|
+
taxa_list_df = pd.read_csv(taxa_list)
|
|
1229
|
+
|
|
1230
|
+
required_columns = ('latin','common')
|
|
1231
|
+
for s in required_columns:
|
|
1232
|
+
assert s in taxa_list_df.columns, \
|
|
1233
|
+
'Required column {} missing from taxonomy list file {}'.format(
|
|
1234
|
+
s,taxa_list)
|
|
1235
|
+
|
|
1236
|
+
# Convert the "latin" and "common" columns in taxa_list_df to lowercase
|
|
1237
|
+
taxa_list_df['latin'] = taxa_list_df['latin'].str.lower()
|
|
1238
|
+
taxa_list_df['common'] = taxa_list_df['common'].str.lower()
|
|
1239
|
+
|
|
1240
|
+
# Remove rows from taxa_list_df where the "latin" column is nan,
|
|
1241
|
+
# printing a warning for each row (with a string representation of the whole row)
|
|
1242
|
+
for i_row,row in taxa_list_df.iterrows():
|
|
1243
|
+
if pd.isna(row['latin']):
|
|
1244
|
+
if verbose:
|
|
1245
|
+
print('Warning: Skipping row with empty "latin" column in {}:\n{}\n'.format(
|
|
1246
|
+
taxa_list,str(row.to_dict())))
|
|
1247
|
+
taxa_list_df.drop(index=i_row, inplace=True)
|
|
1248
|
+
|
|
1249
|
+
# Convert all NaN values in the "common" column to empty strings
|
|
1250
|
+
taxa_list_df['common'] = taxa_list_df['common'].fillna('')
|
|
1251
|
+
|
|
1252
|
+
# Create a dictionary mapping source Latin names to target common names
|
|
1253
|
+
target_latin_to_common = {}
|
|
1254
|
+
|
|
1255
|
+
for i_row,row in taxa_list_df.iterrows():
|
|
1256
|
+
|
|
1257
|
+
latin = row['latin']
|
|
1258
|
+
common = row['common']
|
|
1259
|
+
|
|
1260
|
+
if use_original_common_names_if_available and \
|
|
1261
|
+
('original_common' in row) and \
|
|
1262
|
+
(not is_empty(row['original_common'])):
|
|
1263
|
+
common = row['original_common'].strip().lower()
|
|
1264
|
+
|
|
1265
|
+
# Valid latin names have either one token (e.g. "canidae"),
|
|
1266
|
+
# two tokens (e.g. "bos taurus"), or three tokens (e.g. "canis lupus familiaris")
|
|
1267
|
+
assert len(latin.split(' ')) in (1,2,3), \
|
|
1268
|
+
'Illegal binomial name {} in taxaonomy list {}'.format(
|
|
1269
|
+
latin,taxa_list)
|
|
1270
|
+
|
|
1271
|
+
if latin in target_latin_to_common:
|
|
1272
|
+
error_string = \
|
|
1273
|
+
'scientific name {} appears multiple times in the taxonomy list'.format(
|
|
1274
|
+
latin)
|
|
1275
|
+
if allow_redundant_latin_names:
|
|
1276
|
+
if verbose:
|
|
1277
|
+
print('Warning: {}'.format(error_string))
|
|
1278
|
+
else:
|
|
1279
|
+
raise ValueError(error_string)
|
|
1280
|
+
|
|
1281
|
+
target_latin_to_common[latin] = common
|
|
1282
|
+
|
|
1283
|
+
# ...for each row in the custom taxonomy list
|
|
1284
|
+
|
|
1285
|
+
|
|
1286
|
+
##%% Read taxonomy file
|
|
1287
|
+
|
|
1288
|
+
with open(speciesnet_taxonomy_file,'r') as f:
|
|
1289
|
+
speciesnet_taxonomy_list = f.readlines()
|
|
1290
|
+
speciesnet_taxonomy_list = [s.strip() for s in \
|
|
1291
|
+
speciesnet_taxonomy_list if len(s.strip()) > 0]
|
|
1292
|
+
|
|
1293
|
+
# Maps the latin name of every taxon to the corresponding full taxon string
|
|
1294
|
+
#
|
|
1295
|
+
# For species, the key is a binomial name
|
|
1296
|
+
speciesnet_latin_name_to_taxon_string = {}
|
|
1297
|
+
speciesnet_common_name_to_taxon_string = {}
|
|
1298
|
+
|
|
1299
|
+
def _insert_taxonomy_string(s):
|
|
1300
|
+
|
|
1301
|
+
tokens = s.split(';')
|
|
1302
|
+
assert len(tokens) == 7, 'Illegal taxonomy string {}'.format(s)
|
|
1303
|
+
|
|
1304
|
+
guid = tokens[0] # noqa
|
|
1305
|
+
class_name = tokens[1]
|
|
1306
|
+
order = tokens[2]
|
|
1307
|
+
family = tokens[3]
|
|
1308
|
+
genus = tokens[4]
|
|
1309
|
+
species = tokens[5]
|
|
1310
|
+
common_name = tokens[6]
|
|
1311
|
+
|
|
1312
|
+
if len(class_name) == 0:
|
|
1313
|
+
assert common_name in ('animal','vehicle','blank'), \
|
|
1314
|
+
'Illegal common name {}'.format(common_name)
|
|
1315
|
+
return
|
|
1316
|
+
|
|
1317
|
+
if len(species) > 0:
|
|
1318
|
+
assert all([len(s) > 0 for s in [genus,family,order]]), \
|
|
1319
|
+
'Higher-level taxa missing for {}: {},{},{}'.format(s,genus,family,order)
|
|
1320
|
+
binomial_name = genus + ' ' + species
|
|
1321
|
+
if binomial_name not in speciesnet_latin_name_to_taxon_string:
|
|
1322
|
+
speciesnet_latin_name_to_taxon_string[binomial_name] = s
|
|
1323
|
+
elif len(genus) > 0:
|
|
1324
|
+
assert all([len(s) > 0 for s in [family,order]]), \
|
|
1325
|
+
'Higher-level taxa missing for {}: {},{}'.format(s,family,order)
|
|
1326
|
+
if genus not in speciesnet_latin_name_to_taxon_string:
|
|
1327
|
+
speciesnet_latin_name_to_taxon_string[genus] = s
|
|
1328
|
+
elif len(family) > 0:
|
|
1329
|
+
assert len(order) > 0, \
|
|
1330
|
+
'Higher-level taxa missing for {}: {}'.format(s,order)
|
|
1331
|
+
if family not in speciesnet_latin_name_to_taxon_string:
|
|
1332
|
+
speciesnet_latin_name_to_taxon_string[family] = s
|
|
1333
|
+
elif len(order) > 0:
|
|
1334
|
+
if order not in speciesnet_latin_name_to_taxon_string:
|
|
1335
|
+
speciesnet_latin_name_to_taxon_string[order] = s
|
|
1336
|
+
else:
|
|
1337
|
+
if class_name not in speciesnet_latin_name_to_taxon_string:
|
|
1338
|
+
speciesnet_latin_name_to_taxon_string[class_name] = s
|
|
1339
|
+
|
|
1340
|
+
if len(common_name) > 0:
|
|
1341
|
+
if common_name not in speciesnet_common_name_to_taxon_string:
|
|
1342
|
+
speciesnet_common_name_to_taxon_string[common_name] = s
|
|
1343
|
+
|
|
1344
|
+
for s in speciesnet_taxonomy_list:
|
|
1345
|
+
|
|
1346
|
+
_insert_taxonomy_string(s)
|
|
1347
|
+
|
|
1348
|
+
|
|
1349
|
+
##%% Make sure all parent taxa are represented in the taxonomy
|
|
1350
|
+
|
|
1351
|
+
# In theory any taxon that appears as the parent of another taxon should
|
|
1352
|
+
# also be in the taxonomy, but this isn't always true, so we fix it here.
|
|
1353
|
+
new_taxon_string_to_missing_tokens = defaultdict(list)
|
|
1354
|
+
|
|
1355
|
+
# While we're making this loop, also see whether we need to store any custom
|
|
1356
|
+
# common name mappings based on the taxonomy list.
|
|
1357
|
+
speciesnet_latin_name_to_output_common_name = {}
|
|
1358
|
+
|
|
1359
|
+
# latin_name = next(iter(speciesnet_latin_name_to_taxon_string.keys()))
|
|
1360
|
+
for latin_name in speciesnet_latin_name_to_taxon_string.keys():
|
|
1361
|
+
|
|
1362
|
+
if latin_name in target_latin_to_common:
|
|
1363
|
+
speciesnet_latin_name_to_output_common_name[latin_name] = \
|
|
1364
|
+
target_latin_to_common[latin_name]
|
|
1365
|
+
|
|
1366
|
+
if 'no cv result' in latin_name:
|
|
1367
|
+
continue
|
|
1368
|
+
|
|
1369
|
+
taxon_string = speciesnet_latin_name_to_taxon_string[latin_name]
|
|
1370
|
+
tokens = taxon_string.split(';')
|
|
1371
|
+
|
|
1372
|
+
# Don't process GUID, species, or common name
|
|
1373
|
+
# i_token = 6
|
|
1374
|
+
for i_token in range(1,len(tokens)-2):
|
|
1375
|
+
|
|
1376
|
+
test_token = tokens[i_token]
|
|
1377
|
+
if len(test_token) == 0:
|
|
1378
|
+
continue
|
|
1379
|
+
|
|
1380
|
+
# Do we need to make up a taxon for this token?
|
|
1381
|
+
if test_token not in speciesnet_latin_name_to_taxon_string:
|
|
1382
|
+
|
|
1383
|
+
new_tokens = [''] * 7
|
|
1384
|
+
new_tokens[0] = 'fake_guid'
|
|
1385
|
+
for i_copy_token in range(1,i_token+1):
|
|
1386
|
+
new_tokens[i_copy_token] = tokens[i_copy_token]
|
|
1387
|
+
new_tokens[-1] = test_token + ' species'
|
|
1388
|
+
assert new_tokens[-2] == '', \
|
|
1389
|
+
'Illegal taxonomy string {}'.format(taxon_string)
|
|
1390
|
+
new_taxon_string = ';'.join(new_tokens)
|
|
1391
|
+
# assert new_taxon_string not in new_taxon_strings
|
|
1392
|
+
new_taxon_string_to_missing_tokens[new_taxon_string].append(test_token)
|
|
1393
|
+
|
|
1394
|
+
# ...for each token
|
|
1395
|
+
|
|
1396
|
+
# ...for each taxon
|
|
1397
|
+
|
|
1398
|
+
new_taxon_string_to_missing_tokens = \
|
|
1399
|
+
sort_dictionary_by_key(new_taxon_string_to_missing_tokens)
|
|
1400
|
+
|
|
1401
|
+
if verbose:
|
|
1402
|
+
|
|
1403
|
+
print(f'Found {len(new_taxon_string_to_missing_tokens)} taxa that need to be inserted to ' + \
|
|
1404
|
+
'make the taxonomy valid, showing only mammals and birds here:\n')
|
|
1405
|
+
|
|
1406
|
+
for taxon_string in new_taxon_string_to_missing_tokens:
|
|
1407
|
+
if 'mammalia' not in taxon_string and 'aves' not in taxon_string:
|
|
1408
|
+
continue
|
|
1409
|
+
missing_taxa = ','.join(new_taxon_string_to_missing_tokens[taxon_string])
|
|
1410
|
+
print('{} ({})'.format(taxon_string,missing_taxa))
|
|
1411
|
+
|
|
1412
|
+
for new_taxon_string in new_taxon_string_to_missing_tokens:
|
|
1413
|
+
_insert_taxonomy_string(new_taxon_string)
|
|
1414
|
+
|
|
1415
|
+
|
|
1416
|
+
##%% Make sure all taxa on the allow-list are in the taxonomy
|
|
1417
|
+
|
|
1418
|
+
n_failed_mappings = 0
|
|
1419
|
+
|
|
1420
|
+
for target_taxon_latin_name in target_latin_to_common.keys():
|
|
1421
|
+
if target_taxon_latin_name not in speciesnet_latin_name_to_taxon_string:
|
|
1422
|
+
common_name = target_latin_to_common[target_taxon_latin_name]
|
|
1423
|
+
s = '{} ({}) not in speciesnet taxonomy'.format(
|
|
1424
|
+
target_taxon_latin_name,common_name)
|
|
1425
|
+
if common_name in speciesnet_common_name_to_taxon_string:
|
|
1426
|
+
s += ' (common name maps to {})'.format(
|
|
1427
|
+
speciesnet_common_name_to_taxon_string[common_name])
|
|
1428
|
+
print(s)
|
|
1429
|
+
n_failed_mappings += 1
|
|
1430
|
+
|
|
1431
|
+
if n_failed_mappings > 0:
|
|
1432
|
+
raise ValueError('Cannot continue with taxonomic restriction')
|
|
1433
|
+
|
|
1434
|
+
|
|
1435
|
+
##%% For the allow-list, map each parent taxon to a set of allowable child taxa
|
|
1436
|
+
|
|
1437
|
+
# Maps parent names to all allowed child names, or None if this is the
|
|
1438
|
+
# lowest-level allowable taxon on this path
|
|
1439
|
+
allowed_parent_taxon_to_child_taxa = defaultdict(set)
|
|
1440
|
+
|
|
1441
|
+
# latin_name = next(iter(target_latin_to_common.keys()))
|
|
1442
|
+
for latin_name in target_latin_to_common:
|
|
1443
|
+
|
|
1444
|
+
taxon_string = speciesnet_latin_name_to_taxon_string[latin_name]
|
|
1445
|
+
tokens = taxon_string.split(';')
|
|
1446
|
+
assert len(tokens) == 7, \
|
|
1447
|
+
'Illegal taxonomy string {}'.format(taxon_string)
|
|
1448
|
+
|
|
1449
|
+
# Remove GUID and common mame
|
|
1450
|
+
#
|
|
1451
|
+
# This is now always class/order/family/genus/species
|
|
1452
|
+
tokens = tokens[1:-1]
|
|
1453
|
+
|
|
1454
|
+
child_taxon = None
|
|
1455
|
+
|
|
1456
|
+
# If this is a species
|
|
1457
|
+
if len(tokens[-1]) > 0:
|
|
1458
|
+
binomial_name = tokens[-2] + ' ' + tokens[-1]
|
|
1459
|
+
assert binomial_name == latin_name, \
|
|
1460
|
+
'Binomial/latin mismatch: {} vs {}'.format(binomial_name,latin_name)
|
|
1461
|
+
# If this already exists, it should only allow "None"
|
|
1462
|
+
if binomial_name in allowed_parent_taxon_to_child_taxa:
|
|
1463
|
+
assert len(allowed_parent_taxon_to_child_taxa[binomial_name]) == 1, \
|
|
1464
|
+
'Species-level entry {} has multiple children'.format(binomial_name)
|
|
1465
|
+
assert None in allowed_parent_taxon_to_child_taxa[binomial_name], \
|
|
1466
|
+
'Species-level entry {} has non-None children'.format(binomial_name)
|
|
1467
|
+
allowed_parent_taxon_to_child_taxa[binomial_name].add(None)
|
|
1468
|
+
child_taxon = binomial_name
|
|
1469
|
+
|
|
1470
|
+
# The first level that can ever be a parent taxon is the genus level
|
|
1471
|
+
parent_token_index = len(tokens) - 2
|
|
1472
|
+
|
|
1473
|
+
# Walk up from genus to family
|
|
1474
|
+
while(parent_token_index >= 0):
|
|
1475
|
+
|
|
1476
|
+
# "None" is our leaf node marker, we should never have ''
|
|
1477
|
+
if child_taxon is not None:
|
|
1478
|
+
assert len(child_taxon) > 0
|
|
1479
|
+
|
|
1480
|
+
parent_taxon = tokens[parent_token_index]
|
|
1481
|
+
|
|
1482
|
+
# Don't create entries for blank taxa
|
|
1483
|
+
if (len(parent_taxon) > 0):
|
|
1484
|
+
|
|
1485
|
+
create_child = True
|
|
1486
|
+
|
|
1487
|
+
# This is the lowest-level taxon in this entry
|
|
1488
|
+
if (child_taxon is None):
|
|
1489
|
+
|
|
1490
|
+
# ...but we don't want to remove existing children from any parents
|
|
1491
|
+
if (parent_taxon in allowed_parent_taxon_to_child_taxa) and \
|
|
1492
|
+
(len(allowed_parent_taxon_to_child_taxa[parent_taxon]) > 0):
|
|
1493
|
+
if verbose:
|
|
1494
|
+
existing_children_string = str(allowed_parent_taxon_to_child_taxa[parent_taxon])
|
|
1495
|
+
print('Not creating empty child for parent {} (already has children {})'.format(
|
|
1496
|
+
parent_taxon,existing_children_string))
|
|
1497
|
+
create_child = False
|
|
1498
|
+
|
|
1499
|
+
# If we're adding a new child entry, clear out any leaf node markers
|
|
1500
|
+
else:
|
|
1501
|
+
|
|
1502
|
+
if (parent_taxon in allowed_parent_taxon_to_child_taxa) and \
|
|
1503
|
+
(None in allowed_parent_taxon_to_child_taxa[parent_taxon]):
|
|
1504
|
+
|
|
1505
|
+
assert len(allowed_parent_taxon_to_child_taxa[parent_taxon]) == 1, \
|
|
1506
|
+
'Illlegal parent/child configuration'
|
|
1507
|
+
|
|
1508
|
+
if verbose:
|
|
1509
|
+
print('Un-marking parent {} as a leaf node because of child {}'.format(
|
|
1510
|
+
parent_taxon,child_taxon))
|
|
1511
|
+
|
|
1512
|
+
allowed_parent_taxon_to_child_taxa[parent_taxon] = set()
|
|
1513
|
+
|
|
1514
|
+
if create_child:
|
|
1515
|
+
allowed_parent_taxon_to_child_taxa[parent_taxon].add(child_taxon)
|
|
1516
|
+
|
|
1517
|
+
# If we haven't hit a non-empty taxon yet, don't update "child_taxon"
|
|
1518
|
+
assert len(parent_taxon) > 0
|
|
1519
|
+
child_taxon = parent_taxon
|
|
1520
|
+
|
|
1521
|
+
# ...if we have a non-empty taxon
|
|
1522
|
+
|
|
1523
|
+
parent_token_index -= 1
|
|
1524
|
+
|
|
1525
|
+
# ...for each taxonomic level
|
|
1526
|
+
|
|
1527
|
+
# ...for each allowed latin name
|
|
1528
|
+
|
|
1529
|
+
allowed_parent_taxon_to_child_taxa = \
|
|
1530
|
+
sort_dictionary_by_key(allowed_parent_taxon_to_child_taxa)
|
|
1531
|
+
|
|
1532
|
+
for parent_taxon in allowed_parent_taxon_to_child_taxa:
|
|
1533
|
+
# "None" should only ever appear alone; this marks a leaf node with no children
|
|
1534
|
+
if None in allowed_parent_taxon_to_child_taxa[parent_taxon]:
|
|
1535
|
+
assert len(allowed_parent_taxon_to_child_taxa[parent_taxon]) == 1, \
|
|
1536
|
+
'"None" should only appear alone in a child taxon list'
|
|
1537
|
+
|
|
1538
|
+
|
|
1539
|
+
##%% If we were just validating the custom taxa file, we're done
|
|
1540
|
+
|
|
1541
|
+
if input_file is None:
|
|
1542
|
+
print('Finished validating custom taxonomy list')
|
|
1543
|
+
return
|
|
1544
|
+
|
|
1545
|
+
|
|
1546
|
+
#%% Map all predictions that exist in this dataset...
|
|
1547
|
+
|
|
1548
|
+
# ...to the prediction we should generate.
|
|
1549
|
+
|
|
1550
|
+
with open(input_file,'r') as f:
|
|
1551
|
+
input_data = json.load(f)
|
|
1552
|
+
|
|
1553
|
+
input_category_id_to_common_name = input_data['classification_categories'] #noqa
|
|
1554
|
+
input_category_id_to_taxonomy_string = \
|
|
1555
|
+
input_data['classification_category_descriptions']
|
|
1556
|
+
|
|
1557
|
+
input_category_id_to_output_taxon_string = {}
|
|
1558
|
+
|
|
1559
|
+
# input_category_id = next(iter(input_category_id_to_taxonomy_string.keys()))
|
|
1560
|
+
for input_category_id in input_category_id_to_taxonomy_string.keys():
|
|
1561
|
+
|
|
1562
|
+
input_taxon_string = input_category_id_to_taxonomy_string[input_category_id]
|
|
1563
|
+
input_taxon_tokens = input_taxon_string.split(';')
|
|
1564
|
+
assert len(input_taxon_tokens) == 7, \
|
|
1565
|
+
'Illegal taxonomy string: {}'.format(input_taxon_string)
|
|
1566
|
+
|
|
1567
|
+
# Don't mess with blank/no-cv-result/human (or "animal", which is really "unknown")
|
|
1568
|
+
if (not is_taxonomic_prediction_string(input_taxon_string)) or \
|
|
1569
|
+
(input_taxon_string == human_prediction_string):
|
|
1570
|
+
if verbose:
|
|
1571
|
+
print('Not messing with non-taxonomic category {}'.format(input_taxon_string))
|
|
1572
|
+
input_category_id_to_output_taxon_string[input_category_id] = \
|
|
1573
|
+
input_taxon_string
|
|
1574
|
+
continue
|
|
1575
|
+
|
|
1576
|
+
# Don't mess with protected categories
|
|
1577
|
+
common_name = input_taxon_tokens[-1]
|
|
1578
|
+
|
|
1579
|
+
if (protected_common_names is not None) and \
|
|
1580
|
+
(common_name in protected_common_names):
|
|
1581
|
+
if verbose:
|
|
1582
|
+
print('Not messing with protected category {}:\n{}'.format(
|
|
1583
|
+
common_name,input_taxon_string))
|
|
1584
|
+
input_category_id_to_output_taxon_string[input_category_id] = \
|
|
1585
|
+
input_taxon_string
|
|
1586
|
+
continue
|
|
1587
|
+
|
|
1588
|
+
# Remove GUID and common mame
|
|
1589
|
+
|
|
1590
|
+
# This is always class/order/family/genus/species
|
|
1591
|
+
input_taxon_tokens = input_taxon_tokens[1:-1]
|
|
1592
|
+
assert len(input_taxon_tokens) == 5
|
|
1593
|
+
|
|
1594
|
+
|
|
1595
|
+
# Start at the species level (the last element in input_taxon_tokens),
|
|
1596
|
+
# and see whether each taxon is allowed
|
|
1597
|
+
test_index = len(input_taxon_tokens) - 1
|
|
1598
|
+
target_taxon = None
|
|
1599
|
+
|
|
1600
|
+
while((test_index >= 0) and (target_taxon is None)):
|
|
1601
|
+
|
|
1602
|
+
# Species are represented as binomial names, i.e. when test_index is 4,
|
|
1603
|
+
# test_taxon_name will have two tokens (e.g. "canis lupus"), otherwise
|
|
1604
|
+
# test_taxon_name will have one token (e.g. "canis", or "aves")
|
|
1605
|
+
if (test_index == (len(input_taxon_tokens) - 1)) and \
|
|
1606
|
+
(len(input_taxon_tokens[-1]) > 0):
|
|
1607
|
+
test_taxon_name = \
|
|
1608
|
+
input_taxon_tokens[-2] + ' ' + input_taxon_tokens[-1]
|
|
1609
|
+
else:
|
|
1610
|
+
test_taxon_name = input_taxon_tokens[test_index]
|
|
1611
|
+
|
|
1612
|
+
# If we haven't yet found the level at which this taxon is non-empty,
|
|
1613
|
+
# keep going up
|
|
1614
|
+
if len(test_taxon_name) == 0:
|
|
1615
|
+
test_index -= 1
|
|
1616
|
+
continue
|
|
1617
|
+
|
|
1618
|
+
assert test_taxon_name in speciesnet_latin_name_to_taxon_string, \
|
|
1619
|
+
'{} not found in taxonomy table'.format(test_taxon_name)
|
|
1620
|
+
|
|
1621
|
+
# Is this taxon allowed according to the custom species list?
|
|
1622
|
+
if test_taxon_name in allowed_parent_taxon_to_child_taxa:
|
|
1623
|
+
|
|
1624
|
+
allowed_child_taxa = allowed_parent_taxon_to_child_taxa[test_taxon_name]
|
|
1625
|
+
assert allowed_child_taxa is not None, \
|
|
1626
|
+
'allowed_child_taxa should not be None: {}'.format(test_taxon_name)
|
|
1627
|
+
|
|
1628
|
+
# If this is the lowest-level allowable token or there is not a
|
|
1629
|
+
# unique child, don't walk any further, even if walking down
|
|
1630
|
+
# is enabled.
|
|
1631
|
+
if None in allowed_child_taxa:
|
|
1632
|
+
assert len(allowed_child_taxa) == 1, \
|
|
1633
|
+
'"None" should not be listed as a child taxa with other child taxa'
|
|
1634
|
+
|
|
1635
|
+
if (None in allowed_child_taxa) or (len(allowed_child_taxa) > 1):
|
|
1636
|
+
target_taxon = test_taxon_name
|
|
1637
|
+
elif not allow_walk_down:
|
|
1638
|
+
target_taxon = test_taxon_name
|
|
1639
|
+
else:
|
|
1640
|
+
# If there's a unique child, walk back *down* the allowable
|
|
1641
|
+
# taxa until we run out of unique children
|
|
1642
|
+
while ((next(iter(allowed_child_taxa)) is not None) and \
|
|
1643
|
+
(len(allowed_child_taxa) == 1)):
|
|
1644
|
+
candidate_taxon = next(iter(allowed_child_taxa))
|
|
1645
|
+
assert candidate_taxon in allowed_parent_taxon_to_child_taxa, \
|
|
1646
|
+
'{} should be a subset of {}'.format(
|
|
1647
|
+
candidate_taxon,allowed_parent_taxon_to_child_taxa)
|
|
1648
|
+
assert candidate_taxon in speciesnet_latin_name_to_taxon_string, \
|
|
1649
|
+
'{} should be a subset of {}'.format(
|
|
1650
|
+
candidate_taxon,speciesnet_latin_name_to_taxon_string)
|
|
1651
|
+
allowed_child_taxa = \
|
|
1652
|
+
allowed_parent_taxon_to_child_taxa[candidate_taxon]
|
|
1653
|
+
target_taxon = candidate_taxon
|
|
1654
|
+
|
|
1655
|
+
# ...if this is an allowed taxon
|
|
1656
|
+
|
|
1657
|
+
test_index -= 1
|
|
1658
|
+
|
|
1659
|
+
# ...for each token
|
|
1660
|
+
|
|
1661
|
+
if target_taxon is None:
|
|
1662
|
+
output_taxon_string = animal_prediction_string
|
|
1663
|
+
else:
|
|
1664
|
+
output_taxon_string = speciesnet_latin_name_to_taxon_string[target_taxon]
|
|
1665
|
+
input_category_id_to_output_taxon_string[input_category_id] = output_taxon_string
|
|
1666
|
+
|
|
1667
|
+
# ...for each category (mapping input category IDs to output taxon strings)
|
|
1668
|
+
|
|
1669
|
+
|
|
1670
|
+
#%% Map input category IDs to output category IDs
|
|
1671
|
+
|
|
1672
|
+
speciesnet_taxon_string_to_latin_name = \
|
|
1673
|
+
invert_dictionary(speciesnet_latin_name_to_taxon_string)
|
|
1674
|
+
|
|
1675
|
+
input_category_id_to_output_category_id = {}
|
|
1676
|
+
output_taxon_string_to_category_id = {}
|
|
1677
|
+
output_category_id_to_common_name = {}
|
|
1678
|
+
|
|
1679
|
+
for input_category_id in input_category_id_to_output_taxon_string:
|
|
1680
|
+
|
|
1681
|
+
output_taxon_string = \
|
|
1682
|
+
input_category_id_to_output_taxon_string[input_category_id]
|
|
1683
|
+
|
|
1684
|
+
output_common_name = output_taxon_string.split(';')[-1]
|
|
1685
|
+
|
|
1686
|
+
# Possibly substitute a custom common name
|
|
1687
|
+
if output_taxon_string in speciesnet_taxon_string_to_latin_name:
|
|
1688
|
+
|
|
1689
|
+
speciesnet_latin_name = speciesnet_taxon_string_to_latin_name[output_taxon_string]
|
|
1690
|
+
|
|
1691
|
+
if speciesnet_latin_name in speciesnet_latin_name_to_output_common_name:
|
|
1692
|
+
custom_common_name = speciesnet_latin_name_to_output_common_name[speciesnet_latin_name]
|
|
1693
|
+
if custom_common_name != output_common_name:
|
|
1694
|
+
if verbose:
|
|
1695
|
+
print('Substituting common name {} for {}'.format(custom_common_name,output_common_name))
|
|
1696
|
+
output_common_name = custom_common_name
|
|
1697
|
+
|
|
1698
|
+
# Do we need to create a new output category?
|
|
1699
|
+
if output_taxon_string not in output_taxon_string_to_category_id:
|
|
1700
|
+
output_category_id = str(len(output_taxon_string_to_category_id))
|
|
1701
|
+
output_taxon_string_to_category_id[output_taxon_string] = \
|
|
1702
|
+
output_category_id
|
|
1703
|
+
output_category_id_to_common_name[output_category_id] = \
|
|
1704
|
+
output_common_name
|
|
1705
|
+
else:
|
|
1706
|
+
output_category_id = \
|
|
1707
|
+
output_taxon_string_to_category_id[output_taxon_string]
|
|
1708
|
+
|
|
1709
|
+
input_category_id_to_output_category_id[input_category_id] = \
|
|
1710
|
+
output_category_id
|
|
1711
|
+
|
|
1712
|
+
# Sometimes-useful debug printouts
|
|
1713
|
+
if False:
|
|
1714
|
+
original_common_name = \
|
|
1715
|
+
input_category_id_to_common_name[input_category_id]
|
|
1716
|
+
original_taxon_string = \
|
|
1717
|
+
input_category_id_to_taxonomy_string[input_category_id]
|
|
1718
|
+
print('Mapping {} ({}) to:\n{} ({})\n'.format(
|
|
1719
|
+
original_common_name,original_taxon_string,
|
|
1720
|
+
output_common_name,output_taxon_string))
|
|
1721
|
+
|
|
1722
|
+
# ...for each category (mapping input category IDs to output category IDs)
|
|
1723
|
+
|
|
1724
|
+
|
|
1725
|
+
##%% Remap all category labels
|
|
1726
|
+
|
|
1727
|
+
assert len(set(output_taxon_string_to_category_id.keys())) == \
|
|
1728
|
+
len(set(output_taxon_string_to_category_id.values())), \
|
|
1729
|
+
'Category ID/value non-uniqueness error'
|
|
1730
|
+
|
|
1731
|
+
output_category_id_to_taxon_string = \
|
|
1732
|
+
invert_dictionary(output_taxon_string_to_category_id)
|
|
1733
|
+
|
|
1734
|
+
with open(input_file,'r') as f:
|
|
1735
|
+
output_data = json.load(f)
|
|
1736
|
+
|
|
1737
|
+
classification_descriptions = None
|
|
1738
|
+
if 'classification_category_descriptions' in output_data:
|
|
1739
|
+
classification_descriptions = output_data['classification_category_descriptions']
|
|
1740
|
+
|
|
1741
|
+
for im in tqdm(output_data['images']):
|
|
1742
|
+
|
|
1743
|
+
if 'detections' not in im or im['detections'] is None:
|
|
1744
|
+
continue
|
|
1745
|
+
|
|
1746
|
+
description_options = ClassificationSmoothingOptions()
|
|
1747
|
+
if classification_threshold is not None:
|
|
1748
|
+
description_options.classification_confidence_threshold = classification_threshold
|
|
1749
|
+
|
|
1750
|
+
# Possibly prepare a pre-filtering description
|
|
1751
|
+
pre_filtering_description = None
|
|
1752
|
+
if classification_descriptions is not None and add_pre_filtering_description:
|
|
1753
|
+
category_to_count = \
|
|
1754
|
+
count_detections_by_classification_category(im['detections'],
|
|
1755
|
+
options=description_options)
|
|
1756
|
+
pre_filtering_description = \
|
|
1757
|
+
get_classification_description_string(category_to_count,classification_descriptions)
|
|
1758
|
+
im['pre_filtering_description'] = pre_filtering_description
|
|
1759
|
+
|
|
1760
|
+
for det in im['detections']:
|
|
1761
|
+
if 'classifications' in det:
|
|
1762
|
+
for classification in det['classifications']:
|
|
1763
|
+
classification[0] = \
|
|
1764
|
+
input_category_id_to_output_category_id[classification[0]]
|
|
1765
|
+
|
|
1766
|
+
if classification_descriptions is not None and add_post_filtering_description:
|
|
1767
|
+
category_to_count = \
|
|
1768
|
+
count_detections_by_classification_category(im['detections'],
|
|
1769
|
+
options=description_options)
|
|
1770
|
+
post_filtering_description = \
|
|
1771
|
+
get_classification_description_string(category_to_count,output_category_id_to_taxon_string)
|
|
1772
|
+
im['post_filtering_description'] = post_filtering_description
|
|
1773
|
+
|
|
1774
|
+
# ...for each image
|
|
1775
|
+
|
|
1776
|
+
output_data['classification_categories'] = output_category_id_to_common_name
|
|
1777
|
+
output_data['classification_category_descriptions'] = \
|
|
1778
|
+
output_category_id_to_taxon_string
|
|
1779
|
+
|
|
1780
|
+
|
|
1781
|
+
##%% Write output
|
|
1782
|
+
|
|
1783
|
+
write_json(output_file,output_data)
|
|
1784
|
+
|
|
1785
|
+
if combine_redundant_categories:
|
|
1786
|
+
_ = combine_redundant_classification_categories(input_file=output_file,
|
|
1787
|
+
output_file=output_file)
|
|
1788
|
+
|
|
1789
|
+
# ...def restrict_to_taxa_list(...)
|
|
1790
|
+
|
|
1791
|
+
|
|
1792
|
+
def combine_redundant_classification_categories(input_file,
|
|
1793
|
+
output_file=None,
|
|
1794
|
+
classification_threshold=0.5):
|
|
1795
|
+
"""
|
|
1796
|
+
Args:
|
|
1797
|
+
input_file (str): .json file to read, in MD format
|
|
1798
|
+
output_file (str): .json file to write, in MD format
|
|
1799
|
+
classification_threshold (float, optional): only used when sorting
|
|
1800
|
+
descriptions by count
|
|
1801
|
+
|
|
1802
|
+
Returns:
|
|
1803
|
+
dict: remapped MD-formatted dict
|
|
1804
|
+
"""
|
|
1805
|
+
|
|
1806
|
+
##%% Read input file and list categories
|
|
1807
|
+
|
|
1808
|
+
assert os.path.isfile(input_file), \
|
|
1809
|
+
'Input file {} not found'.format(input_file)
|
|
1810
|
+
|
|
1811
|
+
with open(input_file,'r') as f:
|
|
1812
|
+
d = json.load(f)
|
|
1813
|
+
|
|
1814
|
+
input_category_name_to_ids = defaultdict(list)
|
|
1815
|
+
|
|
1816
|
+
for category_id in d['classification_categories']:
|
|
1817
|
+
category_name = d['classification_categories'][category_id]
|
|
1818
|
+
input_category_name_to_ids[category_name].append(category_id)
|
|
1819
|
+
|
|
1820
|
+
|
|
1821
|
+
##%% Return early if there are no redundant categories
|
|
1822
|
+
|
|
1823
|
+
# What's the largest number of IDs associated with a single category name?
|
|
1824
|
+
max_count = 0
|
|
1825
|
+
for category_name in input_category_name_to_ids:
|
|
1826
|
+
c = len(input_category_name_to_ids[category_name])
|
|
1827
|
+
if c > max_count:
|
|
1828
|
+
max_count = c
|
|
1829
|
+
|
|
1830
|
+
if max_count == 1:
|
|
1831
|
+
if output_file is not None:
|
|
1832
|
+
print('No redundant categories, writing data unmodified to {}'.format(
|
|
1833
|
+
output_file))
|
|
1834
|
+
write_json(output_file,d)
|
|
1835
|
+
return d
|
|
1836
|
+
|
|
1837
|
+
|
|
1838
|
+
##%% Map input category IDs to output category IDs
|
|
1839
|
+
|
|
1840
|
+
input_category_id_to_output_category_id = {}
|
|
1841
|
+
|
|
1842
|
+
for i_category,category_name in enumerate(input_category_name_to_ids):
|
|
1843
|
+
output_category_id = str(i_category)
|
|
1844
|
+
for input_category_id in input_category_name_to_ids[category_name]:
|
|
1845
|
+
input_category_id_to_output_category_id[input_category_id] = \
|
|
1846
|
+
output_category_id
|
|
1847
|
+
|
|
1848
|
+
n_input_categories = len(d['classification_categories'])
|
|
1849
|
+
n_output_categories = len(input_category_name_to_ids)
|
|
1850
|
+
assert n_output_categories < n_input_categories
|
|
1851
|
+
print('Removing {} redundant categories'.format(
|
|
1852
|
+
n_input_categories - n_output_categories))
|
|
1853
|
+
|
|
1854
|
+
|
|
1855
|
+
##%% Create a new category dict
|
|
1856
|
+
|
|
1857
|
+
output_category_name_to_id = {}
|
|
1858
|
+
|
|
1859
|
+
for input_category_id in input_category_id_to_output_category_id:
|
|
1860
|
+
category_name = d['classification_categories'][input_category_id]
|
|
1861
|
+
output_category_id = input_category_id_to_output_category_id[input_category_id]
|
|
1862
|
+
if category_name in output_category_name_to_id:
|
|
1863
|
+
assert output_category_name_to_id[category_name] == output_category_id
|
|
1864
|
+
else:
|
|
1865
|
+
output_category_name_to_id[category_name] = output_category_id
|
|
1866
|
+
|
|
1867
|
+
|
|
1868
|
+
##%% Create new classification category descriptions
|
|
1869
|
+
|
|
1870
|
+
if 'classification_category_descriptions' in d:
|
|
1871
|
+
|
|
1872
|
+
assert len(d['classification_category_descriptions']) == \
|
|
1873
|
+
len(d['classification_categories'])
|
|
1874
|
+
|
|
1875
|
+
# Sort descriptions by count overall, so we can sort by description within categories later
|
|
1876
|
+
description_to_count = defaultdict(int)
|
|
1877
|
+
for im in d['images']:
|
|
1878
|
+
if 'detections' not in im or im['detections'] is None:
|
|
1879
|
+
continue
|
|
1880
|
+
for det in im['detections']:
|
|
1881
|
+
if 'classifications' not in det or det['classifications'] is None:
|
|
1882
|
+
continue
|
|
1883
|
+
conf = det['classifications'][0][1]
|
|
1884
|
+
if conf < classification_threshold:
|
|
1885
|
+
continue
|
|
1886
|
+
input_category_id = det['classifications'][0][0]
|
|
1887
|
+
intput_category_description = d['classification_category_descriptions'][input_category_id]
|
|
1888
|
+
description_to_count[intput_category_description] += 1
|
|
1889
|
+
# ...for each detection
|
|
1890
|
+
# ...for each image
|
|
1891
|
+
|
|
1892
|
+
# This is just a debug convenience
|
|
1893
|
+
description_to_count = sort_dictionary_by_value(description_to_count,
|
|
1894
|
+
reverse=True)
|
|
1895
|
+
|
|
1896
|
+
# Create descriptions for the output categories
|
|
1897
|
+
output_category_id_to_descriptions = defaultdict(list)
|
|
1898
|
+
|
|
1899
|
+
for input_category_id in input_category_id_to_output_category_id:
|
|
1900
|
+
output_category_id = input_category_id_to_output_category_id[input_category_id]
|
|
1901
|
+
description = d['classification_category_descriptions'][input_category_id]
|
|
1902
|
+
output_category_id_to_descriptions[output_category_id].append(description)
|
|
1903
|
+
|
|
1904
|
+
output_classification_category_descriptions = {}
|
|
1905
|
+
|
|
1906
|
+
for output_category_id in output_category_id_to_descriptions:
|
|
1907
|
+
descriptions = output_category_id_to_descriptions[output_category_id]
|
|
1908
|
+
if len(descriptions) > 1:
|
|
1909
|
+
# Sort "descriptions" in descending order by the corresponding values
|
|
1910
|
+
# in description_to_count
|
|
1911
|
+
descriptions.sort(key=lambda x: description_to_count[x], reverse=True)
|
|
1912
|
+
output_classification_category_descriptions[output_category_id] = \
|
|
1913
|
+
'|'.join(descriptions)
|
|
1914
|
+
# ...for each category
|
|
1915
|
+
|
|
1916
|
+
d['classification_category_descriptions'] = output_classification_category_descriptions
|
|
1917
|
+
|
|
1918
|
+
# ...if we have to manage descriptions
|
|
1919
|
+
|
|
1920
|
+
d['classification_categories'] = invert_dictionary(output_category_name_to_id)
|
|
1921
|
+
|
|
1922
|
+
# Remap classifications
|
|
1923
|
+
for im in d['images']:
|
|
1924
|
+
if 'detections' not in im or im['detections'] is None:
|
|
1925
|
+
continue
|
|
1926
|
+
for det in im['detections']:
|
|
1927
|
+
if 'classifications' not in det or det['classifications'] is None:
|
|
1928
|
+
continue
|
|
1929
|
+
for i_class in range(0,len(det['classifications'])):
|
|
1930
|
+
input_category_id = det['classifications'][i_class][0]
|
|
1931
|
+
output_category_id = \
|
|
1932
|
+
input_category_id_to_output_category_id[input_category_id]
|
|
1933
|
+
det['classifications'][i_class][0] = output_category_id
|
|
1934
|
+
# ...for each classification
|
|
1935
|
+
# ...for each detection
|
|
1936
|
+
# ...for each image
|
|
1937
|
+
|
|
1938
|
+
if output_file is not None:
|
|
1939
|
+
write_json(output_file,d)
|
|
1940
|
+
|
|
1941
|
+
return d
|
|
1942
|
+
|
|
1943
|
+
# ...def combine_redundant_classification_categories(...)
|