megadetector 10.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (147) hide show
  1. megadetector/__init__.py +0 -0
  2. megadetector/api/__init__.py +0 -0
  3. megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
  4. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
  5. megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
  6. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
  7. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
  8. megadetector/classification/__init__.py +0 -0
  9. megadetector/classification/aggregate_classifier_probs.py +108 -0
  10. megadetector/classification/analyze_failed_images.py +227 -0
  11. megadetector/classification/cache_batchapi_outputs.py +198 -0
  12. megadetector/classification/create_classification_dataset.py +626 -0
  13. megadetector/classification/crop_detections.py +516 -0
  14. megadetector/classification/csv_to_json.py +226 -0
  15. megadetector/classification/detect_and_crop.py +853 -0
  16. megadetector/classification/efficientnet/__init__.py +9 -0
  17. megadetector/classification/efficientnet/model.py +415 -0
  18. megadetector/classification/efficientnet/utils.py +608 -0
  19. megadetector/classification/evaluate_model.py +520 -0
  20. megadetector/classification/identify_mislabeled_candidates.py +152 -0
  21. megadetector/classification/json_to_azcopy_list.py +63 -0
  22. megadetector/classification/json_validator.py +696 -0
  23. megadetector/classification/map_classification_categories.py +276 -0
  24. megadetector/classification/merge_classification_detection_output.py +509 -0
  25. megadetector/classification/prepare_classification_script.py +194 -0
  26. megadetector/classification/prepare_classification_script_mc.py +228 -0
  27. megadetector/classification/run_classifier.py +287 -0
  28. megadetector/classification/save_mislabeled.py +110 -0
  29. megadetector/classification/train_classifier.py +827 -0
  30. megadetector/classification/train_classifier_tf.py +725 -0
  31. megadetector/classification/train_utils.py +323 -0
  32. megadetector/data_management/__init__.py +0 -0
  33. megadetector/data_management/animl_to_md.py +161 -0
  34. megadetector/data_management/annotations/__init__.py +0 -0
  35. megadetector/data_management/annotations/annotation_constants.py +33 -0
  36. megadetector/data_management/camtrap_dp_to_coco.py +270 -0
  37. megadetector/data_management/cct_json_utils.py +566 -0
  38. megadetector/data_management/cct_to_md.py +184 -0
  39. megadetector/data_management/cct_to_wi.py +293 -0
  40. megadetector/data_management/coco_to_labelme.py +284 -0
  41. megadetector/data_management/coco_to_yolo.py +702 -0
  42. megadetector/data_management/databases/__init__.py +0 -0
  43. megadetector/data_management/databases/add_width_and_height_to_db.py +107 -0
  44. megadetector/data_management/databases/combine_coco_camera_traps_files.py +210 -0
  45. megadetector/data_management/databases/integrity_check_json_db.py +528 -0
  46. megadetector/data_management/databases/subset_json_db.py +195 -0
  47. megadetector/data_management/generate_crops_from_cct.py +200 -0
  48. megadetector/data_management/get_image_sizes.py +164 -0
  49. megadetector/data_management/labelme_to_coco.py +559 -0
  50. megadetector/data_management/labelme_to_yolo.py +349 -0
  51. megadetector/data_management/lila/__init__.py +0 -0
  52. megadetector/data_management/lila/create_lila_blank_set.py +556 -0
  53. megadetector/data_management/lila/create_lila_test_set.py +187 -0
  54. megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
  55. megadetector/data_management/lila/download_lila_subset.py +182 -0
  56. megadetector/data_management/lila/generate_lila_per_image_labels.py +777 -0
  57. megadetector/data_management/lila/get_lila_annotation_counts.py +174 -0
  58. megadetector/data_management/lila/get_lila_image_counts.py +112 -0
  59. megadetector/data_management/lila/lila_common.py +319 -0
  60. megadetector/data_management/lila/test_lila_metadata_urls.py +164 -0
  61. megadetector/data_management/mewc_to_md.py +344 -0
  62. megadetector/data_management/ocr_tools.py +873 -0
  63. megadetector/data_management/read_exif.py +964 -0
  64. megadetector/data_management/remap_coco_categories.py +195 -0
  65. megadetector/data_management/remove_exif.py +156 -0
  66. megadetector/data_management/rename_images.py +194 -0
  67. megadetector/data_management/resize_coco_dataset.py +663 -0
  68. megadetector/data_management/speciesnet_to_md.py +41 -0
  69. megadetector/data_management/wi_download_csv_to_coco.py +247 -0
  70. megadetector/data_management/yolo_output_to_md_output.py +594 -0
  71. megadetector/data_management/yolo_to_coco.py +876 -0
  72. megadetector/data_management/zamba_to_md.py +188 -0
  73. megadetector/detection/__init__.py +0 -0
  74. megadetector/detection/change_detection.py +840 -0
  75. megadetector/detection/process_video.py +479 -0
  76. megadetector/detection/pytorch_detector.py +1451 -0
  77. megadetector/detection/run_detector.py +1267 -0
  78. megadetector/detection/run_detector_batch.py +2159 -0
  79. megadetector/detection/run_inference_with_yolov5_val.py +1314 -0
  80. megadetector/detection/run_md_and_speciesnet.py +1494 -0
  81. megadetector/detection/run_tiled_inference.py +1038 -0
  82. megadetector/detection/tf_detector.py +209 -0
  83. megadetector/detection/video_utils.py +1379 -0
  84. megadetector/postprocessing/__init__.py +0 -0
  85. megadetector/postprocessing/add_max_conf.py +72 -0
  86. megadetector/postprocessing/categorize_detections_by_size.py +166 -0
  87. megadetector/postprocessing/classification_postprocessing.py +1752 -0
  88. megadetector/postprocessing/combine_batch_outputs.py +249 -0
  89. megadetector/postprocessing/compare_batch_results.py +2110 -0
  90. megadetector/postprocessing/convert_output_format.py +403 -0
  91. megadetector/postprocessing/create_crop_folder.py +629 -0
  92. megadetector/postprocessing/detector_calibration.py +570 -0
  93. megadetector/postprocessing/generate_csv_report.py +522 -0
  94. megadetector/postprocessing/load_api_results.py +223 -0
  95. megadetector/postprocessing/md_to_coco.py +428 -0
  96. megadetector/postprocessing/md_to_labelme.py +351 -0
  97. megadetector/postprocessing/md_to_wi.py +41 -0
  98. megadetector/postprocessing/merge_detections.py +392 -0
  99. megadetector/postprocessing/postprocess_batch_results.py +2077 -0
  100. megadetector/postprocessing/remap_detection_categories.py +226 -0
  101. megadetector/postprocessing/render_detection_confusion_matrix.py +677 -0
  102. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +206 -0
  103. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +82 -0
  104. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1665 -0
  105. megadetector/postprocessing/separate_detections_into_folders.py +795 -0
  106. megadetector/postprocessing/subset_json_detector_output.py +964 -0
  107. megadetector/postprocessing/top_folders_to_bottom.py +238 -0
  108. megadetector/postprocessing/validate_batch_results.py +332 -0
  109. megadetector/taxonomy_mapping/__init__.py +0 -0
  110. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
  111. megadetector/taxonomy_mapping/map_new_lila_datasets.py +213 -0
  112. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +165 -0
  113. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +543 -0
  114. megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
  115. megadetector/taxonomy_mapping/simple_image_download.py +224 -0
  116. megadetector/taxonomy_mapping/species_lookup.py +1008 -0
  117. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
  118. megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
  119. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
  120. megadetector/tests/__init__.py +0 -0
  121. megadetector/tests/test_nms_synthetic.py +335 -0
  122. megadetector/utils/__init__.py +0 -0
  123. megadetector/utils/ct_utils.py +1857 -0
  124. megadetector/utils/directory_listing.py +199 -0
  125. megadetector/utils/extract_frames_from_video.py +307 -0
  126. megadetector/utils/gpu_test.py +125 -0
  127. megadetector/utils/md_tests.py +2072 -0
  128. megadetector/utils/path_utils.py +2832 -0
  129. megadetector/utils/process_utils.py +172 -0
  130. megadetector/utils/split_locations_into_train_val.py +237 -0
  131. megadetector/utils/string_utils.py +234 -0
  132. megadetector/utils/url_utils.py +825 -0
  133. megadetector/utils/wi_platform_utils.py +968 -0
  134. megadetector/utils/wi_taxonomy_utils.py +1759 -0
  135. megadetector/utils/write_html_image_list.py +239 -0
  136. megadetector/visualization/__init__.py +0 -0
  137. megadetector/visualization/plot_utils.py +309 -0
  138. megadetector/visualization/render_images_with_thumbnails.py +243 -0
  139. megadetector/visualization/visualization_utils.py +1940 -0
  140. megadetector/visualization/visualize_db.py +630 -0
  141. megadetector/visualization/visualize_detector_output.py +479 -0
  142. megadetector/visualization/visualize_video_output.py +705 -0
  143. megadetector-10.0.13.dist-info/METADATA +134 -0
  144. megadetector-10.0.13.dist-info/RECORD +147 -0
  145. megadetector-10.0.13.dist-info/WHEEL +5 -0
  146. megadetector-10.0.13.dist-info/licenses/LICENSE +19 -0
  147. megadetector-10.0.13.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1752 @@
1
+ """
2
+
3
+ classification_postprocessing.py
4
+
5
+ Functions for postprocessing species classification results, particularly:
6
+
7
+ * Smoothing results within an image (an image with 700 cows and one deer is really just 701
8
+ cows)
9
+ * Smoothing results within a sequence (a sequence that looks like deer/deer/deer/elk/deer/deer
10
+ is really just a deer)
11
+
12
+ """
13
+
14
+ #%% Constants and imports
15
+
16
+ import json
17
+ import copy
18
+ import pandas as pd
19
+
20
+ from collections import defaultdict
21
+ from tqdm import tqdm
22
+
23
+ from megadetector.utils.ct_utils import is_list_sorted
24
+ from megadetector.utils.ct_utils import is_empty
25
+ from megadetector.utils.ct_utils import sort_dictionary_by_value
26
+ from megadetector.utils.ct_utils import sort_dictionary_by_key
27
+ from megadetector.utils.ct_utils import invert_dictionary
28
+ from megadetector.utils.ct_utils import write_json
29
+
30
+ from megadetector.utils.wi_taxonomy_utils import clean_taxonomy_string
31
+ from megadetector.utils.wi_taxonomy_utils import taxonomy_level_index
32
+ from megadetector.utils.wi_taxonomy_utils import taxonomy_level_string_to_index
33
+
34
+ from megadetector.utils.wi_taxonomy_utils import human_prediction_string
35
+ from megadetector.utils.wi_taxonomy_utils import animal_prediction_string
36
+ from megadetector.utils.wi_taxonomy_utils import is_taxonomic_prediction_string
37
+ from megadetector.utils.wi_taxonomy_utils import blank_prediction_string # noqa
38
+
39
+
40
+ #%% Options classes
41
+
42
+ class ClassificationSmoothingOptions:
43
+ """
44
+ Options used to parameterize smooth_classification_results_image_level()
45
+ and smooth_classification_results_sequence_level()
46
+ """
47
+
48
+ def __init__(self):
49
+
50
+ #: How many detections do we need in a dominant category to overwrite
51
+ #: non-dominant classifications? This is irrelevant if
52
+ #: max_detections_nondominant_class <= 1.
53
+ self.min_detections_to_overwrite_secondary = 4
54
+
55
+ #: Even if we have a dominant class, if a non-dominant class has at least
56
+ #: this many classifications in an image, leave them alone.
57
+ #:
58
+ #: If this is <= 1, we won't replace non-dominant, non-other classes
59
+ #: with the dominant class, even if there are 900 cows and 1 deer.
60
+ self.max_detections_nondominant_class = 1
61
+
62
+ #: How many detections do we need in a dominant category to overwrite
63
+ #: non-dominant classifications in the same family? If this is <= 0,
64
+ #: we'll skip this step. This option doesn't mean anything if
65
+ #: max_detections_nondominant_class_same_family <= 1.
66
+ self.min_detections_to_overwrite_secondary_same_family = 2
67
+
68
+ #: If we have this many classifications of a nondominant category,
69
+ #: we won't do same-family overwrites. <= 1 means "even if there are
70
+ #: a million deer, if there are two million moose, call all the deer
71
+ #: moose". This option doesn't mean anything if
72
+ #: min_detections_to_overwrite_secondary_same_family <= 0.
73
+ self.max_detections_nondominant_class_same_family = -1
74
+
75
+ #: If the dominant class has at least this many classifications, overwrite
76
+ #: "other" classifications with the dominant class
77
+ self.min_detections_to_overwrite_other = 2
78
+
79
+ #: Names to treat as "other" categories; can't be None, but can be empty
80
+ #:
81
+ #: "Other" classifications will be changed to the dominant category, regardless
82
+ #: of confidence, as long as there are at least min_detections_to_overwrite_other
83
+ #: examples of the dominant class. For example, cow/other will remain unchanged,
84
+ #: but cow/cow/other will become cow/cow/cow.
85
+ self.other_category_names = ['other','unknown','no cv result','animal','blank','mammal']
86
+
87
+ #: We're not even going to mess around with classifications below this threshold.
88
+ #:
89
+ #: We won't count them, we won't over-write them, they don't exist during the
90
+ #: within-image smoothing step.
91
+ self.classification_confidence_threshold = 0.5
92
+
93
+ #: We're not even going to mess around with detections below this threshold.
94
+ #:
95
+ #: We won't count them, we won't over-write them, they don't exist during the
96
+ #: within-image smoothing step.
97
+ self.detection_confidence_threshold = 0.15
98
+
99
+ #: If classification descriptions are present and appear to represent taxonomic
100
+ #: information, should we propagate classifications when lower-level taxa are more
101
+ #: common in an image? For example, if we see "carnivore/fox/fox/deer", should
102
+ #: we make that "fox/fox/fox/deer"?
103
+ self.propagate_classifications_through_taxonomy = True
104
+
105
+ #: When propagating classifications down through taxonomy levels, we have to
106
+ #: decide whether we prefer more frequent categories or more specific categories.
107
+ #: taxonomy_propagation_level_weight and taxonomy_propagation_count_weight
108
+ #: balance levels against counts in this process.
109
+ self.taxonomy_propagation_level_weight = 1.0
110
+
111
+ #: When propagating classifications down through taxonomy levels, we have to
112
+ #: decide whether we prefer more frequent categories or more specific categories.
113
+ #: taxonomy_propagation_level_weight and taxonomy_propagation_count_weight
114
+ #: balance levels against counts in this process.
115
+ #:
116
+ #: With a very low default value, this just breaks ties.
117
+ self.taxonomy_propagation_count_weight = 0.01
118
+
119
+ #: Should we record information about the state of labels prior to smoothing?
120
+ self.add_pre_smoothing_description = True
121
+
122
+ #: When a dict (rather than a file) is passed to either smoothing function,
123
+ #: if this is True, we'll make a copy of the input dict before modifying.
124
+ self.modify_in_place = False
125
+
126
+ #: Only include these categories in the smoothing process (None to use all categories)
127
+ self.detection_category_names_to_smooth = ['animal']
128
+
129
+ #: Debug options
130
+ self.break_at_image = None
131
+
132
+ ## Populated internally
133
+
134
+ #: Only include these categories in the smoothing process (None to use all categories)
135
+ self._detection_category_ids_to_smooth = None
136
+
137
+
138
+ #%% Utility functions
139
+
140
+ def _results_for_sequence(images_this_sequence,filename_to_results):
141
+ """
142
+ Fetch MD results for every image in this sequence, based on the 'file_name' field
143
+ """
144
+
145
+ results_this_sequence = []
146
+ for im in images_this_sequence:
147
+ fn = im['file_name']
148
+ results_this_image = filename_to_results[fn]
149
+ assert isinstance(results_this_image,dict)
150
+ results_this_sequence.append(results_this_image)
151
+
152
+ return results_this_sequence
153
+
154
+
155
+ def _sort_images_by_time(images):
156
+ """
157
+ Returns a copy of [images], sorted by the 'datetime' field (ascending).
158
+ """
159
+ return sorted(images, key = lambda im: im['datetime'])
160
+
161
+
162
+ def _detection_is_relevant_for_smoothing(det,options):
163
+ """
164
+ Determine whether [det] has classifications that might be meaningful for smoothing.
165
+ """
166
+
167
+ if ('classifications' not in det) or \
168
+ (det['conf'] < options.detection_confidence_threshold):
169
+ return False
170
+
171
+ # Ignore non-smoothed categories
172
+ if (options._detection_category_ids_to_smooth is not None) and \
173
+ (det['category'] not in options._detection_category_ids_to_smooth):
174
+ return False
175
+
176
+ return True
177
+
178
+
179
+ def count_detections_by_classification_category(detections,options=None):
180
+ """
181
+ Count the number of instances of each classification category in the detections list
182
+ [detections] that have an above-threshold detection. Sort results in descending
183
+ order by count. Returns a dict mapping category ID --> count. If no detections
184
+ are above threshold, returns an empty dict.
185
+
186
+ Only processes the top classification for each detection.
187
+
188
+ Args:
189
+ detections (list of dict): detections list
190
+ options (ClassificationSmoothingOptions, optional): see ClassificationSmoothingOptions
191
+
192
+ Returns:
193
+ dict mapping above-threshold category IDs to counts
194
+ """
195
+
196
+ if detections is None or len(detections) == 0:
197
+ return {}
198
+
199
+ if options is None:
200
+ options = ClassificationSmoothingOptions()
201
+
202
+ category_to_count = defaultdict(int)
203
+
204
+ for det in detections:
205
+
206
+ if not _detection_is_relevant_for_smoothing(det,options):
207
+ continue
208
+
209
+ c = det['classifications'][0]
210
+ if c[1] >= options.classification_confidence_threshold:
211
+ category_to_count[c[0]] += 1
212
+
213
+ category_to_count = {k: v for k, v in sorted(category_to_count.items(),
214
+ key=lambda item: item[1],
215
+ reverse=True)}
216
+
217
+ return category_to_count
218
+
219
+
220
+ def get_classification_description_string(category_to_count,classification_descriptions):
221
+ """
222
+ Return a string summarizing the image content according to [category_to_count].
223
+
224
+ Args:
225
+ category_to_count (dict): a dict mapping category IDs to counts
226
+ classification_descriptions (dict): a dict mapping category IDs to description strings
227
+
228
+ Returns:
229
+ string: a description of this image's content, e.g. "rabbit (4), human (1)"
230
+ """
231
+
232
+ category_strings = []
233
+ # category_id = next(iter(category_to_count))
234
+ for category_id in category_to_count:
235
+ category_description = classification_descriptions[category_id]
236
+ tokens = category_description.split(';')
237
+ assert len(tokens) == 7
238
+ category_name = tokens[-1]
239
+ if len(category_name) == 0:
240
+ category_name = 'undefined category'
241
+ count = category_to_count[category_id]
242
+ category_string = '{} ({})'.format(category_name,count)
243
+ category_strings.append(category_string)
244
+
245
+ return ', '.join(category_strings)
246
+
247
+
248
+ def _print_counts_with_names(category_to_count,classification_descriptions):
249
+ """
250
+ Print a list of classification categories with counts, based in the name --> count
251
+ dict [category_to_count]
252
+ """
253
+
254
+ for category_id in category_to_count:
255
+ category_name = classification_descriptions[category_id]
256
+ count = category_to_count[category_id]
257
+ print('{}: {} ({})'.format(category_id,category_name,count))
258
+
259
+
260
+ def _prepare_results_for_smoothing(input_file,options):
261
+ """
262
+ Load results from [input_file] if necessary, prepare category descriptions
263
+ for smoothing. Adds pre-smoothing descriptions to every image if the options
264
+ say we're supposed to do that.
265
+
266
+ May modify some fields in [options].
267
+ """
268
+
269
+ if isinstance(input_file,str):
270
+ with open(input_file,'r') as f:
271
+ print('Loading results from:\n{}'.format(input_file))
272
+ d = json.load(f)
273
+ else:
274
+ assert isinstance(input_file,dict)
275
+ if options.modify_in_place:
276
+ d = input_file
277
+ else:
278
+ print('modify_in_place is False, copying the input before modifying')
279
+ d = copy.deepcopy(input_file)
280
+
281
+
282
+ ## Category processing
283
+
284
+ category_name_to_id = {d['classification_categories'][k]:k for k in d['classification_categories']}
285
+ other_category_ids = []
286
+ for s in options.other_category_names:
287
+ if s in category_name_to_id:
288
+ other_category_ids.append(category_name_to_id[s])
289
+
290
+ # Possibly update the list of category IDs we should smooth
291
+ if options.detection_category_names_to_smooth is None:
292
+ options._detection_category_ids_to_smooth = None
293
+ else:
294
+ detection_category_id_to_name = d['detection_categories']
295
+ detection_category_name_to_id = invert_dictionary(detection_category_id_to_name)
296
+ options._detection_category_ids_to_smooth = []
297
+ for category_name in options.detection_category_names_to_smooth:
298
+ options._detection_category_ids_to_smooth.append(detection_category_name_to_id[category_name])
299
+
300
+ # Before we do anything else, get rid of everything but the top classification
301
+ # for each detection, and remove the 'classifications' field from detections with
302
+ # no classifications.
303
+ for im in tqdm(d['images']):
304
+
305
+ if 'detections' not in im or im['detections'] is None or len(im['detections']) == 0:
306
+ continue
307
+
308
+ detections = im['detections']
309
+
310
+ for det in detections:
311
+
312
+ if 'classifications' not in det:
313
+ continue
314
+ if len(det['classifications']) == 0:
315
+ del det['classifications']
316
+ continue
317
+
318
+ classification_confidence_values = [c[1] for c in det['classifications']]
319
+ assert is_list_sorted(classification_confidence_values,reverse=True)
320
+ det['classifications'] = [det['classifications'][0]]
321
+
322
+ # ...for each detection in this image
323
+
324
+ # ...for each image
325
+
326
+
327
+ ## Clean up classification descriptions...
328
+
329
+ # ...so we can test taxonomic relationships by substring testing.
330
+
331
+ classification_descriptions_clean = None
332
+ classification_descriptions = None
333
+
334
+ if 'classification_category_descriptions' in d:
335
+ classification_descriptions = d['classification_category_descriptions']
336
+ classification_descriptions_clean = {}
337
+ # category_id = next(iter(classification_descriptions))
338
+ for category_id in classification_descriptions:
339
+ classification_descriptions_clean[category_id] = \
340
+ clean_taxonomy_string(classification_descriptions[category_id]).strip(';').lower()
341
+
342
+
343
+ ## Optionally add pre-smoothing descriptions to every image
344
+
345
+ if options.add_pre_smoothing_description and (classification_descriptions is not None):
346
+
347
+ for im in tqdm(d['images']):
348
+
349
+ if 'detections' not in im or im['detections'] is None or len(im['detections']) == 0:
350
+ continue
351
+
352
+ detections = im['detections']
353
+ category_to_count = count_detections_by_classification_category(detections, options)
354
+
355
+ im['pre_smoothing_description'] = \
356
+ get_classification_description_string(category_to_count, classification_descriptions)
357
+
358
+
359
+ return {
360
+ 'd':d,
361
+ 'other_category_ids':other_category_ids,
362
+ 'classification_descriptions_clean':classification_descriptions_clean,
363
+ 'classification_descriptions':classification_descriptions
364
+ }
365
+
366
+ # ...def _prepare_results_for_smoothing(...)
367
+
368
+
369
+ def _smooth_classifications_for_list_of_detections(detections,
370
+ options,
371
+ other_category_ids,
372
+ classification_descriptions,
373
+ classification_descriptions_clean):
374
+ """
375
+ Smooth classifications for a list of detections, which may have come from a single
376
+ image, or may represent an entire sequence.
377
+
378
+ Returns None if no changes are made, else a dict.
379
+
380
+ classification_descriptions_clean should be semicolon-delimited taxonomic strings
381
+ from which common names and GUIDs have already been removed.
382
+
383
+ Assumes there is only one classification per detection, i.e. that non-top classifications
384
+ have already been remoevd.
385
+ """
386
+
387
+ ## Count the number of instances of each category in this image
388
+
389
+ category_to_count = count_detections_by_classification_category(detections, options)
390
+ # _print_counts_with_names(category_to_count,classification_descriptions)
391
+ # get_classification_description_string(category_to_count, classification_descriptions)
392
+
393
+ if len(category_to_count) <= 1:
394
+ return None
395
+
396
+ keys = list(category_to_count.keys())
397
+
398
+ # Handle a quirky special case: if the most common category is "other" and
399
+ # it's "tied" with the second-most-common category, swap them
400
+ if (len(keys) > 1) and \
401
+ (keys[0] in other_category_ids) and \
402
+ (keys[1] not in other_category_ids) and \
403
+ (category_to_count[keys[0]] == category_to_count[keys[1]]):
404
+ keys[1], keys[0] = keys[0], keys[1]
405
+
406
+ max_count = category_to_count[keys[0]]
407
+ most_common_category = keys[0]
408
+ del keys
409
+
410
+
411
+ ## Debug tools
412
+
413
+ verbose_debug_enabled = False
414
+
415
+ if options.break_at_image is not None:
416
+ for det in detections:
417
+ if 'image_filename' in det and \
418
+ det['image_filename'] == options.break_at_image:
419
+ verbose_debug_enabled = True
420
+ break
421
+
422
+ if verbose_debug_enabled:
423
+ _print_counts_with_names(category_to_count,classification_descriptions)
424
+ # from IPython import embed; embed()
425
+
426
+
427
+ ## Possibly change "other" classifications to the most common category
428
+
429
+ # ...if the dominant category is not an "other" category.
430
+
431
+ n_other_classifications_changed_this_image = 0
432
+
433
+ # If we have at least *min_detections_to_overwrite_other* in a category that isn't
434
+ # "other", change all "other" classifications to that category
435
+ if (max_count >= options.min_detections_to_overwrite_other) and \
436
+ (most_common_category not in other_category_ids):
437
+
438
+ for det in detections:
439
+
440
+ if not _detection_is_relevant_for_smoothing(det,options):
441
+ continue
442
+
443
+ assert len(det['classifications']) == 1
444
+ c = det['classifications'][0]
445
+
446
+ if (c[1] >= options.classification_confidence_threshold) and \
447
+ (c[0] in other_category_ids):
448
+
449
+ if verbose_debug_enabled:
450
+ print('Replacing {} with {}'.format(
451
+ classification_descriptions[c[0]],
452
+ most_common_category))
453
+
454
+ n_other_classifications_changed_this_image += 1
455
+ c[0] = most_common_category
456
+
457
+ # ...if there are classifications for this detection
458
+
459
+ # ...for each detection
460
+
461
+ # ...if we should overwrite all "other" classifications
462
+
463
+ if verbose_debug_enabled:
464
+ print('Made {} other changes'.format(n_other_classifications_changed_this_image))
465
+
466
+
467
+ ## Re-count
468
+
469
+ category_to_count = count_detections_by_classification_category(detections, options)
470
+ # _print_counts_with_names(category_to_count,classification_descriptions)
471
+ keys = list(category_to_count.keys())
472
+ max_count = category_to_count[keys[0]]
473
+ most_common_category = keys[0]
474
+ del keys
475
+
476
+
477
+ ## Possibly change some non-dominant classifications to the dominant category
478
+
479
+ process_taxonomic_rules = \
480
+ (classification_descriptions_clean is not None) and \
481
+ (len(classification_descriptions_clean) > 0) and \
482
+ (len(category_to_count) > 1)
483
+
484
+ n_detections_flipped_this_image = 0
485
+
486
+ # Don't do this if the most common category is an "other" category, or
487
+ # if we don't have enough of the most common category
488
+ if (most_common_category not in other_category_ids) and \
489
+ (max_count >= options.min_detections_to_overwrite_secondary):
490
+
491
+ # i_det = 0; det = detections[i_det]
492
+ for i_det,det in enumerate(detections):
493
+
494
+ if not _detection_is_relevant_for_smoothing(det,options):
495
+ continue
496
+
497
+ assert len(det['classifications']) == 1
498
+ c = det['classifications'][0]
499
+
500
+ # Don't over-write the most common category with itself
501
+ if c[0] == most_common_category:
502
+ continue
503
+
504
+ # Don't bother with below-threshold classifications
505
+ if c[1] < options.classification_confidence_threshold:
506
+ continue
507
+
508
+ # If we're doing taxonomic processing, at this stage, don't turn children
509
+ # into parents; we'll likely turn parents into children in the next stage.
510
+ if process_taxonomic_rules:
511
+
512
+ most_common_category_description = \
513
+ classification_descriptions_clean[most_common_category]
514
+
515
+ category_id_this_classification = c[0]
516
+ assert category_id_this_classification in category_to_count
517
+
518
+ category_description_this_classification = \
519
+ classification_descriptions_clean[category_id_this_classification]
520
+
521
+ # An empty description corresponds to the "animal" category. We don't handle
522
+ # "animal" here as a parent category, that would be handled in the "other smoothing"
523
+ # step above.
524
+ if len(category_description_this_classification) == 0:
525
+ continue
526
+
527
+ most_common_category_is_parent_of_this_category = \
528
+ most_common_category_description in category_description_this_classification
529
+
530
+ if most_common_category_is_parent_of_this_category:
531
+ continue
532
+
533
+ # If we have fewer of this category than the most common category,
534
+ # but not *too* many, flip it to the most common category.
535
+ if (max_count > category_to_count[c[0]]) and \
536
+ (category_to_count[c[0]] <= options.max_detections_nondominant_class):
537
+
538
+ c[0] = most_common_category
539
+ n_detections_flipped_this_image += 1
540
+
541
+ # ...for each detection
542
+
543
+ # ...if the dominant category is legit
544
+
545
+ if verbose_debug_enabled:
546
+ print('Made {} non-dominant --> dominant changes'.format(
547
+ n_detections_flipped_this_image))
548
+
549
+
550
+ ## Re-count
551
+
552
+ category_to_count = count_detections_by_classification_category(detections, options)
553
+ # _print_counts_with_names(category_to_count,classification_descriptions)
554
+ keys = list(category_to_count.keys())
555
+ max_count = category_to_count[keys[0]]
556
+ most_common_category = keys[0]
557
+ del keys
558
+
559
+
560
+ ## Possibly collapse higher-level taxonomic predictions down to lower levels
561
+
562
+ n_taxonomic_changes_this_image = 0
563
+
564
+ process_taxonomic_rules = \
565
+ (classification_descriptions_clean is not None) and \
566
+ (len(classification_descriptions_clean) > 0) and \
567
+ (len(category_to_count) > 1)
568
+
569
+ if process_taxonomic_rules and options.propagate_classifications_through_taxonomy:
570
+
571
+ # det = detections[3]
572
+ for det in detections:
573
+
574
+ if not _detection_is_relevant_for_smoothing(det,options):
575
+ continue
576
+
577
+ assert len(det['classifications']) == 1
578
+ c = det['classifications'][0]
579
+
580
+ # Don't bother with any classifications below the confidence threshold
581
+ if c[1] < options.classification_confidence_threshold:
582
+ continue
583
+
584
+ category_id_this_classification = c[0]
585
+ assert category_id_this_classification in category_to_count
586
+
587
+ category_description_this_classification = \
588
+ classification_descriptions_clean[category_id_this_classification]
589
+
590
+ # An empty description corresponds to the "animal" category. We don't handle
591
+ # "animal" here as a parent category, that would be handled in the "other smoothing"
592
+ # step above.
593
+ if len(category_description_this_classification) == 0:
594
+ continue
595
+
596
+ # We may have multiple child categories to choose from; this keeps track of
597
+ # the "best" we've seen so far. "Best" is based on the level (species is better
598
+ # than genus) and number.
599
+ child_category_to_score = defaultdict(float)
600
+
601
+ for category_id_of_candidate_child in category_to_count.keys():
602
+
603
+ # A category is never its own child
604
+ if category_id_of_candidate_child == category_id_this_classification:
605
+ continue
606
+
607
+ # Is this candidate a child of the current classification?
608
+ category_description_candidate_child = \
609
+ classification_descriptions_clean[category_id_of_candidate_child]
610
+
611
+ # An empty description corresponds to "animal", which can never
612
+ # be a child of another category.
613
+ if len(category_description_candidate_child) == 0:
614
+ continue
615
+
616
+ # This handles a case that doesn't come up with "pure" SpeciesNet results;
617
+ # if two categories have different IDs but the same "clean" description, this
618
+ # means they're different common names for the same species, which we use
619
+ # for things like "white-tailed deer buck" and "white-tailed deer fawn".
620
+ #
621
+ # Currently we don't support smoothing those predictions, because it's not a
622
+ # parent/child relationship.
623
+ if category_description_candidate_child == \
624
+ category_description_this_classification:
625
+ continue
626
+
627
+ # As long as we're using "clean" descriptions, parent/child taxonomic
628
+ # relationships are defined by a substring relationship
629
+ is_child = category_description_this_classification in \
630
+ category_description_candidate_child
631
+
632
+ if not is_child:
633
+ continue
634
+
635
+ # How many instances of this child category are there?
636
+ child_category_count = category_to_count[category_id_of_candidate_child]
637
+
638
+ # What taxonomy level is this child category defined at?
639
+ child_category_level = taxonomy_level_index(
640
+ classification_descriptions[category_id_of_candidate_child])
641
+
642
+ child_category_to_score[category_id_of_candidate_child] = \
643
+ child_category_level * options.taxonomy_propagation_level_weight + \
644
+ child_category_count * options.taxonomy_propagation_count_weight
645
+
646
+ # ...for each category we are considering reducing this classification to
647
+
648
+ # Did we find a category we want to change this classification to?
649
+ if len(child_category_to_score) > 0:
650
+
651
+ # Find the child category with the highest score
652
+ child_category_to_score = sort_dictionary_by_value(
653
+ child_category_to_score,reverse=True)
654
+ best_child_category = next(iter(child_category_to_score.keys()))
655
+
656
+ if verbose_debug_enabled:
657
+ old_category_name = \
658
+ classification_descriptions_clean[c[0]]
659
+ new_category_name = \
660
+ classification_descriptions_clean[best_child_category]
661
+ print('Replacing {} with {}'.format(
662
+ old_category_name,new_category_name))
663
+
664
+ c[0] = best_child_category
665
+ n_taxonomic_changes_this_image += 1
666
+
667
+ # ...for each detection
668
+
669
+ # ...if we have taxonomic information available
670
+
671
+
672
+ ## Re-count
673
+
674
+ category_to_count = count_detections_by_classification_category(detections, options)
675
+ # _print_counts_with_names(category_to_count,classification_descriptions)
676
+ keys = list(category_to_count.keys())
677
+ max_count = category_to_count[keys[0]]
678
+ most_common_category = keys[0]
679
+ del keys
680
+
681
+
682
+ ## Possibly do within-family smoothing
683
+
684
+ n_within_family_smoothing_changes = 0
685
+
686
+ # min_detections_to_overwrite_secondary_same_family = -1
687
+ # max_detections_nondominant_class_same_family = 1
688
+ family_level = taxonomy_level_string_to_index('family')
689
+
690
+ if process_taxonomic_rules:
691
+
692
+ category_description_most_common_category = \
693
+ classification_descriptions[most_common_category]
694
+ most_common_category_taxonomic_level = \
695
+ taxonomy_level_index(category_description_most_common_category)
696
+ n_most_common_category = category_to_count[most_common_category]
697
+ tokens = category_description_most_common_category.split(';')
698
+ assert len(tokens) == 7
699
+ most_common_category_family = tokens[3]
700
+ most_common_category_genus = tokens[4]
701
+
702
+ # Only consider remapping to genus or species level, and only when we have
703
+ # a high enough count in the most common category
704
+ if process_taxonomic_rules and \
705
+ (options.min_detections_to_overwrite_secondary_same_family > 0) and \
706
+ (most_common_category not in other_category_ids) and \
707
+ (most_common_category_taxonomic_level > family_level) and \
708
+ (n_most_common_category >= options.min_detections_to_overwrite_secondary_same_family):
709
+
710
+ # det = detections[0]
711
+ for det in detections:
712
+
713
+ if not _detection_is_relevant_for_smoothing(det,options):
714
+ continue
715
+
716
+ assert len(det['classifications']) == 1
717
+ c = det['classifications'][0]
718
+
719
+ # Don't over-write the most common category with itself
720
+ if c[0] == most_common_category:
721
+ continue
722
+
723
+ # Don't bother with below-threshold classifications
724
+ if c[1] < options.classification_confidence_threshold:
725
+ continue
726
+
727
+ n_candidate_flip_category = category_to_count[c[0]]
728
+
729
+ # Do we have too many of the non-dominant category to do this kind of swap?
730
+ if n_candidate_flip_category > \
731
+ options.max_detections_nondominant_class_same_family:
732
+ continue
733
+
734
+ # Don't flip classes when it's a tie
735
+ if n_candidate_flip_category == n_most_common_category:
736
+ continue
737
+
738
+ category_description_candidate_flip = \
739
+ classification_descriptions[c[0]]
740
+ tokens = category_description_candidate_flip.split(';')
741
+ assert len(tokens) == 7
742
+ candidate_flip_category_family = tokens[3]
743
+ candidate_flip_category_genus = tokens[4]
744
+ candidate_flip_category_taxonomic_level = \
745
+ taxonomy_level_index(category_description_candidate_flip)
746
+
747
+ # Only proceed if we have valid family strings
748
+ if (len(candidate_flip_category_family) == 0) or \
749
+ (len(most_common_category_family) == 0):
750
+ continue
751
+
752
+ # Only proceed if the candidate and the most common category are in the same family
753
+ if candidate_flip_category_family != most_common_category_family:
754
+ continue
755
+
756
+ # Don't flip from a species to the genus level in the same genus
757
+ if (candidate_flip_category_genus == most_common_category_genus) and \
758
+ (candidate_flip_category_taxonomic_level > \
759
+ most_common_category_taxonomic_level):
760
+ continue
761
+
762
+ old_category_name = classification_descriptions_clean[c[0]]
763
+ new_category_name = classification_descriptions_clean[most_common_category]
764
+
765
+ c[0] = most_common_category
766
+ n_within_family_smoothing_changes += 1
767
+
768
+ # ...for each detection
769
+
770
+ # ...if the dominant category is legit and we have taxonomic information available
771
+
772
+ return {'n_other_classifications_changed_this_image':n_other_classifications_changed_this_image,
773
+ 'n_detections_flipped_this_image':n_detections_flipped_this_image,
774
+ 'n_taxonomic_changes_this_image':n_taxonomic_changes_this_image,
775
+ 'n_within_family_smoothing_changes':n_within_family_smoothing_changes}
776
+
777
+ # ...def _smooth_classifications_for_list_of_detections(...)
778
+
779
+
780
+ def _smooth_single_image(im,
781
+ options,
782
+ other_category_ids,
783
+ classification_descriptions,
784
+ classification_descriptions_clean):
785
+ """
786
+ Smooth classifications for a single image. Returns None if no changes are made,
787
+ else a dict.
788
+
789
+ classification_descriptions_clean should be semicolon-delimited taxonomic strings
790
+ from which common names and GUIDs have already been removed.
791
+
792
+ Assumes there is only one classification per detection, i.e. that non-top classifications
793
+ have already been remoevd.
794
+ """
795
+
796
+ if 'detections' not in im or im['detections'] is None or len(im['detections']) == 0:
797
+ return
798
+
799
+ detections = im['detections']
800
+
801
+ # Simplify debugging
802
+ for det in detections:
803
+ det['image_filename'] = im['file']
804
+
805
+ to_return = _smooth_classifications_for_list_of_detections(detections,
806
+ options=options,
807
+ other_category_ids=other_category_ids,
808
+ classification_descriptions=classification_descriptions,
809
+ classification_descriptions_clean=classification_descriptions_clean)
810
+
811
+ # Clean out debug information
812
+ for det in detections:
813
+ del det['image_filename']
814
+
815
+ return to_return
816
+
817
+ # ...def smooth_single_image
818
+
819
+
820
+ #%% Image-level smoothing
821
+
822
+ def smooth_classification_results_image_level(input_file,output_file=None,options=None):
823
+ """
824
+ Smooth classifications at the image level for all results in the MD-formatted results
825
+ file [input_file], optionally writing a new set of results to [output_file].
826
+
827
+ This function generally expresses the notion that an image with 700 cows and one deer
828
+ is really just 701 cows.
829
+
830
+ Only count detections with a classification confidence threshold above
831
+ [options.classification_confidence_threshold], which in practice means we're only
832
+ looking at one category per detection.
833
+
834
+ If an image has at least [options.min_detections_to_overwrite_secondary] such detections
835
+ in the most common category, and no more than [options.max_detections_nondominant_class]
836
+ in the second-most-common category, flip all detections to the most common
837
+ category.
838
+
839
+ Optionally treat some classes as particularly unreliable, typically used to overwrite an
840
+ "other" class.
841
+
842
+ This function also removes everything but the non-dominant classification for each detection.
843
+
844
+ Args:
845
+ input_file (str): MegaDetector-formatted classification results file to smooth. Can
846
+ also be an already-loaded results dict.
847
+ output_file (str, optional): .json file to write smoothed results
848
+ options (ClassificationSmoothingOptions, optional): see
849
+ ClassificationSmoothingOptions for details.
850
+
851
+ Returns:
852
+ dict: MegaDetector-results-formatted dict, identical to what's written to
853
+ [output_file] if [output_file] is not None.
854
+ """
855
+
856
+ ## Input validation
857
+
858
+ if options is None:
859
+ options = ClassificationSmoothingOptions()
860
+
861
+ r = _prepare_results_for_smoothing(input_file, options)
862
+ d = r['d']
863
+ other_category_ids = r['other_category_ids']
864
+ classification_descriptions_clean = r['classification_descriptions_clean']
865
+ classification_descriptions = r['classification_descriptions']
866
+
867
+
868
+ ## Smoothing
869
+
870
+ n_other_classifications_changed = 0
871
+ n_other_images_changed = 0
872
+ n_taxonomic_images_changed = 0
873
+
874
+ n_detections_flipped = 0
875
+ n_images_changed = 0
876
+ n_taxonomic_classification_changes = 0
877
+
878
+ # im = d['images'][0]
879
+ for im in tqdm(d['images']):
880
+
881
+ r = _smooth_single_image(im,
882
+ options,
883
+ other_category_ids,
884
+ classification_descriptions=classification_descriptions,
885
+ classification_descriptions_clean=classification_descriptions_clean)
886
+
887
+ if r is None:
888
+ continue
889
+
890
+ n_detections_flipped_this_image = r['n_detections_flipped_this_image']
891
+ n_other_classifications_changed_this_image = \
892
+ r['n_other_classifications_changed_this_image']
893
+ n_taxonomic_changes_this_image = r['n_taxonomic_changes_this_image']
894
+
895
+ n_detections_flipped += n_detections_flipped_this_image
896
+ n_other_classifications_changed += n_other_classifications_changed_this_image
897
+ n_taxonomic_classification_changes += n_taxonomic_changes_this_image
898
+
899
+ if n_detections_flipped_this_image > 0:
900
+ n_images_changed += 1
901
+ if n_other_classifications_changed_this_image > 0:
902
+ n_other_images_changed += 1
903
+ if n_taxonomic_changes_this_image > 0:
904
+ n_taxonomic_images_changed += 1
905
+
906
+ # ...for each image
907
+
908
+ print('Classification smoothing: changed {} detections on {} images'.format(
909
+ n_detections_flipped,n_images_changed))
910
+
911
+ print('"Other" smoothing: changed {} detections on {} images'.format(
912
+ n_other_classifications_changed,n_other_images_changed))
913
+
914
+ print('Taxonomic smoothing: changed {} detections on {} images'.format(
915
+ n_taxonomic_classification_changes,n_taxonomic_images_changed))
916
+
917
+
918
+ ## Write output
919
+
920
+ if output_file is not None:
921
+ print('Writing results after image-level smoothing to:\n{}'.format(output_file))
922
+ write_json(output_file,d)
923
+
924
+ return d
925
+
926
+ # ...def smooth_classification_results_image_level(...)
927
+
928
+
929
+ #%% Sequence-level smoothing
930
+
931
+ def smooth_classification_results_sequence_level(input_file,
932
+ cct_sequence_information,
933
+ output_file=None,
934
+ options=None):
935
+ """
936
+ Smooth classifications at the sequence level for all results in the MD-formatted results
937
+ file [md_results_file], optionally writing a new set of results to [output_file].
938
+
939
+ This function generally expresses the notion that a sequence that looks like
940
+ deer/deer/deer/elk/deer/deer/deer/deer is really just a deer.
941
+
942
+ Args:
943
+ input_file (str or dict): MegaDetector-formatted classification results file to smooth
944
+ (or already-loaded results). If you supply a dict, it's copied by default, but
945
+ in-place modification is supported via options.modify_in_place.
946
+ cct_sequence_information (str, dict, or list): COCO Camera Traps file containing sequence IDs for
947
+ each image (or an already-loaded CCT-formatted dict, or just the 'images' list from a CCT dict).
948
+ output_file (str, optional): .json file to write smoothed results
949
+ options (ClassificationSmoothingOptions, optional): see
950
+ ClassificationSmoothingOptions for details.
951
+
952
+ Returns:
953
+ dict: MegaDetector-results-formatted dict, identical to what's written to
954
+ [output_file] if [output_file] is not None.
955
+ """
956
+
957
+ ## Input validation
958
+
959
+ if options is None:
960
+ options = ClassificationSmoothingOptions()
961
+
962
+ r = _prepare_results_for_smoothing(input_file, options)
963
+ d = r['d']
964
+ other_category_ids = r['other_category_ids']
965
+ classification_descriptions_clean = r['classification_descriptions_clean']
966
+ classification_descriptions = r['classification_descriptions']
967
+
968
+
969
+ ## Make a list of images appearing in each sequence
970
+
971
+ if isinstance(cct_sequence_information,list):
972
+ image_info = cct_sequence_information
973
+ elif isinstance(cct_sequence_information,str):
974
+ print('Loading sequence information from {}'.format(cct_sequence_information))
975
+ with open(cct_sequence_information,'r') as f:
976
+ cct_sequence_information = json.load(f)
977
+ image_info = cct_sequence_information['images']
978
+ else:
979
+ assert isinstance(cct_sequence_information,dict)
980
+ image_info = cct_sequence_information['images']
981
+
982
+ sequence_to_image_filenames = defaultdict(list)
983
+
984
+ # im = image_info[0]
985
+ for im in tqdm(image_info):
986
+ sequence_to_image_filenames[im['seq_id']].append(im['file_name'])
987
+ del image_info
988
+
989
+ image_fn_to_classification_results = {}
990
+ for im in d['images']:
991
+ fn = im['file']
992
+ assert fn not in image_fn_to_classification_results
993
+ image_fn_to_classification_results[fn] = im
994
+
995
+
996
+ ## Smoothing
997
+
998
+ n_other_classifications_changed = 0
999
+ n_other_sequences_changed = 0
1000
+ n_taxonomic_sequences_changed = 0
1001
+ n_within_family_sequences_changed = 0
1002
+
1003
+ n_detections_flipped = 0
1004
+ n_sequences_changed = 0
1005
+ n_taxonomic_classification_changes = 0
1006
+ n_within_family_changes = 0
1007
+
1008
+ # sequence_id = list(sequence_to_image_filenames.keys())[0]
1009
+ for sequence_id in sequence_to_image_filenames.keys():
1010
+
1011
+ image_filenames_this_sequence = sequence_to_image_filenames[sequence_id]
1012
+
1013
+ # if 'file' in image_filenames_this_sequence:
1014
+ # from IPython import embed; embed()
1015
+
1016
+ detections_this_sequence = []
1017
+ for image_filename in image_filenames_this_sequence:
1018
+ if image_filename not in image_fn_to_classification_results:
1019
+ print('Warning: {} in sequence list but not in results'.format(
1020
+ image_filename))
1021
+ continue
1022
+ im = image_fn_to_classification_results[image_filename]
1023
+ if 'detections' not in im or im['detections'] is None:
1024
+ continue
1025
+ detections_this_sequence.extend(im['detections'])
1026
+
1027
+ # Temporarily add image filenames to every detection,
1028
+ # for debugging
1029
+ for det in im['detections']:
1030
+ det['image_filename'] = im['file']
1031
+
1032
+ if len(detections_this_sequence) == 0:
1033
+ continue
1034
+
1035
+ r = _smooth_classifications_for_list_of_detections(
1036
+ detections=detections_this_sequence,
1037
+ options=options,
1038
+ other_category_ids=other_category_ids,
1039
+ classification_descriptions=classification_descriptions,
1040
+ classification_descriptions_clean=classification_descriptions_clean)
1041
+
1042
+ if r is None:
1043
+ continue
1044
+
1045
+ n_detections_flipped_this_sequence = r['n_detections_flipped_this_image']
1046
+ n_other_classifications_changed_this_sequence = \
1047
+ r['n_other_classifications_changed_this_image']
1048
+ n_taxonomic_changes_this_sequence = r['n_taxonomic_changes_this_image']
1049
+ n_within_family_changes_this_sequence = r['n_within_family_smoothing_changes']
1050
+
1051
+ n_detections_flipped += n_detections_flipped_this_sequence
1052
+ n_other_classifications_changed += n_other_classifications_changed_this_sequence
1053
+ n_taxonomic_classification_changes += n_taxonomic_changes_this_sequence
1054
+ n_within_family_changes += n_within_family_changes_this_sequence
1055
+
1056
+ if n_detections_flipped_this_sequence > 0:
1057
+ n_sequences_changed += 1
1058
+ if n_other_classifications_changed_this_sequence > 0:
1059
+ n_other_sequences_changed += 1
1060
+ if n_taxonomic_changes_this_sequence > 0:
1061
+ n_taxonomic_sequences_changed += 1
1062
+ if n_within_family_changes_this_sequence > 0:
1063
+ n_within_family_sequences_changed += 1
1064
+
1065
+ # ...for each sequence
1066
+
1067
+ print('Classification smoothing: changed {} detections in {} sequences'.format(
1068
+ n_detections_flipped,n_sequences_changed))
1069
+
1070
+ print('"Other" smoothing: changed {} detections in {} sequences'.format(
1071
+ n_other_classifications_changed,n_other_sequences_changed))
1072
+
1073
+ print('Taxonomic smoothing: changed {} detections in {} sequences'.format(
1074
+ n_taxonomic_classification_changes,n_taxonomic_sequences_changed))
1075
+
1076
+ print('Within-family smoothing: changed {} detections in {} sequences'.format(
1077
+ n_within_family_changes,n_within_family_sequences_changed))
1078
+
1079
+
1080
+ ## Clean up debug information
1081
+
1082
+ for im in d['images']:
1083
+ if 'detections' not in im or im['detections'] is None:
1084
+ continue
1085
+ for det in im['detections']:
1086
+ if 'image_filename' in det:
1087
+ del det['image_filename']
1088
+
1089
+
1090
+ ## Write output
1091
+
1092
+ if output_file is not None:
1093
+ print('Writing sequence-smoothed classification results to {}'.format(
1094
+ output_file))
1095
+ write_json(output_file,d)
1096
+
1097
+ return d
1098
+
1099
+ # ...smooth_classification_results_sequence_level(...)
1100
+
1101
+
1102
+ def remove_classifications_from_non_animal_detections(input_file,
1103
+ output_file,
1104
+ animal_category_names=None):
1105
+ """
1106
+ Remove classifications from non-animal detections in a MD .json file,
1107
+ optionally writing the results to a new .json file
1108
+
1109
+ Args:
1110
+ input_file (str): the MD-formatted .json file to process
1111
+ output_file (str, optional): the output file to write the modified results
1112
+ animal_category_names (list, optional): the detection category names that should
1113
+ be treated as animals (defaults to just 'animal').
1114
+
1115
+ Returns:
1116
+ dict: the modified results
1117
+ """
1118
+
1119
+ if animal_category_names is None:
1120
+ animal_category_names = ['animal']
1121
+ animal_category_names = set(animal_category_names)
1122
+
1123
+ with open(input_file,'r') as f:
1124
+ d = json.load(f)
1125
+
1126
+ category_id_to_name = d['detection_categories']
1127
+
1128
+ n_classifications_removed = 0
1129
+ n_detections = 0
1130
+
1131
+ # im = d['images'][0]
1132
+ for im in d['images']:
1133
+
1134
+ if ('detections' not in im) or (im['detections'] is None):
1135
+ continue
1136
+
1137
+ n_detections += len(im['detections'])
1138
+
1139
+ for det in im['detections']:
1140
+
1141
+ if 'classifications' not in det:
1142
+ continue
1143
+ category_id = det['category']
1144
+ category_name = category_id_to_name[category_id]
1145
+ if category_name not in animal_category_names:
1146
+ del det['classifications']
1147
+ n_classifications_removed += 1
1148
+ continue
1149
+
1150
+ # ...for each detection
1151
+
1152
+ # ...for each image
1153
+
1154
+ print('Removed classifications from {} of {} detections'.format(
1155
+ n_classifications_removed,n_detections))
1156
+
1157
+ if output_file is not None:
1158
+ write_json(output_file,d)
1159
+
1160
+ return d
1161
+
1162
+ # ...def remove_classifications_from_non_animal_detections(...)
1163
+
1164
+
1165
+ def restrict_to_taxa_list(taxa_list,
1166
+ speciesnet_taxonomy_file,
1167
+ input_file,
1168
+ output_file,
1169
+ allow_walk_down=False,
1170
+ add_pre_filtering_description=True,
1171
+ allow_redundant_latin_names=True,
1172
+ protected_common_names=None,
1173
+ use_original_common_names_if_available=True,
1174
+ verbose=True):
1175
+ """
1176
+ Given a prediction file in MD .json format, likely without having had
1177
+ a geofence applied, apply a custom taxa list.
1178
+
1179
+ Args:
1180
+ taxa_list (str): .csv file with at least the columns "latin" and "common".
1181
+ speciesnet_taxonomy_file (str): taxonomy filename, in the same format used for
1182
+ model release (with 7-token taxonomy entries)
1183
+ input_file (str): .json file to read, in MD format. This can be None, in which
1184
+ case this function just validates [taxa_list].
1185
+ output_file (str): .json file to write, in MD format
1186
+ allow_walk_down (bool, optional): should we walk down the taxonomy tree
1187
+ when making mappings if a parent has only a single allowable child?
1188
+ For example, if only a single felid species is allowed, should other
1189
+ felid predictions be mapped to that species, as opposed to being mapped
1190
+ to the family?
1191
+ add_pre_filtering_description (bool, optional): should we add a new metadata
1192
+ field that summarizes each image's classifications prior to taxonomic
1193
+ restriction?
1194
+ allow_redundant_latin_names (bool, optional): if False, we'll raise an Exception
1195
+ if the same latin name appears twice in the taxonomy list; if True, we'll
1196
+ just print a warning and ignore all entries other than the first for this
1197
+ latin name
1198
+ protected_common_names (list, optional): these categories should be
1199
+ unmodified, even if they aren't used, or have the same taxonomic
1200
+ description as other categories
1201
+ use_original_common_names_if_available (bool, optional): if an "original_common"
1202
+ column is present in [taxa_list], use those common names instead of the ones
1203
+ in the taxonomy file
1204
+ verbose (bool, optional): enable additional debug output
1205
+ """
1206
+
1207
+ ##%% Read target taxa list
1208
+
1209
+ taxa_list_df = pd.read_csv(taxa_list)
1210
+
1211
+ required_columns = ('latin','common')
1212
+ for s in required_columns:
1213
+ assert s in taxa_list_df.columns, \
1214
+ 'Required column {} missing from taxonomy list file {}'.format(
1215
+ s,taxa_list)
1216
+
1217
+ # Convert the "latin" and "common" columns in taxa_list_df to lowercase
1218
+ taxa_list_df['latin'] = taxa_list_df['latin'].str.lower()
1219
+ taxa_list_df['common'] = taxa_list_df['common'].str.lower()
1220
+
1221
+ # Remove rows from taxa_list_df where the "latin" column is nan,
1222
+ # printing a warning for each row (with a string representation of the whole row)
1223
+ for i_row,row in taxa_list_df.iterrows():
1224
+ if pd.isna(row['latin']):
1225
+ if verbose:
1226
+ print('Warning: Skipping row with empty "latin" column in {}:\n{}\n'.format(
1227
+ taxa_list,str(row.to_dict())))
1228
+ taxa_list_df.drop(index=i_row, inplace=True)
1229
+
1230
+ # Convert all NaN values in the "common" column to empty strings
1231
+ taxa_list_df['common'] = taxa_list_df['common'].fillna('')
1232
+
1233
+ # Create a dictionary mapping source Latin names to target common names
1234
+ target_latin_to_common = {}
1235
+
1236
+ for i_row,row in taxa_list_df.iterrows():
1237
+
1238
+ latin = row['latin']
1239
+ common = row['common']
1240
+
1241
+ if use_original_common_names_if_available and \
1242
+ ('original_common' in row) and \
1243
+ (not is_empty(row['original_common'])):
1244
+ common = row['original_common'].strip().lower()
1245
+
1246
+ # Valid latin names have either one token (e.g. "canidae"),
1247
+ # two tokens (e.g. "bos taurus"), or three tokens (e.g. "canis lupus familiaris")
1248
+ assert len(latin.split(' ')) in (1,2,3), \
1249
+ 'Illegal binomial name {} in taxaonomy list {}'.format(
1250
+ latin,taxa_list)
1251
+
1252
+ if latin in target_latin_to_common:
1253
+ error_string = \
1254
+ 'scientific name {} appears multiple times in the taxonomy list'.format(
1255
+ latin)
1256
+ if allow_redundant_latin_names:
1257
+ if verbose:
1258
+ print('Warning: {}'.format(error_string))
1259
+ else:
1260
+ raise ValueError(error_string)
1261
+
1262
+ target_latin_to_common[latin] = common
1263
+
1264
+ # ...for each row in the custom taxonomy list
1265
+
1266
+
1267
+ ##%% Read taxonomy file
1268
+
1269
+ with open(speciesnet_taxonomy_file,'r') as f:
1270
+ speciesnet_taxonomy_list = f.readlines()
1271
+ speciesnet_taxonomy_list = [s.strip() for s in \
1272
+ speciesnet_taxonomy_list if len(s.strip()) > 0]
1273
+
1274
+ # Maps the latin name of every taxon to the corresponding full taxon string
1275
+ #
1276
+ # For species, the key is a binomial name
1277
+ speciesnet_latin_name_to_taxon_string = {}
1278
+ speciesnet_common_name_to_taxon_string = {}
1279
+
1280
+ def _insert_taxonomy_string(s):
1281
+
1282
+ tokens = s.split(';')
1283
+ assert len(tokens) == 7, 'Illegal taxonomy string {}'.format(s)
1284
+
1285
+ guid = tokens[0] # noqa
1286
+ class_name = tokens[1]
1287
+ order = tokens[2]
1288
+ family = tokens[3]
1289
+ genus = tokens[4]
1290
+ species = tokens[5]
1291
+ common_name = tokens[6]
1292
+
1293
+ if len(class_name) == 0:
1294
+ assert common_name in ('animal','vehicle','blank'), \
1295
+ 'Illegal common name {}'.format(common_name)
1296
+ return
1297
+
1298
+ if len(species) > 0:
1299
+ assert all([len(s) > 0 for s in [genus,family,order]]), \
1300
+ 'Higher-level taxa missing for {}: {},{},{}'.format(s,genus,family,order)
1301
+ binomial_name = genus + ' ' + species
1302
+ if binomial_name not in speciesnet_latin_name_to_taxon_string:
1303
+ speciesnet_latin_name_to_taxon_string[binomial_name] = s
1304
+ elif len(genus) > 0:
1305
+ assert all([len(s) > 0 for s in [family,order]]), \
1306
+ 'Higher-level taxa missing for {}: {},{}'.format(s,family,order)
1307
+ if genus not in speciesnet_latin_name_to_taxon_string:
1308
+ speciesnet_latin_name_to_taxon_string[genus] = s
1309
+ elif len(family) > 0:
1310
+ assert len(order) > 0, \
1311
+ 'Higher-level taxa missing for {}: {}'.format(s,order)
1312
+ if family not in speciesnet_latin_name_to_taxon_string:
1313
+ speciesnet_latin_name_to_taxon_string[family] = s
1314
+ elif len(order) > 0:
1315
+ if order not in speciesnet_latin_name_to_taxon_string:
1316
+ speciesnet_latin_name_to_taxon_string[order] = s
1317
+ else:
1318
+ if class_name not in speciesnet_latin_name_to_taxon_string:
1319
+ speciesnet_latin_name_to_taxon_string[class_name] = s
1320
+
1321
+ if len(common_name) > 0:
1322
+ if common_name not in speciesnet_common_name_to_taxon_string:
1323
+ speciesnet_common_name_to_taxon_string[common_name] = s
1324
+
1325
+ for s in speciesnet_taxonomy_list:
1326
+
1327
+ _insert_taxonomy_string(s)
1328
+
1329
+
1330
+ ##%% Make sure all parent taxa are represented in the taxonomy
1331
+
1332
+ # In theory any taxon that appears as the parent of another taxon should
1333
+ # also be in the taxonomy, but this isn't always true, so we fix it here.
1334
+ new_taxon_string_to_missing_tokens = defaultdict(list)
1335
+
1336
+ # While we're making this loop, also see whether we need to store any custom
1337
+ # common name mappings based on the taxonomy list.
1338
+ speciesnet_latin_name_to_output_common_name = {}
1339
+
1340
+ # latin_name = next(iter(speciesnet_latin_name_to_taxon_string.keys()))
1341
+ for latin_name in speciesnet_latin_name_to_taxon_string.keys():
1342
+
1343
+ if latin_name in target_latin_to_common:
1344
+ speciesnet_latin_name_to_output_common_name[latin_name] = \
1345
+ target_latin_to_common[latin_name]
1346
+
1347
+ if 'no cv result' in latin_name:
1348
+ continue
1349
+
1350
+ taxon_string = speciesnet_latin_name_to_taxon_string[latin_name]
1351
+ tokens = taxon_string.split(';')
1352
+
1353
+ # Don't process GUID, species, or common name
1354
+ # i_token = 6
1355
+ for i_token in range(1,len(tokens)-2):
1356
+
1357
+ test_token = tokens[i_token]
1358
+ if len(test_token) == 0:
1359
+ continue
1360
+
1361
+ # Do we need to make up a taxon for this token?
1362
+ if test_token not in speciesnet_latin_name_to_taxon_string:
1363
+
1364
+ new_tokens = [''] * 7
1365
+ new_tokens[0] = 'fake_guid'
1366
+ for i_copy_token in range(1,i_token+1):
1367
+ new_tokens[i_copy_token] = tokens[i_copy_token]
1368
+ new_tokens[-1] = test_token + ' species'
1369
+ assert new_tokens[-2] == '', \
1370
+ 'Illegal taxonomy string {}'.format(taxon_string)
1371
+ new_taxon_string = ';'.join(new_tokens)
1372
+ # assert new_taxon_string not in new_taxon_strings
1373
+ new_taxon_string_to_missing_tokens[new_taxon_string].append(test_token)
1374
+
1375
+ # ...for each token
1376
+
1377
+ # ...for each taxon
1378
+
1379
+ new_taxon_string_to_missing_tokens = \
1380
+ sort_dictionary_by_key(new_taxon_string_to_missing_tokens)
1381
+
1382
+ if verbose:
1383
+
1384
+ print(f'Found {len(new_taxon_string_to_missing_tokens)} taxa that need to be inserted to ' + \
1385
+ 'make the taxonomy valid, showing only mammals and birds here:\n')
1386
+
1387
+ for taxon_string in new_taxon_string_to_missing_tokens:
1388
+ if 'mammalia' not in taxon_string and 'aves' not in taxon_string:
1389
+ continue
1390
+ missing_taxa = ','.join(new_taxon_string_to_missing_tokens[taxon_string])
1391
+ print('{} ({})'.format(taxon_string,missing_taxa))
1392
+
1393
+ for new_taxon_string in new_taxon_string_to_missing_tokens:
1394
+ _insert_taxonomy_string(new_taxon_string)
1395
+
1396
+
1397
+ ##%% Make sure all taxa on the allow-list are in the taxonomy
1398
+
1399
+ n_failed_mappings = 0
1400
+
1401
+ for target_taxon_latin_name in target_latin_to_common.keys():
1402
+ if target_taxon_latin_name not in speciesnet_latin_name_to_taxon_string:
1403
+ common_name = target_latin_to_common[target_taxon_latin_name]
1404
+ s = '{} ({}) not in speciesnet taxonomy'.format(
1405
+ target_taxon_latin_name,common_name)
1406
+ if common_name in speciesnet_common_name_to_taxon_string:
1407
+ s += ' (common name maps to {})'.format(
1408
+ speciesnet_common_name_to_taxon_string[common_name])
1409
+ print(s)
1410
+ n_failed_mappings += 1
1411
+
1412
+ if n_failed_mappings > 0:
1413
+ raise ValueError('Cannot continue with taxonomic restriction')
1414
+
1415
+
1416
+ ##%% For the allow-list, map each parent taxon to a set of allowable child taxa
1417
+
1418
+ # Maps parent names to all allowed child names, or None if this is the
1419
+ # lowest-level allowable taxon on this path
1420
+ allowed_parent_taxon_to_child_taxa = defaultdict(set)
1421
+
1422
+ # latin_name = next(iter(target_latin_to_common.keys()))
1423
+ for latin_name in target_latin_to_common:
1424
+
1425
+ taxon_string = speciesnet_latin_name_to_taxon_string[latin_name]
1426
+ tokens = taxon_string.split(';')
1427
+ assert len(tokens) == 7, \
1428
+ 'Illegal taxonomy string {}'.format(taxon_string)
1429
+
1430
+ # Remove GUID and common mame
1431
+ #
1432
+ # This is now always class/order/family/genus/species
1433
+ tokens = tokens[1:-1]
1434
+
1435
+ child_taxon = None
1436
+
1437
+ # If this is a species
1438
+ if len(tokens[-1]) > 0:
1439
+ binomial_name = tokens[-2] + ' ' + tokens[-1]
1440
+ assert binomial_name == latin_name, \
1441
+ 'Binomial/latin mismatch: {} vs {}'.format(binomial_name,latin_name)
1442
+ # If this already exists, it should only allow "None"
1443
+ if binomial_name in allowed_parent_taxon_to_child_taxa:
1444
+ assert len(allowed_parent_taxon_to_child_taxa[binomial_name]) == 1, \
1445
+ 'Species-level entry {} has multiple children'.format(binomial_name)
1446
+ assert None in allowed_parent_taxon_to_child_taxa[binomial_name], \
1447
+ 'Species-level entry {} has non-None children'.format(binomial_name)
1448
+ allowed_parent_taxon_to_child_taxa[binomial_name].add(None)
1449
+ child_taxon = binomial_name
1450
+
1451
+ # The first level that can ever be a parent taxon is the genus level
1452
+ parent_token_index = len(tokens) - 2
1453
+
1454
+ # Walk up from genus to family
1455
+ while(parent_token_index >= 0):
1456
+
1457
+ # "None" is our leaf node marker, we should never have ''
1458
+ if child_taxon is not None:
1459
+ assert len(child_taxon) > 0
1460
+
1461
+ parent_taxon = tokens[parent_token_index]
1462
+
1463
+ # Don't create entries for blank taxa
1464
+ if (len(parent_taxon) > 0):
1465
+
1466
+ create_child = True
1467
+
1468
+ # This is the lowest-level taxon in this entry
1469
+ if (child_taxon is None):
1470
+
1471
+ # ...but we don't want to remove existing children from any parents
1472
+ if (parent_taxon in allowed_parent_taxon_to_child_taxa) and \
1473
+ (len(allowed_parent_taxon_to_child_taxa[parent_taxon]) > 0):
1474
+ if verbose:
1475
+ existing_children_string = str(allowed_parent_taxon_to_child_taxa[parent_taxon])
1476
+ print('Not creating empty child for parent {} (already has children {})'.format(
1477
+ parent_taxon,existing_children_string))
1478
+ create_child = False
1479
+
1480
+ # If we're adding a new child entry, clear out any leaf node markers
1481
+ else:
1482
+
1483
+ if (parent_taxon in allowed_parent_taxon_to_child_taxa) and \
1484
+ (None in allowed_parent_taxon_to_child_taxa[parent_taxon]):
1485
+
1486
+ assert len(allowed_parent_taxon_to_child_taxa[parent_taxon]) == 1, \
1487
+ 'Illlegal parent/child configuration'
1488
+
1489
+ if verbose:
1490
+ print('Un-marking parent {} as a leaf node because of child {}'.format(
1491
+ parent_taxon,child_taxon))
1492
+
1493
+ allowed_parent_taxon_to_child_taxa[parent_taxon] = set()
1494
+
1495
+ if create_child:
1496
+ allowed_parent_taxon_to_child_taxa[parent_taxon].add(child_taxon)
1497
+
1498
+ # If we haven't hit a non-empty taxon yet, don't update "child_taxon"
1499
+ assert len(parent_taxon) > 0
1500
+ child_taxon = parent_taxon
1501
+
1502
+ # ...if we have a non-empty taxon
1503
+
1504
+ parent_token_index -= 1
1505
+
1506
+ # ...for each taxonomic level
1507
+
1508
+ # ...for each allowed latin name
1509
+
1510
+ allowed_parent_taxon_to_child_taxa = \
1511
+ sort_dictionary_by_key(allowed_parent_taxon_to_child_taxa)
1512
+
1513
+ for parent_taxon in allowed_parent_taxon_to_child_taxa:
1514
+ # "None" should only ever appear alone; this marks a leaf node with no children
1515
+ if None in allowed_parent_taxon_to_child_taxa[parent_taxon]:
1516
+ assert len(allowed_parent_taxon_to_child_taxa[parent_taxon]) == 1, \
1517
+ '"None" should only appear alone in a child taxon list'
1518
+
1519
+
1520
+ ##%% If we were just validating the custom taxa file, we're done
1521
+
1522
+ if input_file is None:
1523
+ print('Finished validating custom taxonomy list')
1524
+ return
1525
+
1526
+
1527
+ #%% Map all predictions that exist in this dataset...
1528
+
1529
+ # ...to the prediction we should generate.
1530
+
1531
+ with open(input_file,'r') as f:
1532
+ input_data = json.load(f)
1533
+
1534
+ input_category_id_to_common_name = input_data['classification_categories'] #noqa
1535
+ input_category_id_to_taxonomy_string = \
1536
+ input_data['classification_category_descriptions']
1537
+
1538
+ input_category_id_to_output_taxon_string = {}
1539
+
1540
+ # input_category_id = next(iter(input_category_id_to_taxonomy_string.keys()))
1541
+ for input_category_id in input_category_id_to_taxonomy_string.keys():
1542
+
1543
+ input_taxon_string = input_category_id_to_taxonomy_string[input_category_id]
1544
+ input_taxon_tokens = input_taxon_string.split(';')
1545
+ assert len(input_taxon_tokens) == 7, \
1546
+ 'Illegal taxonomy string: {}'.format(input_taxon_string)
1547
+
1548
+ # Don't mess with blank/no-cv-result/human (or "animal", which is really "unknown")
1549
+ if (not is_taxonomic_prediction_string(input_taxon_string)) or \
1550
+ (input_taxon_string == human_prediction_string):
1551
+ if verbose:
1552
+ print('Not messing with non-taxonomic category {}'.format(input_taxon_string))
1553
+ input_category_id_to_output_taxon_string[input_category_id] = \
1554
+ input_taxon_string
1555
+ continue
1556
+
1557
+ # Don't mess with protected categories
1558
+ common_name = input_taxon_tokens[-1]
1559
+
1560
+ if (protected_common_names is not None) and \
1561
+ (common_name in protected_common_names):
1562
+ if verbose:
1563
+ print('Not messing with protected category {}:\n{}'.format(
1564
+ common_name,input_taxon_string))
1565
+ input_category_id_to_output_taxon_string[input_category_id] = \
1566
+ input_taxon_string
1567
+ continue
1568
+
1569
+ # Remove GUID and common mame
1570
+
1571
+ # This is always class/order/family/genus/species
1572
+ input_taxon_tokens = input_taxon_tokens[1:-1]
1573
+ assert len(input_taxon_tokens) == 5
1574
+
1575
+
1576
+ # Start at the species level (the last element in input_taxon_tokens),
1577
+ # and see whether each taxon is allowed
1578
+ test_index = len(input_taxon_tokens) - 1
1579
+ target_taxon = None
1580
+
1581
+ while((test_index >= 0) and (target_taxon is None)):
1582
+
1583
+ # Species are represented as binomial names, i.e. when test_index is 4,
1584
+ # test_taxon_name will have two tokens (e.g. "canis lupus"), otherwise
1585
+ # test_taxon_name will have one token (e.g. "canis", or "aves")
1586
+ if (test_index == (len(input_taxon_tokens) - 1)) and \
1587
+ (len(input_taxon_tokens[-1]) > 0):
1588
+ test_taxon_name = \
1589
+ input_taxon_tokens[-2] + ' ' + input_taxon_tokens[-1]
1590
+ else:
1591
+ test_taxon_name = input_taxon_tokens[test_index]
1592
+
1593
+ # If we haven't yet found the level at which this taxon is non-empty,
1594
+ # keep going up
1595
+ if len(test_taxon_name) == 0:
1596
+ test_index -= 1
1597
+ continue
1598
+
1599
+ assert test_taxon_name in speciesnet_latin_name_to_taxon_string, \
1600
+ '{} not found in taxonomy table'.format(test_taxon_name)
1601
+
1602
+ # Is this taxon allowed according to the custom species list?
1603
+ if test_taxon_name in allowed_parent_taxon_to_child_taxa:
1604
+
1605
+ allowed_child_taxa = allowed_parent_taxon_to_child_taxa[test_taxon_name]
1606
+ assert allowed_child_taxa is not None, \
1607
+ 'allowed_child_taxa should not be None: {}'.format(test_taxon_name)
1608
+
1609
+ # If this is the lowest-level allowable token or there is not a
1610
+ # unique child, don't walk any further, even if walking down
1611
+ # is enabled.
1612
+ if None in allowed_child_taxa:
1613
+ assert len(allowed_child_taxa) == 1, \
1614
+ '"None" should not be listed as a child taxa with other child taxa'
1615
+
1616
+ if (None in allowed_child_taxa) or (len(allowed_child_taxa) > 1):
1617
+ target_taxon = test_taxon_name
1618
+ elif not allow_walk_down:
1619
+ target_taxon = test_taxon_name
1620
+ else:
1621
+ # If there's a unique child, walk back *down* the allowable
1622
+ # taxa until we run out of unique children
1623
+ while ((next(iter(allowed_child_taxa)) is not None) and \
1624
+ (len(allowed_child_taxa) == 1)):
1625
+ candidate_taxon = next(iter(allowed_child_taxa))
1626
+ assert candidate_taxon in allowed_parent_taxon_to_child_taxa, \
1627
+ '{} should be a subset of {}'.format(
1628
+ candidate_taxon,allowed_parent_taxon_to_child_taxa)
1629
+ assert candidate_taxon in speciesnet_latin_name_to_taxon_string, \
1630
+ '{} should be a subset of {}'.format(
1631
+ candidate_taxon,speciesnet_latin_name_to_taxon_string)
1632
+ allowed_child_taxa = \
1633
+ allowed_parent_taxon_to_child_taxa[candidate_taxon]
1634
+ target_taxon = candidate_taxon
1635
+
1636
+ # ...if this is an allowed taxon
1637
+
1638
+ test_index -= 1
1639
+
1640
+ # ...for each token
1641
+
1642
+ if target_taxon is None:
1643
+ output_taxon_string = animal_prediction_string
1644
+ else:
1645
+ output_taxon_string = speciesnet_latin_name_to_taxon_string[target_taxon]
1646
+ input_category_id_to_output_taxon_string[input_category_id] = output_taxon_string
1647
+
1648
+ # ...for each category (mapping input category IDs to output taxon strings)
1649
+
1650
+
1651
+ #%% Map input category IDs to output category IDs
1652
+
1653
+ speciesnet_taxon_string_to_latin_name = \
1654
+ invert_dictionary(speciesnet_latin_name_to_taxon_string)
1655
+
1656
+ input_category_id_to_output_category_id = {}
1657
+ output_taxon_string_to_category_id = {}
1658
+ output_category_id_to_common_name = {}
1659
+
1660
+ for input_category_id in input_category_id_to_output_taxon_string:
1661
+
1662
+ output_taxon_string = \
1663
+ input_category_id_to_output_taxon_string[input_category_id]
1664
+
1665
+ output_common_name = output_taxon_string.split(';')[-1]
1666
+
1667
+ # Possibly substitute a custom common name
1668
+ if output_taxon_string in speciesnet_taxon_string_to_latin_name:
1669
+
1670
+ speciesnet_latin_name = speciesnet_taxon_string_to_latin_name[output_taxon_string]
1671
+
1672
+ if speciesnet_latin_name in speciesnet_latin_name_to_output_common_name:
1673
+ custom_common_name = speciesnet_latin_name_to_output_common_name[speciesnet_latin_name]
1674
+ if custom_common_name != output_common_name:
1675
+ if verbose:
1676
+ print('Substituting common name {} for {}'.format(custom_common_name,output_common_name))
1677
+ output_common_name = custom_common_name
1678
+
1679
+ # Do we need to create a new output category?
1680
+ if output_taxon_string not in output_taxon_string_to_category_id:
1681
+ output_category_id = str(len(output_taxon_string_to_category_id))
1682
+ output_taxon_string_to_category_id[output_taxon_string] = \
1683
+ output_category_id
1684
+ output_category_id_to_common_name[output_category_id] = \
1685
+ output_common_name
1686
+ else:
1687
+ output_category_id = \
1688
+ output_taxon_string_to_category_id[output_taxon_string]
1689
+
1690
+ input_category_id_to_output_category_id[input_category_id] = \
1691
+ output_category_id
1692
+
1693
+ # Sometimes-useful debug printouts
1694
+ if False:
1695
+ original_common_name = \
1696
+ input_category_id_to_common_name[input_category_id]
1697
+ original_taxon_string = \
1698
+ input_category_id_to_taxonomy_string[input_category_id]
1699
+ print('Mapping {} ({}) to:\n{} ({})\n'.format(
1700
+ original_common_name,original_taxon_string,
1701
+ output_common_name,output_taxon_string))
1702
+
1703
+ # ...for each category (mapping input category IDs to output category IDs)
1704
+
1705
+
1706
+ ##%% Remap all category labels
1707
+
1708
+ assert len(set(output_taxon_string_to_category_id.keys())) == \
1709
+ len(set(output_taxon_string_to_category_id.values())), \
1710
+ 'Category ID/value non-uniqueness error'
1711
+
1712
+ output_category_id_to_taxon_string = \
1713
+ invert_dictionary(output_taxon_string_to_category_id)
1714
+
1715
+ with open(input_file,'r') as f:
1716
+ output_data = json.load(f)
1717
+
1718
+ classification_descriptions = None
1719
+ if 'classification_category_descriptions' in output_data:
1720
+ classification_descriptions = output_data['classification_category_descriptions']
1721
+
1722
+ for im in tqdm(output_data['images']):
1723
+
1724
+ if 'detections' not in im or im['detections'] is None:
1725
+ continue
1726
+
1727
+ # Possibly prepare a pre-filtering description
1728
+ pre_filtering_description = None
1729
+ if classification_descriptions is not None and add_pre_filtering_description:
1730
+ category_to_count = count_detections_by_classification_category(im['detections'])
1731
+ pre_filtering_description = \
1732
+ get_classification_description_string(category_to_count,classification_descriptions)
1733
+ im['pre_filtering_description'] = pre_filtering_description
1734
+
1735
+ for det in im['detections']:
1736
+ if 'classifications' in det:
1737
+ for classification in det['classifications']:
1738
+ classification[0] = \
1739
+ input_category_id_to_output_category_id[classification[0]]
1740
+
1741
+ # ...for each image
1742
+
1743
+ output_data['classification_categories'] = output_category_id_to_common_name
1744
+ output_data['classification_category_descriptions'] = \
1745
+ output_category_id_to_taxon_string
1746
+
1747
+
1748
+ ##%% Write output
1749
+
1750
+ write_json(output_file,output_data)
1751
+
1752
+ # ...def restrict_to_taxa_list(...)