megadetector 10.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. megadetector/__init__.py +0 -0
  2. megadetector/api/__init__.py +0 -0
  3. megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
  4. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
  5. megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
  6. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
  7. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
  8. megadetector/classification/__init__.py +0 -0
  9. megadetector/classification/aggregate_classifier_probs.py +108 -0
  10. megadetector/classification/analyze_failed_images.py +227 -0
  11. megadetector/classification/cache_batchapi_outputs.py +198 -0
  12. megadetector/classification/create_classification_dataset.py +626 -0
  13. megadetector/classification/crop_detections.py +516 -0
  14. megadetector/classification/csv_to_json.py +226 -0
  15. megadetector/classification/detect_and_crop.py +853 -0
  16. megadetector/classification/efficientnet/__init__.py +9 -0
  17. megadetector/classification/efficientnet/model.py +415 -0
  18. megadetector/classification/efficientnet/utils.py +608 -0
  19. megadetector/classification/evaluate_model.py +520 -0
  20. megadetector/classification/identify_mislabeled_candidates.py +152 -0
  21. megadetector/classification/json_to_azcopy_list.py +63 -0
  22. megadetector/classification/json_validator.py +696 -0
  23. megadetector/classification/map_classification_categories.py +276 -0
  24. megadetector/classification/merge_classification_detection_output.py +509 -0
  25. megadetector/classification/prepare_classification_script.py +194 -0
  26. megadetector/classification/prepare_classification_script_mc.py +228 -0
  27. megadetector/classification/run_classifier.py +287 -0
  28. megadetector/classification/save_mislabeled.py +110 -0
  29. megadetector/classification/train_classifier.py +827 -0
  30. megadetector/classification/train_classifier_tf.py +725 -0
  31. megadetector/classification/train_utils.py +323 -0
  32. megadetector/data_management/__init__.py +0 -0
  33. megadetector/data_management/animl_to_md.py +161 -0
  34. megadetector/data_management/annotations/__init__.py +0 -0
  35. megadetector/data_management/annotations/annotation_constants.py +33 -0
  36. megadetector/data_management/camtrap_dp_to_coco.py +270 -0
  37. megadetector/data_management/cct_json_utils.py +566 -0
  38. megadetector/data_management/cct_to_md.py +184 -0
  39. megadetector/data_management/cct_to_wi.py +293 -0
  40. megadetector/data_management/coco_to_labelme.py +284 -0
  41. megadetector/data_management/coco_to_yolo.py +701 -0
  42. megadetector/data_management/databases/__init__.py +0 -0
  43. megadetector/data_management/databases/add_width_and_height_to_db.py +107 -0
  44. megadetector/data_management/databases/combine_coco_camera_traps_files.py +210 -0
  45. megadetector/data_management/databases/integrity_check_json_db.py +563 -0
  46. megadetector/data_management/databases/subset_json_db.py +195 -0
  47. megadetector/data_management/generate_crops_from_cct.py +200 -0
  48. megadetector/data_management/get_image_sizes.py +164 -0
  49. megadetector/data_management/labelme_to_coco.py +559 -0
  50. megadetector/data_management/labelme_to_yolo.py +349 -0
  51. megadetector/data_management/lila/__init__.py +0 -0
  52. megadetector/data_management/lila/create_lila_blank_set.py +556 -0
  53. megadetector/data_management/lila/create_lila_test_set.py +192 -0
  54. megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
  55. megadetector/data_management/lila/download_lila_subset.py +182 -0
  56. megadetector/data_management/lila/generate_lila_per_image_labels.py +777 -0
  57. megadetector/data_management/lila/get_lila_annotation_counts.py +174 -0
  58. megadetector/data_management/lila/get_lila_image_counts.py +112 -0
  59. megadetector/data_management/lila/lila_common.py +319 -0
  60. megadetector/data_management/lila/test_lila_metadata_urls.py +164 -0
  61. megadetector/data_management/mewc_to_md.py +344 -0
  62. megadetector/data_management/ocr_tools.py +873 -0
  63. megadetector/data_management/read_exif.py +964 -0
  64. megadetector/data_management/remap_coco_categories.py +195 -0
  65. megadetector/data_management/remove_exif.py +156 -0
  66. megadetector/data_management/rename_images.py +194 -0
  67. megadetector/data_management/resize_coco_dataset.py +665 -0
  68. megadetector/data_management/speciesnet_to_md.py +41 -0
  69. megadetector/data_management/wi_download_csv_to_coco.py +247 -0
  70. megadetector/data_management/yolo_output_to_md_output.py +594 -0
  71. megadetector/data_management/yolo_to_coco.py +984 -0
  72. megadetector/data_management/zamba_to_md.py +188 -0
  73. megadetector/detection/__init__.py +0 -0
  74. megadetector/detection/change_detection.py +840 -0
  75. megadetector/detection/process_video.py +479 -0
  76. megadetector/detection/pytorch_detector.py +1451 -0
  77. megadetector/detection/run_detector.py +1267 -0
  78. megadetector/detection/run_detector_batch.py +2172 -0
  79. megadetector/detection/run_inference_with_yolov5_val.py +1314 -0
  80. megadetector/detection/run_md_and_speciesnet.py +1604 -0
  81. megadetector/detection/run_tiled_inference.py +1044 -0
  82. megadetector/detection/tf_detector.py +209 -0
  83. megadetector/detection/video_utils.py +1379 -0
  84. megadetector/postprocessing/__init__.py +0 -0
  85. megadetector/postprocessing/add_max_conf.py +72 -0
  86. megadetector/postprocessing/categorize_detections_by_size.py +166 -0
  87. megadetector/postprocessing/classification_postprocessing.py +1943 -0
  88. megadetector/postprocessing/combine_batch_outputs.py +249 -0
  89. megadetector/postprocessing/compare_batch_results.py +2110 -0
  90. megadetector/postprocessing/convert_output_format.py +403 -0
  91. megadetector/postprocessing/create_crop_folder.py +629 -0
  92. megadetector/postprocessing/detector_calibration.py +570 -0
  93. megadetector/postprocessing/generate_csv_report.py +522 -0
  94. megadetector/postprocessing/load_api_results.py +223 -0
  95. megadetector/postprocessing/md_to_coco.py +428 -0
  96. megadetector/postprocessing/md_to_labelme.py +351 -0
  97. megadetector/postprocessing/md_to_wi.py +41 -0
  98. megadetector/postprocessing/merge_detections.py +392 -0
  99. megadetector/postprocessing/postprocess_batch_results.py +2140 -0
  100. megadetector/postprocessing/remap_detection_categories.py +226 -0
  101. megadetector/postprocessing/render_detection_confusion_matrix.py +677 -0
  102. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +206 -0
  103. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +82 -0
  104. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1665 -0
  105. megadetector/postprocessing/separate_detections_into_folders.py +795 -0
  106. megadetector/postprocessing/subset_json_detector_output.py +964 -0
  107. megadetector/postprocessing/top_folders_to_bottom.py +238 -0
  108. megadetector/postprocessing/validate_batch_results.py +332 -0
  109. megadetector/taxonomy_mapping/__init__.py +0 -0
  110. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
  111. megadetector/taxonomy_mapping/map_new_lila_datasets.py +211 -0
  112. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +165 -0
  113. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +543 -0
  114. megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
  115. megadetector/taxonomy_mapping/simple_image_download.py +231 -0
  116. megadetector/taxonomy_mapping/species_lookup.py +1008 -0
  117. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
  118. megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
  119. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
  120. megadetector/tests/__init__.py +0 -0
  121. megadetector/tests/test_nms_synthetic.py +335 -0
  122. megadetector/utils/__init__.py +0 -0
  123. megadetector/utils/ct_utils.py +1857 -0
  124. megadetector/utils/directory_listing.py +199 -0
  125. megadetector/utils/extract_frames_from_video.py +307 -0
  126. megadetector/utils/gpu_test.py +125 -0
  127. megadetector/utils/md_tests.py +2072 -0
  128. megadetector/utils/path_utils.py +2872 -0
  129. megadetector/utils/process_utils.py +172 -0
  130. megadetector/utils/split_locations_into_train_val.py +237 -0
  131. megadetector/utils/string_utils.py +234 -0
  132. megadetector/utils/url_utils.py +825 -0
  133. megadetector/utils/wi_platform_utils.py +968 -0
  134. megadetector/utils/wi_taxonomy_utils.py +1766 -0
  135. megadetector/utils/write_html_image_list.py +239 -0
  136. megadetector/visualization/__init__.py +0 -0
  137. megadetector/visualization/plot_utils.py +309 -0
  138. megadetector/visualization/render_images_with_thumbnails.py +243 -0
  139. megadetector/visualization/visualization_utils.py +1973 -0
  140. megadetector/visualization/visualize_db.py +630 -0
  141. megadetector/visualization/visualize_detector_output.py +498 -0
  142. megadetector/visualization/visualize_video_output.py +705 -0
  143. megadetector-10.0.15.dist-info/METADATA +115 -0
  144. megadetector-10.0.15.dist-info/RECORD +147 -0
  145. megadetector-10.0.15.dist-info/WHEEL +5 -0
  146. megadetector-10.0.15.dist-info/licenses/LICENSE +19 -0
  147. megadetector-10.0.15.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1943 @@
1
+ """
2
+
3
+ classification_postprocessing.py
4
+
5
+ Functions for postprocessing species classification results, particularly:
6
+
7
+ * Smoothing results within an image (an image with 700 cows and one deer is really just 701
8
+ cows)
9
+ * Smoothing results within a sequence (a sequence that looks like deer/deer/deer/elk/deer/deer
10
+ is really just a deer)
11
+
12
+ """
13
+
14
+ #%% Constants and imports
15
+
16
+ import os
17
+ import json
18
+ import copy
19
+ import pandas as pd
20
+
21
+ from collections import defaultdict
22
+ from tqdm import tqdm
23
+
24
+ from megadetector.utils.ct_utils import is_list_sorted
25
+ from megadetector.utils.ct_utils import is_empty
26
+ from megadetector.utils.ct_utils import sort_dictionary_by_value
27
+ from megadetector.utils.ct_utils import sort_dictionary_by_key
28
+ from megadetector.utils.ct_utils import invert_dictionary
29
+ from megadetector.utils.ct_utils import write_json
30
+
31
+ from megadetector.utils.wi_taxonomy_utils import clean_taxonomy_string
32
+ from megadetector.utils.wi_taxonomy_utils import taxonomy_level_index
33
+ from megadetector.utils.wi_taxonomy_utils import taxonomy_level_string_to_index
34
+
35
+ from megadetector.utils.wi_taxonomy_utils import human_prediction_string
36
+ from megadetector.utils.wi_taxonomy_utils import animal_prediction_string
37
+ from megadetector.utils.wi_taxonomy_utils import is_taxonomic_prediction_string
38
+ from megadetector.utils.wi_taxonomy_utils import blank_prediction_string # noqa
39
+
40
+
41
+ #%% Options classes
42
+
43
+ class ClassificationSmoothingOptions:
44
+ """
45
+ Options used to parameterize smooth_classification_results_image_level()
46
+ and smooth_classification_results_sequence_level()
47
+ """
48
+
49
+ def __init__(self):
50
+
51
+ #: How many detections do we need in a dominant category to overwrite
52
+ #: non-dominant classifications? This is irrelevant if
53
+ #: max_detections_nondominant_class <= 1.
54
+ self.min_detections_to_overwrite_secondary = 4
55
+
56
+ #: Even if we have a dominant class, if a non-dominant class has at least
57
+ #: this many classifications in an image, leave them alone.
58
+ #:
59
+ #: If this is <= 1, we won't replace non-dominant, non-other classes
60
+ #: with the dominant class, even if there are 900 cows and 1 deer.
61
+ self.max_detections_nondominant_class = 1
62
+
63
+ #: How many detections do we need in a dominant category to overwrite
64
+ #: non-dominant classifications in the same family? If this is <= 0,
65
+ #: we'll skip this step. This option doesn't mean anything if
66
+ #: max_detections_nondominant_class_same_family <= 1.
67
+ self.min_detections_to_overwrite_secondary_same_family = 2
68
+
69
+ #: If we have this many classifications of a nondominant category,
70
+ #: we won't do same-family overwrites. <= 1 means "even if there are
71
+ #: a million deer, if there are two million moose, call all the deer
72
+ #: moose". This option doesn't mean anything if
73
+ #: min_detections_to_overwrite_secondary_same_family <= 0.
74
+ self.max_detections_nondominant_class_same_family = -1
75
+
76
+ #: If the dominant class has at least this many classifications, overwrite
77
+ #: "other" classifications with the dominant class
78
+ self.min_detections_to_overwrite_other = 2
79
+
80
+ #: Names to treat as "other" categories; can't be None, but can be empty
81
+ #:
82
+ #: "Other" classifications will be changed to the dominant category, regardless
83
+ #: of confidence, as long as there are at least min_detections_to_overwrite_other
84
+ #: examples of the dominant class. For example, cow/other will remain unchanged,
85
+ #: but cow/cow/other will become cow/cow/cow.
86
+ self.other_category_names = ['other','unknown','no cv result','animal','blank','mammal']
87
+
88
+ #: We're not even going to mess around with classifications below this threshold.
89
+ #:
90
+ #: We won't count them, we won't over-write them, they don't exist during the
91
+ #: within-image smoothing step.
92
+ self.classification_confidence_threshold = 0.5
93
+
94
+ #: We're not even going to mess around with detections below this threshold.
95
+ #:
96
+ #: We won't count them, we won't over-write them, they don't exist during the
97
+ #: within-image smoothing step.
98
+ self.detection_confidence_threshold = 0.15
99
+
100
+ #: If classification descriptions are present and appear to represent taxonomic
101
+ #: information, should we propagate classifications when lower-level taxa are more
102
+ #: common in an image? For example, if we see "carnivore/fox/fox/deer", should
103
+ #: we make that "fox/fox/fox/deer"?
104
+ self.propagate_classifications_through_taxonomy = True
105
+
106
+ #: When propagating classifications down through taxonomy levels, we have to
107
+ #: decide whether we prefer more frequent categories or more specific categories.
108
+ #: taxonomy_propagation_level_weight and taxonomy_propagation_count_weight
109
+ #: balance levels against counts in this process.
110
+ self.taxonomy_propagation_level_weight = 1.0
111
+
112
+ #: When propagating classifications down through taxonomy levels, we have to
113
+ #: decide whether we prefer more frequent categories or more specific categories.
114
+ #: taxonomy_propagation_level_weight and taxonomy_propagation_count_weight
115
+ #: balance levels against counts in this process.
116
+ #:
117
+ #: With a very low default value, this just breaks ties.
118
+ self.taxonomy_propagation_count_weight = 0.01
119
+
120
+ #: Should we record information about the state of labels prior to smoothing?
121
+ self.add_pre_smoothing_description = True
122
+
123
+ #: When a dict (rather than a file) is passed to either smoothing function,
124
+ #: if this is True, we'll make a copy of the input dict before modifying.
125
+ self.modify_in_place = False
126
+
127
+ #: Only include these categories in the smoothing process (None to use all categories)
128
+ self.detection_category_names_to_smooth = ['animal']
129
+
130
+ #: Debug options
131
+ self.break_at_image = None
132
+
133
+ ## Populated internally
134
+
135
+ #: Only include these categories in the smoothing process (None to use all categories)
136
+ self._detection_category_ids_to_smooth = None
137
+
138
+
139
+ #%% Utility functions
140
+
141
+ def _results_for_sequence(images_this_sequence,filename_to_results):
142
+ """
143
+ Fetch MD results for every image in this sequence, based on the 'file_name' field
144
+ """
145
+
146
+ results_this_sequence = []
147
+ for im in images_this_sequence:
148
+ fn = im['file_name']
149
+ results_this_image = filename_to_results[fn]
150
+ assert isinstance(results_this_image,dict)
151
+ results_this_sequence.append(results_this_image)
152
+
153
+ return results_this_sequence
154
+
155
+
156
+ def _sort_images_by_time(images):
157
+ """
158
+ Returns a copy of [images], sorted by the 'datetime' field (ascending).
159
+ """
160
+ return sorted(images, key = lambda im: im['datetime'])
161
+
162
+
163
+ def _detection_is_relevant_for_smoothing(det,options):
164
+ """
165
+ Determine whether [det] has classifications that might be meaningful for smoothing.
166
+ """
167
+
168
+ if ('classifications' not in det) or \
169
+ (det['conf'] < options.detection_confidence_threshold):
170
+ return False
171
+
172
+ # Ignore non-smoothed categories
173
+ if (options._detection_category_ids_to_smooth is not None) and \
174
+ (det['category'] not in options._detection_category_ids_to_smooth):
175
+ return False
176
+
177
+ return True
178
+
179
+
180
+ def count_detections_by_classification_category(detections,options=None):
181
+ """
182
+ Count the number of instances of each classification category in the detections list
183
+ [detections] that have an above-threshold detection. Sort results in descending
184
+ order by count. Returns a dict mapping category ID --> count. If no detections
185
+ are above threshold, returns an empty dict.
186
+
187
+ Only processes the top classification for each detection.
188
+
189
+ Args:
190
+ detections (list of dict): detections list
191
+ options (ClassificationSmoothingOptions, optional): see ClassificationSmoothingOptions
192
+
193
+ Returns:
194
+ dict mapping above-threshold category IDs to counts
195
+ """
196
+
197
+ if detections is None or len(detections) == 0:
198
+ return {}
199
+
200
+ if options is None:
201
+ options = ClassificationSmoothingOptions()
202
+
203
+ category_to_count = defaultdict(int)
204
+
205
+ for det in detections:
206
+
207
+ if not _detection_is_relevant_for_smoothing(det,options):
208
+ continue
209
+
210
+ c = det['classifications'][0]
211
+ if c[1] >= options.classification_confidence_threshold:
212
+ category_to_count[c[0]] += 1
213
+
214
+ category_to_count = {k: v for k, v in sorted(category_to_count.items(),
215
+ key=lambda item: item[1],
216
+ reverse=True)}
217
+
218
+ return category_to_count
219
+
220
+
221
+ def get_classification_description_string(category_to_count,classification_descriptions):
222
+ """
223
+ Return a string summarizing the image content according to [category_to_count].
224
+
225
+ Args:
226
+ category_to_count (dict): a dict mapping category IDs to counts
227
+ classification_descriptions (dict): a dict mapping category IDs to description strings
228
+
229
+ Returns:
230
+ string: a description of this image's content, e.g. "rabbit (4), human (1)"
231
+ """
232
+
233
+ category_strings = []
234
+ # category_id = next(iter(category_to_count))
235
+ for category_id in category_to_count:
236
+ category_description = classification_descriptions[category_id]
237
+ tokens = category_description.split(';')
238
+ assert len(tokens) == 7
239
+ category_name = tokens[-1]
240
+ if len(category_name) == 0:
241
+ category_name = 'undefined category'
242
+ count = category_to_count[category_id]
243
+ category_string = '{} ({})'.format(category_name,count)
244
+ category_strings.append(category_string)
245
+
246
+ return ', '.join(category_strings)
247
+
248
+
249
+ def _print_counts_with_names(category_to_count,classification_descriptions):
250
+ """
251
+ Print a list of classification categories with counts, based in the name --> count
252
+ dict [category_to_count]
253
+ """
254
+
255
+ for category_id in category_to_count:
256
+ category_name = classification_descriptions[category_id]
257
+ count = category_to_count[category_id]
258
+ print('{}: {} ({})'.format(category_id,category_name,count))
259
+
260
+
261
+ def _prepare_results_for_smoothing(input_file,options):
262
+ """
263
+ Load results from [input_file] if necessary, prepare category descriptions
264
+ for smoothing. Adds pre-smoothing descriptions to every image if the options
265
+ say we're supposed to do that.
266
+
267
+ May modify some fields in [options].
268
+ """
269
+
270
+ if isinstance(input_file,str):
271
+ with open(input_file,'r') as f:
272
+ print('Loading results from:\n{}'.format(input_file))
273
+ d = json.load(f)
274
+ else:
275
+ assert isinstance(input_file,dict)
276
+ if options.modify_in_place:
277
+ d = input_file
278
+ else:
279
+ print('modify_in_place is False, copying the input before modifying')
280
+ d = copy.deepcopy(input_file)
281
+
282
+
283
+ ## Category processing
284
+
285
+ category_name_to_id = {d['classification_categories'][k]:k for k in d['classification_categories']}
286
+ other_category_ids = []
287
+ for s in options.other_category_names:
288
+ if s in category_name_to_id:
289
+ other_category_ids.append(category_name_to_id[s])
290
+
291
+ # Possibly update the list of category IDs we should smooth
292
+ if options.detection_category_names_to_smooth is None:
293
+ options._detection_category_ids_to_smooth = None
294
+ else:
295
+ detection_category_id_to_name = d['detection_categories']
296
+ detection_category_name_to_id = invert_dictionary(detection_category_id_to_name)
297
+ options._detection_category_ids_to_smooth = []
298
+ for category_name in options.detection_category_names_to_smooth:
299
+ options._detection_category_ids_to_smooth.append(detection_category_name_to_id[category_name])
300
+
301
+ # Before we do anything else, get rid of everything but the top classification
302
+ # for each detection, and remove the 'classifications' field from detections with
303
+ # no classifications.
304
+ for im in tqdm(d['images']):
305
+
306
+ if 'detections' not in im or im['detections'] is None or len(im['detections']) == 0:
307
+ continue
308
+
309
+ detections = im['detections']
310
+
311
+ for det in detections:
312
+
313
+ if 'classifications' not in det:
314
+ continue
315
+ if len(det['classifications']) == 0:
316
+ del det['classifications']
317
+ continue
318
+
319
+ classification_confidence_values = [c[1] for c in det['classifications']]
320
+ assert is_list_sorted(classification_confidence_values,reverse=True)
321
+ det['classifications'] = [det['classifications'][0]]
322
+
323
+ # ...for each detection in this image
324
+
325
+ # ...for each image
326
+
327
+
328
+ ## Clean up classification descriptions...
329
+
330
+ # ...so we can test taxonomic relationships by substring testing.
331
+
332
+ classification_descriptions_clean = None
333
+ classification_descriptions = None
334
+
335
+ if 'classification_category_descriptions' in d:
336
+ classification_descriptions = d['classification_category_descriptions']
337
+ classification_descriptions_single = {}
338
+ # We use "|" to delimit multiple descriptions, just use the first
339
+ # for smoothing. This isn't perfect or "correct", but it's reasonable.
340
+ for k in classification_descriptions.keys():
341
+ v = classification_descriptions[k]
342
+ v = v.split('|')[0]
343
+ classification_descriptions_single[k] = v
344
+ classification_descriptions = classification_descriptions_single
345
+ classification_descriptions_clean = {}
346
+ # category_id = next(iter(classification_descriptions))
347
+ for category_id in classification_descriptions:
348
+ classification_descriptions_clean[category_id] = \
349
+ clean_taxonomy_string(classification_descriptions[category_id]).strip(';').lower()
350
+
351
+
352
+ ## Optionally add pre-smoothing descriptions to every image
353
+
354
+ if options.add_pre_smoothing_description and (classification_descriptions is not None):
355
+
356
+ for im in tqdm(d['images']):
357
+
358
+ if 'detections' not in im or im['detections'] is None or len(im['detections']) == 0:
359
+ continue
360
+
361
+ detections = im['detections']
362
+ category_to_count = count_detections_by_classification_category(detections, options)
363
+
364
+ im['pre_smoothing_description'] = \
365
+ get_classification_description_string(category_to_count, classification_descriptions)
366
+
367
+
368
+ return {
369
+ 'd':d,
370
+ 'other_category_ids':other_category_ids,
371
+ 'classification_descriptions_clean':classification_descriptions_clean,
372
+ 'classification_descriptions':classification_descriptions
373
+ }
374
+
375
+ # ...def _prepare_results_for_smoothing(...)
376
+
377
+
378
+ def _smooth_classifications_for_list_of_detections(detections,
379
+ options,
380
+ other_category_ids,
381
+ classification_descriptions,
382
+ classification_descriptions_clean):
383
+ """
384
+ Smooth classifications for a list of detections, which may have come from a single
385
+ image, or may represent an entire sequence.
386
+
387
+ Returns None if no changes are made, else a dict.
388
+
389
+ classification_descriptions_clean should be semicolon-delimited taxonomic strings
390
+ from which common names and GUIDs have already been removed.
391
+
392
+ Assumes there is only one classification per detection, i.e. that non-top classifications
393
+ have already been remoevd.
394
+ """
395
+
396
+ ## Count the number of instances of each category in this image
397
+
398
+ category_to_count = count_detections_by_classification_category(detections, options)
399
+ # _print_counts_with_names(category_to_count,classification_descriptions)
400
+ # get_classification_description_string(category_to_count, classification_descriptions)
401
+
402
+ if len(category_to_count) <= 1:
403
+ return None
404
+
405
+ keys = list(category_to_count.keys())
406
+
407
+ # Handle a quirky special case: if the most common category is "other" and
408
+ # it's "tied" with the second-most-common category, swap them
409
+ if (len(keys) > 1) and \
410
+ (keys[0] in other_category_ids) and \
411
+ (keys[1] not in other_category_ids) and \
412
+ (category_to_count[keys[0]] == category_to_count[keys[1]]):
413
+ keys[1], keys[0] = keys[0], keys[1]
414
+
415
+ max_count = category_to_count[keys[0]]
416
+ most_common_category = keys[0]
417
+ del keys
418
+
419
+
420
+ ## Debug tools
421
+
422
+ verbose_debug_enabled = False
423
+
424
+ if options.break_at_image is not None:
425
+ for det in detections:
426
+ if 'image_filename' in det and \
427
+ det['image_filename'] == options.break_at_image:
428
+ verbose_debug_enabled = True
429
+ break
430
+
431
+ if verbose_debug_enabled:
432
+ _print_counts_with_names(category_to_count,classification_descriptions)
433
+ # from IPython import embed; embed()
434
+
435
+
436
+ ## Possibly change "other" classifications to the most common category
437
+
438
+ # ...if the dominant category is not an "other" category.
439
+
440
+ n_other_classifications_changed_this_image = 0
441
+
442
+ # If we have at least *min_detections_to_overwrite_other* in a category that isn't
443
+ # "other", change all "other" classifications to that category
444
+ if (max_count >= options.min_detections_to_overwrite_other) and \
445
+ (most_common_category not in other_category_ids):
446
+
447
+ for det in detections:
448
+
449
+ if not _detection_is_relevant_for_smoothing(det,options):
450
+ continue
451
+
452
+ assert len(det['classifications']) == 1
453
+ c = det['classifications'][0]
454
+
455
+ if (c[1] >= options.classification_confidence_threshold) and \
456
+ (c[0] in other_category_ids):
457
+
458
+ if verbose_debug_enabled:
459
+ print('Replacing {} with {}'.format(
460
+ classification_descriptions[c[0]],
461
+ most_common_category))
462
+
463
+ n_other_classifications_changed_this_image += 1
464
+ c[0] = most_common_category
465
+
466
+ # ...if there are classifications for this detection
467
+
468
+ # ...for each detection
469
+
470
+ # ...if we should overwrite all "other" classifications
471
+
472
+ if verbose_debug_enabled:
473
+ print('Made {} other changes'.format(n_other_classifications_changed_this_image))
474
+
475
+
476
+ ## Re-count
477
+
478
+ category_to_count = count_detections_by_classification_category(detections, options)
479
+ # _print_counts_with_names(category_to_count,classification_descriptions)
480
+ keys = list(category_to_count.keys())
481
+ max_count = category_to_count[keys[0]]
482
+ most_common_category = keys[0]
483
+ del keys
484
+
485
+
486
+ ## Possibly change some non-dominant classifications to the dominant category
487
+
488
+ process_taxonomic_rules = \
489
+ (classification_descriptions_clean is not None) and \
490
+ (len(classification_descriptions_clean) > 0) and \
491
+ (len(category_to_count) > 1)
492
+
493
+ n_detections_flipped_this_image = 0
494
+
495
+ # Don't do this if the most common category is an "other" category, or
496
+ # if we don't have enough of the most common category
497
+ if (most_common_category not in other_category_ids) and \
498
+ (max_count >= options.min_detections_to_overwrite_secondary):
499
+
500
+ # i_det = 0; det = detections[i_det]
501
+ for i_det,det in enumerate(detections):
502
+
503
+ if not _detection_is_relevant_for_smoothing(det,options):
504
+ continue
505
+
506
+ assert len(det['classifications']) == 1
507
+ c = det['classifications'][0]
508
+
509
+ # Don't over-write the most common category with itself
510
+ if c[0] == most_common_category:
511
+ continue
512
+
513
+ # Don't bother with below-threshold classifications
514
+ if c[1] < options.classification_confidence_threshold:
515
+ continue
516
+
517
+ # If we're doing taxonomic processing, at this stage, don't turn children
518
+ # into parents; we'll likely turn parents into children in the next stage.
519
+ if process_taxonomic_rules:
520
+
521
+ most_common_category_description = \
522
+ classification_descriptions_clean[most_common_category]
523
+
524
+ category_id_this_classification = c[0]
525
+ assert category_id_this_classification in category_to_count
526
+
527
+ category_description_this_classification = \
528
+ classification_descriptions_clean[category_id_this_classification]
529
+
530
+ # An empty description corresponds to the "animal" category. We don't handle
531
+ # "animal" here as a parent category, that would be handled in the "other smoothing"
532
+ # step above.
533
+ if len(category_description_this_classification) == 0:
534
+ continue
535
+
536
+ most_common_category_is_parent_of_this_category = \
537
+ most_common_category_description in category_description_this_classification
538
+
539
+ if most_common_category_is_parent_of_this_category:
540
+ continue
541
+
542
+ # If we have fewer of this category than the most common category,
543
+ # but not *too* many, flip it to the most common category.
544
+ if (max_count > category_to_count[c[0]]) and \
545
+ (category_to_count[c[0]] <= options.max_detections_nondominant_class):
546
+
547
+ c[0] = most_common_category
548
+ n_detections_flipped_this_image += 1
549
+
550
+ # ...for each detection
551
+
552
+ # ...if the dominant category is legit
553
+
554
+ if verbose_debug_enabled:
555
+ print('Made {} non-dominant --> dominant changes'.format(
556
+ n_detections_flipped_this_image))
557
+
558
+
559
+ ## Re-count
560
+
561
+ category_to_count = count_detections_by_classification_category(detections, options)
562
+ # _print_counts_with_names(category_to_count,classification_descriptions)
563
+ keys = list(category_to_count.keys())
564
+ max_count = category_to_count[keys[0]]
565
+ most_common_category = keys[0]
566
+ del keys
567
+
568
+
569
+ ## Possibly collapse higher-level taxonomic predictions down to lower levels
570
+
571
+ n_taxonomic_changes_this_image = 0
572
+
573
+ process_taxonomic_rules = \
574
+ (classification_descriptions_clean is not None) and \
575
+ (len(classification_descriptions_clean) > 0) and \
576
+ (len(category_to_count) > 1)
577
+
578
+ if process_taxonomic_rules and options.propagate_classifications_through_taxonomy:
579
+
580
+ # det = detections[3]
581
+ for det in detections:
582
+
583
+ if not _detection_is_relevant_for_smoothing(det,options):
584
+ continue
585
+
586
+ assert len(det['classifications']) == 1
587
+ c = det['classifications'][0]
588
+
589
+ # Don't bother with any classifications below the confidence threshold
590
+ if c[1] < options.classification_confidence_threshold:
591
+ continue
592
+
593
+ category_id_this_classification = c[0]
594
+ assert category_id_this_classification in category_to_count
595
+
596
+ category_description_this_classification = \
597
+ classification_descriptions_clean[category_id_this_classification]
598
+
599
+ # An empty description corresponds to the "animal" category. We don't handle
600
+ # "animal" here as a parent category, that would be handled in the "other smoothing"
601
+ # step above.
602
+ if len(category_description_this_classification) == 0:
603
+ continue
604
+
605
+ # We may have multiple child categories to choose from; this keeps track of
606
+ # the "best" we've seen so far. "Best" is based on the level (species is better
607
+ # than genus) and number.
608
+ child_category_to_score = defaultdict(float)
609
+
610
+ for category_id_of_candidate_child in category_to_count.keys():
611
+
612
+ # A category is never its own child
613
+ if category_id_of_candidate_child == category_id_this_classification:
614
+ continue
615
+
616
+ # Is this candidate a child of the current classification?
617
+ category_description_candidate_child = \
618
+ classification_descriptions_clean[category_id_of_candidate_child]
619
+
620
+ # An empty description corresponds to "animal", which can never
621
+ # be a child of another category.
622
+ if len(category_description_candidate_child) == 0:
623
+ continue
624
+
625
+ # This handles a case that doesn't come up with "pure" SpeciesNet results;
626
+ # if two categories have different IDs but the same "clean" description, this
627
+ # means they're different common names for the same species, which we use
628
+ # for things like "white-tailed deer buck" and "white-tailed deer fawn".
629
+ #
630
+ # Currently we don't support smoothing those predictions, because it's not a
631
+ # parent/child relationship.
632
+ if category_description_candidate_child == \
633
+ category_description_this_classification:
634
+ continue
635
+
636
+ # As long as we're using "clean" descriptions, parent/child taxonomic
637
+ # relationships are defined by a substring relationship
638
+ is_child = category_description_this_classification in \
639
+ category_description_candidate_child
640
+
641
+ if not is_child:
642
+ continue
643
+
644
+ # How many instances of this child category are there?
645
+ child_category_count = category_to_count[category_id_of_candidate_child]
646
+
647
+ # What taxonomy level is this child category defined at?
648
+ child_category_level = taxonomy_level_index(
649
+ classification_descriptions[category_id_of_candidate_child])
650
+
651
+ child_category_to_score[category_id_of_candidate_child] = \
652
+ child_category_level * options.taxonomy_propagation_level_weight + \
653
+ child_category_count * options.taxonomy_propagation_count_weight
654
+
655
+ # ...for each category we are considering reducing this classification to
656
+
657
+ # Did we find a category we want to change this classification to?
658
+ if len(child_category_to_score) > 0:
659
+
660
+ # Find the child category with the highest score
661
+ child_category_to_score = sort_dictionary_by_value(
662
+ child_category_to_score,reverse=True)
663
+ best_child_category = next(iter(child_category_to_score.keys()))
664
+
665
+ if verbose_debug_enabled:
666
+ old_category_name = \
667
+ classification_descriptions_clean[c[0]]
668
+ new_category_name = \
669
+ classification_descriptions_clean[best_child_category]
670
+ print('Replacing {} with {}'.format(
671
+ old_category_name,new_category_name))
672
+
673
+ c[0] = best_child_category
674
+ n_taxonomic_changes_this_image += 1
675
+
676
+ # ...for each detection
677
+
678
+ # ...if we have taxonomic information available
679
+
680
+
681
+ ## Re-count
682
+
683
+ category_to_count = count_detections_by_classification_category(detections, options)
684
+ # _print_counts_with_names(category_to_count,classification_descriptions)
685
+ keys = list(category_to_count.keys())
686
+ max_count = category_to_count[keys[0]]
687
+ most_common_category = keys[0]
688
+ del keys
689
+
690
+
691
+ ## Possibly do within-family smoothing
692
+
693
+ n_within_family_smoothing_changes = 0
694
+
695
+ # min_detections_to_overwrite_secondary_same_family = -1
696
+ # max_detections_nondominant_class_same_family = 1
697
+ family_level = taxonomy_level_string_to_index('family')
698
+
699
+ if process_taxonomic_rules:
700
+
701
+ category_description_most_common_category = \
702
+ classification_descriptions[most_common_category]
703
+ most_common_category_taxonomic_level = \
704
+ taxonomy_level_index(category_description_most_common_category)
705
+ n_most_common_category = category_to_count[most_common_category]
706
+ tokens = category_description_most_common_category.split(';')
707
+ assert len(tokens) == 7
708
+ most_common_category_family = tokens[3]
709
+ most_common_category_genus = tokens[4]
710
+
711
+ # Only consider remapping to genus or species level, and only when we have
712
+ # a high enough count in the most common category
713
+ if process_taxonomic_rules and \
714
+ (options.min_detections_to_overwrite_secondary_same_family > 0) and \
715
+ (most_common_category not in other_category_ids) and \
716
+ (most_common_category_taxonomic_level > family_level) and \
717
+ (n_most_common_category >= options.min_detections_to_overwrite_secondary_same_family):
718
+
719
+ # det = detections[0]
720
+ for det in detections:
721
+
722
+ if not _detection_is_relevant_for_smoothing(det,options):
723
+ continue
724
+
725
+ assert len(det['classifications']) == 1
726
+ c = det['classifications'][0]
727
+
728
+ # Don't over-write the most common category with itself
729
+ if c[0] == most_common_category:
730
+ continue
731
+
732
+ # Don't bother with below-threshold classifications
733
+ if c[1] < options.classification_confidence_threshold:
734
+ continue
735
+
736
+ n_candidate_flip_category = category_to_count[c[0]]
737
+
738
+ # Do we have too many of the non-dominant category to do this kind of swap?
739
+ if n_candidate_flip_category > \
740
+ options.max_detections_nondominant_class_same_family:
741
+ continue
742
+
743
+ # Don't flip classes when it's a tie
744
+ if n_candidate_flip_category == n_most_common_category:
745
+ continue
746
+
747
+ category_description_candidate_flip = \
748
+ classification_descriptions[c[0]]
749
+ tokens = category_description_candidate_flip.split(';')
750
+ assert len(tokens) == 7
751
+ candidate_flip_category_family = tokens[3]
752
+ candidate_flip_category_genus = tokens[4]
753
+ candidate_flip_category_taxonomic_level = \
754
+ taxonomy_level_index(category_description_candidate_flip)
755
+
756
+ # Only proceed if we have valid family strings
757
+ if (len(candidate_flip_category_family) == 0) or \
758
+ (len(most_common_category_family) == 0):
759
+ continue
760
+
761
+ # Only proceed if the candidate and the most common category are in the same family
762
+ if candidate_flip_category_family != most_common_category_family:
763
+ continue
764
+
765
+ # Don't flip from a species to the genus level in the same genus
766
+ if (candidate_flip_category_genus == most_common_category_genus) and \
767
+ (candidate_flip_category_taxonomic_level > \
768
+ most_common_category_taxonomic_level):
769
+ continue
770
+
771
+ old_category_name = classification_descriptions_clean[c[0]]
772
+ new_category_name = classification_descriptions_clean[most_common_category]
773
+
774
+ c[0] = most_common_category
775
+ n_within_family_smoothing_changes += 1
776
+
777
+ # ...for each detection
778
+
779
+ # ...if the dominant category is legit and we have taxonomic information available
780
+
781
+ return {'n_other_classifications_changed_this_image':n_other_classifications_changed_this_image,
782
+ 'n_detections_flipped_this_image':n_detections_flipped_this_image,
783
+ 'n_taxonomic_changes_this_image':n_taxonomic_changes_this_image,
784
+ 'n_within_family_smoothing_changes':n_within_family_smoothing_changes}
785
+
786
+ # ...def _smooth_classifications_for_list_of_detections(...)
787
+
788
+
789
+ def _smooth_single_image(im,
790
+ options,
791
+ other_category_ids,
792
+ classification_descriptions,
793
+ classification_descriptions_clean):
794
+ """
795
+ Smooth classifications for a single image. Returns None if no changes are made,
796
+ else a dict.
797
+
798
+ classification_descriptions_clean should be semicolon-delimited taxonomic strings
799
+ from which common names and GUIDs have already been removed.
800
+
801
+ Assumes there is only one classification per detection, i.e. that non-top classifications
802
+ have already been remoevd.
803
+ """
804
+
805
+ if 'detections' not in im or im['detections'] is None or len(im['detections']) == 0:
806
+ return
807
+
808
+ detections = im['detections']
809
+
810
+ # Simplify debugging
811
+ for det in detections:
812
+ det['image_filename'] = im['file']
813
+
814
+ to_return = _smooth_classifications_for_list_of_detections(detections,
815
+ options=options,
816
+ other_category_ids=other_category_ids,
817
+ classification_descriptions=classification_descriptions,
818
+ classification_descriptions_clean=classification_descriptions_clean)
819
+
820
+ # Clean out debug information
821
+ for det in detections:
822
+ del det['image_filename']
823
+
824
+ return to_return
825
+
826
+ # ...def smooth_single_image
827
+
828
+
829
+ #%% Image-level smoothing
830
+
831
+ def smooth_classification_results_image_level(input_file,output_file=None,options=None):
832
+ """
833
+ Smooth classifications at the image level for all results in the MD-formatted results
834
+ file [input_file], optionally writing a new set of results to [output_file].
835
+
836
+ This function generally expresses the notion that an image with 700 cows and one deer
837
+ is really just 701 cows.
838
+
839
+ Only count detections with a classification confidence threshold above
840
+ [options.classification_confidence_threshold], which in practice means we're only
841
+ looking at one category per detection.
842
+
843
+ If an image has at least [options.min_detections_to_overwrite_secondary] such detections
844
+ in the most common category, and no more than [options.max_detections_nondominant_class]
845
+ in the second-most-common category, flip all detections to the most common
846
+ category.
847
+
848
+ Optionally treat some classes as particularly unreliable, typically used to overwrite an
849
+ "other" class.
850
+
851
+ This function also removes everything but the non-dominant classification for each detection.
852
+
853
+ Args:
854
+ input_file (str): MegaDetector-formatted classification results file to smooth. Can
855
+ also be an already-loaded results dict.
856
+ output_file (str, optional): .json file to write smoothed results
857
+ options (ClassificationSmoothingOptions, optional): see
858
+ ClassificationSmoothingOptions for details.
859
+
860
+ Returns:
861
+ dict: MegaDetector-results-formatted dict, identical to what's written to
862
+ [output_file] if [output_file] is not None.
863
+ """
864
+
865
+ ## Input validation
866
+
867
+ if options is None:
868
+ options = ClassificationSmoothingOptions()
869
+
870
+ r = _prepare_results_for_smoothing(input_file, options)
871
+ d = r['d']
872
+ other_category_ids = r['other_category_ids']
873
+ classification_descriptions_clean = r['classification_descriptions_clean']
874
+ classification_descriptions = r['classification_descriptions']
875
+
876
+
877
+ ## Smoothing
878
+
879
+ n_other_classifications_changed = 0
880
+ n_other_images_changed = 0
881
+ n_taxonomic_images_changed = 0
882
+
883
+ n_detections_flipped = 0
884
+ n_images_changed = 0
885
+ n_taxonomic_classification_changes = 0
886
+
887
+ # im = d['images'][0]
888
+ for im in tqdm(d['images']):
889
+
890
+ r = _smooth_single_image(im,
891
+ options,
892
+ other_category_ids,
893
+ classification_descriptions=classification_descriptions,
894
+ classification_descriptions_clean=classification_descriptions_clean)
895
+
896
+ if r is None:
897
+ continue
898
+
899
+ n_detections_flipped_this_image = r['n_detections_flipped_this_image']
900
+ n_other_classifications_changed_this_image = \
901
+ r['n_other_classifications_changed_this_image']
902
+ n_taxonomic_changes_this_image = r['n_taxonomic_changes_this_image']
903
+
904
+ n_detections_flipped += n_detections_flipped_this_image
905
+ n_other_classifications_changed += n_other_classifications_changed_this_image
906
+ n_taxonomic_classification_changes += n_taxonomic_changes_this_image
907
+
908
+ if n_detections_flipped_this_image > 0:
909
+ n_images_changed += 1
910
+ if n_other_classifications_changed_this_image > 0:
911
+ n_other_images_changed += 1
912
+ if n_taxonomic_changes_this_image > 0:
913
+ n_taxonomic_images_changed += 1
914
+
915
+ # ...for each image
916
+
917
+ print('Classification smoothing: changed {} detections on {} images'.format(
918
+ n_detections_flipped,n_images_changed))
919
+
920
+ print('"Other" smoothing: changed {} detections on {} images'.format(
921
+ n_other_classifications_changed,n_other_images_changed))
922
+
923
+ print('Taxonomic smoothing: changed {} detections on {} images'.format(
924
+ n_taxonomic_classification_changes,n_taxonomic_images_changed))
925
+
926
+
927
+ ## Write output
928
+
929
+ if output_file is not None:
930
+ print('Writing results after image-level smoothing to:\n{}'.format(output_file))
931
+ write_json(output_file,d)
932
+
933
+ return d
934
+
935
+ # ...def smooth_classification_results_image_level(...)
936
+
937
+
938
+ #%% Sequence-level smoothing
939
+
940
+ def smooth_classification_results_sequence_level(input_file,
941
+ cct_sequence_information,
942
+ output_file=None,
943
+ options=None):
944
+ """
945
+ Smooth classifications at the sequence level for all results in the MD-formatted results
946
+ file [md_results_file], optionally writing a new set of results to [output_file].
947
+
948
+ This function generally expresses the notion that a sequence that looks like
949
+ deer/deer/deer/elk/deer/deer/deer/deer is really just a deer.
950
+
951
+ Args:
952
+ input_file (str or dict): MegaDetector-formatted classification results file to smooth
953
+ (or already-loaded results). If you supply a dict, it's copied by default, but
954
+ in-place modification is supported via options.modify_in_place.
955
+ cct_sequence_information (str, dict, or list): COCO Camera Traps file containing sequence IDs for
956
+ each image (or an already-loaded CCT-formatted dict, or just the 'images' list from a CCT dict).
957
+ output_file (str, optional): .json file to write smoothed results
958
+ options (ClassificationSmoothingOptions, optional): see
959
+ ClassificationSmoothingOptions for details.
960
+
961
+ Returns:
962
+ dict: MegaDetector-results-formatted dict, identical to what's written to
963
+ [output_file] if [output_file] is not None.
964
+ """
965
+
966
+ ## Input validation
967
+
968
+ if options is None:
969
+ options = ClassificationSmoothingOptions()
970
+
971
+ r = _prepare_results_for_smoothing(input_file, options)
972
+ d = r['d']
973
+ other_category_ids = r['other_category_ids']
974
+ classification_descriptions_clean = r['classification_descriptions_clean']
975
+ classification_descriptions = r['classification_descriptions']
976
+
977
+
978
+ ## Make a list of images appearing in each sequence
979
+
980
+ if isinstance(cct_sequence_information,list):
981
+ image_info = cct_sequence_information
982
+ elif isinstance(cct_sequence_information,str):
983
+ print('Loading sequence information from {}'.format(cct_sequence_information))
984
+ with open(cct_sequence_information,'r') as f:
985
+ cct_sequence_information = json.load(f)
986
+ image_info = cct_sequence_information['images']
987
+ else:
988
+ assert isinstance(cct_sequence_information,dict)
989
+ image_info = cct_sequence_information['images']
990
+
991
+ sequence_to_image_filenames = defaultdict(list)
992
+
993
+ # im = image_info[0]
994
+ for im in tqdm(image_info):
995
+ sequence_to_image_filenames[im['seq_id']].append(im['file_name'])
996
+ del image_info
997
+
998
+ image_fn_to_classification_results = {}
999
+ for im in d['images']:
1000
+ fn = im['file']
1001
+ assert fn not in image_fn_to_classification_results
1002
+ image_fn_to_classification_results[fn] = im
1003
+
1004
+
1005
+ ## Smoothing
1006
+
1007
+ n_other_classifications_changed = 0
1008
+ n_other_sequences_changed = 0
1009
+ n_taxonomic_sequences_changed = 0
1010
+ n_within_family_sequences_changed = 0
1011
+
1012
+ n_detections_flipped = 0
1013
+ n_sequences_changed = 0
1014
+ n_taxonomic_classification_changes = 0
1015
+ n_within_family_changes = 0
1016
+
1017
+ # sequence_id = list(sequence_to_image_filenames.keys())[0]
1018
+ for sequence_id in sequence_to_image_filenames.keys():
1019
+
1020
+ image_filenames_this_sequence = sequence_to_image_filenames[sequence_id]
1021
+
1022
+ # if 'file' in image_filenames_this_sequence:
1023
+ # from IPython import embed; embed()
1024
+
1025
+ detections_this_sequence = []
1026
+ for image_filename in image_filenames_this_sequence:
1027
+ if image_filename not in image_fn_to_classification_results:
1028
+ print('Warning: {} in sequence list but not in results'.format(
1029
+ image_filename))
1030
+ continue
1031
+ im = image_fn_to_classification_results[image_filename]
1032
+ if 'detections' not in im or im['detections'] is None:
1033
+ continue
1034
+ detections_this_sequence.extend(im['detections'])
1035
+
1036
+ # Temporarily add image filenames to every detection,
1037
+ # for debugging
1038
+ for det in im['detections']:
1039
+ det['image_filename'] = im['file']
1040
+
1041
+ if len(detections_this_sequence) == 0:
1042
+ continue
1043
+
1044
+ r = _smooth_classifications_for_list_of_detections(
1045
+ detections=detections_this_sequence,
1046
+ options=options,
1047
+ other_category_ids=other_category_ids,
1048
+ classification_descriptions=classification_descriptions,
1049
+ classification_descriptions_clean=classification_descriptions_clean)
1050
+
1051
+ if r is None:
1052
+ continue
1053
+
1054
+ n_detections_flipped_this_sequence = r['n_detections_flipped_this_image']
1055
+ n_other_classifications_changed_this_sequence = \
1056
+ r['n_other_classifications_changed_this_image']
1057
+ n_taxonomic_changes_this_sequence = r['n_taxonomic_changes_this_image']
1058
+ n_within_family_changes_this_sequence = r['n_within_family_smoothing_changes']
1059
+
1060
+ n_detections_flipped += n_detections_flipped_this_sequence
1061
+ n_other_classifications_changed += n_other_classifications_changed_this_sequence
1062
+ n_taxonomic_classification_changes += n_taxonomic_changes_this_sequence
1063
+ n_within_family_changes += n_within_family_changes_this_sequence
1064
+
1065
+ if n_detections_flipped_this_sequence > 0:
1066
+ n_sequences_changed += 1
1067
+ if n_other_classifications_changed_this_sequence > 0:
1068
+ n_other_sequences_changed += 1
1069
+ if n_taxonomic_changes_this_sequence > 0:
1070
+ n_taxonomic_sequences_changed += 1
1071
+ if n_within_family_changes_this_sequence > 0:
1072
+ n_within_family_sequences_changed += 1
1073
+
1074
+ # ...for each sequence
1075
+
1076
+ print('Classification smoothing: changed {} detections in {} sequences'.format(
1077
+ n_detections_flipped,n_sequences_changed))
1078
+
1079
+ print('"Other" smoothing: changed {} detections in {} sequences'.format(
1080
+ n_other_classifications_changed,n_other_sequences_changed))
1081
+
1082
+ print('Taxonomic smoothing: changed {} detections in {} sequences'.format(
1083
+ n_taxonomic_classification_changes,n_taxonomic_sequences_changed))
1084
+
1085
+ print('Within-family smoothing: changed {} detections in {} sequences'.format(
1086
+ n_within_family_changes,n_within_family_sequences_changed))
1087
+
1088
+
1089
+ ## Clean up debug information
1090
+
1091
+ for im in d['images']:
1092
+ if 'detections' not in im or im['detections'] is None:
1093
+ continue
1094
+ for det in im['detections']:
1095
+ if 'image_filename' in det:
1096
+ del det['image_filename']
1097
+
1098
+
1099
+ ## Write output
1100
+
1101
+ if output_file is not None:
1102
+ print('Writing sequence-smoothed classification results to {}'.format(
1103
+ output_file))
1104
+ write_json(output_file,d)
1105
+
1106
+ return d
1107
+
1108
+ # ...smooth_classification_results_sequence_level(...)
1109
+
1110
+
1111
+ def remove_classifications_from_non_animal_detections(input_file,
1112
+ output_file,
1113
+ animal_category_names=None):
1114
+ """
1115
+ Remove classifications from non-animal detections in a MD .json file,
1116
+ optionally writing the results to a new .json file
1117
+
1118
+ Args:
1119
+ input_file (str): the MD-formatted .json file to process
1120
+ output_file (str, optional): the output file to write the modified results
1121
+ animal_category_names (list, optional): the detection category names that should
1122
+ be treated as animals (defaults to just 'animal').
1123
+
1124
+ Returns:
1125
+ dict: the modified results
1126
+ """
1127
+
1128
+ if animal_category_names is None:
1129
+ animal_category_names = ['animal']
1130
+ animal_category_names = set(animal_category_names)
1131
+
1132
+ with open(input_file,'r') as f:
1133
+ d = json.load(f)
1134
+
1135
+ category_id_to_name = d['detection_categories']
1136
+
1137
+ n_classifications_removed = 0
1138
+ n_detections = 0
1139
+
1140
+ # im = d['images'][0]
1141
+ for im in d['images']:
1142
+
1143
+ if ('detections' not in im) or (im['detections'] is None):
1144
+ continue
1145
+
1146
+ n_detections += len(im['detections'])
1147
+
1148
+ for det in im['detections']:
1149
+
1150
+ if 'classifications' not in det:
1151
+ continue
1152
+ category_id = det['category']
1153
+ category_name = category_id_to_name[category_id]
1154
+ if category_name not in animal_category_names:
1155
+ del det['classifications']
1156
+ n_classifications_removed += 1
1157
+ continue
1158
+
1159
+ # ...for each detection
1160
+
1161
+ # ...for each image
1162
+
1163
+ print('Removed classifications from {} of {} detections'.format(
1164
+ n_classifications_removed,n_detections))
1165
+
1166
+ if output_file is not None:
1167
+ write_json(output_file,d)
1168
+
1169
+ return d
1170
+
1171
+ # ...def remove_classifications_from_non_animal_detections(...)
1172
+
1173
+
1174
+ def restrict_to_taxa_list(taxa_list,
1175
+ speciesnet_taxonomy_file,
1176
+ input_file,
1177
+ output_file,
1178
+ allow_walk_down=False,
1179
+ add_pre_filtering_description=True,
1180
+ add_post_filtering_description=True,
1181
+ allow_redundant_latin_names=True,
1182
+ protected_common_names=None,
1183
+ use_original_common_names_if_available=True,
1184
+ verbose=True,
1185
+ classification_threshold=None,
1186
+ combine_redundant_categories=True):
1187
+ """
1188
+ Given a prediction file in MD .json format, likely without having had
1189
+ a geofence applied, apply a custom taxa list.
1190
+
1191
+ Args:
1192
+ taxa_list (str): .csv file with at least the columns "latin" and "common"
1193
+ speciesnet_taxonomy_file (str): taxonomy filename, in the same format used for
1194
+ model release (with 7-token taxonomy entries)
1195
+ input_file (str): .json file to read, in MD format. This can be None, in which
1196
+ case this function just validates [taxa_list].
1197
+ output_file (str): .json file to write, in MD format
1198
+ allow_walk_down (bool, optional): should we walk down the taxonomy tree
1199
+ when making mappings if a parent has only a single allowable child?
1200
+ For example, if only a single felid species is allowed, should other
1201
+ felid predictions be mapped to that species, as opposed to being mapped
1202
+ to the family?
1203
+ add_pre_filtering_description (bool, optional): should we add a new metadata
1204
+ field that summarizes each image's classifications prior to taxonomic
1205
+ restriction?
1206
+ add_post_filtering_description (bool, optional): should we add a new metadata
1207
+ field that summarizes each image's classifications after taxonomic
1208
+ restriction?
1209
+ allow_redundant_latin_names (bool, optional): if False, we'll raise an Exception
1210
+ if the same latin name appears twice in the taxonomy list; if True, we'll
1211
+ just print a warning and ignore all entries other than the first for this
1212
+ latin name
1213
+ protected_common_names (list, optional): these categories should be
1214
+ unmodified, even if they aren't used, or have the same taxonomic
1215
+ description as other categories
1216
+ use_original_common_names_if_available (bool, optional): if an "original_common"
1217
+ column is present in [taxa_list], use those common names instead of the ones
1218
+ in the taxonomy file
1219
+ verbose (bool, optional): enable additional debug output
1220
+ classification_threshold (float, optional): only relevant for the pre/post filtering
1221
+ descriptions
1222
+ combine_redundant_categories (bool, optional): whether to combine categories with the
1223
+ same common name
1224
+ """
1225
+
1226
+ ##%% Read target taxa list
1227
+
1228
+ taxa_list_df = pd.read_csv(taxa_list)
1229
+
1230
+ required_columns = ('latin','common')
1231
+ for s in required_columns:
1232
+ assert s in taxa_list_df.columns, \
1233
+ 'Required column {} missing from taxonomy list file {}'.format(
1234
+ s,taxa_list)
1235
+
1236
+ # Convert the "latin" and "common" columns in taxa_list_df to lowercase
1237
+ taxa_list_df['latin'] = taxa_list_df['latin'].str.lower()
1238
+ taxa_list_df['common'] = taxa_list_df['common'].str.lower()
1239
+
1240
+ # Remove rows from taxa_list_df where the "latin" column is nan,
1241
+ # printing a warning for each row (with a string representation of the whole row)
1242
+ for i_row,row in taxa_list_df.iterrows():
1243
+ if pd.isna(row['latin']):
1244
+ if verbose:
1245
+ print('Warning: Skipping row with empty "latin" column in {}:\n{}\n'.format(
1246
+ taxa_list,str(row.to_dict())))
1247
+ taxa_list_df.drop(index=i_row, inplace=True)
1248
+
1249
+ # Convert all NaN values in the "common" column to empty strings
1250
+ taxa_list_df['common'] = taxa_list_df['common'].fillna('')
1251
+
1252
+ # Create a dictionary mapping source Latin names to target common names
1253
+ target_latin_to_common = {}
1254
+
1255
+ for i_row,row in taxa_list_df.iterrows():
1256
+
1257
+ latin = row['latin']
1258
+ common = row['common']
1259
+
1260
+ if use_original_common_names_if_available and \
1261
+ ('original_common' in row) and \
1262
+ (not is_empty(row['original_common'])):
1263
+ common = row['original_common'].strip().lower()
1264
+
1265
+ # Valid latin names have either one token (e.g. "canidae"),
1266
+ # two tokens (e.g. "bos taurus"), or three tokens (e.g. "canis lupus familiaris")
1267
+ assert len(latin.split(' ')) in (1,2,3), \
1268
+ 'Illegal binomial name {} in taxaonomy list {}'.format(
1269
+ latin,taxa_list)
1270
+
1271
+ if latin in target_latin_to_common:
1272
+ error_string = \
1273
+ 'scientific name {} appears multiple times in the taxonomy list'.format(
1274
+ latin)
1275
+ if allow_redundant_latin_names:
1276
+ if verbose:
1277
+ print('Warning: {}'.format(error_string))
1278
+ else:
1279
+ raise ValueError(error_string)
1280
+
1281
+ target_latin_to_common[latin] = common
1282
+
1283
+ # ...for each row in the custom taxonomy list
1284
+
1285
+
1286
+ ##%% Read taxonomy file
1287
+
1288
+ with open(speciesnet_taxonomy_file,'r') as f:
1289
+ speciesnet_taxonomy_list = f.readlines()
1290
+ speciesnet_taxonomy_list = [s.strip() for s in \
1291
+ speciesnet_taxonomy_list if len(s.strip()) > 0]
1292
+
1293
+ # Maps the latin name of every taxon to the corresponding full taxon string
1294
+ #
1295
+ # For species, the key is a binomial name
1296
+ speciesnet_latin_name_to_taxon_string = {}
1297
+ speciesnet_common_name_to_taxon_string = {}
1298
+
1299
+ def _insert_taxonomy_string(s):
1300
+
1301
+ tokens = s.split(';')
1302
+ assert len(tokens) == 7, 'Illegal taxonomy string {}'.format(s)
1303
+
1304
+ guid = tokens[0] # noqa
1305
+ class_name = tokens[1]
1306
+ order = tokens[2]
1307
+ family = tokens[3]
1308
+ genus = tokens[4]
1309
+ species = tokens[5]
1310
+ common_name = tokens[6]
1311
+
1312
+ if len(class_name) == 0:
1313
+ assert common_name in ('animal','vehicle','blank'), \
1314
+ 'Illegal common name {}'.format(common_name)
1315
+ return
1316
+
1317
+ if len(species) > 0:
1318
+ assert all([len(s) > 0 for s in [genus,family,order]]), \
1319
+ 'Higher-level taxa missing for {}: {},{},{}'.format(s,genus,family,order)
1320
+ binomial_name = genus + ' ' + species
1321
+ if binomial_name not in speciesnet_latin_name_to_taxon_string:
1322
+ speciesnet_latin_name_to_taxon_string[binomial_name] = s
1323
+ elif len(genus) > 0:
1324
+ assert all([len(s) > 0 for s in [family,order]]), \
1325
+ 'Higher-level taxa missing for {}: {},{}'.format(s,family,order)
1326
+ if genus not in speciesnet_latin_name_to_taxon_string:
1327
+ speciesnet_latin_name_to_taxon_string[genus] = s
1328
+ elif len(family) > 0:
1329
+ assert len(order) > 0, \
1330
+ 'Higher-level taxa missing for {}: {}'.format(s,order)
1331
+ if family not in speciesnet_latin_name_to_taxon_string:
1332
+ speciesnet_latin_name_to_taxon_string[family] = s
1333
+ elif len(order) > 0:
1334
+ if order not in speciesnet_latin_name_to_taxon_string:
1335
+ speciesnet_latin_name_to_taxon_string[order] = s
1336
+ else:
1337
+ if class_name not in speciesnet_latin_name_to_taxon_string:
1338
+ speciesnet_latin_name_to_taxon_string[class_name] = s
1339
+
1340
+ if len(common_name) > 0:
1341
+ if common_name not in speciesnet_common_name_to_taxon_string:
1342
+ speciesnet_common_name_to_taxon_string[common_name] = s
1343
+
1344
+ for s in speciesnet_taxonomy_list:
1345
+
1346
+ _insert_taxonomy_string(s)
1347
+
1348
+
1349
+ ##%% Make sure all parent taxa are represented in the taxonomy
1350
+
1351
+ # In theory any taxon that appears as the parent of another taxon should
1352
+ # also be in the taxonomy, but this isn't always true, so we fix it here.
1353
+ new_taxon_string_to_missing_tokens = defaultdict(list)
1354
+
1355
+ # While we're making this loop, also see whether we need to store any custom
1356
+ # common name mappings based on the taxonomy list.
1357
+ speciesnet_latin_name_to_output_common_name = {}
1358
+
1359
+ # latin_name = next(iter(speciesnet_latin_name_to_taxon_string.keys()))
1360
+ for latin_name in speciesnet_latin_name_to_taxon_string.keys():
1361
+
1362
+ if latin_name in target_latin_to_common:
1363
+ speciesnet_latin_name_to_output_common_name[latin_name] = \
1364
+ target_latin_to_common[latin_name]
1365
+
1366
+ if 'no cv result' in latin_name:
1367
+ continue
1368
+
1369
+ taxon_string = speciesnet_latin_name_to_taxon_string[latin_name]
1370
+ tokens = taxon_string.split(';')
1371
+
1372
+ # Don't process GUID, species, or common name
1373
+ # i_token = 6
1374
+ for i_token in range(1,len(tokens)-2):
1375
+
1376
+ test_token = tokens[i_token]
1377
+ if len(test_token) == 0:
1378
+ continue
1379
+
1380
+ # Do we need to make up a taxon for this token?
1381
+ if test_token not in speciesnet_latin_name_to_taxon_string:
1382
+
1383
+ new_tokens = [''] * 7
1384
+ new_tokens[0] = 'fake_guid'
1385
+ for i_copy_token in range(1,i_token+1):
1386
+ new_tokens[i_copy_token] = tokens[i_copy_token]
1387
+ new_tokens[-1] = test_token + ' species'
1388
+ assert new_tokens[-2] == '', \
1389
+ 'Illegal taxonomy string {}'.format(taxon_string)
1390
+ new_taxon_string = ';'.join(new_tokens)
1391
+ # assert new_taxon_string not in new_taxon_strings
1392
+ new_taxon_string_to_missing_tokens[new_taxon_string].append(test_token)
1393
+
1394
+ # ...for each token
1395
+
1396
+ # ...for each taxon
1397
+
1398
+ new_taxon_string_to_missing_tokens = \
1399
+ sort_dictionary_by_key(new_taxon_string_to_missing_tokens)
1400
+
1401
+ if verbose:
1402
+
1403
+ print(f'Found {len(new_taxon_string_to_missing_tokens)} taxa that need to be inserted to ' + \
1404
+ 'make the taxonomy valid, showing only mammals and birds here:\n')
1405
+
1406
+ for taxon_string in new_taxon_string_to_missing_tokens:
1407
+ if 'mammalia' not in taxon_string and 'aves' not in taxon_string:
1408
+ continue
1409
+ missing_taxa = ','.join(new_taxon_string_to_missing_tokens[taxon_string])
1410
+ print('{} ({})'.format(taxon_string,missing_taxa))
1411
+
1412
+ for new_taxon_string in new_taxon_string_to_missing_tokens:
1413
+ _insert_taxonomy_string(new_taxon_string)
1414
+
1415
+
1416
+ ##%% Make sure all taxa on the allow-list are in the taxonomy
1417
+
1418
+ n_failed_mappings = 0
1419
+
1420
+ for target_taxon_latin_name in target_latin_to_common.keys():
1421
+ if target_taxon_latin_name not in speciesnet_latin_name_to_taxon_string:
1422
+ common_name = target_latin_to_common[target_taxon_latin_name]
1423
+ s = '{} ({}) not in speciesnet taxonomy'.format(
1424
+ target_taxon_latin_name,common_name)
1425
+ if common_name in speciesnet_common_name_to_taxon_string:
1426
+ s += ' (common name maps to {})'.format(
1427
+ speciesnet_common_name_to_taxon_string[common_name])
1428
+ print(s)
1429
+ n_failed_mappings += 1
1430
+
1431
+ if n_failed_mappings > 0:
1432
+ raise ValueError('Cannot continue with taxonomic restriction')
1433
+
1434
+
1435
+ ##%% For the allow-list, map each parent taxon to a set of allowable child taxa
1436
+
1437
+ # Maps parent names to all allowed child names, or None if this is the
1438
+ # lowest-level allowable taxon on this path
1439
+ allowed_parent_taxon_to_child_taxa = defaultdict(set)
1440
+
1441
+ # latin_name = next(iter(target_latin_to_common.keys()))
1442
+ for latin_name in target_latin_to_common:
1443
+
1444
+ taxon_string = speciesnet_latin_name_to_taxon_string[latin_name]
1445
+ tokens = taxon_string.split(';')
1446
+ assert len(tokens) == 7, \
1447
+ 'Illegal taxonomy string {}'.format(taxon_string)
1448
+
1449
+ # Remove GUID and common mame
1450
+ #
1451
+ # This is now always class/order/family/genus/species
1452
+ tokens = tokens[1:-1]
1453
+
1454
+ child_taxon = None
1455
+
1456
+ # If this is a species
1457
+ if len(tokens[-1]) > 0:
1458
+ binomial_name = tokens[-2] + ' ' + tokens[-1]
1459
+ assert binomial_name == latin_name, \
1460
+ 'Binomial/latin mismatch: {} vs {}'.format(binomial_name,latin_name)
1461
+ # If this already exists, it should only allow "None"
1462
+ if binomial_name in allowed_parent_taxon_to_child_taxa:
1463
+ assert len(allowed_parent_taxon_to_child_taxa[binomial_name]) == 1, \
1464
+ 'Species-level entry {} has multiple children'.format(binomial_name)
1465
+ assert None in allowed_parent_taxon_to_child_taxa[binomial_name], \
1466
+ 'Species-level entry {} has non-None children'.format(binomial_name)
1467
+ allowed_parent_taxon_to_child_taxa[binomial_name].add(None)
1468
+ child_taxon = binomial_name
1469
+
1470
+ # The first level that can ever be a parent taxon is the genus level
1471
+ parent_token_index = len(tokens) - 2
1472
+
1473
+ # Walk up from genus to family
1474
+ while(parent_token_index >= 0):
1475
+
1476
+ # "None" is our leaf node marker, we should never have ''
1477
+ if child_taxon is not None:
1478
+ assert len(child_taxon) > 0
1479
+
1480
+ parent_taxon = tokens[parent_token_index]
1481
+
1482
+ # Don't create entries for blank taxa
1483
+ if (len(parent_taxon) > 0):
1484
+
1485
+ create_child = True
1486
+
1487
+ # This is the lowest-level taxon in this entry
1488
+ if (child_taxon is None):
1489
+
1490
+ # ...but we don't want to remove existing children from any parents
1491
+ if (parent_taxon in allowed_parent_taxon_to_child_taxa) and \
1492
+ (len(allowed_parent_taxon_to_child_taxa[parent_taxon]) > 0):
1493
+ if verbose:
1494
+ existing_children_string = str(allowed_parent_taxon_to_child_taxa[parent_taxon])
1495
+ print('Not creating empty child for parent {} (already has children {})'.format(
1496
+ parent_taxon,existing_children_string))
1497
+ create_child = False
1498
+
1499
+ # If we're adding a new child entry, clear out any leaf node markers
1500
+ else:
1501
+
1502
+ if (parent_taxon in allowed_parent_taxon_to_child_taxa) and \
1503
+ (None in allowed_parent_taxon_to_child_taxa[parent_taxon]):
1504
+
1505
+ assert len(allowed_parent_taxon_to_child_taxa[parent_taxon]) == 1, \
1506
+ 'Illlegal parent/child configuration'
1507
+
1508
+ if verbose:
1509
+ print('Un-marking parent {} as a leaf node because of child {}'.format(
1510
+ parent_taxon,child_taxon))
1511
+
1512
+ allowed_parent_taxon_to_child_taxa[parent_taxon] = set()
1513
+
1514
+ if create_child:
1515
+ allowed_parent_taxon_to_child_taxa[parent_taxon].add(child_taxon)
1516
+
1517
+ # If we haven't hit a non-empty taxon yet, don't update "child_taxon"
1518
+ assert len(parent_taxon) > 0
1519
+ child_taxon = parent_taxon
1520
+
1521
+ # ...if we have a non-empty taxon
1522
+
1523
+ parent_token_index -= 1
1524
+
1525
+ # ...for each taxonomic level
1526
+
1527
+ # ...for each allowed latin name
1528
+
1529
+ allowed_parent_taxon_to_child_taxa = \
1530
+ sort_dictionary_by_key(allowed_parent_taxon_to_child_taxa)
1531
+
1532
+ for parent_taxon in allowed_parent_taxon_to_child_taxa:
1533
+ # "None" should only ever appear alone; this marks a leaf node with no children
1534
+ if None in allowed_parent_taxon_to_child_taxa[parent_taxon]:
1535
+ assert len(allowed_parent_taxon_to_child_taxa[parent_taxon]) == 1, \
1536
+ '"None" should only appear alone in a child taxon list'
1537
+
1538
+
1539
+ ##%% If we were just validating the custom taxa file, we're done
1540
+
1541
+ if input_file is None:
1542
+ print('Finished validating custom taxonomy list')
1543
+ return
1544
+
1545
+
1546
+ #%% Map all predictions that exist in this dataset...
1547
+
1548
+ # ...to the prediction we should generate.
1549
+
1550
+ with open(input_file,'r') as f:
1551
+ input_data = json.load(f)
1552
+
1553
+ input_category_id_to_common_name = input_data['classification_categories'] #noqa
1554
+ input_category_id_to_taxonomy_string = \
1555
+ input_data['classification_category_descriptions']
1556
+
1557
+ input_category_id_to_output_taxon_string = {}
1558
+
1559
+ # input_category_id = next(iter(input_category_id_to_taxonomy_string.keys()))
1560
+ for input_category_id in input_category_id_to_taxonomy_string.keys():
1561
+
1562
+ input_taxon_string = input_category_id_to_taxonomy_string[input_category_id]
1563
+ input_taxon_tokens = input_taxon_string.split(';')
1564
+ assert len(input_taxon_tokens) == 7, \
1565
+ 'Illegal taxonomy string: {}'.format(input_taxon_string)
1566
+
1567
+ # Don't mess with blank/no-cv-result/human (or "animal", which is really "unknown")
1568
+ if (not is_taxonomic_prediction_string(input_taxon_string)) or \
1569
+ (input_taxon_string == human_prediction_string):
1570
+ if verbose:
1571
+ print('Not messing with non-taxonomic category {}'.format(input_taxon_string))
1572
+ input_category_id_to_output_taxon_string[input_category_id] = \
1573
+ input_taxon_string
1574
+ continue
1575
+
1576
+ # Don't mess with protected categories
1577
+ common_name = input_taxon_tokens[-1]
1578
+
1579
+ if (protected_common_names is not None) and \
1580
+ (common_name in protected_common_names):
1581
+ if verbose:
1582
+ print('Not messing with protected category {}:\n{}'.format(
1583
+ common_name,input_taxon_string))
1584
+ input_category_id_to_output_taxon_string[input_category_id] = \
1585
+ input_taxon_string
1586
+ continue
1587
+
1588
+ # Remove GUID and common mame
1589
+
1590
+ # This is always class/order/family/genus/species
1591
+ input_taxon_tokens = input_taxon_tokens[1:-1]
1592
+ assert len(input_taxon_tokens) == 5
1593
+
1594
+
1595
+ # Start at the species level (the last element in input_taxon_tokens),
1596
+ # and see whether each taxon is allowed
1597
+ test_index = len(input_taxon_tokens) - 1
1598
+ target_taxon = None
1599
+
1600
+ while((test_index >= 0) and (target_taxon is None)):
1601
+
1602
+ # Species are represented as binomial names, i.e. when test_index is 4,
1603
+ # test_taxon_name will have two tokens (e.g. "canis lupus"), otherwise
1604
+ # test_taxon_name will have one token (e.g. "canis", or "aves")
1605
+ if (test_index == (len(input_taxon_tokens) - 1)) and \
1606
+ (len(input_taxon_tokens[-1]) > 0):
1607
+ test_taxon_name = \
1608
+ input_taxon_tokens[-2] + ' ' + input_taxon_tokens[-1]
1609
+ else:
1610
+ test_taxon_name = input_taxon_tokens[test_index]
1611
+
1612
+ # If we haven't yet found the level at which this taxon is non-empty,
1613
+ # keep going up
1614
+ if len(test_taxon_name) == 0:
1615
+ test_index -= 1
1616
+ continue
1617
+
1618
+ assert test_taxon_name in speciesnet_latin_name_to_taxon_string, \
1619
+ '{} not found in taxonomy table'.format(test_taxon_name)
1620
+
1621
+ # Is this taxon allowed according to the custom species list?
1622
+ if test_taxon_name in allowed_parent_taxon_to_child_taxa:
1623
+
1624
+ allowed_child_taxa = allowed_parent_taxon_to_child_taxa[test_taxon_name]
1625
+ assert allowed_child_taxa is not None, \
1626
+ 'allowed_child_taxa should not be None: {}'.format(test_taxon_name)
1627
+
1628
+ # If this is the lowest-level allowable token or there is not a
1629
+ # unique child, don't walk any further, even if walking down
1630
+ # is enabled.
1631
+ if None in allowed_child_taxa:
1632
+ assert len(allowed_child_taxa) == 1, \
1633
+ '"None" should not be listed as a child taxa with other child taxa'
1634
+
1635
+ if (None in allowed_child_taxa) or (len(allowed_child_taxa) > 1):
1636
+ target_taxon = test_taxon_name
1637
+ elif not allow_walk_down:
1638
+ target_taxon = test_taxon_name
1639
+ else:
1640
+ # If there's a unique child, walk back *down* the allowable
1641
+ # taxa until we run out of unique children
1642
+ while ((next(iter(allowed_child_taxa)) is not None) and \
1643
+ (len(allowed_child_taxa) == 1)):
1644
+ candidate_taxon = next(iter(allowed_child_taxa))
1645
+ assert candidate_taxon in allowed_parent_taxon_to_child_taxa, \
1646
+ '{} should be a subset of {}'.format(
1647
+ candidate_taxon,allowed_parent_taxon_to_child_taxa)
1648
+ assert candidate_taxon in speciesnet_latin_name_to_taxon_string, \
1649
+ '{} should be a subset of {}'.format(
1650
+ candidate_taxon,speciesnet_latin_name_to_taxon_string)
1651
+ allowed_child_taxa = \
1652
+ allowed_parent_taxon_to_child_taxa[candidate_taxon]
1653
+ target_taxon = candidate_taxon
1654
+
1655
+ # ...if this is an allowed taxon
1656
+
1657
+ test_index -= 1
1658
+
1659
+ # ...for each token
1660
+
1661
+ if target_taxon is None:
1662
+ output_taxon_string = animal_prediction_string
1663
+ else:
1664
+ output_taxon_string = speciesnet_latin_name_to_taxon_string[target_taxon]
1665
+ input_category_id_to_output_taxon_string[input_category_id] = output_taxon_string
1666
+
1667
+ # ...for each category (mapping input category IDs to output taxon strings)
1668
+
1669
+
1670
+ #%% Map input category IDs to output category IDs
1671
+
1672
+ speciesnet_taxon_string_to_latin_name = \
1673
+ invert_dictionary(speciesnet_latin_name_to_taxon_string)
1674
+
1675
+ input_category_id_to_output_category_id = {}
1676
+ output_taxon_string_to_category_id = {}
1677
+ output_category_id_to_common_name = {}
1678
+
1679
+ for input_category_id in input_category_id_to_output_taxon_string:
1680
+
1681
+ output_taxon_string = \
1682
+ input_category_id_to_output_taxon_string[input_category_id]
1683
+
1684
+ output_common_name = output_taxon_string.split(';')[-1]
1685
+
1686
+ # Possibly substitute a custom common name
1687
+ if output_taxon_string in speciesnet_taxon_string_to_latin_name:
1688
+
1689
+ speciesnet_latin_name = speciesnet_taxon_string_to_latin_name[output_taxon_string]
1690
+
1691
+ if speciesnet_latin_name in speciesnet_latin_name_to_output_common_name:
1692
+ custom_common_name = speciesnet_latin_name_to_output_common_name[speciesnet_latin_name]
1693
+ if custom_common_name != output_common_name:
1694
+ if verbose:
1695
+ print('Substituting common name {} for {}'.format(custom_common_name,output_common_name))
1696
+ output_common_name = custom_common_name
1697
+
1698
+ # Do we need to create a new output category?
1699
+ if output_taxon_string not in output_taxon_string_to_category_id:
1700
+ output_category_id = str(len(output_taxon_string_to_category_id))
1701
+ output_taxon_string_to_category_id[output_taxon_string] = \
1702
+ output_category_id
1703
+ output_category_id_to_common_name[output_category_id] = \
1704
+ output_common_name
1705
+ else:
1706
+ output_category_id = \
1707
+ output_taxon_string_to_category_id[output_taxon_string]
1708
+
1709
+ input_category_id_to_output_category_id[input_category_id] = \
1710
+ output_category_id
1711
+
1712
+ # Sometimes-useful debug printouts
1713
+ if False:
1714
+ original_common_name = \
1715
+ input_category_id_to_common_name[input_category_id]
1716
+ original_taxon_string = \
1717
+ input_category_id_to_taxonomy_string[input_category_id]
1718
+ print('Mapping {} ({}) to:\n{} ({})\n'.format(
1719
+ original_common_name,original_taxon_string,
1720
+ output_common_name,output_taxon_string))
1721
+
1722
+ # ...for each category (mapping input category IDs to output category IDs)
1723
+
1724
+
1725
+ ##%% Remap all category labels
1726
+
1727
+ assert len(set(output_taxon_string_to_category_id.keys())) == \
1728
+ len(set(output_taxon_string_to_category_id.values())), \
1729
+ 'Category ID/value non-uniqueness error'
1730
+
1731
+ output_category_id_to_taxon_string = \
1732
+ invert_dictionary(output_taxon_string_to_category_id)
1733
+
1734
+ with open(input_file,'r') as f:
1735
+ output_data = json.load(f)
1736
+
1737
+ classification_descriptions = None
1738
+ if 'classification_category_descriptions' in output_data:
1739
+ classification_descriptions = output_data['classification_category_descriptions']
1740
+
1741
+ for im in tqdm(output_data['images']):
1742
+
1743
+ if 'detections' not in im or im['detections'] is None:
1744
+ continue
1745
+
1746
+ description_options = ClassificationSmoothingOptions()
1747
+ if classification_threshold is not None:
1748
+ description_options.classification_confidence_threshold = classification_threshold
1749
+
1750
+ # Possibly prepare a pre-filtering description
1751
+ pre_filtering_description = None
1752
+ if classification_descriptions is not None and add_pre_filtering_description:
1753
+ category_to_count = \
1754
+ count_detections_by_classification_category(im['detections'],
1755
+ options=description_options)
1756
+ pre_filtering_description = \
1757
+ get_classification_description_string(category_to_count,classification_descriptions)
1758
+ im['pre_filtering_description'] = pre_filtering_description
1759
+
1760
+ for det in im['detections']:
1761
+ if 'classifications' in det:
1762
+ for classification in det['classifications']:
1763
+ classification[0] = \
1764
+ input_category_id_to_output_category_id[classification[0]]
1765
+
1766
+ if classification_descriptions is not None and add_post_filtering_description:
1767
+ category_to_count = \
1768
+ count_detections_by_classification_category(im['detections'],
1769
+ options=description_options)
1770
+ post_filtering_description = \
1771
+ get_classification_description_string(category_to_count,output_category_id_to_taxon_string)
1772
+ im['post_filtering_description'] = post_filtering_description
1773
+
1774
+ # ...for each image
1775
+
1776
+ output_data['classification_categories'] = output_category_id_to_common_name
1777
+ output_data['classification_category_descriptions'] = \
1778
+ output_category_id_to_taxon_string
1779
+
1780
+
1781
+ ##%% Write output
1782
+
1783
+ write_json(output_file,output_data)
1784
+
1785
+ if combine_redundant_categories:
1786
+ _ = combine_redundant_classification_categories(input_file=output_file,
1787
+ output_file=output_file)
1788
+
1789
+ # ...def restrict_to_taxa_list(...)
1790
+
1791
+
1792
+ def combine_redundant_classification_categories(input_file,
1793
+ output_file=None,
1794
+ classification_threshold=0.5):
1795
+ """
1796
+ Args:
1797
+ input_file (str): .json file to read, in MD format
1798
+ output_file (str): .json file to write, in MD format
1799
+ classification_threshold (float, optional): only used when sorting
1800
+ descriptions by count
1801
+
1802
+ Returns:
1803
+ dict: remapped MD-formatted dict
1804
+ """
1805
+
1806
+ ##%% Read input file and list categories
1807
+
1808
+ assert os.path.isfile(input_file), \
1809
+ 'Input file {} not found'.format(input_file)
1810
+
1811
+ with open(input_file,'r') as f:
1812
+ d = json.load(f)
1813
+
1814
+ input_category_name_to_ids = defaultdict(list)
1815
+
1816
+ for category_id in d['classification_categories']:
1817
+ category_name = d['classification_categories'][category_id]
1818
+ input_category_name_to_ids[category_name].append(category_id)
1819
+
1820
+
1821
+ ##%% Return early if there are no redundant categories
1822
+
1823
+ # What's the largest number of IDs associated with a single category name?
1824
+ max_count = 0
1825
+ for category_name in input_category_name_to_ids:
1826
+ c = len(input_category_name_to_ids[category_name])
1827
+ if c > max_count:
1828
+ max_count = c
1829
+
1830
+ if max_count == 1:
1831
+ if output_file is not None:
1832
+ print('No redundant categories, writing data unmodified to {}'.format(
1833
+ output_file))
1834
+ write_json(output_file,d)
1835
+ return d
1836
+
1837
+
1838
+ ##%% Map input category IDs to output category IDs
1839
+
1840
+ input_category_id_to_output_category_id = {}
1841
+
1842
+ for i_category,category_name in enumerate(input_category_name_to_ids):
1843
+ output_category_id = str(i_category)
1844
+ for input_category_id in input_category_name_to_ids[category_name]:
1845
+ input_category_id_to_output_category_id[input_category_id] = \
1846
+ output_category_id
1847
+
1848
+ n_input_categories = len(d['classification_categories'])
1849
+ n_output_categories = len(input_category_name_to_ids)
1850
+ assert n_output_categories < n_input_categories
1851
+ print('Removing {} redundant categories'.format(
1852
+ n_input_categories - n_output_categories))
1853
+
1854
+
1855
+ ##%% Create a new category dict
1856
+
1857
+ output_category_name_to_id = {}
1858
+
1859
+ for input_category_id in input_category_id_to_output_category_id:
1860
+ category_name = d['classification_categories'][input_category_id]
1861
+ output_category_id = input_category_id_to_output_category_id[input_category_id]
1862
+ if category_name in output_category_name_to_id:
1863
+ assert output_category_name_to_id[category_name] == output_category_id
1864
+ else:
1865
+ output_category_name_to_id[category_name] = output_category_id
1866
+
1867
+
1868
+ ##%% Create new classification category descriptions
1869
+
1870
+ if 'classification_category_descriptions' in d:
1871
+
1872
+ assert len(d['classification_category_descriptions']) == \
1873
+ len(d['classification_categories'])
1874
+
1875
+ # Sort descriptions by count overall, so we can sort by description within categories later
1876
+ description_to_count = defaultdict(int)
1877
+ for im in d['images']:
1878
+ if 'detections' not in im or im['detections'] is None:
1879
+ continue
1880
+ for det in im['detections']:
1881
+ if 'classifications' not in det or det['classifications'] is None:
1882
+ continue
1883
+ conf = det['classifications'][0][1]
1884
+ if conf < classification_threshold:
1885
+ continue
1886
+ input_category_id = det['classifications'][0][0]
1887
+ intput_category_description = d['classification_category_descriptions'][input_category_id]
1888
+ description_to_count[intput_category_description] += 1
1889
+ # ...for each detection
1890
+ # ...for each image
1891
+
1892
+ # This is just a debug convenience
1893
+ description_to_count = sort_dictionary_by_value(description_to_count,
1894
+ reverse=True)
1895
+
1896
+ # Create descriptions for the output categories
1897
+ output_category_id_to_descriptions = defaultdict(list)
1898
+
1899
+ for input_category_id in input_category_id_to_output_category_id:
1900
+ output_category_id = input_category_id_to_output_category_id[input_category_id]
1901
+ description = d['classification_category_descriptions'][input_category_id]
1902
+ output_category_id_to_descriptions[output_category_id].append(description)
1903
+
1904
+ output_classification_category_descriptions = {}
1905
+
1906
+ for output_category_id in output_category_id_to_descriptions:
1907
+ descriptions = output_category_id_to_descriptions[output_category_id]
1908
+ if len(descriptions) > 1:
1909
+ # Sort "descriptions" in descending order by the corresponding values
1910
+ # in description_to_count
1911
+ descriptions.sort(key=lambda x: description_to_count[x], reverse=True)
1912
+ output_classification_category_descriptions[output_category_id] = \
1913
+ '|'.join(descriptions)
1914
+ # ...for each category
1915
+
1916
+ d['classification_category_descriptions'] = output_classification_category_descriptions
1917
+
1918
+ # ...if we have to manage descriptions
1919
+
1920
+ d['classification_categories'] = invert_dictionary(output_category_name_to_id)
1921
+
1922
+ # Remap classifications
1923
+ for im in d['images']:
1924
+ if 'detections' not in im or im['detections'] is None:
1925
+ continue
1926
+ for det in im['detections']:
1927
+ if 'classifications' not in det or det['classifications'] is None:
1928
+ continue
1929
+ for i_class in range(0,len(det['classifications'])):
1930
+ input_category_id = det['classifications'][i_class][0]
1931
+ output_category_id = \
1932
+ input_category_id_to_output_category_id[input_category_id]
1933
+ det['classifications'][i_class][0] = output_category_id
1934
+ # ...for each classification
1935
+ # ...for each detection
1936
+ # ...for each image
1937
+
1938
+ if output_file is not None:
1939
+ write_json(output_file,d)
1940
+
1941
+ return d
1942
+
1943
+ # ...def combine_redundant_classification_categories(...)