megadetector 5.0.25__py3-none-any.whl → 5.0.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (34) hide show
  1. megadetector/data_management/cct_json_utils.py +15 -2
  2. megadetector/data_management/coco_to_yolo.py +53 -31
  3. megadetector/data_management/databases/combine_coco_camera_traps_files.py +7 -3
  4. megadetector/data_management/databases/integrity_check_json_db.py +2 -2
  5. megadetector/data_management/lila/generate_lila_per_image_labels.py +2 -2
  6. megadetector/data_management/lila/test_lila_metadata_urls.py +21 -10
  7. megadetector/data_management/remap_coco_categories.py +60 -11
  8. megadetector/data_management/yolo_to_coco.py +45 -15
  9. megadetector/postprocessing/classification_postprocessing.py +788 -524
  10. megadetector/postprocessing/create_crop_folder.py +95 -33
  11. megadetector/postprocessing/load_api_results.py +4 -1
  12. megadetector/postprocessing/md_to_coco.py +1 -1
  13. megadetector/postprocessing/postprocess_batch_results.py +156 -42
  14. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +3 -8
  15. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +2 -2
  16. megadetector/postprocessing/separate_detections_into_folders.py +20 -4
  17. megadetector/postprocessing/subset_json_detector_output.py +180 -15
  18. megadetector/postprocessing/validate_batch_results.py +13 -5
  19. megadetector/taxonomy_mapping/map_new_lila_datasets.py +6 -6
  20. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +3 -58
  21. megadetector/taxonomy_mapping/species_lookup.py +45 -2
  22. megadetector/utils/ct_utils.py +4 -2
  23. megadetector/utils/directory_listing.py +1 -1
  24. megadetector/utils/md_tests.py +2 -1
  25. megadetector/utils/path_utils.py +308 -19
  26. megadetector/utils/wi_utils.py +363 -186
  27. megadetector/visualization/visualization_utils.py +2 -1
  28. megadetector/visualization/visualize_db.py +1 -1
  29. megadetector/visualization/visualize_detector_output.py +1 -4
  30. {megadetector-5.0.25.dist-info → megadetector-5.0.26.dist-info}/METADATA +4 -3
  31. {megadetector-5.0.25.dist-info → megadetector-5.0.26.dist-info}/RECORD +34 -34
  32. {megadetector-5.0.25.dist-info → megadetector-5.0.26.dist-info}/WHEEL +1 -1
  33. {megadetector-5.0.25.dist-info → megadetector-5.0.26.dist-info/licenses}/LICENSE +0 -0
  34. {megadetector-5.0.25.dist-info → megadetector-5.0.26.dist-info}/top_level.txt +0 -0
@@ -4,10 +4,10 @@ classification_postprocessing.py
4
4
 
5
5
  Functions for postprocessing species classification results, particularly:
6
6
 
7
- * Smoothing results within a sequence (a sequence that looks like deer/deer/deer/elk/deer/deer
8
- is really just a deer)
9
7
  * Smoothing results within an image (an image with 700 cows and one deer is really just 701
10
8
  cows)
9
+ * Smoothing results within a sequence (a sequence that looks like deer/deer/deer/elk/deer/deer
10
+ is really just a deer)
11
11
 
12
12
  """
13
13
 
@@ -20,183 +20,219 @@ from collections import defaultdict
20
20
  from tqdm import tqdm
21
21
 
22
22
  from megadetector.utils.ct_utils import is_list_sorted
23
+ from megadetector.utils.wi_utils import clean_taxonomy_string
24
+ from megadetector.utils.wi_utils import taxonomy_level_index
25
+ from megadetector.utils.wi_utils import taxonomy_level_string_to_index
26
+ from megadetector.utils.ct_utils import sort_dictionary_by_value
23
27
 
24
28
 
25
29
  #%% Options classes
26
30
 
27
- class ClassificationSmoothingOptionsImageLevel:
31
+ class ClassificationSmoothingOptions:
28
32
  """
29
33
  Options used to parameterize smooth_classification_results_image_level()
34
+ and smooth_classification_results_sequence_level()
30
35
  """
31
36
 
32
37
  def __init__(self):
33
38
 
34
- #: How many detections do we need above the classification threshold to determine a dominant category
35
- #: for an image?
36
- self.min_detections_above_threshold = 4
39
+ #: How many detections do we need in a dominant category to overwrite
40
+ #: non-dominant classifications? This is irrelevant if
41
+ #: max_detections_nondominant_class <= 1.
42
+ self.min_detections_to_overwrite_secondary = 4
37
43
 
38
- #: Even if we have a dominant class, if a non-dominant class has at least this many classifications
39
- #: in an image, leave them alone.
40
- self.max_detections_secondary_class = 3
41
-
42
- #: If the dominant class has at least this many classifications, overwrite "other" classifications
44
+ #: Even if we have a dominant class, if a non-dominant class has at least
45
+ #: this many classifications in an image, leave them alone.
46
+ #:
47
+ #: If this is <= 1, we won't replace non-dominant, non-other classes
48
+ #: with the dominant class, even if there are 900 cows and 1 deer.
49
+ self.max_detections_nondominant_class = 1
50
+
51
+ #: How many detections do we need in a dominant category to overwrite
52
+ #: non-dominant classifications in the same family? If this is <= 0,
53
+ #: we'll skip this step. This option doesn't mean anything if
54
+ #: max_detections_nondominant_class_same_family <= 1.
55
+ self.min_detections_to_overwrite_secondary_same_family = 2
56
+
57
+ #: If we have this many classifications of a nondominant category,
58
+ #: we won't do same-family overwrites. <= 1 means "even if there are
59
+ #: a million deer, if there are two million moose, call all the deer
60
+ #: moose". This option doesn't mean anything if
61
+ #: min_detections_to_overwrite_secondary_same_family <= 0.
62
+ self.max_detections_nondominant_class_same_family = -1
63
+
64
+ #: If the dominant class has at least this many classifications, overwrite
65
+ #: "other" classifications with the dominant class
43
66
  self.min_detections_to_overwrite_other = 2
44
67
 
45
68
  #: Names to treat as "other" categories; can't be None, but can be empty
46
- self.other_category_names = ['other']
47
-
48
- #: What confidence threshold should we use for assessing the dominant category in an image?
49
- self.classification_confidence_threshold = 0.6
50
-
51
- #: Which classifications should we even bother over-writing?
52
- self.classification_overwrite_threshold = 0.3
53
-
54
- #: Detection confidence threshold for things we count when determining a dominant class
55
- self.detection_confidence_threshold = 0.2
56
-
57
- #: Which detections should we even bother over-writing?
58
- self.detection_overwrite_threshold = 0.05
59
-
60
-
61
- class ClassificationSmoothingOptionsSequenceLevel:
62
- """
63
- Options used to parameterize smooth_classification_results_sequence_level()
64
- """
65
-
66
- def __init__(self):
67
-
68
- #: Only process detections in this category
69
- self.animal_detection_category = '1'
70
-
71
- #: Treat category names on this list as "other", which can be flipped to common
72
- #: categories.
73
- self.other_category_names = set(['other'])
74
-
75
- #: These are the only classes to which we're going to switch "other" classifications.
76
- #:
77
- #: Example:
78
- #:
79
- #: ['deer','elk','cow','canid','cat','bird','bear']
80
- self.category_names_to_smooth_to = None
81
-
82
- #: Only switch classifications to the dominant class if we see the dominant class at least
83
- #: this many times
84
- self.min_dominant_class_classifications_above_threshold_for_class_smoothing = 5 # 2
85
-
86
- #: If we see more than this many of a class that are above threshold, don't switch those
87
- #: classifications to the dominant class.
88
- self.max_secondary_class_classifications_above_threshold_for_class_smoothing = 5
89
-
90
- #: If the ratio between a dominant class and a secondary class count is greater than this,
91
- #: regardless of the secondary class count, switch those classifications (i.e., ignore
92
- #: max_secondary_class_classifications_above_threshold_for_class_smoothing).
93
69
  #:
94
- #: This may be different for different dominant classes, e.g. if we see lots of cows, they really
95
- #: tend to be cows. Less so for canids, so we set a higher "override ratio" for canids.
70
+ #: "Other" classifications will be changed to the dominant category, regardless
71
+ #: of confidence, as long as there are at least min_detections_to_overwrite_other
72
+ #: examples of the dominant class. For example, cow/other will remain unchanged,
73
+ #: but cow/cow/other will become cow/cow/cow.
74
+ self.other_category_names = ['other','unknown','no cv result','animal','blank','mammal']
75
+
76
+ #: We're not even going to mess around with classifications below this threshold.
96
77
  #:
97
- #: Should always include a "None" category as the default ratio.
78
+ #: We won't count them, we won't over-write them, they don't exist during the
79
+ #: within-image smoothing step.
80
+ self.classification_confidence_threshold = 0.5
81
+
82
+ #: We're not even going to mess around with detections below this threshold.
98
83
  #:
99
- #: Example:
84
+ #: We won't count them, we won't over-write them, they don't exist during the
85
+ #: within-image smoothing step.
86
+ self.detection_confidence_threshold = 0.15
87
+
88
+ #: If classification descriptions are present and appear to represent taxonomic
89
+ #: information, should we propagate classifications when lower-level taxa are more
90
+ #: common in an image? For example, if we see "carnivore/fox/fox/deer", should
91
+ #: we make that "fox/fox/fox/deer"?
92
+ self.propagate_classifications_through_taxonomy = True
93
+
94
+ #: When propagating classifications down through taxonomy levels, we have to
95
+ #: decide whether we prefer more frequent categories or more specific categories.
96
+ #: taxonomy_propagation_level_weight and taxonomy_propagation_count_weight
97
+ #: balance levels against counts in this process.
98
+ self.taxonomy_propagation_level_weight = 1.0
99
+
100
+ #: When propagating classifications down through taxonomy levels, we have to
101
+ #: decide whether we prefer more frequent categories or more specific categories.
102
+ #: taxonomy_propagation_level_weight and taxonomy_propagation_count_weight
103
+ #: balance levels against counts in this process.
100
104
  #:
101
- #: {'cow':2,None:3}
102
- self.min_dominant_class_ratio_for_secondary_override_table = {None:3}
103
-
104
- #: If there are at least this many classifications for the dominant class in a sequence,
105
- #: regardless of what that class is, convert all 'other' classifications (regardless of
106
- #: confidence) to that class.
107
- self.min_dominant_class_classifications_above_threshold_for_other_smoothing = 3 # 2
108
-
109
- #: If there are at least this many classifications for the dominant class in a sequence,
110
- #: regardless of what that class is, classify all previously-unclassified detections
111
- #: as that class.
112
- self.min_dominant_class_classifications_above_threshold_for_unclassified_smoothing = 3 # 2
105
+ #: With a very low default value, this just breaks ties.
106
+ self.taxonomy_propagation_count_weight = 0.01
113
107
 
114
- #: Only count classifications above this confidence level when determining the dominant
115
- #: class, and when deciding whether to switch other classifications.
116
- self.classification_confidence_threshold = 0.6
108
+ #: Should we record information about the state of labels prior to smoothing?
109
+ self.add_pre_smoothing_description = True
117
110
 
118
- #: Confidence values to use when we change a detection's classification (the
119
- #: original confidence value is irrelevant at that point) (for the "other" class)
120
- self.flipped_other_confidence_value = 0.6
111
+ #: When a dict (rather than a file) is passed to either smoothing function,
112
+ #: if this is True, we'll make a copy of the input dict before modifying.
113
+ self.modify_in_place = False
121
114
 
122
- #: Confidence values to use when we change a detection's classification (the
123
- #: original confidence value is irrelevant at that point) (for all non-other classes)
124
- self.flipped_class_confidence_value = 0.6
125
-
126
- #: Confidence values to use when we change a detection's classification (the
127
- #: original confidence value is irrelevant at that point) (for previously unclassified detections)
128
- self.flipped_unclassified_confidence_value = 0.6
129
-
130
- #: Only flip the class label unclassified detections if the detection confidence exceeds this threshold
131
- self.min_detection_confidence_for_unclassified_flipping = 0.15
132
-
133
- #: Only relevant when MegaDetector results are supplied as a dict rather than a file; determines
134
- #: whether smoothing happens in place.
135
- self.modify_in_place = True
136
-
137
- # ...class ClassificationSmoothingOptionsSequenceLevel()
115
+ #: Debug options
116
+ self.break_at_image = None
138
117
 
118
+
119
+ #%% Utility functions
120
+
121
+ def _results_for_sequence(images_this_sequence,filename_to_results):
122
+ """
123
+ Fetch MD results for every image in this sequence, based on the 'file_name' field
124
+ """
139
125
 
140
- #%% Image-level smoothing
126
+ results_this_sequence = []
127
+ for im in images_this_sequence:
128
+ fn = im['file_name']
129
+ results_this_image = filename_to_results[fn]
130
+ assert isinstance(results_this_image,dict)
131
+ results_this_sequence.append(results_this_image)
132
+
133
+ return results_this_sequence
134
+
135
+
136
+ def _sort_images_by_time(images):
137
+ """
138
+ Returns a copy of [images], sorted by the 'datetime' field (ascending).
139
+ """
140
+ return sorted(images, key = lambda im: im['datetime'])
141
141
 
142
- def smooth_classification_results_image_level(input_file,output_file=None,options=None):
142
+
143
+ def _count_detections_by_category(detections,options):
143
144
  """
144
- Smooth classifications at the image level for all results in the MD-formatted results
145
- file [input_file], optionally writing a new set of results to [output_file].
145
+ Count the number of instances of each category in the detections list
146
+ [detections] that have an above-threshold detection. Sort results in descending
147
+ order by count. Returns a dict mapping category ID --> count. If no detections
148
+ are above threshold, returns an empty dict.
146
149
 
147
- This function generally expresses the notion that an image with 700 cows and one deer
148
- is really just 701 cows.
150
+ Assumes that if the 'classifications' field is present for a detection, it has
151
+ length 1, i.e. that non-top classifications have already been removed.
152
+ """
149
153
 
150
- Only count detections with a classification confidence threshold above
151
- [options.classification_confidence_threshold], which in practice means we're only
152
- looking at one category per detection.
154
+ category_to_count = defaultdict(int)
153
155
 
154
- If an image has at least [options.min_detections_above_threshold] such detections
155
- in the most common category, and no more than [options.max_detections_secondary_class]
156
- in the second-most-common category, flip all detections to the most common
157
- category.
156
+ for det in detections:
157
+ if ('classifications' in det) and (det['conf'] >= options.detection_confidence_threshold):
158
+ assert len(det['classifications']) == 1
159
+ c = det['classifications'][0]
160
+ if c[1] >= options.classification_confidence_threshold:
161
+ category_to_count[c[0]] += 1
162
+
163
+ category_to_count = {k: v for k, v in sorted(category_to_count.items(),
164
+ key=lambda item: item[1],
165
+ reverse=True)}
158
166
 
159
- Optionally treat some classes as particularly unreliable, typically used to overwrite an
160
- "other" class.
167
+ return category_to_count
168
+
169
+
170
+ def _get_description_string(category_to_count,classification_descriptions):
171
+ """
172
+ Return a string summarizing the image content according to [category_to_count].
173
+ """
161
174
 
162
- This function also removes everything but the non-dominant classification for each detection.
175
+ category_strings = []
176
+ # category_id = next(iter(category_to_count))
177
+ for category_id in category_to_count:
178
+ category_description = classification_descriptions[category_id]
179
+ tokens = category_description.split(';')
180
+ assert len(tokens) == 7
181
+ category_name = tokens[-1]
182
+ if len(category_name) == 0:
183
+ category_name = 'undefined category'
184
+ count = category_to_count[category_id]
185
+ category_string = '{} ({})'.format(category_name,count)
186
+ category_strings.append(category_string)
163
187
 
164
- Args:
165
- input_file (str): MegaDetector-formatted classification results file to smooth
166
- output_file (str, optional): .json file to write smoothed results
167
- options (ClassificationSmoothingOptionsImageLevel, optional): see
168
- ClassificationSmoothingOptionsImageLevel for details.
169
-
170
- Returns:
171
- dict: MegaDetector-results-formatted dict, identical to what's written to
172
- [output_file] if [output_file] is not None.
188
+ return ', '.join(category_strings)
189
+
190
+
191
+ def _print_counts_with_names(category_to_count,classification_descriptions):
192
+ """
193
+ Print a list of classification categories with counts, based in the name --> count
194
+ dict [category_to_count]
173
195
  """
174
196
 
175
- if options is None:
176
- options = ClassificationSmoothingOptionsImageLevel()
177
-
178
- with open(input_file,'r') as f:
179
- print('Loading results from:\n{}'.format(input_file))
180
- d = json.load(f)
197
+ for category_id in category_to_count:
198
+ category_name = classification_descriptions[category_id]
199
+ count = category_to_count[category_id]
200
+ print('{}: {} ({})'.format(category_id,category_name,count))
201
+
202
+
203
+ def _prepare_results_for_smoothing(input_file,options):
204
+ """
205
+ Load results from [input_file] if necessary, prepare category descrptions
206
+ for smoothing. Adds pre-smoothing descriptions to every image if the options
207
+ say we're supposed to do that.
208
+ """
181
209
 
210
+ if isinstance(input_file,str):
211
+ with open(input_file,'r') as f:
212
+ print('Loading results from:\n{}'.format(input_file))
213
+ d = json.load(f)
214
+ else:
215
+ assert isinstance(input_file,dict)
216
+ if options.modify_in_place:
217
+ d = input_file
218
+ else:
219
+ print('modify_in_place is False, copying the input before modifying')
220
+ d = copy.deepcopy(input_file)
221
+
222
+
223
+ ## Category processing
224
+
182
225
  category_name_to_id = {d['classification_categories'][k]:k for k in d['classification_categories']}
183
226
  other_category_ids = []
184
227
  for s in options.other_category_names:
185
228
  if s in category_name_to_id:
186
229
  other_category_ids.append(category_name_to_id[s])
187
- else:
188
- print('Warning: "other" category {} not present in file {}'.format(
189
- s,input_file))
190
-
191
- n_other_classifications_changed = 0
192
- n_other_images_changed = 0
193
-
194
- n_detections_flipped = 0
195
- n_images_changed = 0
196
-
230
+
197
231
  # Before we do anything else, get rid of everything but the top classification
198
- # for each detection.
199
- for im in tqdm(d['images']):
232
+ # for each detection, and remove the 'classifications' field from detections with
233
+ # no classifications.
234
+ for im in tqdm(d['images']):
235
+
200
236
  if 'detections' not in im or im['detections'] is None or len(im['detections']) == 0:
201
237
  continue
202
238
 
@@ -204,7 +240,10 @@ def smooth_classification_results_image_level(input_file,output_file=None,option
204
240
 
205
241
  for det in detections:
206
242
 
207
- if 'classifications' not in det or len(det['classifications']) == 0:
243
+ if 'classifications' not in det:
244
+ continue
245
+ if len(det['classifications']) == 0:
246
+ del det['classifications']
208
247
  continue
209
248
 
210
249
  classification_confidence_values = [c[1] for c in det['classifications']]
@@ -215,271 +254,565 @@ def smooth_classification_results_image_level(input_file,output_file=None,option
215
254
 
216
255
  # ...for each image
217
256
 
218
- # im = d['images'][0]
219
- for im in tqdm(d['images']):
257
+
258
+ ## Clean up classification descriptions so we can test taxonomic relationships
259
+ ## by substring testing.
260
+
261
+ classification_descriptions_clean = None
262
+ classification_descriptions = None
263
+
264
+ if 'classification_category_descriptions' in d:
265
+ classification_descriptions = d['classification_category_descriptions']
266
+ classification_descriptions_clean = {}
267
+ # category_id = next(iter(classification_descriptions))
268
+ for category_id in classification_descriptions:
269
+ classification_descriptions_clean[category_id] = \
270
+ clean_taxonomy_string(classification_descriptions[category_id]).strip(';').lower()
271
+
272
+
273
+ ## Optionally add pre-smoothing descriptions to every image
274
+
275
+ if options.add_pre_smoothing_description:
220
276
 
221
- if 'detections' not in im or im['detections'] is None or len(im['detections']) == 0:
222
- continue
277
+ for im in tqdm(d['images']):
278
+
279
+ if 'detections' not in im or im['detections'] is None or len(im['detections']) == 0:
280
+ continue
281
+
282
+ detections = im['detections']
283
+ category_to_count = _count_detections_by_category(detections, options)
284
+
285
+ im['pre_smoothing_description'] = \
286
+ _get_description_string(category_to_count, classification_descriptions)
287
+
288
+
289
+ return {
290
+ 'd':d,
291
+ 'other_category_ids':other_category_ids,
292
+ 'classification_descriptions_clean':classification_descriptions_clean,
293
+ 'classification_descriptions':classification_descriptions
294
+ }
295
+
296
+ # ...def _prepare_results_for_smoothing(...)
297
+
298
+
299
+ def _smooth_classifications_for_list_of_detections(detections,
300
+ options,
301
+ other_category_ids,
302
+ classification_descriptions,
303
+ classification_descriptions_clean):
304
+ """
305
+ Smooth classifications for a list of detections, which may have come from a single
306
+ image, or may represent an entire sequence.
307
+
308
+ Returns None if no changes are made, else a dict.
309
+
310
+ classification_descriptions_clean should be semicolon-delimited taxonomic strings
311
+ from which common names and GUIDs have already been removed.
312
+
313
+ Assumes there is only one classification per detection, i.e. that non-top classifications
314
+ have already been remoevd.
315
+ """
316
+
317
+ ## Count the number of instances of each category in this image
318
+
319
+ category_to_count = _count_detections_by_category(detections, options)
320
+ # _print_counts_with_names(category_to_count,classification_descriptions)
321
+ # _get_description_string(category_to_count, classification_descriptions)
223
322
 
224
- detections = im['detections']
323
+ if len(category_to_count) <= 1:
324
+ return None
325
+
326
+ keys = list(category_to_count.keys())
225
327
 
226
- category_to_count = defaultdict(int)
328
+ # Handle a quirky special case: if the most common category is "other" and
329
+ # it's "tied" with the second-most-common category, swap them
330
+ if (len(keys) > 1) and \
331
+ (keys[0] in other_category_ids) and \
332
+ (keys[1] not in other_category_ids) and \
333
+ (category_to_count[keys[0]] == category_to_count[keys[1]]):
334
+ keys[1], keys[0] = keys[0], keys[1]
335
+
336
+ max_count = category_to_count[keys[0]]
337
+ most_common_category = keys[0]
338
+ del keys
339
+
340
+
341
+ ## Debug tools
342
+
343
+ verbose_debug_enabled = False
344
+
345
+ if options.break_at_image is not None:
227
346
  for det in detections:
228
- if ('classifications' in det) and (det['conf'] >= options.detection_confidence_threshold):
229
- for c in det['classifications']:
230
- if c[1] >= options.classification_confidence_threshold:
231
- category_to_count[c[0]] += 1
232
- # ...for each classification
233
- # ...if there are classifications for this detection
234
- # ...for each detection
235
-
236
- if len(category_to_count) <= 1:
237
- continue
238
-
239
- category_to_count = {k: v for k, v in sorted(category_to_count.items(),
240
- key=lambda item: item[1],
241
- reverse=True)}
242
-
243
- keys = list(category_to_count.keys())
244
-
245
- # Handle a quirky special case: if the most common category is "other" and
246
- # it's "tied" with the second-most-common category, swap them
247
- if (len(keys) > 1) and \
248
- (keys[0] in other_category_ids) and \
249
- (keys[1] not in other_category_ids) and \
250
- (category_to_count[keys[0]] == category_to_count[keys[1]]):
251
- keys[1], keys[0] = keys[0], keys[1]
252
-
253
- max_count = category_to_count[keys[0]]
254
- # secondary_count = category_to_count[keys[1]]
255
- # The 'secondary count' is the most common non-other class
256
- secondary_count = 0
257
- for i_key in range(1,len(keys)):
258
- if keys[i_key] not in other_category_ids:
259
- secondary_count = category_to_count[keys[i_key]]
347
+ if 'image_filename' in det and \
348
+ det['image_filename'] == options.break_at_image:
349
+ verbose_debug_enabled = True
260
350
  break
261
-
262
- most_common_category = keys[0]
263
-
264
- assert max_count >= secondary_count
351
+
352
+ if verbose_debug_enabled:
353
+ _print_counts_with_names(category_to_count,classification_descriptions)
354
+ import pdb; pdb.set_trace()
355
+
356
+
357
+ ## Possibly change "other" classifications to the most common category
358
+
359
+ # ...if the dominant category is not an "other" category.
360
+
361
+ n_other_classifications_changed_this_image = 0
362
+
363
+ # If we have at least *min_detections_to_overwrite_other* in a category that isn't
364
+ # "other", change all "other" classifications to that category
365
+ if (max_count >= options.min_detections_to_overwrite_other) and \
366
+ (most_common_category not in other_category_ids):
265
367
 
266
- # If we have at least *min_detections_to_overwrite_other* in a category that isn't
267
- # "other", change all "other" classifications to that category
268
- if max_count >= options.min_detections_to_overwrite_other and \
269
- most_common_category not in other_category_ids:
368
+ for det in detections:
270
369
 
271
- other_change_made = False
370
+ if ('classifications' not in det) or \
371
+ (det['conf'] < options.detection_confidence_threshold):
372
+ continue
272
373
 
273
- for det in detections:
374
+ assert len(det['classifications']) == 1
375
+ c = det['classifications'][0]
274
376
 
275
- if ('classifications' in det) and \
276
- (det['conf'] >= options.detection_overwrite_threshold):
377
+ if (c[1] >= options.classification_confidence_threshold) and \
378
+ (c[0] in other_category_ids):
277
379
 
278
- for c in det['classifications']:
380
+ n_other_classifications_changed_this_image += 1
381
+ c[0] = most_common_category
382
+
383
+ # ...if there are classifications for this detection
384
+
385
+ # ...for each detection
386
+
387
+ # ...if we should overwrite all "other" classifications
388
+
389
+
390
+ ## Re-count
391
+
392
+ category_to_count = _count_detections_by_category(detections, options)
393
+ # _print_counts_with_names(category_to_count,classification_descriptions)
394
+ keys = list(category_to_count.keys())
395
+ max_count = category_to_count[keys[0]]
396
+ most_common_category = keys[0]
397
+ del keys
398
+
399
+
400
+ ## Possibly change some non-dominant classifications to the dominant category
401
+
402
+ n_detections_flipped_this_image = 0
403
+
404
+ # Don't do this if the most common category is an "other" category, or
405
+ # if we don't have enough of the most common category
406
+ if (most_common_category not in other_category_ids) and \
407
+ (max_count >= options.min_detections_to_overwrite_secondary):
408
+
409
+ # i_det = 0; det = detections[i_det]
410
+ for i_det,det in enumerate(detections):
279
411
 
280
- if c[1] >= options.classification_overwrite_threshold and \
281
- c[0] in other_category_ids:
282
-
283
- n_other_classifications_changed += 1
284
- other_change_made = True
285
- c[0] = most_common_category
286
-
287
- # ...for each classification
288
-
289
- # ...if there are classifications for this detection
412
+ if ('classifications' not in det) or \
413
+ (det['conf'] < options.detection_confidence_threshold):
414
+ continue
290
415
 
291
- # ...for each detection
416
+ assert len(det['classifications']) == 1
417
+ c = det['classifications'][0]
292
418
 
293
- if other_change_made:
294
- n_other_images_changed += 1
419
+ # Don't over-write the most common category with itself
420
+ if c[0] == most_common_category:
421
+ continue
422
+
423
+ # Don't bother with below-threshold classifications
424
+ if c[1] < options.classification_confidence_threshold:
425
+ continue
426
+
427
+ # If we have fewer of this category than the most common category,
428
+ # but not *too* many, flip it to the most common category.
429
+ if (max_count > category_to_count[c[0]]) and \
430
+ (category_to_count[c[0]] <= options.max_detections_nondominant_class):
431
+
432
+ c[0] = most_common_category
433
+ n_detections_flipped_this_image += 1
295
434
 
296
- # ...if we should overwrite all "other" classifications
435
+ # ...for each detection
436
+
437
+ # ...if the dominant category is legit
297
438
 
298
- if max_count < options.min_detections_above_threshold:
299
- continue
300
-
301
- if secondary_count >= options.max_detections_secondary_class:
302
- continue
303
-
304
- # At this point, we know we have a dominant category; change all other above-threshold
305
- # classifications to that category. That category may have been "other", in which
306
- # case we may have already made the relevant changes.
307
-
308
- n_detections_flipped_this_image = 0
309
-
310
- # det = detections[0]
439
+
440
+ ## Re-count
441
+
442
+ category_to_count = _count_detections_by_category(detections, options)
443
+ # _print_counts_with_names(category_to_count,classification_descriptions)
444
+ keys = list(category_to_count.keys())
445
+ max_count = category_to_count[keys[0]]
446
+ most_common_category = keys[0]
447
+ del keys
448
+
449
+
450
+ ## Possibly collapse higher-level taxonomic predictions down to lower levels
451
+
452
+ # ...when the most common class is a child of a less common class.
453
+
454
+ n_taxonomic_changes_this_image = 0
455
+
456
+ process_taxonomic_rules = \
457
+ (classification_descriptions_clean is not None) and \
458
+ (len(classification_descriptions_clean) > 0) and \
459
+ (len(category_to_count) > 1)
460
+
461
+ if process_taxonomic_rules and options.propagate_classifications_through_taxonomy:
462
+
463
+ # det = detections[3]
311
464
  for det in detections:
312
465
 
313
- if ('classifications' in det) and \
314
- (det['conf'] >= options.detection_overwrite_threshold):
315
-
316
- for c in det['classifications']:
317
- if c[1] >= options.classification_overwrite_threshold and \
318
- c[0] != most_common_category:
466
+ if ('classifications' not in det) or \
467
+ (det['conf'] < options.detection_confidence_threshold):
468
+ continue
319
469
 
320
- c[0] = most_common_category
321
- n_detections_flipped += 1
322
- n_detections_flipped_this_image += 1
470
+ assert len(det['classifications']) == 1
471
+ c = det['classifications'][0]
472
+
473
+ # Don't bother with any classifications below the confidence threshold
474
+ if c[1] < options.classification_confidence_threshold:
475
+ continue
476
+
477
+ category_id_this_classification = c[0]
478
+ assert category_id_this_classification in category_to_count
479
+
480
+ category_description_this_classification = \
481
+ classification_descriptions_clean[category_id_this_classification]
482
+
483
+ # An empty description corresponds to the "animal" category. We don't handle
484
+ # "animal" here as a parent category, that would be handled in the "other smoothing"
485
+ # step above.
486
+ if len(category_description_this_classification) == 0:
487
+ continue
488
+
489
+ # We may have multiple child categories to choose from; this keeps track of
490
+ # the "best" we've seen so far. "Best" is based on the level (species is better
491
+ # than genus) and number.
492
+ child_category_to_score = defaultdict(float)
493
+
494
+ for category_id_of_candidate_child in category_to_count.keys():
495
+
496
+ # A category is never its own child
497
+ if category_id_of_candidate_child == category_id_this_classification:
498
+ continue
323
499
 
324
- # ...for each classification
500
+ # Is this candidate a child of the current classification?
501
+ category_description_candidate_child = \
502
+ classification_descriptions_clean[category_id_of_candidate_child]
325
503
 
326
- # ...if there are classifications for this detection
504
+ # An empty description corresponds to "animal", which can never
505
+ # be a child of another category.
506
+ if len(category_description_candidate_child) == 0:
507
+ continue
508
+
509
+ # As long as we're using "clean" descriptions, parent/child taxonomic
510
+ # relationships are defined by a substring relationship
511
+ is_child = category_description_this_classification in \
512
+ category_description_candidate_child
513
+ if not is_child:
514
+ continue
515
+
516
+ # How many instances of this child category are there?
517
+ child_category_count = category_to_count[category_id_of_candidate_child]
518
+
519
+ # What taxonomy level is this child category defined at?
520
+ child_category_level = taxonomy_level_index(
521
+ classification_descriptions[category_id_of_candidate_child])
522
+
523
+ child_category_to_score[category_id_of_candidate_child] = \
524
+ child_category_level * options.taxonomy_propagation_level_weight + \
525
+ child_category_count * options.taxonomy_propagation_count_weight
526
+
527
+ # ...for each category we are considering reducing this classification to
528
+
529
+ # Did we find a category we want to change this classification to?
530
+ if len(child_category_to_score) > 0:
531
+
532
+ # Find the child category with the highest score
533
+ child_category_to_score = sort_dictionary_by_value(
534
+ child_category_to_score,reverse=True)
535
+ best_child_category = next(iter(child_category_to_score.keys()))
536
+
537
+ if verbose_debug_enabled:
538
+ old_category_name = \
539
+ classification_descriptions_clean[c[0]]
540
+ new_category_name = \
541
+ classification_descriptions_clean[best_child_category]
542
+ print('Replacing {} with {}'.format(
543
+ old_category_name,new_category_name))
544
+
545
+ c[0] = best_child_category
546
+ n_taxonomic_changes_this_image += 1
327
547
 
328
548
  # ...for each detection
329
549
 
330
- if n_detections_flipped_this_image > 0:
331
- n_images_changed += 1
550
+ # ...if we have taxonomic information available
332
551
 
333
- # ...for each image
334
552
 
335
- print('Classification smoothing: changed {} detections on {} images'.format(
336
- n_detections_flipped,n_images_changed))
553
+ ## Re-count
337
554
 
338
- print('"Other" smoothing: changed {} detections on {} images'.format(
339
- n_other_classifications_changed,n_other_images_changed))
340
-
341
- if output_file is not None:
342
- print('Writing results after image-level smoothing to:\n{}'.format(output_file))
343
- with open(output_file,'w') as f:
344
- json.dump(d,f,indent=1)
345
-
346
- return d
347
-
348
- # ...def smooth_classification_results_image_level(...)
349
-
350
-
351
- #%% Sequence-level smoothing
352
-
353
- def _results_for_sequence(images_this_sequence,filename_to_results):
354
- """
355
- Fetch MD results for every image in this sequence, based on the 'file_name' field
356
- """
555
+ category_to_count = _count_detections_by_category(detections, options)
556
+ # _print_counts_with_names(category_to_count,classification_descriptions)
557
+ keys = list(category_to_count.keys())
558
+ max_count = category_to_count[keys[0]]
559
+ most_common_category = keys[0]
560
+ del keys
357
561
 
358
- results_this_sequence = []
359
- for im in images_this_sequence:
360
- fn = im['file_name']
361
- results_this_image = filename_to_results[fn]
362
- assert isinstance(results_this_image,dict)
363
- results_this_sequence.append(results_this_image)
364
-
365
- return results_this_sequence
366
-
367
562
 
368
- def _top_classifications_for_sequence(images_this_sequence,filename_to_results,options):
369
- """
370
- Return all top-1 animal classifications for every detection in this
371
- sequence, regardless of confidence
372
-
373
- May modify [images_this_sequence] (removing non-top-1 classifications)
374
- """
563
+ ## Possibly do within-family smoothing
375
564
 
376
- classifications_this_sequence = []
377
-
378
- # im = images_this_sequence[0]
379
- for im in images_this_sequence:
380
-
381
- fn = im['file_name']
382
- results_this_image = filename_to_results[fn]
565
+ n_within_family_smoothing_changes = 0
566
+
567
+ # min_detections_to_overwrite_secondary_same_family = -1
568
+ # max_detections_nondominant_class_same_family = 1
569
+ family_level = taxonomy_level_string_to_index('family')
570
+
571
+ if process_taxonomic_rules:
383
572
 
384
- if results_this_image['detections'] is None:
385
- continue
573
+ category_description_most_common_category = \
574
+ classification_descriptions[most_common_category]
575
+ most_common_category_taxonomic_level = \
576
+ taxonomy_level_index(category_description_most_common_category)
577
+ n_most_common_category = category_to_count[most_common_category]
578
+ tokens = category_description_most_common_category.split(';')
579
+ assert len(tokens) == 7
580
+ most_common_category_family = tokens[3]
581
+ most_common_category_genus = tokens[4]
582
+
583
+ # Only consider remapping to genus or species level, and only when we have
584
+ # a high enough count in the most common category
585
+ if process_taxonomic_rules and \
586
+ (options.min_detections_to_overwrite_secondary_same_family > 0) and \
587
+ (most_common_category not in other_category_ids) and \
588
+ (most_common_category_taxonomic_level > family_level) and \
589
+ (n_most_common_category >= options.min_detections_to_overwrite_secondary_same_family):
590
+
591
+ # det = detections[0]
592
+ for det in detections:
593
+
594
+ if ('classifications' not in det) or \
595
+ (det['conf'] < options.detection_confidence_threshold):
596
+ continue
597
+
598
+ assert len(det['classifications']) == 1
599
+ c = det['classifications'][0]
600
+
601
+ # Don't over-write the most common category with itself
602
+ if c[0] == most_common_category:
603
+ continue
386
604
 
387
- # det = results_this_image['detections'][0]
388
- for det in results_this_image['detections']:
605
+ # Don't bother with below-threshold classifications
606
+ if c[1] < options.classification_confidence_threshold:
607
+ continue
608
+
609
+ n_candidate_flip_category = category_to_count[c[0]]
610
+
611
+ # Do we have too many of the non-dominant category to do this kind of swap?
612
+ if n_candidate_flip_category > \
613
+ options.max_detections_nondominant_class_same_family:
614
+ continue
615
+
616
+ # Don't flip classes when it's a tie
617
+ if n_candidate_flip_category == n_most_common_category:
618
+ continue
619
+
620
+ category_description_candidate_flip = \
621
+ classification_descriptions[c[0]]
622
+ tokens = category_description_candidate_flip.split(';')
623
+ assert len(tokens) == 7
624
+ candidate_flip_category_family = tokens[3]
625
+ candidate_flip_category_genus = tokens[4]
626
+ candidate_flip_category_taxonomic_level = \
627
+ taxonomy_level_index(category_description_candidate_flip)
389
628
 
390
- # Only process animal detections
391
- if det['category'] != options.animal_detection_category:
629
+ # Only proceed if we have valid family strings
630
+ if (len(candidate_flip_category_family) == 0) or \
631
+ (len(most_common_category_family) == 0):
392
632
  continue
393
633
 
394
- # Only process detections with classification information
395
- if 'classifications' not in det:
634
+ # Only proceed if the candidate and the most common category are in the same family
635
+ if candidate_flip_category_family != most_common_category_family:
396
636
  continue
397
637
 
398
- # We only care about top-1 classifications, remove everything else
399
- if len(det['classifications']) > 1:
400
-
401
- # Make sure the list of classifications is already sorted by confidence
402
- classification_confidence_values = [c[1] for c in det['classifications']]
403
- assert is_list_sorted(classification_confidence_values,reverse=True)
404
-
405
- # ...and just keep the first one
406
- det['classifications'] = [det['classifications'][0]]
407
-
408
- # Confidence values should be sorted within a detection; verify this, and ignore
409
- top_classification = det['classifications'][0]
638
+ # Don't flip from a species to the genus level in the same genus
639
+ if (candidate_flip_category_genus == most_common_category_genus) and \
640
+ (candidate_flip_category_taxonomic_level > \
641
+ most_common_category_taxonomic_level):
642
+ continue
643
+
644
+ old_category_name = classification_descriptions_clean[c[0]]
645
+ new_category_name = classification_descriptions_clean[most_common_category]
410
646
 
411
- classifications_this_sequence.append(top_classification)
412
-
413
- # ...for each detection in this image
647
+ c[0] = most_common_category
648
+ n_within_family_smoothing_changes += 1
649
+
650
+ # ...for each detection
414
651
 
415
- # ...for each image in this sequence
416
-
417
- return classifications_this_sequence
652
+ # ...if the dominant category is legit and we have taxonomic information available
653
+
654
+
655
+ return {'n_other_classifications_changed_this_image':n_other_classifications_changed_this_image,
656
+ 'n_detections_flipped_this_image':n_detections_flipped_this_image,
657
+ 'n_taxonomic_changes_this_image':n_taxonomic_changes_this_image,
658
+ 'n_within_family_smoothing_changes':n_within_family_smoothing_changes}
418
659
 
419
- # ..._top_classifications_for_sequence()
660
+ # ...def _smooth_classifications_for_list_of_detections(...)
420
661
 
421
662
 
422
- def _count_above_threshold_classifications(classifications_this_sequence,options):
663
+ def _smooth_single_image(im,
664
+ options,
665
+ other_category_ids,
666
+ classification_descriptions,
667
+ classification_descriptions_clean):
423
668
  """
424
- Given a list of classification objects (tuples), return a dict mapping
425
- category IDs to the count of above-threshold classifications.
669
+ Smooth classifications for a single image. Returns None if no changes are made,
670
+ else a dict.
671
+
672
+ classification_descriptions_clean should be semicolon-delimited taxonomic strings
673
+ from which common names and GUIDs have already been removed.
426
674
 
427
- This dict's keys will be sorted in descending order by frequency.
675
+ Assumes there is only one classification per detection, i.e. that non-top classifications
676
+ have already been remoevd.
428
677
  """
429
678
 
430
- # Count above-threshold classifications in this sequence
431
- category_to_count = defaultdict(int)
432
- for c in classifications_this_sequence:
433
- if c[1] >= options.classification_confidence_threshold:
434
- category_to_count[c[0]] += 1
679
+ if 'detections' not in im or im['detections'] is None or len(im['detections']) == 0:
680
+ return
435
681
 
436
- # Sort the dictionary in descending order by count
437
- category_to_count = {k: v for k, v in sorted(category_to_count.items(),
438
- key=lambda item: item[1],
439
- reverse=True)}
682
+ detections = im['detections']
440
683
 
441
- keys_sorted_by_frequency = list(category_to_count.keys())
684
+ # Simplify debugging
685
+ for det in detections:
686
+ det['image_filename'] = im['file']
442
687
 
443
- # Handle a quirky special case: if the most common category is "other" and
444
- # it's "tied" with the second-most-common category, swap them.
445
- if (options.other_category_names is not None) and (len(options.other_category_names) > 0):
446
- if (len(keys_sorted_by_frequency) > 1) and \
447
- (keys_sorted_by_frequency[0] in options.other_category_names) and \
448
- (keys_sorted_by_frequency[1] not in options.other_category_names) and \
449
- (category_to_count[keys_sorted_by_frequency[0]] == \
450
- category_to_count[keys_sorted_by_frequency[1]]):
451
- keys_sorted_by_frequency[1], keys_sorted_by_frequency[0] = \
452
- keys_sorted_by_frequency[0], keys_sorted_by_frequency[1]
453
-
454
- sorted_category_to_count = {}
455
- for k in keys_sorted_by_frequency:
456
- sorted_category_to_count[k] = category_to_count[k]
457
-
458
- return sorted_category_to_count
459
-
460
- # ...def _count_above_threshold_classifications()
461
-
462
-
463
- def _sort_images_by_time(images):
688
+ to_return = _smooth_classifications_for_list_of_detections(detections,
689
+ options=options,
690
+ other_category_ids=other_category_ids,
691
+ classification_descriptions=classification_descriptions,
692
+ classification_descriptions_clean=classification_descriptions_clean)
693
+
694
+ # Clean out debug information
695
+ for det in detections:
696
+ del det['image_filename']
697
+
698
+ return to_return
699
+
700
+ # ...def smooth_single_image
701
+
702
+
703
+ #%% Image-level smoothing
704
+
705
+ def smooth_classification_results_image_level(input_file,output_file=None,options=None):
464
706
  """
465
- Returns a copy of [images], sorted by the 'datetime' field (ascending).
707
+ Smooth classifications at the image level for all results in the MD-formatted results
708
+ file [input_file], optionally writing a new set of results to [output_file].
709
+
710
+ This function generally expresses the notion that an image with 700 cows and one deer
711
+ is really just 701 cows.
712
+
713
+ Only count detections with a classification confidence threshold above
714
+ [options.classification_confidence_threshold], which in practice means we're only
715
+ looking at one category per detection.
716
+
717
+ If an image has at least [options.min_detections_to_overwrite_secondary] such detections
718
+ in the most common category, and no more than [options.max_detections_nondominant_class]
719
+ in the second-most-common category, flip all detections to the most common
720
+ category.
721
+
722
+ Optionally treat some classes as particularly unreliable, typically used to overwrite an
723
+ "other" class.
724
+
725
+ This function also removes everything but the non-dominant classification for each detection.
726
+
727
+ Args:
728
+ input_file (str): MegaDetector-formatted classification results file to smooth. Can
729
+ also be an already-loaded results dict.
730
+ output_file (str, optional): .json file to write smoothed results
731
+ options (ClassificationSmoothingOptions, optional): see
732
+ ClassificationSmoothingOptions for details.
733
+
734
+ Returns:
735
+ dict: MegaDetector-results-formatted dict, identical to what's written to
736
+ [output_file] if [output_file] is not None.
466
737
  """
467
- return sorted(images, key = lambda im: im['datetime'])
468
738
 
739
+ ## Input validation
740
+
741
+ if options is None:
742
+ options = ClassificationSmoothingOptions()
743
+
744
+ r = _prepare_results_for_smoothing(input_file, options)
745
+ d = r['d']
746
+ other_category_ids = r['other_category_ids']
747
+ classification_descriptions_clean = r['classification_descriptions_clean']
748
+ classification_descriptions = r['classification_descriptions']
749
+
750
+
751
+ ## Smoothing
752
+
753
+ n_other_classifications_changed = 0
754
+ n_other_images_changed = 0
755
+ n_taxonomic_images_changed = 0
756
+
757
+ n_detections_flipped = 0
758
+ n_images_changed = 0
759
+ n_taxonomic_classification_changes = 0
760
+
761
+ # im = d['images'][0]
762
+ for im in tqdm(d['images']):
763
+
764
+ r = _smooth_single_image(im,
765
+ options,
766
+ other_category_ids,
767
+ classification_descriptions=classification_descriptions,
768
+ classification_descriptions_clean=classification_descriptions_clean)
769
+
770
+ if r is None:
771
+ continue
772
+
773
+ n_detections_flipped_this_image = r['n_detections_flipped_this_image']
774
+ n_other_classifications_changed_this_image = \
775
+ r['n_other_classifications_changed_this_image']
776
+ n_taxonomic_changes_this_image = r['n_taxonomic_changes_this_image']
777
+
778
+ n_detections_flipped += n_detections_flipped_this_image
779
+ n_other_classifications_changed += n_other_classifications_changed_this_image
780
+ n_taxonomic_classification_changes += n_taxonomic_changes_this_image
781
+
782
+ if n_detections_flipped_this_image > 0:
783
+ n_images_changed += 1
784
+ if n_other_classifications_changed_this_image > 0:
785
+ n_other_images_changed += 1
786
+ if n_taxonomic_changes_this_image > 0:
787
+ n_taxonomic_images_changed += 1
788
+
789
+ # ...for each image
790
+
791
+ print('Classification smoothing: changed {} detections on {} images'.format(
792
+ n_detections_flipped,n_images_changed))
793
+
794
+ print('"Other" smoothing: changed {} detections on {} images'.format(
795
+ n_other_classifications_changed,n_other_images_changed))
796
+
797
+ print('Taxonomic smoothing: changed {} detections on {} images'.format(
798
+ n_taxonomic_classification_changes,n_taxonomic_images_changed))
799
+
800
+
801
+ ## Write output
802
+
803
+ if output_file is not None:
804
+ print('Writing results after image-level smoothing to:\n{}'.format(output_file))
805
+ with open(output_file,'w') as f:
806
+ json.dump(d,f,indent=1)
469
807
 
470
- def _get_first_key_from_sorted_dictionary(di):
471
- if len(di) == 0:
472
- return None
473
- return next(iter(di.items()))[0]
474
-
808
+ return d
475
809
 
476
- def _get_first_value_from_sorted_dictionary(di):
477
- if len(di) == 0:
478
- return None
479
- return next(iter(di.items()))[1]
810
+ # ...def smooth_classification_results_image_level(...)
480
811
 
481
812
 
482
- def smooth_classification_results_sequence_level(md_results,
813
+ #%% Sequence-level smoothing
814
+
815
+ def smooth_classification_results_sequence_level(input_file,
483
816
  cct_sequence_information,
484
817
  output_file=None,
485
818
  options=None):
@@ -491,44 +824,33 @@ def smooth_classification_results_sequence_level(md_results,
491
824
  deer/deer/deer/elk/deer/deer/deer/deer is really just a deer.
492
825
 
493
826
  Args:
494
- md_results (str or dict): MegaDetector-formatted classification results file to smooth
827
+ input_file (str or dict): MegaDetector-formatted classification results file to smooth
495
828
  (or already-loaded results). If you supply a dict, it's modified in place by default, but
496
829
  a copy can be forced by setting options.modify_in_place=False.
497
830
  cct_sequence_information (str, dict, or list): COCO Camera Traps file containing sequence IDs for
498
831
  each image (or an already-loaded CCT-formatted dict, or just the 'images' list from a CCT dict).
499
832
  output_file (str, optional): .json file to write smoothed results
500
- options (ClassificationSmoothingOptionsSequenceLevel, optional): see
501
- ClassificationSmoothingOptionsSequenceLevel for details.
833
+ options (ClassificationSmoothingOptions, optional): see
834
+ ClassificationSmoothingOptions for details.
502
835
 
503
836
  Returns:
504
837
  dict: MegaDetector-results-formatted dict, identical to what's written to
505
838
  [output_file] if [output_file] is not None.
506
839
  """
507
840
 
508
- if options is None:
509
- options = ClassificationSmoothingOptionsSequenceLevel()
841
+ ## Input validation
510
842
 
511
- if options.category_names_to_smooth_to is None:
512
- options.category_names_to_smooth_to = []
843
+ if options is None:
844
+ options = ClassificationSmoothingOptions()
513
845
 
514
- if options.other_category_names is None:
515
- options.other_category_names = []
846
+ r = _prepare_results_for_smoothing(input_file, options)
847
+ d = r['d']
848
+ other_category_ids = r['other_category_ids']
849
+ classification_descriptions_clean = r['classification_descriptions_clean']
850
+ classification_descriptions = r['classification_descriptions']
516
851
 
517
- assert None in options.min_dominant_class_ratio_for_secondary_override_table, \
518
- 'Oops, it looks like you removed the default (None) key from ' + \
519
- 'options.min_dominant_class_ratio_for_secondary_override_table'
520
852
 
521
- if isinstance(md_results,str):
522
- print('Loading MD results from {}'.format(md_results))
523
- with open(md_results,'r') as f:
524
- md_results = json.load(f)
525
- else:
526
- assert isinstance(md_results,dict)
527
- if not options.modify_in_place:
528
- print('Copying MD results instead of modifying in place')
529
- md_results = copy.deepcopy(md_results)
530
- else:
531
- print('Smoothing MD results in place')
853
+ ## Make a list of images appearing in each sequence
532
854
 
533
855
  if isinstance(cct_sequence_information,list):
534
856
  image_info = cct_sequence_information
@@ -540,177 +862,119 @@ def smooth_classification_results_sequence_level(md_results,
540
862
  else:
541
863
  assert isinstance(cct_sequence_information,dict)
542
864
  image_info = cct_sequence_information['images']
543
-
544
865
 
545
- ##%% Make a list of images appearing at each location
546
-
547
- sequence_to_images = defaultdict(list)
866
+ sequence_to_image_filenames = defaultdict(list)
548
867
 
549
868
  # im = image_info[0]
550
869
  for im in tqdm(image_info):
551
- sequence_to_images[im['seq_id']].append(im)
552
-
553
- all_sequences = list(sorted(sequence_to_images.keys()))
554
-
555
-
556
- ##%% Load classification results
557
-
558
- # Map each filename to classification results for that file
559
- filename_to_results = {}
560
-
561
- for im in tqdm(md_results['images']):
562
- filename_to_results[im['file'].replace('\\','/')] = im
563
-
564
-
565
- ##%% Smooth classification results over sequences (prep)
566
-
567
- classification_category_id_to_name = md_results['classification_categories']
568
- classification_category_name_to_id = {v: k for k, v in classification_category_id_to_name.items()}
569
-
570
- class_names = list(classification_category_id_to_name.values())
571
-
572
- assert(md_results['detection_categories'][options.animal_detection_category] == 'animal')
573
-
574
- other_category_ids = set([classification_category_name_to_id[s] for s in options.other_category_names])
575
-
576
- category_ids_to_smooth_to = set([classification_category_name_to_id[s] for s in options.category_names_to_smooth_to])
577
- assert all([s in class_names for s in options.category_names_to_smooth_to])
578
-
579
-
580
- ##%% Smooth classifications at the sequence level (main loop)
581
-
582
- n_other_flips = 0
583
- n_classification_flips = 0
584
- n_unclassified_flips = 0
870
+ sequence_to_image_filenames[im['seq_id']].append(im['file_name'])
871
+ del image_info
872
+
873
+ image_fn_to_classification_results = {}
874
+ for im in d['images']:
875
+ fn = im['file']
876
+ assert fn not in image_fn_to_classification_results
877
+ image_fn_to_classification_results[fn] = im
878
+
879
+
880
+ ## Smoothing
585
881
 
586
- # Break if this token is contained in a filename (set to None for normal operation)
587
- debug_fn = None
882
+ n_other_classifications_changed = 0
883
+ n_other_sequences_changed = 0
884
+ n_taxonomic_sequences_changed = 0
885
+ n_within_family_sequences_changed = 0
588
886
 
589
- # i_sequence = 0; seq_id = all_sequences[i_sequence]
590
- for i_sequence,seq_id in tqdm(enumerate(all_sequences),total=len(all_sequences)):
887
+ n_detections_flipped = 0
888
+ n_sequences_changed = 0
889
+ n_taxonomic_classification_changes = 0
890
+ n_within_family_changes = 0
591
891
 
592
- images_this_sequence = sequence_to_images[seq_id]
892
+ # sequence_id = list(sequence_to_image_filenames.keys())[0]
893
+ for sequence_id in sequence_to_image_filenames.keys():
894
+
895
+ image_filenames_this_sequence = sequence_to_image_filenames[sequence_id]
593
896
 
594
- # Count top-1 classifications in this sequence (regardless of confidence)
595
- classifications_this_sequence = _top_classifications_for_sequence(images_this_sequence,
596
- filename_to_results,
597
- options)
897
+ # if 'file' in image_filenames_this_sequence:
898
+ # import pdb; pdb.set_trace()
899
+
900
+ detections_this_sequence = []
901
+ for image_filename in image_filenames_this_sequence:
902
+ im = image_fn_to_classification_results[image_filename]
903
+ if 'detections' not in im or im['detections'] is None:
904
+ continue
905
+ detections_this_sequence.extend(im['detections'])
906
+
907
+ # Temporarily add image filenames to every detection,
908
+ # for debugging
909
+ for det in im['detections']:
910
+ det['image_filename'] = im['file']
598
911
 
599
- # Handy debugging code for looking at the numbers for a particular sequence
600
- for im in images_this_sequence:
601
- if debug_fn is not None and debug_fn in im['file_name']:
602
- raise ValueError('')
603
-
604
- if len(classifications_this_sequence) == 0:
912
+ if len(detections_this_sequence) == 0:
605
913
  continue
606
914
 
607
- # Count above-threshold classifications for each category
608
- sorted_category_to_count = _count_above_threshold_classifications(
609
- classifications_this_sequence,options)
610
-
611
- if len(sorted_category_to_count) == 0:
915
+ r = _smooth_classifications_for_list_of_detections(
916
+ detections=detections_this_sequence,
917
+ options=options,
918
+ other_category_ids=other_category_ids,
919
+ classification_descriptions=classification_descriptions,
920
+ classification_descriptions_clean=classification_descriptions_clean)
921
+
922
+ if r is None:
612
923
  continue
613
924
 
614
- max_count = _get_first_value_from_sorted_dictionary(sorted_category_to_count)
615
- dominant_category_id = _get_first_key_from_sorted_dictionary(sorted_category_to_count)
616
-
617
- # If our dominant category ID isn't something we want to smooth to,
618
- # don't mess around with this sequence
619
- if dominant_category_id not in category_ids_to_smooth_to:
620
- continue
925
+ n_detections_flipped_this_sequence = r['n_detections_flipped_this_image']
926
+ n_other_classifications_changed_this_sequence = \
927
+ r['n_other_classifications_changed_this_image']
928
+ n_taxonomic_changes_this_sequence = r['n_taxonomic_changes_this_image']
929
+ n_within_family_changes_this_sequence = r['n_within_family_smoothing_changes']
930
+
931
+ n_detections_flipped += n_detections_flipped_this_sequence
932
+ n_other_classifications_changed += n_other_classifications_changed_this_sequence
933
+ n_taxonomic_classification_changes += n_taxonomic_changes_this_sequence
934
+ n_within_family_changes += n_within_family_changes_this_sequence
935
+
936
+ if n_detections_flipped_this_sequence > 0:
937
+ n_sequences_changed += 1
938
+ if n_other_classifications_changed_this_sequence > 0:
939
+ n_other_sequences_changed += 1
940
+ if n_taxonomic_changes_this_sequence > 0:
941
+ n_taxonomic_sequences_changed += 1
942
+ if n_within_family_changes_this_sequence > 0:
943
+ n_within_family_sequences_changed += 1
621
944
 
622
-
623
- ## Smooth "other" classifications ##
624
-
625
- if max_count >= options.min_dominant_class_classifications_above_threshold_for_other_smoothing:
626
- for c in classifications_this_sequence:
627
- if c[0] in other_category_ids:
628
- n_other_flips += 1
629
- c[0] = dominant_category_id
630
- c[1] = options.flipped_other_confidence_value
945
+ # ...for each sequence
631
946
 
947
+ print('Classification smoothing: changed {} detections in {} sequences'.format(
948
+ n_detections_flipped,n_sequences_changed))
632
949
 
633
- # By not re-computing "max_count" here, we are making a decision that the count used
634
- # to decide whether a class should overwrite another class does not include any "other"
635
- # classifications we changed to be the dominant class. If we wanted to include those...
636
- #
637
- # sorted_category_to_count = count_above_threshold_classifications(classifications_this_sequence)
638
- # max_count = get_first_value_from_sorted_dictionary(sorted_category_to_count)
639
- # assert dominant_category_id == get_first_key_from_sorted_dictionary(sorted_category_to_count)
640
-
641
-
642
- ## Smooth non-dominant classes ##
643
-
644
- if max_count >= options.min_dominant_class_classifications_above_threshold_for_class_smoothing:
645
-
646
- # Don't flip classes to the dominant class if they have a large number of classifications
647
- category_ids_not_to_flip = set()
648
-
649
- for category_id in sorted_category_to_count.keys():
650
- secondary_class_count = sorted_category_to_count[category_id]
651
- dominant_to_secondary_ratio = max_count / secondary_class_count
652
-
653
- # Don't smooth over this class if there are a bunch of them, and the ratio
654
- # if primary to secondary class count isn't too large
655
-
656
- # Default ratio
657
- ratio_for_override = options.min_dominant_class_ratio_for_secondary_override_table[None]
658
-
659
- # Does this dominant class have a custom ratio?
660
- dominant_category_name = classification_category_id_to_name[dominant_category_id]
661
- if dominant_category_name in options.min_dominant_class_ratio_for_secondary_override_table:
662
- ratio_for_override = \
663
- options.min_dominant_class_ratio_for_secondary_override_table[dominant_category_name]
664
-
665
- if (dominant_to_secondary_ratio < ratio_for_override) and \
666
- (secondary_class_count > \
667
- options.max_secondary_class_classifications_above_threshold_for_class_smoothing):
668
- category_ids_not_to_flip.add(category_id)
669
-
670
- for c in classifications_this_sequence:
671
- if c[0] not in category_ids_not_to_flip and c[0] != dominant_category_id:
672
- c[0] = dominant_category_id
673
- c[1] = options.flipped_class_confidence_value
674
- n_classification_flips += 1
675
-
676
-
677
- ## Smooth unclassified detections ##
678
-
679
- if max_count >= options.min_dominant_class_classifications_above_threshold_for_unclassified_smoothing:
680
-
681
- results_this_sequence = _results_for_sequence(images_this_sequence,filename_to_results)
682
- detections_this_sequence = []
683
- for r in results_this_sequence:
684
- if r['detections'] is not None:
685
- detections_this_sequence.extend(r['detections'])
686
- for det in detections_this_sequence:
687
- if 'classifications' in det and len(det['classifications']) > 0:
688
- continue
689
- if det['category'] != options.animal_detection_category:
690
- continue
691
- if det['conf'] < options.min_detection_confidence_for_unclassified_flipping:
692
- continue
693
- det['classifications'] = [[dominant_category_id,options.flipped_unclassified_confidence_value]]
694
- n_unclassified_flips += 1
695
-
696
- # ...for each sequence
697
-
698
- print('\Finished sequence smoothing\n')
699
- print('Flipped {} "other" classifications'.format(n_other_flips))
700
- print('Flipped {} species classifications'.format(n_classification_flips))
701
- print('Flipped {} unclassified detections'.format(n_unclassified_flips))
702
-
950
+ print('"Other" smoothing: changed {} detections in {} sequences'.format(
951
+ n_other_classifications_changed,n_other_sequences_changed))
703
952
 
704
- ##%% Write smoothed classification results
953
+ print('Taxonomic smoothing: changed {} detections in {} sequences'.format(
954
+ n_taxonomic_classification_changes,n_taxonomic_sequences_changed))
955
+
956
+ print('Within-family smoothing: changed {} detections in {} sequences'.format(
957
+ n_within_family_changes,n_within_family_sequences_changed))
705
958
 
706
- if output_file is not None:
707
-
959
+
960
+ ## Clean up debug information
961
+
962
+ for im in d['images']:
963
+ if 'detections' not in im or im['detections'] is None:
964
+ continue
965
+ for det in im['detections']:
966
+ if 'image_filename' in det:
967
+ del det['image_filename']
968
+
969
+
970
+ ## Write output
971
+
972
+ if output_file is not None:
708
973
  print('Writing sequence-smoothed classification results to {}'.format(
709
- output_file))
710
-
974
+ output_file))
711
975
  with open(output_file,'w') as f:
712
- json.dump(md_results,f,indent=1)
976
+ json.dump(d,f,indent=1)
713
977
 
714
- return md_results
978
+ return d
715
979
 
716
980
  # ...smooth_classification_results_sequence_level(...)