megadetector 5.0.25__py3-none-any.whl → 5.0.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/data_management/cct_json_utils.py +15 -2
- megadetector/data_management/coco_to_yolo.py +53 -31
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +7 -3
- megadetector/data_management/databases/integrity_check_json_db.py +2 -2
- megadetector/data_management/lila/generate_lila_per_image_labels.py +2 -2
- megadetector/data_management/lila/test_lila_metadata_urls.py +21 -10
- megadetector/data_management/remap_coco_categories.py +60 -11
- megadetector/data_management/yolo_to_coco.py +45 -15
- megadetector/postprocessing/classification_postprocessing.py +788 -524
- megadetector/postprocessing/create_crop_folder.py +95 -33
- megadetector/postprocessing/load_api_results.py +4 -1
- megadetector/postprocessing/md_to_coco.py +1 -1
- megadetector/postprocessing/postprocess_batch_results.py +156 -42
- megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +3 -8
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +2 -2
- megadetector/postprocessing/separate_detections_into_folders.py +20 -4
- megadetector/postprocessing/subset_json_detector_output.py +180 -15
- megadetector/postprocessing/validate_batch_results.py +13 -5
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +6 -6
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +3 -58
- megadetector/taxonomy_mapping/species_lookup.py +45 -2
- megadetector/utils/ct_utils.py +4 -2
- megadetector/utils/directory_listing.py +1 -1
- megadetector/utils/md_tests.py +2 -1
- megadetector/utils/path_utils.py +308 -19
- megadetector/utils/wi_utils.py +363 -186
- megadetector/visualization/visualization_utils.py +2 -1
- megadetector/visualization/visualize_db.py +1 -1
- megadetector/visualization/visualize_detector_output.py +1 -4
- {megadetector-5.0.25.dist-info → megadetector-5.0.27.dist-info}/METADATA +4 -3
- {megadetector-5.0.25.dist-info → megadetector-5.0.27.dist-info}/RECORD +34 -34
- {megadetector-5.0.25.dist-info → megadetector-5.0.27.dist-info}/WHEEL +1 -1
- {megadetector-5.0.25.dist-info → megadetector-5.0.27.dist-info/licenses}/LICENSE +0 -0
- {megadetector-5.0.25.dist-info → megadetector-5.0.27.dist-info}/top_level.txt +0 -0
|
@@ -181,7 +181,7 @@ class RepeatDetectionOptions:
|
|
|
181
181
|
#: Original size is preserved if this is None.
|
|
182
182
|
#:
|
|
183
183
|
#: This does *not* include the tile image grid.
|
|
184
|
-
self.maxOutputImageWidth =
|
|
184
|
+
self.maxOutputImageWidth = 2000
|
|
185
185
|
|
|
186
186
|
#: Line thickness (in pixels) for box rendering
|
|
187
187
|
self.lineThickness = 10
|
|
@@ -256,7 +256,7 @@ class RepeatDetectionOptions:
|
|
|
256
256
|
self.detectionTilesPrimaryImageLocation = 'right'
|
|
257
257
|
|
|
258
258
|
#: Maximum number of individual detection instances to include in the mosaic
|
|
259
|
-
self.detectionTilesMaxCrops =
|
|
259
|
+
self.detectionTilesMaxCrops = 150
|
|
260
260
|
|
|
261
261
|
#: If bRenderOtherDetections is True, what color should we use to render the
|
|
262
262
|
#: (hopefully pretty subtle) non-target detections?
|
|
@@ -86,6 +86,7 @@ from functools import partial
|
|
|
86
86
|
from tqdm import tqdm
|
|
87
87
|
|
|
88
88
|
from megadetector.utils.ct_utils import args_to_object, is_float
|
|
89
|
+
from megadetector.utils.path_utils import remove_empty_folders
|
|
89
90
|
from megadetector.detection.run_detector import get_typical_confidence_threshold_from_results
|
|
90
91
|
from megadetector.visualization import visualization_utils as vis_utils
|
|
91
92
|
from megadetector.visualization.visualization_utils import blur_detections
|
|
@@ -167,7 +168,7 @@ class SeparateDetectionsIntoFoldersOptions:
|
|
|
167
168
|
#:
|
|
168
169
|
#: deer=0.75,cow=0.75
|
|
169
170
|
#:
|
|
170
|
-
#:
|
|
171
|
+
#: String, converted internally to a dict mapping name:threshold
|
|
171
172
|
self.classification_thresholds = None
|
|
172
173
|
|
|
173
174
|
## Debug or internal attributes
|
|
@@ -194,6 +195,10 @@ class SeparateDetectionsIntoFoldersOptions:
|
|
|
194
195
|
#: Can also be a comma-separated list.
|
|
195
196
|
self.category_names_to_blur = None
|
|
196
197
|
|
|
198
|
+
#: Remove all empty folders from the target folder at the end of the process,
|
|
199
|
+
#: whether or not they were created by this script
|
|
200
|
+
self.remove_empty_folders = False
|
|
201
|
+
|
|
197
202
|
# ...__init__()
|
|
198
203
|
|
|
199
204
|
# ...class SeparateDetectionsIntoFoldersOptions
|
|
@@ -319,7 +324,7 @@ def _process_detections(im,options):
|
|
|
319
324
|
|
|
320
325
|
classification_category_id = classification[0]
|
|
321
326
|
classification_confidence = classification[1]
|
|
322
|
-
|
|
327
|
+
|
|
323
328
|
# Do we have a threshold for this category, and if so, is
|
|
324
329
|
# this classification above threshold?
|
|
325
330
|
assert options.classification_category_id_to_name is not None
|
|
@@ -521,7 +526,11 @@ def separate_detections_into_folders(options):
|
|
|
521
526
|
for category_name in category_names:
|
|
522
527
|
|
|
523
528
|
# Do we have a custom threshold for this category?
|
|
524
|
-
|
|
529
|
+
if category_name not in options.category_name_to_threshold:
|
|
530
|
+
print('Warning: category {} in detection file, but not in threshold mapping'.format(
|
|
531
|
+
category_name))
|
|
532
|
+
options.category_name_to_threshold[category_name] = None
|
|
533
|
+
|
|
525
534
|
if options.category_name_to_threshold[category_name] is None:
|
|
526
535
|
options.category_name_to_threshold[category_name] = default_threshold
|
|
527
536
|
|
|
@@ -584,7 +593,7 @@ def separate_detections_into_folders(options):
|
|
|
584
593
|
|
|
585
594
|
# ...for each token
|
|
586
595
|
|
|
587
|
-
options.classification_thresholds = classification_thresholds
|
|
596
|
+
options.classification_thresholds = classification_thresholds
|
|
588
597
|
|
|
589
598
|
# ...if classification thresholds are still in string format
|
|
590
599
|
|
|
@@ -611,6 +620,10 @@ def separate_detections_into_folders(options):
|
|
|
611
620
|
pool = ThreadPool(options.n_threads)
|
|
612
621
|
process_detections_with_options = partial(_process_detections, options=options)
|
|
613
622
|
_ = list(tqdm(pool.imap(process_detections_with_options, images), total=len(images)))
|
|
623
|
+
|
|
624
|
+
if options.remove_empty_folders:
|
|
625
|
+
print('Removing empty folders from {}'.format(options.base_output_folder))
|
|
626
|
+
remove_empty_folders(options.base_output_folder)
|
|
614
627
|
|
|
615
628
|
# ...def separate_detections_into_folders
|
|
616
629
|
|
|
@@ -715,6 +728,9 @@ def main():
|
|
|
715
728
|
default_box_expansion))
|
|
716
729
|
parser.add_argument('--category_names_to_blur', type=str, default=None,
|
|
717
730
|
help='Comma-separated list of category names to blur (or a single category name, e.g. "person")')
|
|
731
|
+
parser.add_argument('--remove_empty_folders', action='store_true',
|
|
732
|
+
help='Remove all empty folders from the target folder at the end of the process, ' + \
|
|
733
|
+
'whether or not they were created by this script')
|
|
718
734
|
|
|
719
735
|
if len(sys.argv[1:])==0:
|
|
720
736
|
parser.print_help()
|
|
@@ -61,9 +61,11 @@ import os
|
|
|
61
61
|
import re
|
|
62
62
|
|
|
63
63
|
from tqdm import tqdm
|
|
64
|
+
from collections import defaultdict
|
|
64
65
|
|
|
65
66
|
from megadetector.utils.ct_utils import args_to_object, get_max_conf, invert_dictionary
|
|
66
67
|
from megadetector.utils.path_utils import top_level_folder
|
|
68
|
+
from megadetector.utils.path_utils import recursive_file_list
|
|
67
69
|
|
|
68
70
|
|
|
69
71
|
#%% Helper classes
|
|
@@ -136,7 +138,18 @@ class SubsetJsonDetectorOutputOptions:
|
|
|
136
138
|
|
|
137
139
|
#: Set to >0 during testing to limit the number of images that get processed.
|
|
138
140
|
self.debug_max_images = -1
|
|
141
|
+
|
|
142
|
+
#: Keep only files in this list, which can be a .json results file or a folder.
|
|
143
|
+
#
|
|
144
|
+
#: Assumes that the input .json file contains relative paths when comparing to a folder.
|
|
145
|
+
self.keep_files_in_list = None
|
|
146
|
+
|
|
147
|
+
#: Remove classification with <= N instances. Does not re-map categories
|
|
148
|
+
#: to be contiguous. Set to 1 to remove empty categories only.
|
|
149
|
+
self.remove_classification_categories_below_count = None
|
|
139
150
|
|
|
151
|
+
# ...class SubsetJsonDetectorOutputOptions
|
|
152
|
+
|
|
140
153
|
|
|
141
154
|
#%% Main function
|
|
142
155
|
|
|
@@ -156,11 +169,104 @@ def _write_detection_results(data, output_filename, options):
|
|
|
156
169
|
else:
|
|
157
170
|
os.makedirs(basedir, exist_ok=True)
|
|
158
171
|
|
|
159
|
-
|
|
160
|
-
|
|
172
|
+
n_images = len(data['images'])
|
|
173
|
+
|
|
174
|
+
print('Writing detection output (with {} images) to {}'.format(n_images,output_filename))
|
|
175
|
+
with open(output_filename, 'w', newline='\n') as f:
|
|
161
176
|
json.dump(data,f,indent=1)
|
|
162
177
|
|
|
163
|
-
# ..._write_detection_results()
|
|
178
|
+
# ...def _write_detection_results(...)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def remove_classification_categories_below_count(data, options):
|
|
182
|
+
"""
|
|
183
|
+
Removes all classification categories below a threshold count. Does not re-map
|
|
184
|
+
classification category IDs.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
data (dict): data loaded from a MD results file
|
|
188
|
+
options (SubsetJsonDetectorOutputOptions): parameters for subsetting
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
dict: Possibly-modified version of [data] (also modifies in place)
|
|
192
|
+
"""
|
|
193
|
+
|
|
194
|
+
if options.remove_classification_categories_below_count is None:
|
|
195
|
+
return data
|
|
196
|
+
if 'classification_categories' not in data:
|
|
197
|
+
return data
|
|
198
|
+
|
|
199
|
+
classification_category_id_to_count = {}
|
|
200
|
+
|
|
201
|
+
for classification_category_id in data['classification_categories']:
|
|
202
|
+
classification_category_id_to_count[classification_category_id] = 0
|
|
203
|
+
|
|
204
|
+
# Count the number of occurrences of each classification category
|
|
205
|
+
for im in data['images']:
|
|
206
|
+
if 'detections' not in im or im['detections'] is None:
|
|
207
|
+
continue
|
|
208
|
+
for det in im['detections']:
|
|
209
|
+
if 'classifications' not in det:
|
|
210
|
+
continue
|
|
211
|
+
for classification in det['classifications']:
|
|
212
|
+
classification_category_id_to_count[classification[0]] = \
|
|
213
|
+
classification_category_id_to_count[classification[0]] + 1
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
# Which categories have above-threshold counts?
|
|
217
|
+
classification_category_ids_to_keep = set()
|
|
218
|
+
|
|
219
|
+
for classification_category_id in classification_category_id_to_count:
|
|
220
|
+
if classification_category_id_to_count[classification_category_id] > \
|
|
221
|
+
options.remove_classification_categories_below_count:
|
|
222
|
+
classification_category_ids_to_keep.add(classification_category_id)
|
|
223
|
+
|
|
224
|
+
n_categories_removed = \
|
|
225
|
+
len(classification_category_id_to_count) - \
|
|
226
|
+
len(classification_category_ids_to_keep)
|
|
227
|
+
|
|
228
|
+
print('Removing {} of {} classification categories'.format(
|
|
229
|
+
n_categories_removed,len(classification_category_id_to_count)))
|
|
230
|
+
|
|
231
|
+
if n_categories_removed == 0:
|
|
232
|
+
return data
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
# Filter the category list
|
|
236
|
+
output_classification_categories = {}
|
|
237
|
+
for category_id in data['classification_categories']:
|
|
238
|
+
if category_id in classification_category_ids_to_keep:
|
|
239
|
+
output_classification_categories[category_id] = \
|
|
240
|
+
data['classification_categories'][category_id]
|
|
241
|
+
data['classification_categories'] = output_classification_categories
|
|
242
|
+
assert len(data['classification_categories']) == len(classification_category_ids_to_keep)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
# If necessary, filter the category descriptions
|
|
246
|
+
if 'classification_category_descriptions' in data:
|
|
247
|
+
output_classification_category_descriptions = {}
|
|
248
|
+
for category_id in data['classification_category_descriptions']:
|
|
249
|
+
if category_id in classification_category_ids_to_keep:
|
|
250
|
+
output_classification_category_descriptions[category_id] = \
|
|
251
|
+
data['classification_category_descriptions'][category_id]
|
|
252
|
+
data['classification_category_descriptions'] = output_classification_category_descriptions
|
|
253
|
+
|
|
254
|
+
# Filter images
|
|
255
|
+
for im in data['images']:
|
|
256
|
+
if 'detections' not in im or im['detections'] is None:
|
|
257
|
+
continue
|
|
258
|
+
for det in im['detections']:
|
|
259
|
+
if 'classifications' not in det:
|
|
260
|
+
continue
|
|
261
|
+
classifications_to_keep = []
|
|
262
|
+
for classification in det['classifications']:
|
|
263
|
+
if classification[0] in classification_category_ids_to_keep:
|
|
264
|
+
classifications_to_keep.append(classification)
|
|
265
|
+
det['classifications'] = classifications_to_keep
|
|
266
|
+
|
|
267
|
+
return data
|
|
268
|
+
|
|
269
|
+
# ...def remove_classification_categories_below_count(...)
|
|
164
270
|
|
|
165
271
|
|
|
166
272
|
def subset_json_detector_output_by_confidence(data, options):
|
|
@@ -172,7 +278,7 @@ def subset_json_detector_output_by_confidence(data, options):
|
|
|
172
278
|
options (SubsetJsonDetectorOutputOptions): parameters for subsetting
|
|
173
279
|
|
|
174
280
|
Returns:
|
|
175
|
-
dict: Possibly-modified version of data (also modifies in place)
|
|
281
|
+
dict: Possibly-modified version of [data] (also modifies in place)
|
|
176
282
|
"""
|
|
177
283
|
|
|
178
284
|
if options.confidence_threshold is None:
|
|
@@ -234,9 +340,55 @@ def subset_json_detector_output_by_confidence(data, options):
|
|
|
234
340
|
|
|
235
341
|
return data
|
|
236
342
|
|
|
237
|
-
# ...subset_json_detector_output_by_confidence()
|
|
343
|
+
# ...def subset_json_detector_output_by_confidence(...)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def subset_json_detector_output_by_list(data, options):
|
|
347
|
+
"""
|
|
348
|
+
Keeps only files in options.keep_files_in_list, which can be a .json results file or a folder.
|
|
349
|
+
Assumes that the input .json file contains relative paths when comparing to a folder.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
data (dict): data loaded from a MD results file
|
|
353
|
+
options (SubsetJsonDetectorOutputOptions): parameters for subsetting
|
|
354
|
+
|
|
355
|
+
Returns:
|
|
356
|
+
dict: Possibly-modified version of [data] (also modifies in place)
|
|
357
|
+
"""
|
|
358
|
+
|
|
359
|
+
if options.keep_files_in_list is None:
|
|
360
|
+
return
|
|
361
|
+
|
|
362
|
+
files_to_keep = None
|
|
363
|
+
|
|
364
|
+
if os.path.isfile(options.keep_files_in_list):
|
|
365
|
+
with open(options.keep_files_in_list,'r') as f:
|
|
366
|
+
d = json.load(f)
|
|
367
|
+
files_to_keep = [im['file'] for im in d['images']]
|
|
368
|
+
elif os.path.isdir(options.keep_files_in_list):
|
|
369
|
+
files_to_keep = \
|
|
370
|
+
recursive_file_list(options.keep_files_in_list,return_relative_paths=True)
|
|
371
|
+
else:
|
|
372
|
+
raise ValueError('Subsetting .json file by list: {} is neither a .json results file nor a folder'.format(
|
|
373
|
+
options.keep_files_in_list))
|
|
374
|
+
|
|
375
|
+
files_to_keep = [fn.replace('\\','/') for fn in files_to_keep]
|
|
376
|
+
files_to_keep_set = set(files_to_keep)
|
|
377
|
+
|
|
378
|
+
images_to_keep = []
|
|
379
|
+
|
|
380
|
+
for im in data['images']:
|
|
381
|
+
fn = im['file'].replace('\\','/')
|
|
382
|
+
if fn in files_to_keep_set:
|
|
383
|
+
images_to_keep.append(im)
|
|
384
|
+
|
|
385
|
+
data['images'] = images_to_keep
|
|
386
|
+
|
|
387
|
+
return data
|
|
238
388
|
|
|
389
|
+
# ...def subset_json_detector_output_by_list(...)
|
|
239
390
|
|
|
391
|
+
|
|
240
392
|
def subset_json_detector_output_by_categories(data, options):
|
|
241
393
|
"""
|
|
242
394
|
Removes all detections without detections above a threshold for specific categories.
|
|
@@ -246,7 +398,7 @@ def subset_json_detector_output_by_categories(data, options):
|
|
|
246
398
|
options (SubsetJsonDetectorOutputOptions): parameters for subsetting
|
|
247
399
|
|
|
248
400
|
Returns:
|
|
249
|
-
dict: Possibly-modified version of data (also modifies in place)
|
|
401
|
+
dict: Possibly-modified version of [data] (also modifies in place)
|
|
250
402
|
"""
|
|
251
403
|
|
|
252
404
|
# If categories_to_keep is supplied as a list, convert to a dict
|
|
@@ -342,7 +494,7 @@ def subset_json_detector_output_by_categories(data, options):
|
|
|
342
494
|
|
|
343
495
|
return data
|
|
344
496
|
|
|
345
|
-
# ...subset_json_detector_output_by_categories()
|
|
497
|
+
# ...def subset_json_detector_output_by_categories(...)
|
|
346
498
|
|
|
347
499
|
|
|
348
500
|
def remove_failed_images(data,options):
|
|
@@ -354,7 +506,7 @@ def remove_failed_images(data,options):
|
|
|
354
506
|
options (SubsetJsonDetectorOutputOptions): parameters for subsetting
|
|
355
507
|
|
|
356
508
|
Returns:
|
|
357
|
-
dict: Possibly-modified version of data (also modifies in place)
|
|
509
|
+
dict: Possibly-modified version of [data] (also modifies in place)
|
|
358
510
|
"""
|
|
359
511
|
|
|
360
512
|
images_in = data['images']
|
|
@@ -381,7 +533,7 @@ def remove_failed_images(data,options):
|
|
|
381
533
|
|
|
382
534
|
return data
|
|
383
535
|
|
|
384
|
-
# ...remove_failed_images()
|
|
536
|
+
# ...def remove_failed_images(...)
|
|
385
537
|
|
|
386
538
|
|
|
387
539
|
def subset_json_detector_output_by_query(data, options):
|
|
@@ -394,7 +546,7 @@ def subset_json_detector_output_by_query(data, options):
|
|
|
394
546
|
options (SubsetJsonDetectorOutputOptions): parameters for subsetting
|
|
395
547
|
|
|
396
548
|
Returns:
|
|
397
|
-
dict: Possibly-modified version of data (also modifies in place)
|
|
549
|
+
dict: Possibly-modified version of [data] (also modifies in place)
|
|
398
550
|
"""
|
|
399
551
|
|
|
400
552
|
images_in = data['images']
|
|
@@ -441,7 +593,7 @@ def subset_json_detector_output_by_query(data, options):
|
|
|
441
593
|
|
|
442
594
|
return data
|
|
443
595
|
|
|
444
|
-
# ...subset_json_detector_output_by_query()
|
|
596
|
+
# ...def subset_json_detector_output_by_query(...)
|
|
445
597
|
|
|
446
598
|
|
|
447
599
|
def subset_json_detector_output(input_filename, output_filename, options, data=None):
|
|
@@ -481,10 +633,10 @@ def subset_json_detector_output(input_filename, output_filename, options, data=N
|
|
|
481
633
|
raise ValueError('When splitting by folders, output must be a valid directory name, you specified an existing file')
|
|
482
634
|
|
|
483
635
|
if data is None:
|
|
484
|
-
print('Reading
|
|
636
|
+
print('Reading file {}'.format(input_filename))
|
|
485
637
|
with open(input_filename) as f:
|
|
486
638
|
data = json.load(f)
|
|
487
|
-
print('
|
|
639
|
+
print('Read {} images'.format(len(data['images'])))
|
|
488
640
|
if options.debug_max_images > 0:
|
|
489
641
|
print('Trimming to {} images'.format(options.debug_max_images))
|
|
490
642
|
data['images'] = data['images'][:options.debug_max_images]
|
|
@@ -500,7 +652,7 @@ def subset_json_detector_output(input_filename, output_filename, options, data=N
|
|
|
500
652
|
if options.remove_failed_images:
|
|
501
653
|
|
|
502
654
|
data = remove_failed_images(data, options)
|
|
503
|
-
|
|
655
|
+
|
|
504
656
|
if options.confidence_threshold is not None:
|
|
505
657
|
|
|
506
658
|
data = subset_json_detector_output_by_confidence(data, options)
|
|
@@ -508,6 +660,14 @@ def subset_json_detector_output(input_filename, output_filename, options, data=N
|
|
|
508
660
|
if (options.categories_to_keep is not None) or (options.category_names_to_keep is not None):
|
|
509
661
|
|
|
510
662
|
data = subset_json_detector_output_by_categories(data, options)
|
|
663
|
+
|
|
664
|
+
if options.remove_classification_categories_below_count is not None:
|
|
665
|
+
|
|
666
|
+
data = remove_classification_categories_below_count(data, options)
|
|
667
|
+
|
|
668
|
+
if options.keep_files_in_list is not None:
|
|
669
|
+
|
|
670
|
+
data = subset_json_detector_output_by_list(data, options)
|
|
511
671
|
|
|
512
672
|
if not options.split_folders:
|
|
513
673
|
|
|
@@ -615,7 +775,7 @@ def subset_json_detector_output(input_filename, output_filename, options, data=N
|
|
|
615
775
|
|
|
616
776
|
# ...if we're splitting folders
|
|
617
777
|
|
|
618
|
-
# ...subset_json_detector_output()
|
|
778
|
+
# ...def subset_json_detector_output(...)
|
|
619
779
|
|
|
620
780
|
|
|
621
781
|
#%% Interactive driver
|
|
@@ -676,6 +836,9 @@ def main():
|
|
|
676
836
|
help='Replace [query] with this')
|
|
677
837
|
parser.add_argument('--confidence_threshold', type=float, default=None,
|
|
678
838
|
help='Remove detections below this confidence level')
|
|
839
|
+
parser.add_argument('--keep_files_in_list', type=str, default=None,
|
|
840
|
+
help='Keep only files in this list, which can be a .json results file or a folder.' + \
|
|
841
|
+
' Assumes that the input .json file contains relative paths when comparing to a folder.')
|
|
679
842
|
parser.add_argument('--split_folders', action='store_true',
|
|
680
843
|
help='Split .json files by leaf-node folder')
|
|
681
844
|
parser.add_argument('--split_folder_param', type=int,
|
|
@@ -690,6 +853,8 @@ def main():
|
|
|
690
853
|
help='When using split_folders and make_folder_relative, copy jsons to their corresponding folders (relative to output_file)')
|
|
691
854
|
parser.add_argument('--create_folders', action='store_true',
|
|
692
855
|
help='When using copy_jsons_to_folders, create folders that don''t exist')
|
|
856
|
+
parser.add_argument('--remove_classification_categories_below_count', type=int, default=None,
|
|
857
|
+
help='Remove classification categories with less than this many instances (no removal by default)')
|
|
693
858
|
|
|
694
859
|
if len(sys.argv[1:]) == 0:
|
|
695
860
|
parser.print_help()
|
|
@@ -20,11 +20,19 @@ from tqdm import tqdm
|
|
|
20
20
|
from megadetector.detection.video_utils import is_video_file
|
|
21
21
|
from megadetector.utils.ct_utils import args_to_object, is_list_sorted # noqa
|
|
22
22
|
|
|
23
|
-
typical_info_fields = ['detector',
|
|
24
|
-
'
|
|
25
|
-
'
|
|
26
|
-
|
|
27
|
-
|
|
23
|
+
typical_info_fields = ['detector',
|
|
24
|
+
'detection_completion_time',
|
|
25
|
+
'classifier',
|
|
26
|
+
'classification_completion_time',
|
|
27
|
+
'detection_metadata',
|
|
28
|
+
'classifier_metadata']
|
|
29
|
+
|
|
30
|
+
required_keys = ['info',
|
|
31
|
+
'images',
|
|
32
|
+
'detection_categories']
|
|
33
|
+
|
|
34
|
+
typical_keys = ['classification_categories',
|
|
35
|
+
'classification_category_descriptions']
|
|
28
36
|
|
|
29
37
|
|
|
30
38
|
#%% Classes
|
|
@@ -15,10 +15,10 @@ import json
|
|
|
15
15
|
# Created by get_lila_category_list.py
|
|
16
16
|
input_lila_category_list_file = os.path.expanduser('~/lila/lila_categories_list/lila_dataset_to_categories.json')
|
|
17
17
|
|
|
18
|
-
output_file = os.path.expanduser('~/lila/
|
|
18
|
+
output_file = os.path.expanduser('~/lila/lila_additions_2025.03.24.csv')
|
|
19
19
|
|
|
20
20
|
datasets_to_map = [
|
|
21
|
-
'
|
|
21
|
+
'UNSW Predators'
|
|
22
22
|
]
|
|
23
23
|
|
|
24
24
|
|
|
@@ -125,6 +125,8 @@ output_df = pd.DataFrame(data=output_rows, columns=[
|
|
|
125
125
|
'scientific_name', 'common_name', 'taxonomy_string'])
|
|
126
126
|
output_df.to_csv(output_file, index=None, header=True)
|
|
127
127
|
|
|
128
|
+
# from megadetector.utils.path_utils import open_file; open_file(output_file)
|
|
129
|
+
|
|
128
130
|
|
|
129
131
|
#%% Manual lookup
|
|
130
132
|
|
|
@@ -138,10 +140,8 @@ if False:
|
|
|
138
140
|
|
|
139
141
|
#%%
|
|
140
142
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
# q = 'notamacropus'
|
|
144
|
-
q = 'insects'
|
|
143
|
+
q = 'dasyurus maculatus'
|
|
144
|
+
|
|
145
145
|
taxonomy_preference = 'inat'
|
|
146
146
|
m = get_preferred_taxonomic_match(q,taxonomy_preference)
|
|
147
147
|
# print(m.scientific_name); import clipboard; clipboard.copy(m.scientific_name)
|
|
@@ -16,7 +16,7 @@ import os
|
|
|
16
16
|
import pandas as pd
|
|
17
17
|
|
|
18
18
|
# lila_taxonomy_file = r"c:\git\agentmorrisprivate\lila-taxonomy\lila-taxonomy-mapping.csv"
|
|
19
|
-
lila_taxonomy_file = os.path.expanduser('~/lila/
|
|
19
|
+
lila_taxonomy_file = os.path.expanduser('~/lila/lila_additions_2025.03.24.csv')
|
|
20
20
|
|
|
21
21
|
preview_base = os.path.expanduser('~/lila/lila_taxonomy_preview')
|
|
22
22
|
os.makedirs(preview_base,exist_ok=True)
|
|
@@ -72,65 +72,10 @@ from megadetector.taxonomy_mapping.species_lookup import \
|
|
|
72
72
|
initialize_taxonomy_lookup()
|
|
73
73
|
|
|
74
74
|
|
|
75
|
-
#%% Optionally remap all gbif-based mappings to inat (or vice-versa)
|
|
76
|
-
|
|
77
|
-
if False:
|
|
78
|
-
|
|
79
|
-
#%%
|
|
80
|
-
|
|
81
|
-
source_mappings = ['gbif','manual']
|
|
82
|
-
target_mapping = 'inat'
|
|
83
|
-
valid_mappings = ['gbif','inat','manual']
|
|
84
|
-
|
|
85
|
-
assert target_mapping in valid_mappings
|
|
86
|
-
for source_mapping in source_mappings:
|
|
87
|
-
assert source_mapping != target_mapping and \
|
|
88
|
-
source_mapping in valid_mappings
|
|
89
|
-
|
|
90
|
-
n_remappings = 0
|
|
91
|
-
|
|
92
|
-
# i_row = 1; row = df.iloc[i_row]; row
|
|
93
|
-
for i_row,row in df.iterrows():
|
|
94
|
-
|
|
95
|
-
if row['source'] not in source_mappings:
|
|
96
|
-
continue
|
|
97
|
-
|
|
98
|
-
scientific_name = row['scientific_name']
|
|
99
|
-
old_common = taxonomy_string_to_common_name(row['taxonomy_string'])
|
|
100
|
-
|
|
101
|
-
m = get_preferred_taxonomic_match(scientific_name,target_mapping)
|
|
102
|
-
|
|
103
|
-
if m is None or m.source != target_mapping:
|
|
104
|
-
print('No mapping for {} ({}) ({})'.format(scientific_name,row['query'],old_common))
|
|
105
|
-
continue
|
|
106
|
-
|
|
107
|
-
assert m.scientific_name == row['scientific_name']
|
|
108
|
-
|
|
109
|
-
if m.taxonomic_level == 'variety' and row['taxonomy_level'] == 'subspecies':
|
|
110
|
-
pass
|
|
111
|
-
else:
|
|
112
|
-
assert m.taxonomic_level == row['taxonomy_level']
|
|
113
|
-
|
|
114
|
-
new_common = taxonomy_string_to_common_name(m.taxonomy_string)
|
|
115
|
-
|
|
116
|
-
if row['taxonomy_string'] != m.taxonomy_string:
|
|
117
|
-
print('Remapping {} ({} to {})'.format(scientific_name, old_common, new_common))
|
|
118
|
-
n_remappings += 1
|
|
119
|
-
df.loc[i_row,'taxonomy_string'] = m.taxonomy_string
|
|
120
|
-
|
|
121
|
-
if row['source'] != 'manual':
|
|
122
|
-
df.loc[i_row,'source'] = m.source
|
|
123
|
-
|
|
124
|
-
# This should be zero for the release .csv
|
|
125
|
-
print('Made {} remappings'.format(n_remappings))
|
|
126
|
-
|
|
127
|
-
#%%
|
|
128
|
-
|
|
129
|
-
df.to_csv(lila_taxonomy_file.replace('.csv','_remapped.csv'),header=True,index=False)
|
|
130
|
-
|
|
131
|
-
|
|
132
75
|
#%% Check for mappings that disagree with the taxonomy string
|
|
133
76
|
|
|
77
|
+
# For example, cases where the "level" column says "species", but the taxonomy string says it's a genus.
|
|
78
|
+
|
|
134
79
|
df = pd.read_csv(lila_taxonomy_file)
|
|
135
80
|
|
|
136
81
|
n_taxonomy_changes = 0
|
|
@@ -602,8 +602,17 @@ hyphenated_terms = ['crowned', 'backed', 'throated', 'tailed', 'headed', 'cheeke
|
|
|
602
602
|
|
|
603
603
|
def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retry=True) -> TaxonomicMatch:
|
|
604
604
|
"""
|
|
605
|
-
Wrapper for
|
|
606
|
-
preferences that are specific to our scenario.
|
|
605
|
+
Wrapper for _get_preferred_taxonomic_match, but expressing a variety of heuristics
|
|
606
|
+
and preferences that are specific to our scenario.
|
|
607
|
+
|
|
608
|
+
Args:
|
|
609
|
+
query (str): The common or scientific name we want to look up
|
|
610
|
+
taxonomy_preference (str, optional): 'inat' or 'gbif'
|
|
611
|
+
retry (bool, optional): if the initial lookup fails, should we try heuristic
|
|
612
|
+
substitutions, e.g. replacing "_" with " ", or "spp" with "species"?
|
|
613
|
+
|
|
614
|
+
Returns:
|
|
615
|
+
TaxonomicMatch: the best taxonomic match, or None
|
|
607
616
|
"""
|
|
608
617
|
|
|
609
618
|
m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
|
|
@@ -616,6 +625,36 @@ def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retr
|
|
|
616
625
|
return m
|
|
617
626
|
|
|
618
627
|
|
|
628
|
+
def validate_and_convert(data):
|
|
629
|
+
"""
|
|
630
|
+
Recursively validates that all elements in the nested structure are only
|
|
631
|
+
tuples, lists, ints, or np.int64, and converts np.int64 to int.
|
|
632
|
+
|
|
633
|
+
Args:
|
|
634
|
+
data: The nested structure to validate and convert
|
|
635
|
+
|
|
636
|
+
Returns:
|
|
637
|
+
The validated and converted structure
|
|
638
|
+
|
|
639
|
+
Raises:
|
|
640
|
+
TypeError: If an invalid type is encountered
|
|
641
|
+
"""
|
|
642
|
+
|
|
643
|
+
if isinstance(data, np.int64):
|
|
644
|
+
return int(data)
|
|
645
|
+
elif isinstance(data, int) or isinstance(data, str):
|
|
646
|
+
return data
|
|
647
|
+
elif isinstance(data, (list, tuple)):
|
|
648
|
+
# Process lists and tuples recursively
|
|
649
|
+
container_type = type(data)
|
|
650
|
+
return container_type(validate_and_convert(item) for item in data)
|
|
651
|
+
else:
|
|
652
|
+
raise TypeError(f"Invalid type encountered: {type(data).__name__}. "
|
|
653
|
+
f"Only int, np.int64, list, and tuple are allowed.")
|
|
654
|
+
|
|
655
|
+
# ...def validate_and_convert(...)
|
|
656
|
+
|
|
657
|
+
|
|
619
658
|
def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') -> TaxonomicMatch:
|
|
620
659
|
|
|
621
660
|
query = query.lower().strip().replace('_', ' ')
|
|
@@ -760,6 +799,10 @@ def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') ->
|
|
|
760
799
|
|
|
761
800
|
# ...if we needed to look in the GBIF taxonomy
|
|
762
801
|
|
|
802
|
+
# Convert np.int64's to ints
|
|
803
|
+
if match is not None:
|
|
804
|
+
match = validate_and_convert(match)
|
|
805
|
+
|
|
763
806
|
taxonomy_string = str(match)
|
|
764
807
|
|
|
765
808
|
return TaxonomicMatch(scientific_name, common_name, taxonomic_level, source,
|
megadetector/utils/ct_utils.py
CHANGED
|
@@ -483,7 +483,9 @@ def sort_dictionary_by_key(d,reverse=False):
|
|
|
483
483
|
def sort_dictionary_by_value(d,sort_values=None,reverse=False):
|
|
484
484
|
"""
|
|
485
485
|
Sorts the dictionary [d] by value. If sort_values is None, uses d.values(),
|
|
486
|
-
otherwise uses the dictionary sort_values as the sorting criterion.
|
|
486
|
+
otherwise uses the dictionary sort_values as the sorting criterion. Always
|
|
487
|
+
returns a new standard dict, so if [d] is, for example, a defaultdict, the
|
|
488
|
+
returned value is not.
|
|
487
489
|
|
|
488
490
|
Args:
|
|
489
491
|
d (dict): dictionary to sort
|
|
@@ -492,7 +494,7 @@ def sort_dictionary_by_value(d,sort_values=None,reverse=False):
|
|
|
492
494
|
reverse (bool, optional): whether to sort in reverse (descending) order
|
|
493
495
|
|
|
494
496
|
Returns:
|
|
495
|
-
dict: sorted copy of [d
|
|
497
|
+
dict: sorted copy of [d
|
|
496
498
|
"""
|
|
497
499
|
|
|
498
500
|
if sort_values is None:
|
|
@@ -237,7 +237,7 @@ def main():
|
|
|
237
237
|
args = parser.parse_args()
|
|
238
238
|
|
|
239
239
|
assert os.path.isdir(args.directory), "{} is not a valid directory".format(args.directory)
|
|
240
|
-
assert re.match('https?://[^\.]+\.blob\.core\.windows\.net/.+', args.sas_url), "--sas_url does not " + \
|
|
240
|
+
assert re.match(r'https?://[^\.]+\.blob\.core\.windows\.net/.+', args.sas_url), "--sas_url does not " + \
|
|
241
241
|
"match the format https://accname.blob.core.windows.net/bname/path/to/folder?..."
|
|
242
242
|
|
|
243
243
|
traverse_and_create_index(args.directory, overwrite_files=args.enable_overwrite, sas_url=args.sas_url, basepath=args.basepath)
|
megadetector/utils/md_tests.py
CHANGED
|
@@ -1173,6 +1173,7 @@ def run_cli_tests(options):
|
|
|
1173
1173
|
## Return early if we're not running torch-related tests
|
|
1174
1174
|
|
|
1175
1175
|
if options.test_mode == 'utils-only':
|
|
1176
|
+
print('utils-only tests finished, returning')
|
|
1176
1177
|
return
|
|
1177
1178
|
|
|
1178
1179
|
|
|
@@ -1828,7 +1829,7 @@ def main():
|
|
|
1828
1829
|
parser.add_argument(
|
|
1829
1830
|
'--test_mode',
|
|
1830
1831
|
type=str,
|
|
1831
|
-
default='
|
|
1832
|
+
default='all',
|
|
1832
1833
|
help='Test mode: "all" or "utils-only"'
|
|
1833
1834
|
)
|
|
1834
1835
|
|