megadetector 5.0.6__py3-none-any.whl → 5.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (62) hide show
  1. api/batch_processing/data_preparation/manage_local_batch.py +278 -197
  2. api/batch_processing/data_preparation/manage_video_batch.py +7 -2
  3. api/batch_processing/postprocessing/add_max_conf.py +1 -0
  4. api/batch_processing/postprocessing/compare_batch_results.py +110 -60
  5. api/batch_processing/postprocessing/load_api_results.py +55 -69
  6. api/batch_processing/postprocessing/md_to_labelme.py +1 -0
  7. api/batch_processing/postprocessing/postprocess_batch_results.py +158 -50
  8. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +625 -0
  9. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
  10. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
  11. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +222 -74
  12. api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
  13. api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
  14. classification/prepare_classification_script.py +191 -191
  15. data_management/coco_to_yolo.py +65 -44
  16. data_management/databases/integrity_check_json_db.py +7 -5
  17. data_management/generate_crops_from_cct.py +1 -1
  18. data_management/importers/animl_results_to_md_results.py +2 -2
  19. data_management/importers/noaa_seals_2019.py +1 -1
  20. data_management/importers/zamba_results_to_md_results.py +2 -2
  21. data_management/labelme_to_coco.py +34 -6
  22. data_management/labelme_to_yolo.py +1 -1
  23. data_management/lila/create_lila_blank_set.py +474 -0
  24. data_management/lila/create_lila_test_set.py +2 -1
  25. data_management/lila/create_links_to_md_results_files.py +1 -1
  26. data_management/lila/download_lila_subset.py +46 -21
  27. data_management/lila/generate_lila_per_image_labels.py +23 -14
  28. data_management/lila/get_lila_annotation_counts.py +16 -10
  29. data_management/lila/lila_common.py +14 -11
  30. data_management/lila/test_lila_metadata_urls.py +116 -0
  31. data_management/resize_coco_dataset.py +12 -10
  32. data_management/yolo_output_to_md_output.py +40 -13
  33. data_management/yolo_to_coco.py +34 -21
  34. detection/process_video.py +36 -14
  35. detection/pytorch_detector.py +1 -1
  36. detection/run_detector.py +73 -18
  37. detection/run_detector_batch.py +104 -24
  38. detection/run_inference_with_yolov5_val.py +127 -26
  39. detection/run_tiled_inference.py +153 -43
  40. detection/video_utils.py +3 -1
  41. md_utils/ct_utils.py +79 -3
  42. md_utils/md_tests.py +253 -15
  43. md_utils/path_utils.py +129 -24
  44. md_utils/process_utils.py +26 -7
  45. md_utils/split_locations_into_train_val.py +215 -0
  46. md_utils/string_utils.py +10 -0
  47. md_utils/url_utils.py +0 -2
  48. md_utils/write_html_image_list.py +1 -0
  49. md_visualization/visualization_utils.py +17 -2
  50. md_visualization/visualize_db.py +8 -0
  51. md_visualization/visualize_detector_output.py +185 -104
  52. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/METADATA +2 -2
  53. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/RECORD +62 -58
  54. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/WHEEL +1 -1
  55. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
  56. taxonomy_mapping/map_new_lila_datasets.py +43 -39
  57. taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
  58. taxonomy_mapping/preview_lila_taxonomy.py +27 -27
  59. taxonomy_mapping/species_lookup.py +33 -13
  60. taxonomy_mapping/taxonomy_csv_checker.py +7 -5
  61. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/LICENSE +0 -0
  62. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/top_level.txt +0 -0
@@ -14,18 +14,6 @@
14
14
  # the same if you are reading this in Jupyter Notebook (using the .ipynb version of the
15
15
  # script):
16
16
  #
17
- # * You can specify the MegaDetector location, but you may find it useful to use the same paths
18
- # I use; on all the machines where I run MD, I keep all versions of MegaDetector handy at these
19
- # paths:
20
- #
21
- # ~/models/camera_traps/megadetector/md_v5.0.0/md_v5a.0.0.pt
22
- # ~/models/camera_traps/megadetector/md_v5.0.0/md_v5b.0.0.pt
23
- # ~/models/camera_traps/megadetector/md_v4.1.0/md_v4.1.0.pb
24
- #
25
- # On Windows, this translates to, for example:
26
- #
27
- # c:\users\dmorr\models\camera_traps\megadetector\md_v5.0.0\md_v5a.0.0.pt
28
- #
29
17
  # * Typically when I have a MegaDetector job to run, I make a copy of this script. Let's
30
18
  # say I'm running a job for an organization called "bibblebop"; I have a big folder of
31
19
  # job-specific copies of this script, and I might save a new one called "bibblebop-2023-07-26.py"
@@ -78,6 +66,7 @@ import json
78
66
  import os
79
67
  import stat
80
68
  import time
69
+ import re
81
70
 
82
71
  import humanfriendly
83
72
 
@@ -90,12 +79,12 @@ from md_utils.ct_utils import split_list_into_n_chunks
90
79
 
91
80
  from detection.run_detector_batch import load_and_run_detector_batch, write_results_to_file
92
81
  from detection.run_detector import DEFAULT_OUTPUT_CONFIDENCE_THRESHOLD
82
+ from detection.run_detector import estimate_md_images_per_second
93
83
 
94
84
  from api.batch_processing.postprocessing.postprocess_batch_results import (
95
85
  PostProcessingOptions, process_batch_results)
96
86
  from detection.run_detector import get_detector_version_from_filename
97
-
98
- max_task_name_length = 92
87
+ from md_utils.ct_utils import image_file_to_camera_folder
99
88
 
100
89
  # To specify a non-default confidence threshold for including detections in the .json file
101
90
  json_threshold = None
@@ -103,61 +92,108 @@ json_threshold = None
103
92
  # Turn warnings into errors if more than this many images are missing
104
93
  max_tolerable_failed_images = 100
105
94
 
95
+ # Should we supply the --image_queue_option to run_detector_batch.py? I only set this
96
+ # when I have a very slow drive and a comparably fast GPU. When this is enabled, checkpointing
97
+ # is not supported within a job, so I set n_jobs to a large number (typically 100).
106
98
  use_image_queue = False
107
99
 
108
100
  # Only relevant when we're using a single GPU
109
101
  default_gpu_number = 0
110
102
 
103
+ # Should we supply --quiet to run_detector_batch.py?
111
104
  quiet_mode = True
112
105
 
113
106
  # Specify a target image size when running MD... strongly recommended to leave this at "None"
107
+ #
108
+ # When using augmented inference, if you leave this at "None", run_inference_with_yolov5_val.py
109
+ # will use its default size, which is 1280 * 1.3, which is almost always what you want.
114
110
  image_size = None
115
111
 
116
112
  # Only relevant when running on CPU
117
113
  ncores = 1
118
114
 
119
- # OS-specific script line continuation character
115
+ # OS-specific script line continuation character (modified later if we're running on Windows)
120
116
  slcc = '\\'
121
117
 
122
- # OS-specific script comment character
118
+ # OS-specific script comment character (modified later if we're running on Windows)
123
119
  scc = '#'
124
120
 
121
+ # # OS-specific script extension (modified later if we're running on Windows)
125
122
  script_extension = '.sh'
126
123
 
124
+ # If False, we'll load chunk files with file lists if they exist
125
+ force_enumeration = False
126
+
127
127
  # Prefer threads on Windows, processes on Linux
128
128
  parallelization_defaults_to_threads = False
129
129
 
130
130
  # This is for things like image rendering, not for MegaDetector
131
131
  default_workers_for_parallel_tasks = 30
132
132
 
133
+ overwrite_handling = 'skip' # 'skip', 'error', or 'overwrite'
134
+
135
+ # Only relevant to repeat detection elimination; try to identify EK113/RCNX101-style
136
+ # overflow folders and treat them as the same camera
137
+ overflow_folder_handling_enabled = True
138
+
139
+ # The function used to get camera names from image paths; can also replace this
140
+ # with a custom function.
141
+ relative_path_to_location = image_file_to_camera_folder
142
+
143
+ # This will be the .json results file after RDE; if this is still None when
144
+ # we get to classification stuff, that will indicate that we didn't do RDE.
145
+ filtered_output_filename = None
146
+
147
+ if os.name == 'nt':
148
+
149
+ slcc = '^'
150
+ scc = 'REM'
151
+ script_extension = '.bat'
152
+
153
+ # My experience has been that Python multiprocessing is flaky on Windows, so
154
+ # default to threads on Windows
155
+ parallelization_defaults_to_threads = True
156
+ default_workers_for_parallel_tasks = 10
157
+
158
+
159
+ ## Constants related to using YOLOv5's val.py
160
+
133
161
  # Should we use YOLOv5's val.py instead of run_detector_batch.py?
134
162
  use_yolo_inference_scripts = False
135
163
 
136
- # Directory in which to run val.py. Only relevant if use_yolo_inference_scripts is True.
164
+ # Directory in which to run val.py (relevant for YOLOv5, not for YOLOv8)
137
165
  yolo_working_dir = os.path.expanduser('~/git/yolov5')
138
166
 
167
+ # Only used for loading the mapping from class indices to names
168
+ yolo_dataset_file = None
169
+
170
+ # 'yolov5' or 'yolov8'; assumes YOLOv5 if this is None
171
+ yolo_model_type = None
172
+
173
+ # inference batch size
174
+ yolo_batch_size = 1
175
+
139
176
  # Should we remove intermediate files used for running YOLOv5's val.py?
140
177
  #
141
178
  # Only relevant if use_yolo_inference_scripts is True.
142
- remove_yolo_intermediate_results = False
143
- remove_yolo_symlink_folder = False
179
+ remove_yolo_intermediate_results = True
180
+ remove_yolo_symlink_folder = True
144
181
  use_symlinks_for_yolo_inference = True
182
+ write_yolo_debug_output = False
145
183
 
146
- overwrite_handling = 'skip' # 'skip', 'error', or 'overwrite'
184
+ # Should we apply YOLOv5's test-time augmentation?
185
+ augment = False
147
186
 
148
- # Set later if EK113/RCNX101-style overflow folders are being handled in this dataset
149
- overflow_folder_handling_enabled = False
150
187
 
151
- # Should we apply YOLOv5's augmentation? Only allowed when use_yolo_inference_scripts
152
- # is True.
153
- augment = False
188
+ ## Constants related to tiled inference
154
189
 
155
- if os.name == 'nt':
156
- slcc = '^'
157
- scc = 'REM'
158
- script_extension = '.bat'
159
- parallelization_defaults_to_threads = True
160
- default_workers_for_parallel_tasks = 10
190
+ use_tiled_inference = True
191
+
192
+ # Should we delete tiles after each job? Only set this to False for debugging;
193
+ # large jobs will take up a lot of space if you keep tiles around after each task.
194
+ remove_tiles = True
195
+ tile_size = (1280,1280)
196
+ tile_overlap = 0.2
161
197
 
162
198
 
163
199
  #%% Constants I set per script
@@ -165,9 +201,11 @@ if os.name == 'nt':
165
201
  input_path = '/drive/organization'
166
202
 
167
203
  assert not (input_path.endswith('/') or input_path.endswith('\\'))
204
+ assert os.path.isdir(input_path), 'Could not find input folder {}'.format(input_path)
205
+ input_path = input_path.replace('\\','/')
168
206
 
169
207
  organization_name_short = 'organization'
170
- job_date = None # '2023-05-08'
208
+ job_date = None # '2024-01-01'
171
209
  assert job_date is not None and organization_name_short != 'organization'
172
210
 
173
211
  # Optional descriptor
@@ -178,9 +216,7 @@ if job_tag is None:
178
216
  else:
179
217
  job_description_string = '-' + job_tag
180
218
 
181
- model_file = os.path.expanduser('~/models/camera_traps/megadetector/md_v5.0.0/md_v5a.0.0.pt')
182
- # model_file = os.path.expanduser('~/models/camera_traps/megadetector/md_v5.0.0/md_v5b.0.0.pt')
183
- # model_file = os.path.expanduser('~/models/camera_traps/megadetector/md_v4.1.0/md_v4.1.0.pb')
219
+ model_file = 'MDV5A' # 'MDV5A', 'MDV5B', 'MDV4'
184
220
 
185
221
  postprocessing_base = os.path.expanduser('~/postprocessing')
186
222
 
@@ -194,16 +230,12 @@ n_gpus = 2
194
230
  # checkpointing. Don't worry, this will be assert()'d in the next cell.
195
231
  checkpoint_frequency = 10000
196
232
 
197
- # gpu_images_per_second is only used to print out a time estimate, and it's completely
198
- # tied to the assumption of running on an RTX 3090. YMMV.
199
- if ('v5') in model_file:
200
- gpu_images_per_second = 10
201
- else:
202
- gpu_images_per_second = 2.9
233
+ # Estimate inference speed for the current GPU
234
+ approx_images_per_second = estimate_md_images_per_second(model_file)
203
235
 
204
- # Rough estimate for how much slower everything runs when using augmentation
236
+ # Rough estimate for the inference time cost of augmentation
205
237
  if augment:
206
- gpu_images_per_second = gpu_images_per_second * 0.7
238
+ approx_images_per_second = approx_images_per_second * 0.7
207
239
 
208
240
  base_task_name = organization_name_short + '-' + job_date + job_description_string + '-' + \
209
241
  get_detector_version_from_filename(model_file)
@@ -224,6 +256,14 @@ if augment:
224
256
  assert use_yolo_inference_scripts,\
225
257
  'Augmentation is only supported when running with the YOLO inference scripts'
226
258
 
259
+ if use_tiled_inference:
260
+ assert not augment, \
261
+ 'Augmentation is not supported when using tiled inference'
262
+ assert not use_yolo_inference_scripts, \
263
+ 'Using the YOLO inference script is not supported when using tiled inference'
264
+ assert checkpoint_frequency is None, \
265
+ 'Checkpointing is not supported when using tiled inference'
266
+
227
267
  filename_base = os.path.join(base_output_folder_name, base_task_name)
228
268
  combined_api_output_folder = os.path.join(filename_base, 'combined_api_outputs')
229
269
  postprocessing_output_folder = os.path.join(filename_base, 'preview')
@@ -240,24 +280,17 @@ print('Output folder:\n{}'.format(filename_base))
240
280
 
241
281
  #%% Enumerate files
242
282
 
243
- all_images = sorted(path_utils.find_images(input_path,recursive=True))
283
+ # Have we already listed files for this job?
284
+ chunk_files = os.listdir(filename_base)
285
+ pattern = re.compile('chunk\d+.json')
286
+ chunk_files = [fn for fn in chunk_files if pattern.match(fn)]
244
287
 
245
- # It's common to run this notebook on an external drive with the main folders in the drive root
246
- all_images = [fn for fn in all_images if not \
247
- (fn.startswith('$RECYCLE') or fn.startswith('System Volume Information'))]
288
+ if (not force_enumeration) and (len(chunk_files) > 0):
248
289
 
249
- print('Enumerated {} image files in {}'.format(len(all_images),input_path))
250
-
251
- if False:
252
-
253
- pass
290
+ print('Found {} chunk files in folder {}, bypassing enumeration'.format(
291
+ len(chunk_files),
292
+ filename_base))
254
293
 
255
- #%% Load files from prior enumeration
256
-
257
- import re
258
- chunk_files = os.listdir(filename_base)
259
- pattern = re.compile('chunk\d+.json')
260
- chunk_files = [fn for fn in chunk_files if pattern.match(fn)]
261
294
  all_images = []
262
295
  for fn in chunk_files:
263
296
  with open(os.path.join(filename_base,fn),'r') as f:
@@ -265,8 +298,24 @@ if False:
265
298
  assert isinstance(chunk,list)
266
299
  all_images.extend(chunk)
267
300
  all_images = sorted(all_images)
268
- print('Loaded {} image files from chunks in {}'.format(len(all_images),filename_base))
269
301
 
302
+ print('Loaded {} image files from {} chunks in {}'.format(
303
+ len(all_images),len(chunk_files),filename_base))
304
+
305
+ else:
306
+
307
+ print('Enumerating image files in {}'.format(input_path))
308
+
309
+ all_images = sorted(path_utils.find_images(input_path,recursive=True,convert_slashes=True))
310
+
311
+ # It's common to run this notebook on an external drive with the main folders in the drive root
312
+ all_images = [fn for fn in all_images if not \
313
+ (fn.startswith('$RECYCLE') or fn.startswith('System Volume Information'))]
314
+
315
+ print('')
316
+
317
+ print('Enumerated {} image files in {}'.format(len(all_images),input_path))
318
+
270
319
 
271
320
  #%% Divide images into chunks
272
321
 
@@ -275,13 +324,19 @@ folder_chunks = split_list_into_n_chunks(all_images,n_jobs)
275
324
 
276
325
  #%% Estimate total time
277
326
 
278
- n_images = len(all_images)
279
- execution_seconds = n_images / gpu_images_per_second
280
- wallclock_seconds = execution_seconds / n_gpus
281
- print('Expected time: {}'.format(humanfriendly.format_timespan(wallclock_seconds)))
282
-
283
- seconds_per_chunk = len(folder_chunks[0]) / gpu_images_per_second
284
- print('Expected time per chunk: {}'.format(humanfriendly.format_timespan(seconds_per_chunk)))
327
+ if approx_images_per_second is None:
328
+
329
+ print("Can't estimate inference time for the current environment")
330
+
331
+ else:
332
+
333
+ n_images = len(all_images)
334
+ execution_seconds = n_images / approx_images_per_second
335
+ wallclock_seconds = execution_seconds / n_gpus
336
+ print('Expected time: {}'.format(humanfriendly.format_timespan(wallclock_seconds)))
337
+
338
+ seconds_per_chunk = len(folder_chunks[0]) / approx_images_per_second
339
+ print('Expected time per chunk: {}'.format(humanfriendly.format_timespan(seconds_per_chunk)))
285
340
 
286
341
 
287
342
  #%% Write file lists
@@ -298,19 +353,20 @@ for i_chunk,chunk_list in enumerate(folder_chunks):
298
353
  #%% Generate commands
299
354
 
300
355
  # A list of the scripts tied to each GPU, as absolute paths. We'll write this out at
301
- # the end so each GPU's list of commands can be run at once. Generally only used when
302
- # running lots of small batches via YOLOv5's val.py, which doesn't support checkpointing.
356
+ # the end so each GPU's list of commands can be run at once
303
357
  gpu_to_scripts = defaultdict(list)
304
358
 
305
359
  # i_task = 0; task = task_info[i_task]
306
360
  for i_task,task in enumerate(task_info):
307
361
 
308
362
  chunk_file = task['input_file']
363
+ checkpoint_filename = chunk_file.replace('.json','_checkpoint.json')
364
+
309
365
  output_fn = chunk_file.replace('.json','_results.json')
310
366
 
311
367
  task['output_file'] = output_fn
312
368
 
313
- if n_jobs > 1:
369
+ if n_gpus > 1:
314
370
  gpu_number = i_task % n_gpus
315
371
  else:
316
372
  gpu_number = default_gpu_number
@@ -326,6 +382,10 @@ for i_task,task in enumerate(task_info):
326
382
  augment_string = ''
327
383
  if augment:
328
384
  augment_string = '--augment_enabled 1'
385
+ else:
386
+ augment_string = '--augment_enabled 0'
387
+
388
+ batch_string = '--batch_size {}'.format(yolo_batch_size)
329
389
 
330
390
  symlink_folder = os.path.join(filename_base,'symlinks','symlinks_{}'.format(
331
391
  str(i_task).zfill(3)))
@@ -339,6 +399,10 @@ for i_task,task in enumerate(task_info):
339
399
  if not remove_yolo_symlink_folder:
340
400
  remove_symlink_folder_string = '--no_remove_symlink_folder'
341
401
 
402
+ write_yolo_debug_output_string = ''
403
+ if write_yolo_debug_output:
404
+ write_yolo_debug_output = '--write_yolo_debug_output'
405
+
342
406
  remove_yolo_results_string = ''
343
407
  if not remove_yolo_intermediate_results:
344
408
  remove_yolo_results_string = '--no_remove_yolo_results_folder'
@@ -356,15 +420,47 @@ for i_task,task in enumerate(task_info):
356
420
  overwrite_handling_string = '--overwrite_handling {}'.format(overwrite_handling)
357
421
 
358
422
  cmd += f'python run_inference_with_yolov5_val.py "{model_file}" "{chunk_file}" "{output_fn}" '
359
- cmd += f'--yolo_working_folder "{yolo_working_dir}" {image_size_string} {augment_string} '
423
+ cmd += f'{image_size_string} {augment_string} '
360
424
  cmd += f'{symlink_folder_string} {yolo_results_folder_string} {remove_yolo_results_string} '
361
425
  cmd += f'{remove_symlink_folder_string} {confidence_threshold_string} {device_string} '
362
- cmd += f'{overwrite_handling_string}'
363
-
426
+ cmd += f'{overwrite_handling_string} {batch_string} {write_yolo_debug_output_string}'
427
+
428
+ if yolo_working_dir is not None:
429
+ cmd += f' --yolo_working_folder "{yolo_working_dir}"'
430
+ if yolo_dataset_file is not None:
431
+ cmd += ' --yolo_dataset_file "{}"'.format(yolo_dataset_file)
432
+ if yolo_model_type is not None:
433
+ cmd += ' --model_type {}'.format(yolo_model_type)
434
+
364
435
  if not use_symlinks_for_yolo_inference:
365
436
  cmd += ' --no_use_symlinks'
366
437
 
367
438
  cmd += '\n'
439
+
440
+ elif use_tiled_inference:
441
+
442
+ tiling_folder = os.path.join(filename_base,'tile_cache','tile_cache_{}'.format(
443
+ str(i_task).zfill(3)))
444
+
445
+ if os.name == 'nt':
446
+ cuda_string = f'set CUDA_VISIBLE_DEVICES={gpu_number} & '
447
+ else:
448
+ cuda_string = f'CUDA_VISIBLE_DEVICES={gpu_number} '
449
+
450
+ cmd = f'{cuda_string} python run_tiled_inference.py "{model_file}" "{input_path}" "{tiling_folder}" "{output_fn}"'
451
+
452
+ cmd += f' --image_list "{chunk_file}"'
453
+ cmd += f' --overwrite_handling {overwrite_handling}'
454
+
455
+ if not remove_tiles:
456
+ cmd += ' --no_remove_tiles'
457
+
458
+ # If we're using non-default tile sizes
459
+ if tile_size is not None and (tile_size[0] > 0 or tile_size[1] > 0):
460
+ cmd += ' --tile_size_x {} --tile_size_y {}'.format(tile_size[0],tile_size[1])
461
+
462
+ if tile_overlap is not None:
463
+ cmd += f' --tile_overlap {tile_overlap}'
368
464
 
369
465
  else:
370
466
 
@@ -375,7 +471,6 @@ for i_task,task in enumerate(task_info):
375
471
 
376
472
  checkpoint_frequency_string = ''
377
473
  checkpoint_path_string = ''
378
- checkpoint_filename = chunk_file.replace('.json','_checkpoint.json')
379
474
 
380
475
  if checkpoint_frequency is not None and checkpoint_frequency > 0:
381
476
  checkpoint_frequency_string = f'--checkpoint_frequency {checkpoint_frequency}'
@@ -484,12 +579,10 @@ multiple processes, so the tasks will run serially. This only matters if you ha
484
579
  GPUs.
485
580
  """
486
581
 
487
- if False:
488
-
489
- pass
490
-
491
- #%%% Run the tasks (commented out)
582
+ run_tasks_in_notebook = False
492
583
 
584
+ if run_tasks_in_notebook:
585
+
493
586
  assert not use_yolo_inference_scripts, \
494
587
  'If you want to use the YOLOv5 inference scripts, you can\'t run the model interactively (yet)'
495
588
 
@@ -537,15 +630,32 @@ if False:
537
630
 
538
631
  # ...for each chunk
539
632
 
540
- # ...if False
633
+ # ...if we're running tasks in this notebook
541
634
 
542
635
 
543
636
  #%% Load results, look for failed or missing images in each task
544
637
 
638
+ # Check that all task output files exist
639
+
640
+ missing_output_files = []
641
+
642
+ # i_task = 0; task = task_info[i_task]
643
+ for i_task,task in tqdm(enumerate(task_info),total=len(task_info)):
644
+ output_file = task['output_file']
645
+ if not os.path.isfile(output_file):
646
+ missing_output_files.append(output_file)
647
+
648
+ if len(missing_output_files) > 0:
649
+ print('Missing {} output files:'.format(len(missing_output_files)))
650
+ for s in missing_output_files:
651
+ print(s)
652
+ raise Exception('Missing output files')
653
+
654
+
545
655
  n_total_failures = 0
546
656
 
547
657
  # i_task = 0; task = task_info[i_task]
548
- for i_task,task in enumerate(task_info):
658
+ for i_task,task in tqdm(enumerate(task_info),total=len(task_info)):
549
659
 
550
660
  chunk_file = task['input_file']
551
661
  output_file = task['output_file']
@@ -562,6 +672,13 @@ for i_task,task in enumerate(task_info):
562
672
 
563
673
  # im = task_results['images'][0]
564
674
  for im in task_results['images']:
675
+
676
+ # Most of the time, inference result files use absolute paths, but it's
677
+ # getting annoying to make sure that's *always* true, so handle both here.
678
+ # E.g., when using tiled inference, paths will be relative.
679
+ if not os.path.isabs(im['file']):
680
+ fn = os.path.join(input_path,im['file']).replace('\\','/')
681
+ im['file'] = fn
565
682
  assert im['file'].startswith(input_path)
566
683
  assert im['file'] in task_images_set
567
684
  filename_to_results[im['file']] = im
@@ -573,7 +690,8 @@ for i_task,task in enumerate(task_info):
573
690
  task['results'] = task_results
574
691
 
575
692
  for fn in task_images:
576
- assert fn in filename_to_results
693
+ assert fn in filename_to_results, \
694
+ 'File {} not found in results for task {}'.format(fn,i_task)
577
695
 
578
696
  n_total_failures += n_task_failures
579
697
 
@@ -593,7 +711,7 @@ combined_results = {}
593
711
  combined_results['images'] = []
594
712
  images_processed = set()
595
713
 
596
- for i_task,task in enumerate(task_info):
714
+ for i_task,task in tqdm(enumerate(task_info),total=len(task_info)):
597
715
 
598
716
  task_results = task['results']
599
717
 
@@ -620,14 +738,14 @@ assert len(combined_results['images']) == len(all_images), \
620
738
  result_filenames = [im['file'] for im in combined_results['images']]
621
739
  assert len(combined_results['images']) == len(set(result_filenames))
622
740
 
623
- # Check for valid path names
741
+ # Convert to relative paths, preserving '/' as the path separator, regardless of OS
624
742
  for im in combined_results['images']:
743
+ assert '\\' not in im['file']
744
+ assert im['file'].startswith(input_path)
625
745
  if input_path.endswith(':'):
626
- assert im['file'].startswith(input_path)
627
746
  im['file'] = im['file'].replace(input_path,'',1)
628
747
  else:
629
- assert im['file'].startswith(input_path + os.path.sep)
630
- im['file'] = im['file'].replace(input_path + os.path.sep,'',1)
748
+ im['file'] = im['file'].replace(input_path + '/','',1)
631
749
 
632
750
  combined_api_output_file = os.path.join(
633
751
  combined_api_output_folder,
@@ -675,88 +793,8 @@ options.api_output_file = combined_api_output_file
675
793
  options.output_dir = output_base
676
794
  ppresults = process_batch_results(options)
677
795
  html_output_file = ppresults.output_html_file
678
- path_utils.open_file(html_output_file)
679
-
680
-
681
- #%% RDE (sample directory collapsing)
682
-
683
- #
684
- # The next few cells are about repeat detection elimination; if you want to skip this,
685
- # and still do other stuff in this notebook (e.g. running classifiers), that's fine, but
686
- # the rest of the notebook weakly assumes you've done this. Specifically, it looks for
687
- # the variable "filtered_api_output_file" (a file produced by the RDE process). If you
688
- # don't run the RDE cells, just change "filtered_api_output_file" to "combined_api_output_file"
689
- # (the raw output from MegaDetector). Then it will be like all this RDE stuff doesn't exist.
690
- #
691
- # Though FWIW, once you're sufficiently power-user-ish to use this notebook, RDE is almost
692
- # always worth it.
693
- #
694
-
695
- def relative_path_to_location(relative_path):
696
- """
697
- This is a sample function that returns a camera name given an image path. By
698
- default in the RDE process, leaf-node folders are equivalent to cameras. To map
699
- something other than leaf-node folders to cameras, fill in this function, and un-comment the
700
- line below containing "relative_path_to_location".
701
-
702
- Sample regular expressions are included here for common patterns, particularly the
703
- overflow folders created by Reconyx and Bushnell camera traps. So if one of those
704
- fits your scenario, you don't have to modify this function, just un-comment the line
705
- below that enables this feature.
706
-
707
- Nothing bad happens if you have overflow folders like this and you don't
708
- enable this mapping, you are just taking a more conservative approach to RDE in that
709
- scenario.
710
- """
711
-
712
- import re
713
-
714
- # 100RECNX is the overflow folder style for Reconyx cameras
715
- # 100EK113 is (for some reason) the overflow folder style for Bushnell cameras
716
- # 100_BTCF is the overflow folder style for Browning cameras
717
- patterns = ['\/\d+RECNX\/','\/\d+EK\d+\/','\/\d+_BTCF\/']
718
-
719
- relative_path = relative_path.replace('\\','/')
720
- for pat in patterns:
721
- relative_path = re.sub(pat,'/',relative_path)
722
- location_name = os.path.dirname(relative_path)
723
-
724
- return location_name
725
-
726
-
727
- #%% Test cells for relative_path_to_location
728
-
729
- if False:
730
-
731
- pass
732
-
733
- #%% Test the generic cases
734
-
735
- relative_path = 'a/b/c/d/100EK113/blah.jpg'
736
- print(relative_path_to_location(relative_path))
737
-
738
- relative_path = 'a/b/c/d/100RECNX/blah.jpg'
739
- print(relative_path_to_location(relative_path))
740
-
741
-
742
- #%% Test relative_path_to_location on the current dataset
743
-
744
- with open(combined_api_output_file,'r') as f:
745
- d = json.load(f)
746
- image_filenames = [im['file'] for im in d['images']]
747
-
748
- location_names = set()
749
-
750
- # relative_path = image_filenames[0]
751
- for relative_path in tqdm(image_filenames):
752
- location_name = relative_path_to_location(relative_path)
753
- location_names.add(location_name)
754
-
755
- location_names = list(location_names)
756
- location_names.sort()
757
-
758
- for s in location_names:
759
- print(s)
796
+ path_utils.open_file(html_output_file,attempt_to_open_in_wsl_host=True)
797
+ # import clipboard; clipboard.copy(html_output_file)
760
798
 
761
799
 
762
800
  #%% Repeat detection elimination, phase 1
@@ -768,7 +806,7 @@ task_index = 0
768
806
 
769
807
  options = repeat_detections_core.RepeatDetectionOptions()
770
808
 
771
- options.confidenceMin = 0.15
809
+ options.confidenceMin = 0.1
772
810
  options.confidenceMax = 1.01
773
811
  options.iouThreshold = 0.85
774
812
  options.occurrenceThreshold = 15
@@ -785,13 +823,13 @@ options.otherDetectionsThreshold = options.confidenceMin
785
823
 
786
824
  options.bRenderDetectionTiles = True
787
825
  options.maxOutputImageWidth = 2000
788
- options.detectionTilesMaxCrops = 500
826
+ options.detectionTilesMaxCrops = 300
789
827
 
790
828
  # options.lineThickness = 5
791
829
  # options.boxExpansion = 8
792
830
 
793
831
  # To invoke custom collapsing of folders for a particular manufacturer's naming scheme
794
- # options.customDirNameFunction = relative_path_to_location; overflow_folder_handling_enabled = True
832
+ options.customDirNameFunction = relative_path_to_location
795
833
 
796
834
  options.bRenderHtml = False
797
835
  options.imageBase = input_path
@@ -816,9 +854,9 @@ options.debugMaxRenderInstance = -1
816
854
  # Can be None, 'xsort', or 'clustersort'
817
855
  options.smartSort = 'xsort'
818
856
 
819
- suspiciousDetectionResults = repeat_detections_core.find_repeat_detections(combined_api_output_file,
820
- None,
821
- options)
857
+ suspicious_detection_results = repeat_detections_core.find_repeat_detections(combined_api_output_file,
858
+ outputFilename=None,
859
+ options=options)
822
860
 
823
861
 
824
862
  #%% Manual RDE step
@@ -826,7 +864,8 @@ suspiciousDetectionResults = repeat_detections_core.find_repeat_detections(combi
826
864
  ## DELETE THE VALID DETECTIONS ##
827
865
 
828
866
  # If you run this line, it will open the folder up in your file browser
829
- path_utils.open_file(os.path.dirname(suspiciousDetectionResults.filterFile))
867
+ path_utils.open_file(os.path.dirname(suspicious_detection_results.filterFile),
868
+ attempt_to_open_in_wsl_host=True)
830
869
 
831
870
  #
832
871
  # If you ran the previous cell, but then you change your mind and you don't want to do
@@ -834,7 +873,7 @@ path_utils.open_file(os.path.dirname(suspiciousDetectionResults.filterFile))
834
873
  # previous cell. If you do that, you're implicitly telling the notebook that you looked
835
874
  # at everything in that folder, and confirmed there were no red boxes on animals.
836
875
  #
837
- # Instead, either change "filtered_api_output_file" below to "combined_api_output_file",
876
+ # Instead, either change "filtered_output_filename" below to "combined_api_output_file",
838
877
  # or delete *all* the images in the filtering folder.
839
878
  #
840
879
 
@@ -843,12 +882,13 @@ path_utils.open_file(os.path.dirname(suspiciousDetectionResults.filterFile))
843
882
 
844
883
  from api.batch_processing.postprocessing.repeat_detection_elimination import remove_repeat_detections
845
884
 
846
- filtered_output_filename = path_utils.insert_before_extension(combined_api_output_file, 'filtered_{}'.format(rde_string))
885
+ filtered_output_filename = path_utils.insert_before_extension(combined_api_output_file,
886
+ 'filtered_{}'.format(rde_string))
847
887
 
848
888
  remove_repeat_detections.remove_repeat_detections(
849
889
  inputFile=combined_api_output_file,
850
890
  outputFile=filtered_output_filename,
851
- filteringDir=os.path.dirname(suspiciousDetectionResults.filterFile)
891
+ filteringDir=os.path.dirname(suspicious_detection_results.filterFile)
852
892
  )
853
893
 
854
894
 
@@ -890,7 +930,8 @@ options.output_dir = output_base
890
930
  ppresults = process_batch_results(options)
891
931
  html_output_file = ppresults.output_html_file
892
932
 
893
- path_utils.open_file(html_output_file)
933
+ path_utils.open_file(html_output_file,attempt_to_open_in_wsl_host=True)
934
+ # import clipboard; clipboard.copy(html_output_file)
894
935
 
895
936
 
896
937
  #%% Run MegaClassifier (actually, write out a script that runs MegaClassifier)
@@ -899,6 +940,11 @@ path_utils.open_file(html_output_file)
899
940
  final_output_path_mc = None
900
941
  final_output_path_ic = None
901
942
 
943
+ # If we didn't do RDE
944
+ if filtered_output_filename is None:
945
+ print("Warning: it looks like you didn't do RDE, using the raw output file")
946
+ filtered_output_filename = combined_api_output_file
947
+
902
948
  classifier_name_short = 'megaclassifier'
903
949
  threshold_str = '0.15' # 0.6
904
950
  classifier_name = 'megaclassifier_v0.1_efficientnet-b3'
@@ -1086,7 +1132,6 @@ with open(output_file,'w') as f:
1086
1132
  for s in commands:
1087
1133
  f.write('{}'.format(s))
1088
1134
 
1089
- import stat
1090
1135
  st = os.stat(output_file)
1091
1136
  os.chmod(output_file, st.st_mode | stat.S_IEXEC)
1092
1137
 
@@ -1256,8 +1301,6 @@ os.chmod(output_file, st.st_mode | stat.S_IEXEC)
1256
1301
 
1257
1302
  #%% Within-image classification smoothing
1258
1303
 
1259
- from collections import defaultdict
1260
-
1261
1304
  #
1262
1305
  # Only count detections with a classification confidence threshold above
1263
1306
  # *classification_confidence_threshold*, which in practice means we're only
@@ -1516,7 +1559,7 @@ else:
1516
1559
  import datetime
1517
1560
  from data_management.read_exif import parse_exif_datetime_string
1518
1561
 
1519
- min_valid_timestamp_year = 2015
1562
+ min_valid_timestamp_year = 2001
1520
1563
 
1521
1564
  now = datetime.datetime.now()
1522
1565
 
@@ -1540,6 +1583,7 @@ for exif_result in tqdm(exif_results):
1540
1583
 
1541
1584
  im['file_name'] = exif_result['file_name']
1542
1585
  im['id'] = im['file_name']
1586
+
1543
1587
  if ('exif_tags' not in exif_result) or (exif_result['exif_tags'] is None) or \
1544
1588
  (exif_datetime_tag not in exif_result['exif_tags']):
1545
1589
  exif_dt = None
@@ -1573,7 +1617,7 @@ for exif_result in tqdm(exif_results):
1573
1617
 
1574
1618
  # ...for each exif image result
1575
1619
 
1576
- print('Parsed EXIF datetime information, unable to parse EXIF data from {} of {} images'.format(
1620
+ print('Parsed EXIF datetime information, unable to parse EXIF date from {} of {} images'.format(
1577
1621
  len(images_without_datetime),len(exif_results)))
1578
1622
 
1579
1623
 
@@ -1639,7 +1683,7 @@ min_dominant_class_classifications_above_threshold_for_class_smoothing = 5 # 2
1639
1683
  max_secondary_class_classifications_above_threshold_for_class_smoothing = 5
1640
1684
 
1641
1685
  # If the ratio between a dominant class and a secondary class count is greater than this,
1642
- # regardless of the secondary class count, switch those classificaitons (i.e., ignore
1686
+ # regardless of the secondary class count, switch those classifications (i.e., ignore
1643
1687
  # max_secondary_class_classifications_above_threshold_for_class_smoothing).
1644
1688
  #
1645
1689
  # This may be different for different dominant classes, e.g. if we see lots of cows, they really
@@ -1959,8 +2003,8 @@ print('Processing {} to {}'.format(base_task_name, output_base))
1959
2003
  options.api_output_file = sequence_smoothed_classification_file
1960
2004
  options.output_dir = output_base
1961
2005
  ppresults = process_batch_results(options)
1962
- path_utils.open_file(ppresults.output_html_file)
1963
-
2006
+ path_utils.open_file(ppresults.output_html_file,attempt_to_open_in_wsl_host=True)
2007
+ # import clipboard; clipboard.copy(ppresults.output_html_file)
1964
2008
 
1965
2009
  #% Zip .json files
1966
2010
 
@@ -2027,7 +2071,7 @@ for i, j in itertools.combinations(list(range(0,len(filenames))),2):
2027
2071
  results = compare_batch_results(options)
2028
2072
 
2029
2073
  from md_utils.path_utils import open_file
2030
- open_file(results.html_output_file)
2074
+ open_file(results.html_output_file,attempt_to_open_in_wsl_host=True)
2031
2075
 
2032
2076
 
2033
2077
  #%% Merge in high-confidence detections from another results file
@@ -2081,7 +2125,7 @@ options.output_dir = output_base_large_boxes
2081
2125
 
2082
2126
  ppresults = process_batch_results(options)
2083
2127
  html_output_file = ppresults.output_html_file
2084
- path_utils.open_file(html_output_file)
2128
+ path_utils.open_file(html_output_file,attempt_to_open_in_wsl_host=True)
2085
2129
 
2086
2130
 
2087
2131
  #%% .json splitting
@@ -2094,12 +2138,6 @@ from api.batch_processing.postprocessing.subset_json_detector_output import (
2094
2138
  input_filename = filtered_output_filename
2095
2139
  output_base = os.path.join(combined_api_output_folder,base_task_name + '_json_subsets')
2096
2140
 
2097
- if False:
2098
- if data is None:
2099
- with open(input_filename) as f:
2100
- data = json.load(f)
2101
- print('Data set contains {} images'.format(len(data['images'])))
2102
-
2103
2141
  print('Processing file {} to {}'.format(input_filename,output_base))
2104
2142
 
2105
2143
  options = SubsetJsonDetectorOutputOptions()
@@ -2204,13 +2242,47 @@ video_output_filename = filtered_output_filename.replace('.json','_aggregated.js
2204
2242
  frame_results_to_video_results(filtered_output_filename,video_output_filename)
2205
2243
 
2206
2244
 
2245
+ #%% Sample custom path replacement function
2246
+
2247
+ def custom_relative_path_to_location(relative_path):
2248
+
2249
+ relative_path = relative_path.replace('\\','/')
2250
+ tokens = relative_path.split('/')
2251
+ location_name = '/'.join(tokens[0:2])
2252
+ return location_name
2253
+
2254
+
2255
+ #%% Test relative_path_to_location on the current dataset
2256
+
2257
+ with open(combined_api_output_file,'r') as f:
2258
+ d = json.load(f)
2259
+ image_filenames = [im['file'] for im in d['images']]
2260
+
2261
+ location_names = set()
2262
+
2263
+ # relative_path = image_filenames[0]
2264
+ for relative_path in tqdm(image_filenames):
2265
+ location_name = relative_path_to_location(relative_path)
2266
+ location_names.add(location_name)
2267
+
2268
+ location_names = list(location_names)
2269
+ location_names.sort()
2270
+
2271
+ for s in location_names:
2272
+ print(s)
2273
+
2274
+
2207
2275
  #%% End notebook: turn this script into a notebook (how meta!)
2208
2276
 
2209
2277
  import os
2210
2278
  import nbformat as nbf
2211
2279
 
2212
- input_py_file = os.path.expanduser(
2213
- '~/git/MegaDetector/api/batch_processing/data_preparation/manage_local_batch.py')
2280
+ if os.name == 'nt':
2281
+ git_base = r'c:\git'
2282
+ else:
2283
+ git_base = os.path.expanduer('~/git')
2284
+
2285
+ input_py_file = git_base + '/MegaDetector/api/batch_processing/data_preparation/manage_local_batch.py'
2214
2286
  assert os.path.isfile(input_py_file)
2215
2287
  output_ipynb_file = input_py_file.replace('.py','.ipynb')
2216
2288
 
@@ -2233,14 +2305,23 @@ i_line = 0
2233
2305
 
2234
2306
  header_comment = ''
2235
2307
 
2308
+ # Delete a few lines from the top that don't belong in the NB version, e.g. the name
2309
+ # of the .py file
2236
2310
  lines_to_ignore = 7
2311
+ expected_first_token = '# This script'
2312
+ found_first_token = False
2237
2313
 
2238
2314
  # Everything before the first cell is the header comment
2239
2315
  while(not lines[i_line].startswith('#%%')):
2316
+
2240
2317
  if i_line < lines_to_ignore:
2241
2318
  i_line += 1
2242
2319
  continue
2243
2320
 
2321
+ if not found_first_token:
2322
+ assert lines[i_line].startswith(expected_first_token)
2323
+ found_first_token = True
2324
+
2244
2325
  s = lines[i_line].replace('#','').strip()
2245
2326
  if len(s) == 0:
2246
2327
  header_comment += '\n\n'