PyPI - megadetector - Versions diffs - 5.0.6__py3-none-any.whl → 5.0.8__py3-none-any.whl - Mend

megadetector 5.0.6py3-none-any.whl → 5.0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (75) hide show

api/batch_processing/data_preparation/manage_local_batch.py +297 -202
api/batch_processing/data_preparation/manage_video_batch.py +7 -2
api/batch_processing/postprocessing/add_max_conf.py +1 -0
api/batch_processing/postprocessing/combine_api_outputs.py +2 -2
api/batch_processing/postprocessing/compare_batch_results.py +111 -61
api/batch_processing/postprocessing/convert_output_format.py +24 -6
api/batch_processing/postprocessing/load_api_results.py +56 -72
api/batch_processing/postprocessing/md_to_labelme.py +119 -51
api/batch_processing/postprocessing/merge_detections.py +30 -5
api/batch_processing/postprocessing/postprocess_batch_results.py +175 -55
api/batch_processing/postprocessing/remap_detection_categories.py +163 -0
api/batch_processing/postprocessing/render_detection_confusion_matrix.py +628 -0
api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +224 -76
api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
classification/prepare_classification_script.py +191 -191
data_management/cct_json_utils.py +7 -2
data_management/coco_to_labelme.py +263 -0
data_management/coco_to_yolo.py +72 -48
data_management/databases/integrity_check_json_db.py +75 -64
data_management/databases/subset_json_db.py +1 -1
data_management/generate_crops_from_cct.py +1 -1
data_management/get_image_sizes.py +44 -26
data_management/importers/animl_results_to_md_results.py +3 -5
data_management/importers/noaa_seals_2019.py +2 -2
data_management/importers/zamba_results_to_md_results.py +2 -2
data_management/labelme_to_coco.py +264 -127
data_management/labelme_to_yolo.py +96 -53
data_management/lila/create_lila_blank_set.py +557 -0
data_management/lila/create_lila_test_set.py +2 -1
data_management/lila/create_links_to_md_results_files.py +1 -1
data_management/lila/download_lila_subset.py +138 -45
data_management/lila/generate_lila_per_image_labels.py +23 -14
data_management/lila/get_lila_annotation_counts.py +16 -10
data_management/lila/lila_common.py +15 -42
data_management/lila/test_lila_metadata_urls.py +116 -0
data_management/read_exif.py +65 -16
data_management/remap_coco_categories.py +84 -0
data_management/resize_coco_dataset.py +14 -31
data_management/wi_download_csv_to_coco.py +239 -0
data_management/yolo_output_to_md_output.py +40 -13
data_management/yolo_to_coco.py +313 -100
detection/process_video.py +36 -14
detection/pytorch_detector.py +1 -1
detection/run_detector.py +73 -18
detection/run_detector_batch.py +116 -27
detection/run_inference_with_yolov5_val.py +135 -27
detection/run_tiled_inference.py +153 -43
detection/tf_detector.py +2 -1
detection/video_utils.py +4 -2
md_utils/ct_utils.py +101 -6
md_utils/md_tests.py +264 -17
md_utils/path_utils.py +326 -47
md_utils/process_utils.py +26 -7
md_utils/split_locations_into_train_val.py +215 -0
md_utils/string_utils.py +10 -0
md_utils/url_utils.py +66 -3
md_utils/write_html_image_list.py +12 -2
md_visualization/visualization_utils.py +380 -74
md_visualization/visualize_db.py +41 -10
md_visualization/visualize_detector_output.py +185 -104
{megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/METADATA +11 -13
{megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/RECORD +74 -67
{megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/WHEEL +1 -1
taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
taxonomy_mapping/map_new_lila_datasets.py +43 -39
taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
taxonomy_mapping/preview_lila_taxonomy.py +27 -27
taxonomy_mapping/species_lookup.py +33 -13
taxonomy_mapping/taxonomy_csv_checker.py +7 -5
md_visualization/visualize_megadb.py +0 -183
{megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/LICENSE +0 -0
{megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/top_level.txt +0 -0

api/batch_processing/data_preparation/manage_local_batch.py CHANGED Viewed

@@ -14,18 +14,6 @@
 # the same if you are reading this in Jupyter Notebook (using the .ipynb version of the
 # script):
 #
-# * You can specify the MegaDetector location, but you may find it useful to use the same paths
-#   I use; on all the machines where I run MD, I keep all versions of MegaDetector handy at these
-#   paths:
-#
-#   ~/models/camera_traps/megadetector/md_v5.0.0/md_v5a.0.0.pt
-#   ~/models/camera_traps/megadetector/md_v5.0.0/md_v5b.0.0.pt
-#   ~/models/camera_traps/megadetector/md_v4.1.0/md_v4.1.0.pb
-#
-#   On Windows, this translates to, for example:
-#
-#   c:\users\dmorr\models\camera_traps\megadetector\md_v5.0.0\md_v5a.0.0.pt
-#
 # * Typically when I have a MegaDetector job to run, I make a copy of this script.  Let's
 #   say I'm running a job for an organization called "bibblebop"; I have a big folder of
 #   job-specific copies of this script, and I might save a new one called "bibblebop-2023-07-26.py"
@@ -78,6 +66,7 @@ import json
 import os
 import stat
 import time
+import re
 import humanfriendly
@@ -90,12 +79,14 @@ from md_utils.ct_utils import split_list_into_n_chunks
 from detection.run_detector_batch import load_and_run_detector_batch, write_results_to_file
 from detection.run_detector import DEFAULT_OUTPUT_CONFIDENCE_THRESHOLD
+from detection.run_detector import estimate_md_images_per_second
 from api.batch_processing.postprocessing.postprocess_batch_results import (
     PostProcessingOptions, process_batch_results)
 from detection.run_detector import get_detector_version_from_filename
+from md_utils.ct_utils import image_file_to_camera_folder
-max_task_name_length = 92
+## Inference options
 # To specify a non-default confidence threshold for including detections in the .json file
 json_threshold = None
@@ -103,61 +94,113 @@ json_threshold = None
 # Turn warnings into errors if more than this many images are missing
 max_tolerable_failed_images = 100
+# Should we supply the --image_queue_option to run_detector_batch.py?  I only set this
+# when I have a very slow drive and a comparably fast GPU.  When this is enabled, checkpointing
+# is not supported within a job, so I set n_jobs to a large number (typically 100).
 use_image_queue = False
 # Only relevant when we're using a single GPU
 default_gpu_number = 0
+# Should we supply --quiet to run_detector_batch.py?
 quiet_mode = True
 # Specify a target image size when running MD... strongly recommended to leave this at "None"
+#
+# When using augmented inference, if you leave this at "None", run_inference_with_yolov5_val.py
+# will use its default size, which is 1280 * 1.3, which is almost always what you want.
 image_size = None
+# Should we include image size, timestamp, and/or EXIF data in MD output?
+include_image_size = False
+include_image_timestamp = False
+include_exif_data = False
 # Only relevant when running on CPU
 ncores = 1
-# OS-specific script line continuation character
+# OS-specific script line continuation character (modified later if we're running on Windows)
 slcc = '\\'
-# OS-specific script comment character
+# OS-specific script comment character (modified later if we're running on Windows)
 scc = '#'
+# # OS-specific script extension (modified later if we're running on Windows)
 script_extension = '.sh'
+# If False, we'll load chunk files with file lists if they exist
+force_enumeration = False
 # Prefer threads on Windows, processes on Linux
 parallelization_defaults_to_threads = False
 # This is for things like image rendering, not for MegaDetector
 default_workers_for_parallel_tasks = 30
+overwrite_handling = 'skip' # 'skip', 'error', or 'overwrite'
+# Only relevant to repeat detection elimination; try to identify EK113/RCNX101-style
+# overflow folders and treat them as the same camera
+overflow_folder_handling_enabled = True
+# The function used to get camera names from image paths; can also replace this
+# with a custom function.
+relative_path_to_location = image_file_to_camera_folder
+# This will be the .json results file after RDE; if this is still None when
+# we get to classification stuff, that will indicate that we didn't do RDE.
+filtered_output_filename = None
+if os.name == 'nt':
+    slcc = '^'
+    scc = 'REM'
+    script_extension = '.bat'
+    # My experience has been that Python multiprocessing is flaky on Windows, so
+    # default to threads on Windows
+    parallelization_defaults_to_threads = True
+    default_workers_for_parallel_tasks = 10
+## Constants related to using YOLOv5's val.py
 # Should we use YOLOv5's val.py instead of run_detector_batch.py?
 use_yolo_inference_scripts = False
-# Directory in which to run val.py.  Only relevant if use_yolo_inference_scripts is True.
+# Directory in which to run val.py (relevant for YOLOv5, not for YOLOv8)
 yolo_working_dir = os.path.expanduser('~/git/yolov5')
+# Only used for loading the mapping from class indices to names
+yolo_dataset_file = None
+# 'yolov5' or 'yolov8'; assumes YOLOv5 if this is None
+yolo_model_type = None
+# inference batch size
+yolo_batch_size = 1
 # Should we remove intermediate files used for running YOLOv5's val.py?
 #
 # Only relevant if use_yolo_inference_scripts is True.
-remove_yolo_intermediate_results = False
-remove_yolo_symlink_folder = False
+remove_yolo_intermediate_results = True
+remove_yolo_symlink_folder = True
 use_symlinks_for_yolo_inference = True
+write_yolo_debug_output = False
-overwrite_handling = 'skip' # 'skip', 'error', or 'overwrite'
+# Should we apply YOLOv5's test-time augmentation?
+augment = False
-# Set later if EK113/RCNX101-style overflow folders are being handled in this dataset
-overflow_folder_handling_enabled = False
-# Should we apply YOLOv5's augmentation?  Only allowed when use_yolo_inference_scripts
-# is True.
-augment = False
+## Constants related to tiled inference
-if os.name == 'nt':
-    slcc = '^'
-    scc = 'REM'
-    script_extension = '.bat'
-    parallelization_defaults_to_threads = True
-    default_workers_for_parallel_tasks = 10
+use_tiled_inference = False
+# Should we delete tiles after each job?  Only set this to False for debugging;
+# large jobs will take up a lot of space if you keep tiles around after each task.
+remove_tiles = True
+tile_size = (1280,1280)
+tile_overlap = 0.2
 #%% Constants I set per script
@@ -165,9 +208,11 @@ if os.name == 'nt':
 input_path = '/drive/organization'
 assert not (input_path.endswith('/') or input_path.endswith('\\'))
+assert os.path.isdir(input_path), 'Could not find input folder {}'.format(input_path)
+input_path = input_path.replace('\\','/')
 organization_name_short = 'organization'
-job_date = None # '2023-05-08'
+job_date = None # '2024-01-01'
 assert job_date is not None and organization_name_short != 'organization'
 # Optional descriptor
@@ -178,9 +223,7 @@ if job_tag is None:
 else:
     job_description_string = '-' + job_tag
-model_file = os.path.expanduser('~/models/camera_traps/megadetector/md_v5.0.0/md_v5a.0.0.pt')
-# model_file = os.path.expanduser('~/models/camera_traps/megadetector/md_v5.0.0/md_v5b.0.0.pt')
-# model_file = os.path.expanduser('~/models/camera_traps/megadetector/md_v4.1.0/md_v4.1.0.pb')
+model_file = 'MDV5A' # 'MDV5A', 'MDV5B', 'MDV4'
 postprocessing_base = os.path.expanduser('~/postprocessing')
@@ -194,16 +237,12 @@ n_gpus = 2
 # checkpointing.  Don't worry, this will be assert()'d in the next cell.
 checkpoint_frequency = 10000
-# gpu_images_per_second is only used to print out a time estimate, and it's completely
-# tied to the assumption of running on an RTX 3090.  YMMV.
-if ('v5') in model_file:
-    gpu_images_per_second = 10
-else:
-    gpu_images_per_second = 2.9
+# Estimate inference speed for the current GPU
+approx_images_per_second = estimate_md_images_per_second(model_file)
-# Rough estimate for how much slower everything runs when using augmentation
-if augment:
-    gpu_images_per_second = gpu_images_per_second * 0.7
+# Rough estimate for the inference time cost of augmentation
+if augment and (approx_images_per_second is not None):
+    approx_images_per_second = approx_images_per_second * 0.7
 base_task_name = organization_name_short + '-' + job_date + job_description_string + '-' + \
     get_detector_version_from_filename(model_file)
@@ -224,10 +263,22 @@ if augment:
     assert use_yolo_inference_scripts,\
         'Augmentation is only supported when running with the YOLO inference scripts'
+if use_tiled_inference:
+    assert not augment, \
+        'Augmentation is not supported when using tiled inference'
+    assert not use_yolo_inference_scripts, \
+        'Using the YOLO inference script is not supported when using tiled inference'
+    assert checkpoint_frequency is None, \
+        'Checkpointing is not supported when using tiled inference'
 filename_base = os.path.join(base_output_folder_name, base_task_name)
 combined_api_output_folder = os.path.join(filename_base, 'combined_api_outputs')
 postprocessing_output_folder = os.path.join(filename_base, 'preview')
+combined_api_output_file = os.path.join(
+    combined_api_output_folder,
+    '{}_detections.json'.format(base_task_name))
 os.makedirs(filename_base, exist_ok=True)
 os.makedirs(combined_api_output_folder, exist_ok=True)
 os.makedirs(postprocessing_output_folder, exist_ok=True)
@@ -240,24 +291,17 @@ print('Output folder:\n{}'.format(filename_base))
 #%% Enumerate files
-all_images = sorted(path_utils.find_images(input_path,recursive=True))
-# It's common to run this notebook on an external drive with the main folders in the drive root
-all_images = [fn for fn in all_images if not \
-              (fn.startswith('$RECYCLE') or fn.startswith('System Volume Information'))]
-print('Enumerated {} image files in {}'.format(len(all_images),input_path))
-if False:
+# Have we already listed files for this job?
+chunk_files = os.listdir(filename_base)
+pattern = re.compile('chunk\d+.json')
+chunk_files = [fn for fn in chunk_files if pattern.match(fn)]
-    pass
+if (not force_enumeration) and (len(chunk_files) > 0):
-    #%% Load files from prior enumeration
+    print('Found {} chunk files in folder {}, bypassing enumeration'.format(
+        len(chunk_files),
+        filename_base))
-    import re
-    chunk_files = os.listdir(filename_base)
-    pattern = re.compile('chunk\d+.json')
-    chunk_files = [fn for fn in chunk_files if pattern.match(fn)]
     all_images = []
     for fn in chunk_files:
         with open(os.path.join(filename_base,fn),'r') as f:
@@ -265,8 +309,24 @@ if False:
             assert isinstance(chunk,list)
             all_images.extend(chunk)
     all_images = sorted(all_images)
-    print('Loaded {} image files from chunks in {}'.format(len(all_images),filename_base))
+    print('Loaded {} image files from {} chunks in {}'.format(
+        len(all_images),len(chunk_files),filename_base))
+else:
+    print('Enumerating image files in {}'.format(input_path))
+    all_images = sorted(path_utils.find_images(input_path,recursive=True,convert_slashes=True))
+    # It's common to run this notebook on an external drive with the main folders in the drive root
+    all_images = [fn for fn in all_images if not \
+                  (fn.startswith('$RECYCLE') or fn.startswith('System Volume Information'))]
+    print('')
+    print('Enumerated {} image files in {}'.format(len(all_images),input_path))
 #%% Divide images into chunks
@@ -275,13 +335,19 @@ folder_chunks = split_list_into_n_chunks(all_images,n_jobs)
 #%% Estimate total time
-n_images = len(all_images)
-execution_seconds = n_images / gpu_images_per_second
-wallclock_seconds = execution_seconds / n_gpus
-print('Expected time: {}'.format(humanfriendly.format_timespan(wallclock_seconds)))
-seconds_per_chunk = len(folder_chunks[0]) / gpu_images_per_second
-print('Expected time per chunk: {}'.format(humanfriendly.format_timespan(seconds_per_chunk)))
+if approx_images_per_second is None:
+    print("Can't estimate inference time for the current environment")
+else:
+    n_images = len(all_images)
+    execution_seconds = n_images / approx_images_per_second
+    wallclock_seconds = execution_seconds / n_gpus
+    print('Expected time: {}'.format(humanfriendly.format_timespan(wallclock_seconds)))
+    seconds_per_chunk = len(folder_chunks[0]) / approx_images_per_second
+    print('Expected time per chunk: {}'.format(humanfriendly.format_timespan(seconds_per_chunk)))
 #%% Write file lists
@@ -298,19 +364,20 @@ for i_chunk,chunk_list in enumerate(folder_chunks):
 #%% Generate commands
 # A list of the scripts tied to each GPU, as absolute paths.  We'll write this out at
-# the end so each GPU's list of commands can be run at once.  Generally only used when
-# running lots of small batches via YOLOv5's val.py, which doesn't support checkpointing.
+# the end so each GPU's list of commands can be run at once
 gpu_to_scripts = defaultdict(list)
 # i_task = 0; task = task_info[i_task]
 for i_task,task in enumerate(task_info):
     chunk_file = task['input_file']
+    checkpoint_filename = chunk_file.replace('.json','_checkpoint.json')
     output_fn = chunk_file.replace('.json','_results.json')
     task['output_file'] = output_fn
-    if n_jobs > 1:
+    if n_gpus > 1:
         gpu_number = i_task % n_gpus
     else:
         gpu_number = default_gpu_number
@@ -326,6 +393,10 @@ for i_task,task in enumerate(task_info):
         augment_string = ''
         if augment:
             augment_string = '--augment_enabled 1'
+        else:
+            augment_string = '--augment_enabled 0'
+        batch_string = '--batch_size {}'.format(yolo_batch_size)
         symlink_folder = os.path.join(filename_base,'symlinks','symlinks_{}'.format(
             str(i_task).zfill(3)))
@@ -339,6 +410,10 @@ for i_task,task in enumerate(task_info):
         if not remove_yolo_symlink_folder:
             remove_symlink_folder_string = '--no_remove_symlink_folder'
+        write_yolo_debug_output_string = ''
+        if write_yolo_debug_output:
+            write_yolo_debug_output = '--write_yolo_debug_output'
         remove_yolo_results_string = ''
         if not remove_yolo_intermediate_results:
             remove_yolo_results_string = '--no_remove_yolo_results_folder'
@@ -356,15 +431,47 @@ for i_task,task in enumerate(task_info):
         overwrite_handling_string = '--overwrite_handling {}'.format(overwrite_handling)
         cmd += f'python run_inference_with_yolov5_val.py "{model_file}" "{chunk_file}" "{output_fn}" '
-        cmd += f'--yolo_working_folder "{yolo_working_dir}" {image_size_string} {augment_string} '
+        cmd += f'{image_size_string} {augment_string} '
         cmd += f'{symlink_folder_string} {yolo_results_folder_string} {remove_yolo_results_string} '
         cmd += f'{remove_symlink_folder_string} {confidence_threshold_string} {device_string} '
-        cmd += f'{overwrite_handling_string}'
+        cmd += f'{overwrite_handling_string} {batch_string} {write_yolo_debug_output_string}'
+        if yolo_working_dir is not None:
+            cmd += f' --yolo_working_folder "{yolo_working_dir}"'
+        if yolo_dataset_file is not None:
+            cmd += ' --yolo_dataset_file "{}"'.format(yolo_dataset_file)
+        if yolo_model_type is not None:
+            cmd += ' --model_type {}'.format(yolo_model_type)
         if not use_symlinks_for_yolo_inference:
             cmd += ' --no_use_symlinks'
         cmd += '\n'
+    elif use_tiled_inference:
+        tiling_folder = os.path.join(filename_base,'tile_cache','tile_cache_{}'.format(
+            str(i_task).zfill(3)))
+        if os.name == 'nt':
+            cuda_string = f'set CUDA_VISIBLE_DEVICES={gpu_number} & '
+        else:
+            cuda_string = f'CUDA_VISIBLE_DEVICES={gpu_number} '
+        cmd = f'{cuda_string} python run_tiled_inference.py "{model_file}" "{input_path}" "{tiling_folder}" "{output_fn}"'
+        cmd += f' --image_list "{chunk_file}"'
+        cmd += f' --overwrite_handling {overwrite_handling}'
+        if not remove_tiles:
+            cmd += ' --no_remove_tiles'
+        # If we're using non-default tile sizes
+        if tile_size is not None and (tile_size[0] > 0 or tile_size[1] > 0):
+            cmd += ' --tile_size_x {} --tile_size_y {}'.format(tile_size[0],tile_size[1])
+        if tile_overlap is not None:
+            cmd += f' --tile_overlap {tile_overlap}'
     else:
@@ -375,7 +482,6 @@ for i_task,task in enumerate(task_info):
         checkpoint_frequency_string = ''
         checkpoint_path_string = ''
-        checkpoint_filename = chunk_file.replace('.json','_checkpoint.json')
         if checkpoint_frequency is not None and checkpoint_frequency > 0:
             checkpoint_frequency_string = f'--checkpoint_frequency {checkpoint_frequency}'
@@ -399,7 +505,14 @@ for i_task,task in enumerate(task_info):
         overwrite_handling_string = '--overwrite_handling {}'.format(overwrite_handling)
         cmd = f'{cuda_string} python run_detector_batch.py "{model_file}" "{chunk_file}" "{output_fn}" {checkpoint_frequency_string} {checkpoint_path_string} {use_image_queue_string} {ncores_string} {quiet_string} {image_size_string} {confidence_threshold_string} {overwrite_handling_string}'
+        if include_image_size:
+            cmd += ' --include_image_size'
+        if include_image_timestamp:
+            cmd += ' --include_image_timestamp'
+        if include_exif_data:
+            cmd += ' --include_exif_data'
     cmd_file = os.path.join(filename_base,'run_chunk_{}_gpu_{}{}'.format(str(i_task).zfill(3),
                             str(gpu_number).zfill(2),script_extension))
@@ -484,12 +597,10 @@ multiple processes, so the tasks will run serially.  This only matters if you ha
 GPUs.
 """
-if False:
-    pass
-    #%%% Run the tasks (commented out)
+run_tasks_in_notebook = False
+if run_tasks_in_notebook:
     assert not use_yolo_inference_scripts, \
         'If you want to use the YOLOv5 inference scripts, you can\'t run the model interactively (yet)'
@@ -537,15 +648,32 @@ if False:
     # ...for each chunk
-# ...if False
+# ...if we're running tasks in this notebook
 #%% Load results, look for failed or missing images in each task
+# Check that all task output files exist
+missing_output_files = []
+# i_task = 0; task = task_info[i_task]
+for i_task,task in tqdm(enumerate(task_info),total=len(task_info)):
+    output_file = task['output_file']
+    if not os.path.isfile(output_file):
+        missing_output_files.append(output_file)
+if len(missing_output_files) > 0:
+    print('Missing {} output files:'.format(len(missing_output_files)))
+    for s in missing_output_files:
+        print(s)
+    raise Exception('Missing output files')
 n_total_failures = 0
 # i_task = 0; task = task_info[i_task]
-for i_task,task in enumerate(task_info):
+for i_task,task in tqdm(enumerate(task_info),total=len(task_info)):
     chunk_file = task['input_file']
     output_file = task['output_file']
@@ -562,6 +690,13 @@ for i_task,task in enumerate(task_info):
     # im = task_results['images'][0]
     for im in task_results['images']:
+        # Most of the time, inference result files use absolute paths, but it's
+        # getting annoying to make sure that's *always* true, so handle both here.
+        # E.g., when using tiled inference, paths will be relative.
+        if not os.path.isabs(im['file']):
+            fn = os.path.join(input_path,im['file']).replace('\\','/')
+            im['file'] = fn
         assert im['file'].startswith(input_path)
         assert im['file'] in task_images_set
         filename_to_results[im['file']] = im
@@ -573,7 +708,8 @@ for i_task,task in enumerate(task_info):
     task['results'] = task_results
     for fn in task_images:
-        assert fn in filename_to_results
+        assert fn in filename_to_results, \
+            'File {} not found in results for task {}'.format(fn,i_task)
     n_total_failures += n_task_failures
@@ -593,7 +729,7 @@ combined_results = {}
 combined_results['images'] = []
 images_processed = set()
-for i_task,task in enumerate(task_info):
+for i_task,task in tqdm(enumerate(task_info),total=len(task_info)):
     task_results = task['results']
@@ -620,19 +756,15 @@ assert len(combined_results['images']) == len(all_images), \
 result_filenames = [im['file'] for im in combined_results['images']]
 assert len(combined_results['images']) == len(set(result_filenames))
-# Check for valid path names
+# Convert to relative paths, preserving '/' as the path separator, regardless of OS
 for im in combined_results['images']:
+    assert '\\' not in im['file']
+    assert im['file'].startswith(input_path)
     if input_path.endswith(':'):
-        assert im['file'].startswith(input_path)
         im['file'] = im['file'].replace(input_path,'',1)
     else:
-        assert im['file'].startswith(input_path + os.path.sep)
-        im['file'] = im['file'].replace(input_path + os.path.sep,'',1)
+        im['file'] = im['file'].replace(input_path + '/','',1)
-combined_api_output_file = os.path.join(
-    combined_api_output_folder,
-    '{}_detections.json'.format(base_task_name))
 with open(combined_api_output_file,'w') as f:
     json.dump(combined_results,f,indent=1)
@@ -675,88 +807,8 @@ options.api_output_file = combined_api_output_file
 options.output_dir = output_base
 ppresults = process_batch_results(options)
 html_output_file = ppresults.output_html_file
-path_utils.open_file(html_output_file)
-#%% RDE (sample directory collapsing)
-#
-# The next few cells are about repeat detection elimination; if you want to skip this,
-# and still do other stuff in this notebook (e.g. running classifiers), that's fine, but
-# the rest of the notebook weakly assumes you've done this.  Specifically, it looks for
-# the variable "filtered_api_output_file" (a file produced by the RDE process).  If you
-# don't run the RDE cells, just change "filtered_api_output_file" to "combined_api_output_file"
-# (the raw output from MegaDetector).  Then it will be like all this RDE stuff doesn't exist.
-#
-# Though FWIW, once you're sufficiently power-user-ish to use this notebook, RDE is almost
-# always worth it.
-#
-def relative_path_to_location(relative_path):
-    """
-    This is a sample function that returns a camera name given an image path.  By
-    default in the RDE process, leaf-node folders are equivalent to cameras.  To map
-    something other than leaf-node folders to cameras, fill in this function, and un-comment the
-    line below containing "relative_path_to_location".
-    Sample regular expressions are included here for common patterns, particularly the
-    overflow folders created by Reconyx and Bushnell camera traps.  So if one of those
-    fits your scenario, you don't have to modify this function, just un-comment the line
-    below that enables this feature.
-    Nothing bad happens if you have overflow folders like this and you don't
-    enable this mapping, you are just taking a more conservative approach to RDE in that
-    scenario.
-    """
-    import re
-    # 100RECNX is the overflow folder style for Reconyx cameras
-    # 100EK113 is (for some reason) the overflow folder style for Bushnell cameras
-    # 100_BTCF is the overflow folder style for Browning cameras
-    patterns = ['\/\d+RECNX\/','\/\d+EK\d+\/','\/\d+_BTCF\/']
-    relative_path = relative_path.replace('\\','/')
-    for pat in patterns:
-        relative_path = re.sub(pat,'/',relative_path)
-    location_name = os.path.dirname(relative_path)
-    return location_name
-#%% Test cells for relative_path_to_location
-if False:
-    pass
-    #%% Test the generic cases
-    relative_path = 'a/b/c/d/100EK113/blah.jpg'
-    print(relative_path_to_location(relative_path))
-    relative_path = 'a/b/c/d/100RECNX/blah.jpg'
-    print(relative_path_to_location(relative_path))
-    #%% Test relative_path_to_location on the current dataset
-    with open(combined_api_output_file,'r') as f:
-        d = json.load(f)
-    image_filenames = [im['file'] for im in d['images']]
-    location_names = set()
-    # relative_path = image_filenames[0]
-    for relative_path in tqdm(image_filenames):
-        location_name = relative_path_to_location(relative_path)
-        location_names.add(location_name)
-    location_names = list(location_names)
-    location_names.sort()
-    for s in location_names:
-        print(s)
+path_utils.open_file(html_output_file,attempt_to_open_in_wsl_host=True,browser_name='chrome')
+# import clipboard; clipboard.copy(html_output_file)
 #%% Repeat detection elimination, phase 1
@@ -768,7 +820,7 @@ task_index = 0
 options = repeat_detections_core.RepeatDetectionOptions()
-options.confidenceMin = 0.15
+options.confidenceMin = 0.1
 options.confidenceMax = 1.01
 options.iouThreshold = 0.85
 options.occurrenceThreshold = 15
@@ -785,13 +837,13 @@ options.otherDetectionsThreshold = options.confidenceMin
 options.bRenderDetectionTiles = True
 options.maxOutputImageWidth = 2000
-options.detectionTilesMaxCrops = 500
+options.detectionTilesMaxCrops = 250
 # options.lineThickness = 5
 # options.boxExpansion = 8
 # To invoke custom collapsing of folders for a particular manufacturer's naming scheme
-# options.customDirNameFunction = relative_path_to_location; overflow_folder_handling_enabled = True
+options.customDirNameFunction = relative_path_to_location
 options.bRenderHtml = False
 options.imageBase = input_path
@@ -816,9 +868,9 @@ options.debugMaxRenderInstance = -1
 # Can be None, 'xsort', or 'clustersort'
 options.smartSort = 'xsort'
-suspiciousDetectionResults = repeat_detections_core.find_repeat_detections(combined_api_output_file,
-                                                                           None,
-                                                                           options)
+suspicious_detection_results = repeat_detections_core.find_repeat_detections(combined_api_output_file,
+                                                                             outputFilename=None,
+                                                                             options=options)
 #%% Manual RDE step
@@ -826,7 +878,8 @@ suspiciousDetectionResults = repeat_detections_core.find_repeat_detections(combi
 ## DELETE THE VALID DETECTIONS ##
 # If you run this line, it will open the folder up in your file browser
-path_utils.open_file(os.path.dirname(suspiciousDetectionResults.filterFile))
+path_utils.open_file(os.path.dirname(suspicious_detection_results.filterFile),
+                     attempt_to_open_in_wsl_host=True)
 #
 # If you ran the previous cell, but then you change your mind and you don't want to do
@@ -834,7 +887,7 @@ path_utils.open_file(os.path.dirname(suspiciousDetectionResults.filterFile))
 # previous cell.  If you do that, you're implicitly telling the notebook that you looked
 # at everything in that folder, and confirmed there were no red boxes on animals.
 #
-# Instead, either change "filtered_api_output_file" below to "combined_api_output_file",
+# Instead, either change "filtered_output_filename" below to "combined_api_output_file",
 # or delete *all* the images in the filtering folder.
 #
@@ -843,12 +896,13 @@ path_utils.open_file(os.path.dirname(suspiciousDetectionResults.filterFile))
 from api.batch_processing.postprocessing.repeat_detection_elimination import remove_repeat_detections
-filtered_output_filename = path_utils.insert_before_extension(combined_api_output_file, 'filtered_{}'.format(rde_string))
+filtered_output_filename = path_utils.insert_before_extension(combined_api_output_file,
+                                                              'filtered_{}'.format(rde_string))
 remove_repeat_detections.remove_repeat_detections(
     inputFile=combined_api_output_file,
     outputFile=filtered_output_filename,
-    filteringDir=os.path.dirname(suspiciousDetectionResults.filterFile)
+    filteringDir=os.path.dirname(suspicious_detection_results.filterFile)
     )
@@ -890,7 +944,8 @@ options.output_dir = output_base
 ppresults = process_batch_results(options)
 html_output_file = ppresults.output_html_file
-path_utils.open_file(html_output_file)
+path_utils.open_file(html_output_file,attempt_to_open_in_wsl_host=True,browser_name='chrome')
+# import clipboard; clipboard.copy(html_output_file)
 #%% Run MegaClassifier (actually, write out a script that runs MegaClassifier)
@@ -899,6 +954,11 @@ path_utils.open_file(html_output_file)
 final_output_path_mc = None
 final_output_path_ic = None
+# If we didn't do RDE
+if filtered_output_filename is None:
+    print("Warning: it looks like you didn't do RDE, using the raw output file")
+    filtered_output_filename = combined_api_output_file
 classifier_name_short = 'megaclassifier'
 threshold_str = '0.15' # 0.6
 classifier_name = 'megaclassifier_v0.1_efficientnet-b3'
@@ -1086,7 +1146,6 @@ with open(output_file,'w') as f:
     for s in commands:
         f.write('{}'.format(s))
-import stat
 st = os.stat(output_file)
 os.chmod(output_file, st.st_mode | stat.S_IEXEC)
@@ -1256,8 +1315,6 @@ os.chmod(output_file, st.st_mode | stat.S_IEXEC)
 #%% Within-image classification smoothing
-from collections import defaultdict
 #
 # Only count detections with a classification confidence threshold above
 # *classification_confidence_threshold*, which in practice means we're only
@@ -1516,7 +1573,7 @@ else:
 import datetime
 from data_management.read_exif import parse_exif_datetime_string
-min_valid_timestamp_year = 2015
+min_valid_timestamp_year = 2001
 now = datetime.datetime.now()
@@ -1540,6 +1597,7 @@ for exif_result in tqdm(exif_results):
     im['file_name'] = exif_result['file_name']
     im['id'] = im['file_name']
     if ('exif_tags' not in exif_result) or (exif_result['exif_tags'] is None) or \
         (exif_datetime_tag not in exif_result['exif_tags']):
         exif_dt = None
@@ -1573,7 +1631,7 @@ for exif_result in tqdm(exif_results):
 # ...for each exif image result
-print('Parsed EXIF datetime information, unable to parse EXIF data from {} of {} images'.format(
+print('Parsed EXIF datetime information, unable to parse EXIF date from {} of {} images'.format(
     len(images_without_datetime),len(exif_results)))
@@ -1639,7 +1697,7 @@ min_dominant_class_classifications_above_threshold_for_class_smoothing = 5 # 2
 max_secondary_class_classifications_above_threshold_for_class_smoothing = 5
 # If the ratio between a dominant class and a secondary class count is greater than this,
-# regardless of the secondary class count, switch those classificaitons (i.e., ignore
+# regardless of the secondary class count, switch those classifications (i.e., ignore
 # max_secondary_class_classifications_above_threshold_for_class_smoothing).
 #
 # This may be different for different dominant classes, e.g. if we see lots of cows, they really
@@ -1959,8 +2017,8 @@ print('Processing {} to {}'.format(base_task_name, output_base))
 options.api_output_file = sequence_smoothed_classification_file
 options.output_dir = output_base
 ppresults = process_batch_results(options)
-path_utils.open_file(ppresults.output_html_file)
+path_utils.open_file(ppresults.output_html_file,attempt_to_open_in_wsl_host=True,browser_name='chrome')
+# import clipboard; clipboard.copy(ppresults.output_html_file)
 #% Zip .json files
@@ -2027,7 +2085,7 @@ for i, j in itertools.combinations(list(range(0,len(filenames))),2):
 results = compare_batch_results(options)
 from md_utils.path_utils import open_file
-open_file(results.html_output_file)
+open_file(results.html_output_file,attempt_to_open_in_wsl_host=True,browser_name='chrome')
 #%% Merge in high-confidence detections from another results file
@@ -2081,7 +2139,7 @@ options.output_dir = output_base_large_boxes
 ppresults = process_batch_results(options)
 html_output_file = ppresults.output_html_file
-path_utils.open_file(html_output_file)
+path_utils.open_file(html_output_file,attempt_to_open_in_wsl_host=True,browser_name='chrome')
 #%% .json splitting
@@ -2094,12 +2152,6 @@ from api.batch_processing.postprocessing.subset_json_detector_output import (
 input_filename = filtered_output_filename
 output_base = os.path.join(combined_api_output_folder,base_task_name + '_json_subsets')
-if False:
-    if data is None:
-        with open(input_filename) as f:
-            data = json.load(f)
-    print('Data set contains {} images'.format(len(data['images'])))
 print('Processing file {} to {}'.format(input_filename,output_base))
 options = SubsetJsonDetectorOutputOptions()
@@ -2204,13 +2256,47 @@ video_output_filename = filtered_output_filename.replace('.json','_aggregated.js
 frame_results_to_video_results(filtered_output_filename,video_output_filename)
+#%% Sample custom path replacement function
+def custom_relative_path_to_location(relative_path):
+    relative_path = relative_path.replace('\\','/')
+    tokens = relative_path.split('/')
+    location_name = '/'.join(tokens[0:2])
+    return location_name
+#%% Test relative_path_to_location on the current dataset
+with open(combined_api_output_file,'r') as f:
+    d = json.load(f)
+image_filenames = [im['file'] for im in d['images']]
+location_names = set()
+# relative_path = image_filenames[0]
+for relative_path in tqdm(image_filenames):
+    location_name = relative_path_to_location(relative_path)
+    location_names.add(location_name)
+location_names = list(location_names)
+location_names.sort()
+for s in location_names:
+    print(s)
 #%% End notebook: turn this script into a notebook (how meta!)
 import os
 import nbformat as nbf
-input_py_file = os.path.expanduser(
-    '~/git/MegaDetector/api/batch_processing/data_preparation/manage_local_batch.py')
+if os.name == 'nt':
+    git_base = r'c:\git'
+else:
+    git_base = os.path.expanduser('~/git')
+input_py_file = git_base + '/MegaDetector/api/batch_processing/data_preparation/manage_local_batch.py'
 assert os.path.isfile(input_py_file)
 output_ipynb_file = input_py_file.replace('.py','.ipynb')
@@ -2233,14 +2319,23 @@ i_line = 0
 header_comment = ''
+# Delete a few lines from the top that don't belong in the NB version, e.g. the name
+# of the .py file
 lines_to_ignore = 7
+expected_first_token = '# This script'
+found_first_token = False
 # Everything before the first cell is the header comment
 while(not lines[i_line].startswith('#%%')):
     if i_line < lines_to_ignore:
         i_line += 1
         continue
+    if not found_first_token:
+        assert lines[i_line].startswith(expected_first_token)
+        found_first_token = True
     s = lines[i_line].replace('#','').strip()
     if len(s) == 0:
         header_comment += '\n\n'

megadetector 5.0.6__py3-none-any.whl → 5.0.8__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.6py3-none-any.whl → 5.0.8py3-none-any.whl