PyPI - megadetector - Versions diffs - 5.0.7__py3-none-any.whl → 5.0.8__py3-none-any.whl - Mend

megadetector 5.0.7py3-none-any.whl → 5.0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (48) hide show

api/batch_processing/data_preparation/manage_local_batch.py +28 -14
api/batch_processing/postprocessing/combine_api_outputs.py +2 -2
api/batch_processing/postprocessing/compare_batch_results.py +1 -1
api/batch_processing/postprocessing/convert_output_format.py +24 -6
api/batch_processing/postprocessing/load_api_results.py +1 -3
api/batch_processing/postprocessing/md_to_labelme.py +118 -51
api/batch_processing/postprocessing/merge_detections.py +30 -5
api/batch_processing/postprocessing/postprocess_batch_results.py +24 -12
api/batch_processing/postprocessing/remap_detection_categories.py +163 -0
api/batch_processing/postprocessing/render_detection_confusion_matrix.py +15 -12
api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +2 -2
data_management/cct_json_utils.py +7 -2
data_management/coco_to_labelme.py +263 -0
data_management/coco_to_yolo.py +7 -4
data_management/databases/integrity_check_json_db.py +68 -59
data_management/databases/subset_json_db.py +1 -1
data_management/get_image_sizes.py +44 -26
data_management/importers/animl_results_to_md_results.py +1 -3
data_management/importers/noaa_seals_2019.py +1 -1
data_management/labelme_to_coco.py +252 -143
data_management/labelme_to_yolo.py +95 -52
data_management/lila/create_lila_blank_set.py +106 -23
data_management/lila/download_lila_subset.py +133 -65
data_management/lila/generate_lila_per_image_labels.py +1 -1
data_management/lila/lila_common.py +8 -38
data_management/read_exif.py +65 -16
data_management/remap_coco_categories.py +84 -0
data_management/resize_coco_dataset.py +3 -22
data_management/wi_download_csv_to_coco.py +239 -0
data_management/yolo_to_coco.py +283 -83
detection/run_detector_batch.py +12 -3
detection/run_inference_with_yolov5_val.py +10 -3
detection/run_tiled_inference.py +2 -2
detection/tf_detector.py +2 -1
detection/video_utils.py +1 -1
md_utils/ct_utils.py +22 -3
md_utils/md_tests.py +11 -2
md_utils/path_utils.py +206 -32
md_utils/url_utils.py +66 -1
md_utils/write_html_image_list.py +12 -3
md_visualization/visualization_utils.py +363 -72
md_visualization/visualize_db.py +33 -10
{megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/METADATA +10 -12
{megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/RECORD +47 -44
{megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/WHEEL +1 -1
md_visualization/visualize_megadb.py +0 -183
{megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/LICENSE +0 -0
{megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/top_level.txt +0 -0

data_management/labelme_to_yolo.py CHANGED Viewed

@@ -11,6 +11,9 @@
 import os
 import json
+from multiprocessing.pool import Pool, ThreadPool
+from functools import partial
 from md_utils.path_utils import recursive_file_list
 from tqdm import tqdm
@@ -21,22 +24,21 @@ def labelme_file_to_yolo_file(labelme_file,
                               category_name_to_category_id,
                               yolo_file=None,
                               required_token=None,
-                              right_edge_quantization_threshold=None,
                               overwrite_behavior='overwrite'):
     """
     Convert the single .json file labelme_file to yolo format, writing the results to the text
     file yolo_file (defaults to s/json/txt).
-    If required_token is not None and the labelme_file does not contain the key [required_token],
-    no-ops.
+    If required_token is not None and the dict in labelme_file does not contain the key [required_token],
+    this function no-ops (i.e., does not generate a YOLO file).
-    right_edge_quantization_threshold is an off-by-default hack to handle cases where
-    boxes that really should be running off the right side of the image only extend like 99%
-    of the way there, due to what appears to be a slight bias inherent to MD.  If a box extends
-    within [right_edge_quantization_threshold] (a small number, from 0 to 1, but probably around
-    0.02) of the right edge of the image, it will be extended to the far right edge.
+    overwrite_behavior should be 'skip' or 'overwrite' (default).
     """
+    result = {}
+    result['labelme_file'] = labelme_file
+    result['status'] = 'unknown'
     assert os.path.isfile(labelme_file), 'Could not find labelme .json file {}'.format(labelme_file)
     assert labelme_file.endswith('.json'), 'Illegal labelme .json file {}'.format(labelme_file)
@@ -45,7 +47,8 @@ def labelme_file_to_yolo_file(labelme_file,
     if os.path.isfile(yolo_file):
         if overwrite_behavior == 'skip':
-            return
+            result['status'] = 'skip-exists'
+            return result
         else:
             assert overwrite_behavior == 'overwrite', \
                 'Unrecognized overwrite behavior {}'.format(overwrite_behavior)
@@ -54,7 +57,8 @@ def labelme_file_to_yolo_file(labelme_file,
         labelme_data = json.load(f)
     if required_token is not None and required_token not in labelme_data:
-        return
+        result['status'] = 'skip-no-required-token'
+        return result
     im_height = labelme_data['imageHeight']
     im_width = labelme_data['imageWidth']
@@ -83,10 +87,12 @@ def labelme_file_to_yolo_file(labelme_file,
         if (minx_abs >= (im_width-1)) or (maxx_abs <= 0) or \
             (miny_abs >= (im_height-1)) or (maxy_abs <= 0):
-                print('Skipping invalid shape in {}'.format(labelme_file))
+                print('Skipping invalid shape in {}'.format(labelme_file))
                 continue
-        # Clip to [0,1]
+        # Clip to [0,1]... it's not obvious that the YOLO format doesn't allow bounding
+        # boxes to extend outside the image, but YOLOv5 and YOLOv8 get sad about boxes
+        # that extend outside the image.
         maxx_abs = min(maxx_abs,im_width-1)
         maxy_abs = min(maxy_abs,im_height-1)
         minx_abs = max(minx_abs,0.0)
@@ -97,11 +103,6 @@ def labelme_file_to_yolo_file(labelme_file,
         miny_rel = miny_abs / (im_height-1)
         maxy_rel = maxy_abs / (im_height-1)
-        if (right_edge_quantization_threshold is not None):
-            right_edge_distance = 1.0 - maxx_rel
-            if right_edge_distance < right_edge_quantization_threshold:
-                maxx_rel = 1.0
         assert maxx_rel >= minx_rel
         assert maxy_rel >= miny_rel
@@ -119,32 +120,45 @@ def labelme_file_to_yolo_file(labelme_file,
     with open(yolo_file,'w') as f:
         for s in yolo_lines:
             f.write(s + '\n')
+    result['status'] = 'converted'
+    return result
 def labelme_folder_to_yolo(labelme_folder,
                            category_name_to_category_id=None,
                            required_token=None,
-                           right_edge_quantization_threshold=None,
-                           overwrite_behavior='overwrite'):
+                           overwrite_behavior='overwrite',
+                           relative_filenames_to_convert=None,
+                           n_workers=1,
+                           use_threads=True):
     """
     Given a folder with images and labelme .json files, convert the .json files
     to YOLO .txt format.  If category_name_to_category_id is None, first reads
     all the labels in the folder to build a zero-indexed name --> ID mapping.
     If required_token is not None and a labelme_file does not contain the key [required_token],
-    it won't be converted.
+    it won't be converted.  Typically used to specify a field that indicates which files have
+    been reviewed.
-    right_edge_quantization_threshold is an off-by-default hack to handle cases where
-    boxes that really should be running off the right side of the image only extend like 99%
-    of the way there, due to what appears to be a slight bias inherent to MD.  If a box extends
-    within [right_edge_quantization_threshold] (a small number, from 0 to 1, but probably around
-    0.02) of the right edge of the image, it will be extended to the far right edge.
+    If relative_filenames_to_convert is not None, this should be a list of .json (not image)
+    files that should get converted, relative to the base folder.
-    returns category_name_to_category_id, whether it was passed in or constructed.
+    overwrite_behavior should be 'skip' or 'overwrite' (default).
+    returns a dict with:
+        'category_name_to_category_id', whether it was passed in or constructed
+        'image_results': a list of results for each image (converted, skipped, error)
     """
-    labelme_files_relative = recursive_file_list(labelme_folder,return_relative_paths=True)
-    labelme_files_relative = [fn for fn in labelme_files_relative if fn.endswith('.json')]
+    if relative_filenames_to_convert is not None:
+        labelme_files_relative = relative_filenames_to_convert
+        assert all([fn.endswith('.json') for fn in labelme_files_relative]), \
+            'relative_filenames_to_convert contains non-json files'
+    else:
+        labelme_files_relative = recursive_file_list(labelme_folder,return_relative_paths=True)
+        labelme_files_relative = [fn for fn in labelme_files_relative if fn.endswith('.json')]
     if required_token is None:
         valid_labelme_files_relative = labelme_files_relative
@@ -163,9 +177,9 @@ def labelme_folder_to_yolo(labelme_folder,
             valid_labelme_files_relative.append(fn_relative)
-    print('{} of {} files are valid'.format(len(valid_labelme_files_relative),
-                                           len(labelme_files_relative)))
+        print('{} of {} files are valid'.format(len(valid_labelme_files_relative),
+                                                len(labelme_files_relative)))
     del labelme_files_relative
     if category_name_to_category_id is None:
@@ -184,26 +198,54 @@ def labelme_folder_to_yolo(labelme_folder,
         # ...for each file
     # ...if we need to build a category mapping
-    for fn_relative in tqdm(valid_labelme_files_relative):
-        fn_abs = os.path.join(labelme_folder,fn_relative)
-        labelme_file_to_yolo_file(fn_abs,
-                                  category_name_to_category_id,
-                                  yolo_file=None,
-                                  required_token=required_token,
-                                  right_edge_quantization_threshold=\
-                                      right_edge_quantization_threshold,
-                                  overwrite_behavior=overwrite_behavior)
-    # ...for each file
+    image_results = []
+    n_workers = min(n_workers,len(valid_labelme_files_relative))
+    if n_workers <= 1:
+        for fn_relative in tqdm(valid_labelme_files_relative):
+            fn_abs = os.path.join(labelme_folder,fn_relative)
+            image_result = labelme_file_to_yolo_file(fn_abs,
+                                      category_name_to_category_id,
+                                      yolo_file=None,
+                                      required_token=required_token,
+                                      overwrite_behavior=overwrite_behavior)
+            image_results.append(image_result)
+        # ...for each file
+    else:
+        if use_threads:
+            pool = ThreadPool(n_workers)
+        else:
+            pool = Pool(n_workers)
+        valid_labelme_files_abs = [os.path.join(labelme_folder,fn_relative) for \
+                                   fn_relative in valid_labelme_files_relative]
+        image_results = list(tqdm(pool.imap(
+            partial(labelme_file_to_yolo_file,
+                    category_name_to_category_id=category_name_to_category_id,
+                    yolo_file=None,
+                    required_token=required_token,
+                    overwrite_behavior=overwrite_behavior),
+                    valid_labelme_files_abs),
+                    total=len(valid_labelme_files_abs)))
+    assert len(valid_labelme_files_relative) == len(image_results)
     print('Converted {} labelme .json files to YOLO'.format(
         len(valid_labelme_files_relative)))
-    return category_name_to_category_id
+    labelme_to_yolo_results = {}
+    labelme_to_yolo_results['category_name_to_category_id'] = category_name_to_category_id
+    labelme_to_yolo_results['image_results'] = image_results
+    return labelme_to_yolo_results
+# ...def labelme_folder_to_yolo(...)
 #%% Interactive driver
 if False:
@@ -212,18 +254,19 @@ if False:
     #%%
-    import os
     labelme_file = os.path.expanduser('~/tmp/labels/x.json')
-    yolo_file = None
     required_token = 'saved_by_labelme'
-    right_edge_quantization_threshold = 0.015
     category_name_to_category_id = {'animal':0}
+    labelme_folder = os.path.expanduser('~/tmp/labels')
     #%%
-    labelme_folder = os.path.expanduser('~/tmp/labels')
+    category_name_to_category_id = \
+        labelme_folder_to_yolo(labelme_folder,
+                               category_name_to_category_id=category_name_to_category_id,
+                               required_token=required_token,
+                               overwrite_behavior='overwrite')
 #%% Command-line driver
 # TODO

data_management/lila/create_lila_blank_set.py CHANGED Viewed

@@ -4,7 +4,11 @@
 #
 # Create a folder of blank images sampled from LILA.  We'll aim for diversity, so less-common
 # locations will be oversampled relative to more common locations.  We'll also run MegaDetector
-# to minimize the chance that incorrectly-labeled non-empty images sneak into our blank set.
+# (with manual review) to remove some incorrectly-labeled, not-actually-empty images from our
+# blank set.
+#
+# We'll store location information for each image in a .json file, so we can split locations
+# into train/val in downstream tasks.
 #
 ########
@@ -14,7 +18,6 @@ import os
 import random
 import math
 import json
-import shutil
 import numpy as np
 from tqdm import tqdm
@@ -22,8 +25,7 @@ from multiprocessing.pool import ThreadPool
 from urllib.parse import urlparse
 from collections import defaultdict
-from data_management.lila.lila_common import \
-    read_lila_all_images_file, azure_url_to_gcp_http_url
+from data_management.lila.lila_common import read_lila_all_images_file
 from md_utils.url_utils import download_url
 from md_visualization import visualization_utils as vis_utils
 from md_utils.path_utils import recursive_file_list
@@ -45,6 +47,14 @@ os.makedirs(confirmed_blanks_base,exist_ok=True)
 md_possible_non_blanks_folder = os.path.join(project_base,'candidate_non_blanks')
 os.makedirs(md_possible_non_blanks_folder,exist_ok=True)
+location_to_blank_image_urls_cache_file = os.path.join(project_base,
+                                                       'location_to_blank_image_urls.json')
+md_results_file = os.path.join(project_base,'lila_blanks_md_results.json')
+all_fn_relative_to_location_file = os.path.join(project_base,'all_fn_relative_to_location.json')
+confirmed_fn_relative_to_location_file = os.path.join(project_base,'confirmed_fn_relative_to_location.json')
 preferred_image_download_source = 'gcp'
 # Number of concurrent download threads
@@ -171,9 +181,6 @@ for s in original_labels_with_nan_common_names:
 #%% Map locations to blank images
-location_to_blank_image_urls_cache_file = os.path.join(project_base,
-                                                       'location_to_blank_image_urls.json')
 force_map_locations = False
 # Load from .json if available
@@ -275,7 +282,7 @@ print('Max samples per location: {}'.format(max_blanks_per_location))
 #%% Download those image files (prep)
-container_to_url_base = {
+container_to_url_base = {
                          'lilablobssc.blob.core.windows.net':'/',
                          'storage.googleapis.com':'/public-datasets-lila/'
                          }
@@ -318,6 +325,21 @@ def download_relative_filename(url, output_base, verbose=False, url_base=None, o
     result['status'] = 'success'
     return result
+def azure_url_to_gcp_http_url(url,error_if_not_azure_url=True):
+    """
+    Most URLs point to Azure by default, but most files are available on both Azure and GCP.
+    This function converts an Azure URL to the corresponding GCP http:// url.
+    """
+    lila_azure_storage_account = 'https://lilablobssc.blob.core.windows.net'
+    gcp_bucket_api_url = 'https://storage.googleapis.com/public-datasets-lila'
+    error_if_not_azure_url = False
+    if error_if_not_azure_url:
+        assert url.startswith(lila_azure_storage_account)
+    gcp_url = url.replace(lila_azure_storage_account,gcp_bucket_api_url,1)
+    return gcp_url
 # Convert Azure URLs to GCP URLs if necessary
 if preferred_image_download_source != 'azure':
     assert preferred_image_download_source == 'gcp'
@@ -358,8 +380,6 @@ print('Errors on {} of {} downloads'.format(len(error_urls),len(results)))
 #%% Run MegaDetector on the folder
-md_results_file = os.path.join(project_base,'lila_blanks_md_results.json')
 cmd = 'python run_detector_batch.py MDV5A "{}" "{}"'.format(
     candidate_blanks_base,md_results_file)
 cmd += ' --recursive --output_relative_filenames'
@@ -419,6 +439,7 @@ for i_fn,source_file_relative in tqdm(enumerate(images_to_review_to_detections),
                                           confidence_threshold=min_threshold,
                                           target_size=(1280,-1))
+# This is a temporary file I just used during debugging
 with open(os.path.join(project_base,'output_file_to_source_file.json'),'w') as f:
     json.dump(output_file_to_source_file,f,indent=1)
@@ -442,33 +463,95 @@ for output_file in tqdm(output_file_to_source_file.keys()):
         source_file_relative = output_file_to_source_file[output_file]
         removed_blank_images_relative.append(source_file_relative)
+removed_blank_images_relative_set = set(removed_blank_images_relative)
 assert len(removed_blank_images_relative) + len(remaining_images) == len(output_file_to_source_file)
-#%% Copy all the confirmed blanks to the confirmed folder
+#%% Copy only the confirmed blanks to the confirmed folder
+from md_utils.path_utils import is_image_file
 all_candidate_blanks = recursive_file_list(candidate_blanks_base,return_relative_paths=True)
 print('Found {} candidate blanks'.format(len(all_candidate_blanks)))
+skipped_images_relative = []
+skipped_non_images = []
 for source_fn_relative in tqdm(all_candidate_blanks):
+    # Skip anything we removed from the "candidate non-blanks" folder; these weren't really
+    # blank.
+    if source_fn_relative in removed_blank_images_relative_set:
+        skipped_images_relative.append(source_fn_relative)
+        continue
+    if not is_image_file(source_fn_relative):
+        # Not a typo; "skipped images" really means "skipped files"
+        skipped_images_relative.append(source_fn_relative)
+        skipped_non_images.append(source_fn_relative)
     source_fn_abs = os.path.join(candidate_blanks_base,source_fn_relative)
     assert os.path.isfile(source_fn_abs)
     target_fn_abs = os.path.join(confirmed_blanks_base,source_fn_relative)
     os.makedirs(os.path.dirname(target_fn_abs),exist_ok=True)
-    shutil.copyfile(source_fn_abs,target_fn_abs)
+    # shutil.copyfile(source_fn_abs,target_fn_abs)
+print('Skipped {} files ({} non-image files)'.format(len(skipped_images_relative),
+                                                     len(skipped_non_images)))
-#%% Record location information for each file
-fn_relative_to_location = {}
-for location in location_to_blank_image_urls:
-    urls_this_location = location_to_blank_image_urls[location]
-    for url in urls_this_location:
-        fn_relative = url.split('//')[1]
-        fn_relative_to_location[fn_relative] = location
-all_confirmed_blanks = recursive_file_list(confirmed_blanks_base,return_relative_paths=True)
+#%% Validate the folder of confirmed blanks
+from md_utils.path_utils import find_images
+# all_confirmed_blanks = recursive_file_list(confirmed_blanks_base,return_relative_paths=True)
+all_confirmed_blanks = find_images(confirmed_blanks_base,return_relative_paths=True,recursive=True)
+assert len(all_confirmed_blanks) < len(all_candidate_blanks)
 print('Found {} confirmed blanks'.format(len(all_confirmed_blanks)))
-for fn_relative in all_confirmed_blanks:
-    assert fn_relative in fn_relative_to_location
+#%% Manually review a few of the images we skipped
+# ...to make sure they're non-blank
+i_image = random.randint(0, len(skipped_images_relative))
+fn_relative = skipped_images_relative[i_image]
+fn_abs = os.path.join(candidate_blanks_base,fn_relative)
+assert os.path.isfile(fn_abs)
+import clipboard
+clipboard.copy('feh --scale-down "{}"'.format(fn_abs))
+#%% Record location information for each confirmed file
+# Map every URL's path to the corresponding location
+#
+# This is *all empty URLs*, not just the ones we downloaded
+all_fn_relative_to_location = {}
+# location = next(iter(location_to_blank_image_urls.keys()))
+for location in tqdm(location_to_blank_image_urls):
+    urls_this_location = location_to_blank_image_urls[location]
+    # url = urls_this_location[0]
+    for url in urls_this_location:
+        # Turn:
+        #
+        # https://lilablobssc.blob.core.windows.net/caltech-unzipped/cct_images/5968c0f9-23d2-11e8-a6a3-ec086b02610b.jpg'
+        #
+        # ...into:
+        #
+        # caltech-unzipped/cct_images/5968c0f9-23d2-11e8-a6a3-ec086b02610b.jpg'
+        p = urlparse(url)
+        fn_relative = str(p.path)[1:]
+        all_fn_relative_to_location[fn_relative] = location
+# Build a much smaller mapping of just the confirmed blanks
+confirmed_fn_relative_to_location = {}
+for i_fn,fn_relative in tqdm(enumerate(all_confirmed_blanks),total=len(all_confirmed_blanks)):
+    confirmed_fn_relative_to_location[fn_relative] = all_fn_relative_to_location[fn_relative]
+with open(all_fn_relative_to_location_file,'w') as f:
+    json.dump(all_fn_relative_to_location,f,indent=1)
+with open(confirmed_fn_relative_to_location_file,'w') as f:
+    json.dump(confirmed_fn_relative_to_location,f,indent=1)

megadetector 5.0.7__py3-none-any.whl → 5.0.8__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.7py3-none-any.whl → 5.0.8py3-none-any.whl