PyPI - megadetector - Versions diffs - 5.0.7__py3-none-any.whl → 5.0.8__py3-none-any.whl - Mend

megadetector 5.0.7py3-none-any.whl → 5.0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (48) hide show

api/batch_processing/data_preparation/manage_local_batch.py +28 -14
api/batch_processing/postprocessing/combine_api_outputs.py +2 -2
api/batch_processing/postprocessing/compare_batch_results.py +1 -1
api/batch_processing/postprocessing/convert_output_format.py +24 -6
api/batch_processing/postprocessing/load_api_results.py +1 -3
api/batch_processing/postprocessing/md_to_labelme.py +118 -51
api/batch_processing/postprocessing/merge_detections.py +30 -5
api/batch_processing/postprocessing/postprocess_batch_results.py +24 -12
api/batch_processing/postprocessing/remap_detection_categories.py +163 -0
api/batch_processing/postprocessing/render_detection_confusion_matrix.py +15 -12
api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +2 -2
data_management/cct_json_utils.py +7 -2
data_management/coco_to_labelme.py +263 -0
data_management/coco_to_yolo.py +7 -4
data_management/databases/integrity_check_json_db.py +68 -59
data_management/databases/subset_json_db.py +1 -1
data_management/get_image_sizes.py +44 -26
data_management/importers/animl_results_to_md_results.py +1 -3
data_management/importers/noaa_seals_2019.py +1 -1
data_management/labelme_to_coco.py +252 -143
data_management/labelme_to_yolo.py +95 -52
data_management/lila/create_lila_blank_set.py +106 -23
data_management/lila/download_lila_subset.py +133 -65
data_management/lila/generate_lila_per_image_labels.py +1 -1
data_management/lila/lila_common.py +8 -38
data_management/read_exif.py +65 -16
data_management/remap_coco_categories.py +84 -0
data_management/resize_coco_dataset.py +3 -22
data_management/wi_download_csv_to_coco.py +239 -0
data_management/yolo_to_coco.py +283 -83
detection/run_detector_batch.py +12 -3
detection/run_inference_with_yolov5_val.py +10 -3
detection/run_tiled_inference.py +2 -2
detection/tf_detector.py +2 -1
detection/video_utils.py +1 -1
md_utils/ct_utils.py +22 -3
md_utils/md_tests.py +11 -2
md_utils/path_utils.py +206 -32
md_utils/url_utils.py +66 -1
md_utils/write_html_image_list.py +12 -3
md_visualization/visualization_utils.py +363 -72
md_visualization/visualize_db.py +33 -10
{megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/METADATA +10 -12
{megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/RECORD +47 -44
{megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/WHEEL +1 -1
md_visualization/visualize_megadb.py +0 -183
{megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/LICENSE +0 -0
{megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/top_level.txt +0 -0

data_management/lila/download_lila_subset.py CHANGED Viewed

@@ -9,7 +9,7 @@
 # what you want to query for, etc., is very application-specific; this is just meant as a
 # demo.
 #
-# Can download from either Azure or GCP.
+# Can download from GCP (all datasets), AWS (all datasets), or Azure (most datasets).
 #
 ########
@@ -20,15 +20,16 @@ import random
 from tqdm import tqdm
 from multiprocessing.pool import ThreadPool
-from urllib.parse import urlparse
 from collections import defaultdict
-from data_management.lila.lila_common import \
-    read_lila_all_images_file, is_empty, azure_url_to_gcp_http_url
+from data_management.lila.lila_common import read_lila_all_images_file, is_empty, lila_base_urls
 from md_utils.url_utils import download_url
+for s in lila_base_urls.values():
+    assert s.endswith('/')
 # If any of these strings appear in the common name of a species, we'll download that image
-species_of_interest = ['grey fox','red fox','leopard cat','kiwi']
+species_of_interest = ['grey fox','gray fox','cape fox','red fox','kit fox']
 # We'll write images, metadata downloads, and temporary files here
 lila_local_base = os.path.expanduser('~/lila')
@@ -42,17 +43,61 @@ os.makedirs(output_dir,exist_ok=True)
 # Number of concurrent download threads
 n_download_threads = 20
+verbose = False
 max_images_per_dataset = 10 # None
 # This impacts the data download, but not the metadata download
 #
-# "Azure" really means "Azure if available"; recent datasets are only available
-# on GCP.
-image_download_source = 'azure' # 'azure' or 'gcp'
+# Setting this to "Azure" really means "Azure if available"; some datasets are
+# not available on Azure.
+preferred_provider = 'gcp' # 'azure', 'gcp', 'aws'
 random.seed(0)
+#%% Support functions
+def download_relative_url(relative_url, output_base, provider='gcp',
+                          verbose=False, overwrite=False):
+    """
+    Download a URL to output_base, preserving the path relative to the common LILA root.
+    """
+    assert not relative_url.startswith('/')
+    # Not all datasets are available on Azure, fall back in these cases.  The decision
+    # to fall back to GCP rather than AWS is arbitrary.
+    if provider == 'azure':
+        nominal_provider = relative_url_to_nominal_provider[relative_url]
+        if nominal_provider != 'azure':
+            if verbose:
+                print('URL {} not available on Azure, falling back to GCP'.format(
+                    relative_url))
+            provider = 'gcp'
+    url = lila_base_urls[provider] + relative_url
+    result = {'status':'unknown','url':url,'destination_filename':None}
+    destination_filename = os.path.join(output_base,relative_url)
+    result['destination_filename'] = destination_filename
+    if ((os.path.isfile(destination_filename)) and (not overwrite)):
+        result['status'] = 'skipped'
+        return result
+    try:
+        download_url(url, destination_filename, verbose=verbose, force_download=overwrite)
+    except Exception as e:
+        print('Warning: error downloading URL {}: {}'.format(
+            url,str(e)))
+        result['status'] = 'error: {}'.format(str(e))
+        return result
+    result['status'] = 'success'
+    return result
 #%% Download and open the giant table of image URLs and labels
 # ~60 seconds to download, unzip, and open
@@ -63,6 +108,8 @@ df = read_lila_all_images_file(metadata_dir)
 # ~2 minutes
+common_name_to_count = defaultdict(int)
 ds_name_to_urls = defaultdict(list)
 def find_items(row):
@@ -75,6 +122,7 @@ def find_items(row):
     for species_name in species_of_interest:
         if species_name in row['common_name']:
             match = True
+            common_name_to_count[species_name] += 1
             break
     if match:
@@ -83,15 +131,19 @@ def find_items(row):
 tqdm.pandas()
 _ = df.progress_apply(find_items,axis=1)
+# We have a list of URLs for each dataset, flatten them all into a list of URLs
 all_urls = list(ds_name_to_urls.values())
 all_urls = [item for sublist in all_urls for item in sublist]
 print('Found {} matching URLs across {} datasets'.format(len(all_urls),len(ds_name_to_urls)))
+for common_name in common_name_to_count:
+    print('{}: {}'.format(common_name,common_name_to_count[common_name]))
 from copy import deepcopy
 ds_name_to_urls_raw = deepcopy(ds_name_to_urls)
-#%% Trim to a fixed number of URLs per dataset
+#%% Optionally trim to a fixed number of URLs per dataset
 if max_images_per_dataset is None:
     pass
@@ -102,74 +154,90 @@ else:
             ds_name_to_urls[ds_name] = random.sample(ds_name_to_urls[ds_name],max_images_per_dataset)
-#%% Download those image files
+#%% Convert URLs to be relative to the common LILA base
-container_to_url_base = {
-                         'lilablobssc.blob.core.windows.net':'/',
-                         'storage.googleapis.com':'/public-datasets-lila/'
-                         }
+all_urls = list(ds_name_to_urls.values())
+all_urls = [item for sublist in all_urls for item in sublist]
-def download_relative_filename(url, output_base, verbose=False, url_base=None, overwrite=False):
-    """
-    Download a URL to output_base, preserving relative path
-    """
-    result = {'status':'unknown','url':url,'destination_filename':None}
-    if url_base is None:
-        assert url.startswith('https://')
-        container = url.split('/')[2]
-        assert container in container_to_url_base
-        url_base = container_to_url_base[container]
-    assert url_base.startswith('/') and url_base.endswith('/')
-    p = urlparse(url)
-    relative_filename = str(p.path)
-    # remove the leading '/'
-    assert relative_filename.startswith(url_base)
-    relative_filename = relative_filename.replace(url_base,'',1)
-    destination_filename = os.path.join(output_base,relative_filename)
-    result['destination_filename'] = destination_filename
-    if ((os.path.isfile(destination_filename)) and (not overwrite)):
-        result['status'] = 'skipped'
-        return result
-    try:
-        download_url(url, destination_filename, verbose=verbose)
-    except Exception as e:
-        print('Warning: error downloading URL {}: {}'.format(
-            url,str(e)))
-        result['status'] = 'error: {}'.format(str(e))
-        return result
-    result['status'] = 'success'
-    return result
+all_urls_relative = []
+# Each file has a nominal URL in the .csv file.  For now, the only thing this tells is
+# is that if the nominal URL isn't an Azure URL, the file isn't on Azure.  All files are on
+# GCP and AWS.
+#
+# Keep track of the nominal provider for each URL.
+relative_url_to_nominal_provider = {}
+for url in all_urls:
+    found_base = False
+    for provider in lila_base_urls.keys():
+        base = lila_base_urls[provider]
+        if url.startswith(base):
+            relative_url = url.replace(base,'')
+            all_urls_relative.append(relative_url)
+            relative_url_to_nominal_provider[relative_url] = provider
+            found_base = True
+            break
+    assert found_base
+assert len(all_urls) == len(all_urls_relative)
-# ds_name_to_urls maps dataset names to lists of URLs; flatten to a single list of URLs
-all_urls = list(ds_name_to_urls.values())
-all_urls = [item for sublist in all_urls for item in sublist]
-# Convert Azure URLs to GCP URLs if necessary
-if image_download_source != 'azure':
-    assert image_download_source == 'gcp'
-    all_urls = [azure_url_to_gcp_http_url(url) for url in all_urls]
+#%% Download image files
-print('Downloading {} images on {} workers'.format(len(all_urls),n_download_threads))
+print('Downloading {} images on {} workers, preferred provider is {}'.format(
+    len(all_urls),n_download_threads,preferred_provider))
 if n_download_threads <= 1:
     results = []
-    # url = all_urls[0]
-    for url in tqdm(all_urls):
-        results.append(download_relative_filename(url,output_dir,url_base=None))
+    # url_relative = all_urls_relative[0]
+    for url_relative in tqdm(all_urls_relative):
+        result = download_relative_url(url_relative,
+                                       output_base=output_dir,
+                                       provider=preferred_provider,
+                                       verbose=verbose)
+        results.append(result)
 else:
     pool = ThreadPool(n_download_threads)
-    results = list(tqdm(pool.imap(lambda s: download_relative_filename(
-        s,output_dir,url_base=None),
-        all_urls), total=len(all_urls)))
+    results = list(tqdm(pool.imap(lambda s: download_relative_url(
+        s,output_base=output_dir,provider=preferred_provider,verbose=verbose),
+        all_urls_relative), total=len(all_urls_relative)))
+#%% Scrap
+if False:
+    pass
+    #%% Find all the reptiles on LILA
+    reptile_rows = df.loc[df['class'] == 'reptilia']
+    # i_row = 0; row = reptile_rows.iloc[i_row]
+    common_name_to_count = defaultdict(int)
+    dataset_to_count = defaultdict(int)
+    for i_row,row in reptile_rows.iterrows():
+        common_name_to_count[row['common_name']] += 1
+        dataset_to_count[row['dataset_name']] += 1
+    from md_utils.ct_utils import sort_dictionary_by_value
+    print('Found {} reptiles\n'.format(len(reptile_rows)))
+    common_name_to_count = sort_dictionary_by_value(common_name_to_count,reverse=True)
+    dataset_to_count = sort_dictionary_by_value(dataset_to_count,reverse=True)
+    print('Common names by count:\n')
+    for k in common_name_to_count:
+        print('{} ({})'.format(k,common_name_to_count[k]))
+    print('\nDatasets by count:\n')
+    for k in dataset_to_count:
+        print('{} ({})'.format(k,dataset_to_count[k]))

data_management/lila/generate_lila_per_image_labels.py CHANGED Viewed

@@ -338,7 +338,7 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
 # ...with open()
-print('Processed {} datsets'.format(len(metadata_table)))
+print('Processed {} datasets'.format(len(metadata_table)))
 #%% Read the .csv back

data_management/lila/lila_common.py CHANGED Viewed

@@ -31,9 +31,13 @@ wildlife_insights_taxonomy_local_json_filename = 'wi_taxonomy.json'
 wildlife_insights_taxonomy_local_csv_filename = \
     wildlife_insights_taxonomy_local_json_filename.replace('.json','.csv')
-lila_azure_storage_account = 'https://lilablobssc.blob.core.windows.net'
-gcp_bucket_api_url = 'https://storage.googleapis.com/public-datasets-lila'
-gcp_bucket_gs_url = 'gs://public-datasets-lila'
+# Filenames are consistent across clouds relative to these URLs
+lila_base_urls = {
+    'azure':'https://lilablobssc.blob.core.windows.net/',
+    'gcp':'https://storage.googleapis.com/public-datasets-lila/',
+    'aws':'http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/'
+}
 #%% Common functions
@@ -198,28 +202,6 @@ def read_metadata_file_for_dataset(ds_name,metadata_dir,metadata_table=None,json
     return json_filename
-def azure_url_to_gcp_http_url(url,error_if_not_azure_url=True):
-    """
-    Most URLs point to Azure by default, but most files are available on both Azure and GCP.
-    This function converts an Azure URL to the corresponding GCP http:// url.
-    """
-    if error_if_not_azure_url:
-        assert url.startswith(lila_azure_storage_account)
-    gcp_url = url.replace(lila_azure_storage_account,gcp_bucket_api_url,1)
-    return gcp_url
-def azure_url_to_gcp_gs_url(url,error_if_not_azure_url=True):
-    """
-    Most URLs point to Azure by default, but most files are available on both Azure and GCP.
-    This function converts an Azure URL to the corresponding GCP gs:// url.
-    """
-    return azure_url_to_gcp_http_url(url,error_if_not_azure_url).\
-        replace(gcp_bucket_api_url,gcp_bucket_gs_url,1)
 #%% Interactive test driver
 if False:
@@ -252,16 +234,4 @@ if False:
             urls_to_test.append(ds_info['bbox_url'])
     status_codes = url_utils.test_urls(urls_to_test)
-    #%% Verify that the GCP versions of all metadata files exist
-    gcp_urls = []
-    # url = urls_to_test[0]
-    for url in urls_to_test:
-        assert url.startswith(lila_azure_storage_account)
-        gcp_url = url.replace(lila_azure_storage_account,gcp_bucket_api_url,1)
-        gcp_urls.append(gcp_url)
-    status_codes = url_utils.test_urls(gcp_urls)

data_management/read_exif.py CHANGED Viewed

@@ -48,9 +48,18 @@ class ReadExifOptions:
     #
     # Not relevant if n_workers is 1.
     use_threads = True
+    # "File" and "ExifTool" are tag types used by ExifTool to report data that
+    # doesn't come from EXIF, rather from the file (e.g. file size).
     tag_types_to_ignore = set(['File','ExifTool'])
+    # Include/exclude specific tags (mutually incompatible)
+    tags_to_include = None
+    tags_to_exclude = None
+    # A useful set of tags one might want to limit queries for
+    # options.tags_to_include = ['DateTime','Model','Make','ExifImageWidth','ExifImageHeight','DateTime','DateTimeOriginal','Orientation']
     exiftool_command_name = 'exiftool'
     # How should we handle byte-formatted EXIF tags?
@@ -62,16 +71,17 @@ class ReadExifOptions:
     # Should we use exiftool or pil?
     processing_library = 'pil' # 'exiftool','pil'
 #%% Functions
-def enumerate_files(input_folder):
+def enumerate_files(input_folder,recursive=True):
     """
     Enumerates all image files in input_folder, returning relative paths
     """
-    image_files = find_images(input_folder,recursive=True)
+    image_files = find_images(input_folder,recursive=recursive)
     image_files = [os.path.relpath(s,input_folder) for s in image_files]
     image_files = [s.replace('\\','/') for s in image_files]
     print('Enumerated {} files'.format(len(image_files)))
@@ -99,7 +109,7 @@ def get_exif_ifd(exif):
 def read_pil_exif(im,options=None):
     """
     Read all the EXIF data we know how to read from [im] (path or PIL Image), whether it's
-    in the PIL default EXIF data or not.
+    in the PIL default EXIF data or not.  Returns a dict.
     """
     if options is None:
@@ -192,6 +202,32 @@ def parse_exif_datetime_string(s,verbose=False):
     return dt
+def _filter_tags(tags,options):
+    """
+    Internal function used to include/exclude specific tags from the exif_tags
+    dict.
+    """
+    if options is None:
+        return tags
+    if options.tags_to_include is None and options.tags_to_exclude is None:
+        return tags
+    if options.tags_to_include is not None:
+        assert options.tags_to_exclude is None, "tags_to_include and tags_to_exclude are incompatible"
+        tags_to_return = {}
+        for tag_name in tags.keys():
+            if tag_name in options.tags_to_include:
+                tags_to_return[tag_name] = tags[tag_name]
+        return tags_to_return
+    if options.tags_to_exclude is not None:
+        assert options.tags_to_include is None, "tags_to_include and tags_to_exclude are incompatible"
+        tags_to_return = {}
+        for tag_name in tags.keys():
+            if tag_name not in options.tags_to_exclude:
+                tags_to_return[tag_name] = tags[tag_name]
+        return tags_to_return
 def read_exif_tags_for_image(file_path,options=None):
     """
     Get relevant fields from EXIF data for an image
@@ -227,8 +263,8 @@ def read_exif_tags_for_image(file_path,options=None):
                 result['status'] = 'empty_read'
             else:
                 result['status'] = 'success'
-                result['tags'] = exif_tags
+                result['tags'] = _filter_tags(exif_tags,options)
         return result
     elif options.processing_library == 'exiftool':
@@ -283,9 +319,12 @@ def read_exif_tags_for_image(file_path,options=None):
                     print('Ignoring tag with type {}'.format(field_type))
                 continue
-            field_tag = field_name_type_tokens[1].strip()
-            tag = [field_type,field_tag,field_value]
+            field_name = field_name_type_tokens[1].strip()
+            if options.tags_to_exclude is not None and field_name in options.tags_to_exclude:
+                continue
+            if options.tags_to_include is not None and field_name not in options.tags_to_include:
+                continue
+            tag = [field_type,field_name,field_value]
             exif_tags.append(tag)
@@ -350,20 +389,22 @@ def populate_exif_data(im, image_base, options=None):
 # ...populate_exif_data()
-def create_image_objects(image_files):
+def create_image_objects(image_files,recursive=True):
     """
     Create empty image objects for every image in [image_files], which can be a
     list of relative paths (which will get stored without processing, so the base
     path doesn't matter here), or a folder name.
     Returns a list of dicts with field 'file_name' (a relative path).
+    "recursive" is ignored if "image_files" is a list.
     """
     # Enumerate *relative* paths
     if isinstance(image_files,str):
         print('Enumerating image files in {}'.format(image_files))
         assert os.path.isdir(image_files), 'Invalid image folder {}'.format(image_files)
-        image_files = enumerate_files(image_files)
+        image_files = enumerate_files(image_files,recursive=recursive)
     images = []
     for fn in image_files:
@@ -499,7 +540,7 @@ def is_executable(name):
     return which(name) is not None
-def read_exif_from_folder(input_folder,output_file=None,options=None,filenames=None):
+def read_exif_from_folder(input_folder,output_file=None,options=None,filenames=None,recursive=True):
     """
     Read EXIF data for all images in input_folder.
@@ -516,6 +557,12 @@ def read_exif_from_folder(input_folder,output_file=None,options=None,filenames=N
     if options is None:
         options = ReadExifOptions()
+    # Validate options
+    if options.tags_to_include is not None:
+        assert options.tags_to_exclude is None, "tags_to_include and tags_to_exclude are incompatible"
+    if options.tags_to_exclude is not None:
+        assert options.tags_to_include is None, "tags_to_include and tags_to_exclude are incompatible"
     if input_folder is None:
         input_folder = ''
     if len(input_folder) > 0:
@@ -542,7 +589,7 @@ def read_exif_from_folder(input_folder,output_file=None,options=None,filenames=N
         assert is_executable(options.exiftool_command_name), 'exiftool not available'
     if filenames is None:
-        images = create_image_objects(input_folder)
+        images = create_image_objects(input_folder,recursive=recursive)
     else:
         assert isinstance(filenames,list)
         images = create_image_objects(filenames)
@@ -567,14 +614,16 @@ if False:
     #%%
-    input_folder = os.path.expanduser('~/data/KRU-test')
-    output_file = os.path.expanduser('~/data/test-exif.json')
+    input_folder = r'C:\temp\md-name-testing'
+    output_file = None # r'C:\temp\md-name-testing\exif.json'
     options = ReadExifOptions()
     options.verbose = False
     options.n_workers = 10
     options.use_threads = False
     options.processing_library = 'pil'
     # options.processing_library = 'exiftool'
+    options.tags_to_include = ['DateTime','Model','Make','ExifImageWidth','ExifImageHeight','DateTime','DateTimeOriginal','Orientation']
+    # options.tags_to_exclude = ['MakerNote']
     results = read_exif_from_folder(input_folder,output_file,options)

data_management/remap_coco_categories.py ADDED Viewed

@@ -0,0 +1,84 @@
+########
+#
+# remap_coco_categories.py
+#
+# Given a COCO-formatted dataset, remap the categories to a new mapping.
+#
+########
+#%% Imports and constants
+import os
+import json
+from copy import deepcopy
+#%% Main function
+def remap_coco_categories(input_data,
+                          output_category_name_to_id,
+                          input_category_name_to_output_category_name,
+                          output_file=None):
+    """
+    Given a COCO-formatted dataset, remap the categories to a new categories mapping, optionally
+    writing the results to a new file.
+    output_category_name_to_id is a dict mapping strings to ints.
+    input_category_name_to_output_category_name is a dict mapping strings to strings.
+    [input_data] can be a COCO-formatted dict or a filename.  If it's a dict, it will be copied,
+    not modified in place.
+    """
+    if isinstance(input_data,str):
+        assert os.path.isfile(input_data), "Can't find file {}".format(input_data)
+        with open(input_data,'r') as f:
+            input_data = json.load(f)
+        assert isinstance(input_data,dict), 'Illegal COCO input data'
+    else:
+        assert isinstance(input_data,dict), 'Illegal COCO input data'
+        input_data = deepcopy(input_data)
+    # It's safe to modify in-place now
+    output_data = input_data
+    # Read input name --> ID mapping
+    input_category_name_to_input_category_id = {}
+    for c in input_data['categories']:
+        input_category_name_to_input_category_id[c['name']] = c['id']
+    # Map input IDs --> output IDs
+    input_category_id_to_output_category_id = {}
+    for input_name in input_category_name_to_output_category_name.keys():
+        output_name = input_category_name_to_output_category_name[input_name]
+        assert output_name in output_category_name_to_id, \
+            'No output ID for {} --> {}'.format(input_name,output_name)
+        input_id = input_category_name_to_input_category_id[input_name]
+        output_id = output_category_name_to_id[output_name]
+        input_category_id_to_output_category_id[input_id] = output_id
+    # Map annotations
+    for ann in output_data['annotations']:
+        assert ann['category_id'] in input_category_id_to_output_category_id, \
+            'Unrecognized category ID {}'.format(ann['category_id'])
+        ann['category_id'] = input_category_id_to_output_category_id[ann['category_id']]
+    # Update the category list
+    output_categories = []
+    for output_name in output_category_name_to_id:
+        category = {'name':output_name,'id':output_category_name_to_id[output_name]}
+        output_categories.append(category)
+    output_data['categories'] = output_categories
+    if output_file is not None:
+        with open(output_file,'w') as f:
+            json.dump(output_data,f,indent=1)
+    return input_data
+#%% Command-line driver
+# TODO

data_management/resize_coco_dataset.py CHANGED Viewed

@@ -26,8 +26,7 @@ from md_visualization.visualization_utils import \
 def resize_coco_dataset(input_folder,input_filename,
                         output_folder,output_filename,
                         target_size=(-1,-1),
-                        correct_size_image_handling='copy',
-                        right_edge_quantization_threshold=None):
+                        correct_size_image_handling='copy'):
     """
     Given a COCO-formatted dataset (images in input_folder, data in input_filename), resize
     all the images to a target size (in output_folder) and scale bounding boxes accordingly
@@ -36,7 +35,7 @@ def resize_coco_dataset(input_folder,input_filename,
     target_size should be a tuple/list of ints, length 2.  If either dimension is -1, aspect ratio
     will be preserved.  If both dimensions are -1, this means "keep the original size".  If
     both dimensions are -1 and correct_size_image_handling is copy, this function is basically
-    a no-op, although you might still use it for right_edge_quantization_threshold.
+    a no-op.
     correct_size_image_handling can be 'copy' (in which case the original image is just copied
     to the output folder) or 'rewrite' (in which case the image is opened via PIL and re-written,
@@ -44,12 +43,6 @@ def resize_coco_dataset(input_folder,input_filename,
     you're superstitious about biases coming from images in a training set being written
     by different image encoders.
-    right_edge_quantization_threshold is an off-by-default hack to adjust large datasets where
-    boxes that really should be running off the right side of the image only extend like 99%
-    of the way there, due to what appears to be a slight bias inherent to MD.  If a box extends
-    within [right_edge_quantization_threshold] (a small number, from 0 to 1, but probably around
-    0.02) of the right edge of the image, it will be extended to the far right edge.
     Returns the COCO database with resized images.
     """
@@ -126,15 +119,6 @@ def resize_coco_dataset(input_folder,input_filename,
                             bbox[2] * width_scale,
                             bbox[3] * height_scale]
-                # Do we need to quantize this box?
-                if right_edge_quantization_threshold is not None and \
-                    right_edge_quantization_threshold > 0:
-                    bbox_right_edge_abs = bbox[0] + bbox[2]
-                    bbox_right_edge_norm = bbox_right_edge_abs / output_w
-                    bbox_right_edge_distance = (1.0 - bbox_right_edge_norm)
-                    if bbox_right_edge_distance < right_edge_quantization_threshold:
-                        bbox[2] = output_w - bbox[0]
                 ann['bbox'] = bbox
             # ...if this annotation has a box
@@ -169,13 +153,10 @@ if False:
     correct_size_image_handling = 'rewrite'
-    right_edge_quantization_threshold = 0.015
     resize_coco_dataset(input_folder,input_filename,
                         output_folder,output_filename,
                         target_size=target_size,
-                        correct_size_image_handling=correct_size_image_handling,
-                        right_edge_quantization_threshold=right_edge_quantization_threshold)
+                        correct_size_image_handling=correct_size_image_handling)
     #%% Preview

megadetector 5.0.7__py3-none-any.whl → 5.0.8__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.7py3-none-any.whl → 5.0.8py3-none-any.whl