PyPI - megadetector - Versions diffs - 5.0.6__py3-none-any.whl → 5.0.8__py3-none-any.whl - Mend

megadetector 5.0.6py3-none-any.whl → 5.0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (75) hide show

api/batch_processing/data_preparation/manage_local_batch.py +297 -202
api/batch_processing/data_preparation/manage_video_batch.py +7 -2
api/batch_processing/postprocessing/add_max_conf.py +1 -0
api/batch_processing/postprocessing/combine_api_outputs.py +2 -2
api/batch_processing/postprocessing/compare_batch_results.py +111 -61
api/batch_processing/postprocessing/convert_output_format.py +24 -6
api/batch_processing/postprocessing/load_api_results.py +56 -72
api/batch_processing/postprocessing/md_to_labelme.py +119 -51
api/batch_processing/postprocessing/merge_detections.py +30 -5
api/batch_processing/postprocessing/postprocess_batch_results.py +175 -55
api/batch_processing/postprocessing/remap_detection_categories.py +163 -0
api/batch_processing/postprocessing/render_detection_confusion_matrix.py +628 -0
api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +224 -76
api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
classification/prepare_classification_script.py +191 -191
data_management/cct_json_utils.py +7 -2
data_management/coco_to_labelme.py +263 -0
data_management/coco_to_yolo.py +72 -48
data_management/databases/integrity_check_json_db.py +75 -64
data_management/databases/subset_json_db.py +1 -1
data_management/generate_crops_from_cct.py +1 -1
data_management/get_image_sizes.py +44 -26
data_management/importers/animl_results_to_md_results.py +3 -5
data_management/importers/noaa_seals_2019.py +2 -2
data_management/importers/zamba_results_to_md_results.py +2 -2
data_management/labelme_to_coco.py +264 -127
data_management/labelme_to_yolo.py +96 -53
data_management/lila/create_lila_blank_set.py +557 -0
data_management/lila/create_lila_test_set.py +2 -1
data_management/lila/create_links_to_md_results_files.py +1 -1
data_management/lila/download_lila_subset.py +138 -45
data_management/lila/generate_lila_per_image_labels.py +23 -14
data_management/lila/get_lila_annotation_counts.py +16 -10
data_management/lila/lila_common.py +15 -42
data_management/lila/test_lila_metadata_urls.py +116 -0
data_management/read_exif.py +65 -16
data_management/remap_coco_categories.py +84 -0
data_management/resize_coco_dataset.py +14 -31
data_management/wi_download_csv_to_coco.py +239 -0
data_management/yolo_output_to_md_output.py +40 -13
data_management/yolo_to_coco.py +313 -100
detection/process_video.py +36 -14
detection/pytorch_detector.py +1 -1
detection/run_detector.py +73 -18
detection/run_detector_batch.py +116 -27
detection/run_inference_with_yolov5_val.py +135 -27
detection/run_tiled_inference.py +153 -43
detection/tf_detector.py +2 -1
detection/video_utils.py +4 -2
md_utils/ct_utils.py +101 -6
md_utils/md_tests.py +264 -17
md_utils/path_utils.py +326 -47
md_utils/process_utils.py +26 -7
md_utils/split_locations_into_train_val.py +215 -0
md_utils/string_utils.py +10 -0
md_utils/url_utils.py +66 -3
md_utils/write_html_image_list.py +12 -2
md_visualization/visualization_utils.py +380 -74
md_visualization/visualize_db.py +41 -10
md_visualization/visualize_detector_output.py +185 -104
{megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/METADATA +11 -13
{megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/RECORD +74 -67
{megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/WHEEL +1 -1
taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
taxonomy_mapping/map_new_lila_datasets.py +43 -39
taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
taxonomy_mapping/preview_lila_taxonomy.py +27 -27
taxonomy_mapping/species_lookup.py +33 -13
taxonomy_mapping/taxonomy_csv_checker.py +7 -5
md_visualization/visualize_megadb.py +0 -183
{megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/LICENSE +0 -0
{megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/top_level.txt +0 -0

taxonomy_mapping/species_lookup.py CHANGED Viewed

@@ -36,14 +36,23 @@ taxonomy_urls = {
 }
 files_to_unzip = {
-    'GBIF': ['backbone/Taxon.tsv', 'backbone/VernacularName.tsv'],
+    # GBIF used to put everything in a "backbone" folder within the zipfile, but as of
+    # 12.2023, this is no longer the case.
+    # 'GBIF': ['backbone/Taxon.tsv', 'backbone/VernacularName.tsv'],
+    'GBIF': ['Taxon.tsv', 'VernacularName.tsv'],
     'iNaturalist': ['taxa.csv']
 }
 # As of 2020.05.12:
 #
 # GBIF: ~777MB zipped, ~1.6GB taxonomy
-# iNat: ~2.2GB zipped, ~51MB taxonomy
+# iNat: ~2.2GB zipped, ~51MB taxonomy (most of the zipfile is observations)
+# As of 2023.12.29:
+#
+# GBIF: ~948MB zipped, ~2.2GB taxonomy
+# iNat: ~6.7GB zipped, ~62MB taxonomy (most of the zipfile is observations)
 os.makedirs(taxonomy_download_dir, exist_ok=True)
 for taxonomy_name in taxonomy_urls:
@@ -99,15 +108,16 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
         gbif_taxon_id_to_scientific,\
         gbif_scientific_to_taxon_id
     ## Load serialized taxonomy info if we've already saved it
     if (not force_init) and (inat_taxonomy is not None):
         print('Skipping taxonomy re-init')
         return
-    if os.path.isfile(serialized_structures_file):
+    if (not force_init) and (os.path.isfile(serialized_structures_file)):
-        print(f'Reading taxonomy data from {serialized_structures_file}')
+        print(f'De-serializing taxonomy data from {serialized_structures_file}')
         with open(serialized_structures_file, 'rb') as f:
             structures_to_serialize = pickle.load(f)
@@ -125,6 +135,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
         gbif_vernacular_to_taxon_id,\
         gbif_taxon_id_to_scientific,\
         gbif_scientific_to_taxon_id = structures_to_serialize
         return
@@ -135,6 +146,9 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
     for taxonomy_name, zip_url in taxonomy_urls.items():
         need_to_download = False
+        if force_init:
+            need_to_download = True
         # Don't download the zipfile if we've already unzipped what we need
         for fn in files_to_unzip[taxonomy_name]:
@@ -150,11 +164,11 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
         zipfile_path = os.path.join(
             taxonomy_download_dir, zip_url.split('/')[-1])
-        # Bypasses download if the file exists already
+        # Bypasses download if the file exists already (unless force_init is set)
         url_utils.download_url(
             zip_url, os.path.join(zipfile_path),
             progress_updater=url_utils.DownloadProgressBar(),
-            verbose=True)
+            verbose=True,force_download=force_init)
         # Unzip the files we need
         files_we_need = files_to_unzip[taxonomy_name]
@@ -166,7 +180,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
                 target_file = os.path.join(
                     taxonomy_download_dir, taxonomy_name, os.path.basename(fn))
-                if os.path.isfile(target_file):
+                if (not force_init) and (os.path.isfile(target_file)):
                     print(f'Bypassing unzip of {target_file}, file exists')
                 else:
                     os.makedirs(os.path.basename(target_file),exist_ok=True)
@@ -185,13 +199,16 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
     # name file
     # Load iNat taxonomy
-    inat_taxonomy = pd.read_csv(os.path.join(taxonomy_download_dir, 'iNaturalist', 'taxa.csv'))
+    inat_taxonomy_file = os.path.join(taxonomy_download_dir, 'iNaturalist', 'taxa.csv')
+    print('Loading iNat taxonomy from {}'.format(inat_taxonomy_file))
+    inat_taxonomy = pd.read_csv(inat_taxonomy_file)
     inat_taxonomy['scientificName'] = inat_taxonomy['scientificName'].fillna('').str.strip()
     inat_taxonomy['vernacularName'] = inat_taxonomy['vernacularName'].fillna('').str.strip()
     # Load GBIF taxonomy
-    gbif_taxonomy = pd.read_csv(os.path.join(
-        taxonomy_download_dir, 'GBIF', 'Taxon.tsv'), sep='\t')
+    gbif_taxonomy_file = os.path.join(taxonomy_download_dir, 'GBIF', 'Taxon.tsv')
+    print('Loading GBIF taxonomy from {}'.format(gbif_taxonomy_file))
+    gbif_taxonomy = pd.read_csv(gbif_taxonomy_file, sep='\t')
     gbif_taxonomy['scientificName'] = gbif_taxonomy['scientificName'].fillna('').str.strip()
     gbif_taxonomy['canonicalName'] = gbif_taxonomy['canonicalName'].fillna('').str.strip()
@@ -249,7 +266,8 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
     # Build iNat dictionaries
-    # row = inat_taxonomy.iloc[0]
+    print('Building lookup dictionaries for iNat taxonomy')
     for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
         taxon_id = row['taxonID']
@@ -267,6 +285,8 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
     # Build GBIF dictionaries
+    print('Building lookup dictionaries for GBIF taxonomy')
     for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
         taxon_id = row['taxonID']
@@ -320,13 +340,13 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
         gbif_scientific_to_taxon_id
     ]
-    print('Serializing...', end='')
+    print('Serializing to {}...'.format(serialized_structures_file), end='')
     if not os.path.isfile(serialized_structures_file):
         with open(serialized_structures_file, 'wb') as p:
             pickle.dump(structures_to_serialize, p)
     print(' done')
-# ...def initialize_taxonomy_lookup()
+# ...def initialize_taxonomy_lookup(...)
 def get_scientific_name_from_row(r):

taxonomy_mapping/taxonomy_csv_checker.py CHANGED Viewed

@@ -45,7 +45,7 @@ def check_taxonomy_csv(csv_path: str) -> None:
     num_taxon_level_errors = 0
     num_scientific_name_errors = 0
-    for i, row in taxonomy_df.iterrows():
+    for i_row, row in taxonomy_df.iterrows():
         ds = row['dataset_name']
         ds_label = row['query']
@@ -81,14 +81,14 @@ def check_taxonomy_csv(csv_path: str) -> None:
             node.add_id(id_source, int(taxon_id))  # np.int64 -> int
             if j == 0:
                 if level != taxon_level:
-                    print(f'row: {i}, {ds}, {ds_label}')
+                    print(f'row: {i_row}, {ds}, {ds_label}')
                     print(f'- taxonomy_level column: {level}, '
                           f'level from taxonomy_string: {taxon_level}')
                     print()
                     num_taxon_level_errors += 1
                 if scientific_name != taxon_name:
-                    print(f'row: {i}, {ds}, {ds_label}')
+                    print(f'row: {i_row}, {ds}, {ds_label}')
                     print(f'- scientific_name column: {scientific_name}, '
                           f'name from taxonomy_string: {taxon_name}')
                     print()
@@ -97,7 +97,7 @@ def check_taxonomy_csv(csv_path: str) -> None:
             taxon_child = node
     # ...for each row in the taxonomy file
     assert nx.is_directed_acyclic_graph(graph)
     for node in graph.nodes:
@@ -123,6 +123,8 @@ def check_taxonomy_csv(csv_path: str) -> None:
     except AssertionError as e:
         print(f'At least one node has unresolved ambiguous parents: {e}')
+    print('Processed {} rows from {}'.format(len(taxonomy_df),csv_path))
     print('num taxon level errors:', num_taxon_level_errors)
     print('num scientific name errors:', num_scientific_name_errors)
@@ -154,4 +156,4 @@ if False:
     import os
     csv_path = os.path.expanduser('~/lila/lila-taxonomy-mapping_release.csv')
     check_taxonomy_csv(csv_path)

md_visualization/visualize_megadb.py DELETED Viewed

@@ -1,183 +0,0 @@
-########
-#
-# visualize_megadb.py
-#
-# Create visual previews of images/sequences in MegaDB.
-#
-########
-#%% Imports
-import argparse
-import json
-import os
-import sys
-from random import shuffle
-from multiprocessing.pool import ThreadPool
-from functools import partial
-import io
-from tqdm import tqdm
-from data_management.megadb.megadb_utils import MegadbUtils
-from md_utils.write_html_image_list import write_html_image_list
-from md_visualization import visualization_utils as vis_utils
-#%% Support functions
-def render_image_info(rendering, args):
-    storage_client = rendering['storage_client']
-    image_obj = io.BytesIO()
-    try:
-        storage_client.download_blob(rendering['blob_path']).readinto(image_obj)
-    except Exception as e:
-        print(f'Image not found in blob storage: {rendering["blob_path"]}')
-        print(e)
-        return
-    # resize is for displaying them more quickly
-    image = vis_utils.resize_image(
-        vis_utils.open_image(image_obj), args.output_image_width)
-    vis_utils.render_megadb_bounding_boxes(rendering['bbox'], image)
-    annotated_img_name = rendering['annotated_img_name']
-    annotated_img_path = os.path.join(
-        args.output_dir, 'rendered_images', annotated_img_name)
-    image.save(annotated_img_path)
-def visualize_sequences(datasets_table, sequences, args):
-    num_images = 0
-    images_html = []
-    rendering_info = []
-    for seq in sequences:
-        if 'images' not in seq:
-            continue
-        # dataset and seq_id are required fields
-        dataset_name = seq['dataset']
-        seq_id = seq['seq_id']
-        # sort the images in the sequence
-        images_in_seq = sorted(seq['images'], key=lambda x: x['frame_num']) if len(seq['images']) > 1 else seq['images']
-        for im in images_in_seq:
-            if args.trim_to_images_bboxes_labeled and 'bbox' not in im:
-                continue
-            num_images += 1
-            blob_path = MegadbUtils.get_full_path(
-                datasets_table, dataset_name, im['file'])
-            frame_num = im.get('frame_num', -1)
-            # if no class label on the image, show class label on the sequence
-            im_class = im.get('class', None)
-            if im_class is None:
-                im_class = seq.get('class', [])
-            rendering = {}
-            rendering['storage_client'] = MegadbUtils.get_storage_client(
-                datasets_table, dataset_name)
-            rendering['blob_path'] = blob_path
-            rendering['bbox'] = im.get('bbox', [])
-            annotated_img_name = 'anno_' + blob_path.replace('/', args.pathsep_replacement).replace('\\', args.pathsep_replacement)
-            rendering['annotated_img_name'] = annotated_img_name
-            rendering_info.append(rendering)
-            images_html.append({
-                'filename': 'rendered_images/{}'.format(annotated_img_name),
-                'title': 'Seq ID: {}. Frame number: {}<br/> Image file: {}<br/> number of boxes: {}, image class labels: {}'.format(seq_id, frame_num, blob_path, len(rendering['bbox']), im_class),
-                'textStyle': 'font-family:verdana,arial,calibri;font-size:80%;text-align:left;margin-top:20;margin-bottom:5'
-            })
-        if num_images >= args.num_to_visualize:
-            print('num_images visualized is {}'.format(num_images))
-            break
-    # pool = ThreadPool()
-    render_image_info_partial = partial(render_image_info, args=args)
-    # print('len of rendering_info', len(rendering_info))
-    # tqdm(pool.imap_unordered(render_image_info_partial, rendering_info), total=len(rendering_info))
-    for rendering in tqdm(rendering_info):
-        render_image_info_partial(rendering)
-    print('Making HTML...')
-    html_path = os.path.join(args.output_dir, 'index.html')
-    # options = write_html_image_list()
-    # options['headerHtml']
-    write_html_image_list(
-        filename=html_path,
-        images=images_html
-    )
-#%% Command-line driver
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        'megadb_entries', type=str,
-        help='Path to a json list of MegaDB entries')
-    parser.add_argument(
-        'output_dir', action='store', type=str,
-        help='Output directory for html and rendered images')
-    parser.add_argument(
-        '--trim_to_images_bboxes_labeled', action='store_true',
-        help='Only include images that have been sent for bbox labeling (but '
-             'may be actually empty). Turn this on if QAing annotations.')
-    parser.add_argument(
-        '--num_to_visualize', action='store', type=int, default=200,
-        help='Number of images to visualize (all comformant images in a '
-             'sequence are shown, so may be a few more than specified). '
-             'Sequences are shuffled. Default: 200. Use -1 to visualize all.')
-    parser.add_argument(
-        '--pathsep_replacement', action='store', type=str, default='~',
-        help='Replace path separators in relative filenames with another '
-             'character (default ~)')
-    parser.add_argument(
-        '-w', '--output_image_width', type=int, default=700,
-        help='an integer indicating the desired width in pixels of the output '
-             'annotated images. Use -1 to not resize.')
-    if len(sys.argv[1:]) == 0:
-        parser.print_help()
-        parser.exit()
-    args = parser.parse_args()
-    assert 'COSMOS_ENDPOINT' in os.environ and 'COSMOS_KEY' in os.environ
-    os.makedirs(args.output_dir, exist_ok=True)
-    os.makedirs(os.path.join(args.output_dir, 'rendered_images'))
-    print('Connecting to MegaDB to get the datasets table...')
-    megadb_utils = MegadbUtils()
-    datasets_table = megadb_utils.get_datasets_table()
-    print('Loading the MegaDB entries...')
-    with open(args.megadb_entries) as f:
-        sequences = json.load(f)
-    print('Total number of sequences: {}'.format(len(sequences)))
-    # print('Checking that the MegaDB entries conform to the schema...')
-    # sequences_schema_check.sequences_schema_check(sequences)
-    shuffle(sequences)
-    visualize_sequences(datasets_table, sequences, args)
-if __name__ == '__main__':
-    main()

{megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/LICENSE RENAMED Viewed

File without changes

{megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/top_level.txt RENAMED Viewed

File without changes

megadetector 5.0.6__py3-none-any.whl → 5.0.8__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.6py3-none-any.whl → 5.0.8py3-none-any.whl