PyPI - megadetector - Versions diffs - 5.0.6__py3-none-any.whl → 5.0.7__py3-none-any.whl - Mend

megadetector 5.0.6py3-none-any.whl → 5.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (62) hide show

api/batch_processing/data_preparation/manage_local_batch.py +278 -197
api/batch_processing/data_preparation/manage_video_batch.py +7 -2
api/batch_processing/postprocessing/add_max_conf.py +1 -0
api/batch_processing/postprocessing/compare_batch_results.py +110 -60
api/batch_processing/postprocessing/load_api_results.py +55 -69
api/batch_processing/postprocessing/md_to_labelme.py +1 -0
api/batch_processing/postprocessing/postprocess_batch_results.py +158 -50
api/batch_processing/postprocessing/render_detection_confusion_matrix.py +625 -0
api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +222 -74
api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
classification/prepare_classification_script.py +191 -191
data_management/coco_to_yolo.py +65 -44
data_management/databases/integrity_check_json_db.py +7 -5
data_management/generate_crops_from_cct.py +1 -1
data_management/importers/animl_results_to_md_results.py +2 -2
data_management/importers/noaa_seals_2019.py +1 -1
data_management/importers/zamba_results_to_md_results.py +2 -2
data_management/labelme_to_coco.py +34 -6
data_management/labelme_to_yolo.py +1 -1
data_management/lila/create_lila_blank_set.py +474 -0
data_management/lila/create_lila_test_set.py +2 -1
data_management/lila/create_links_to_md_results_files.py +1 -1
data_management/lila/download_lila_subset.py +46 -21
data_management/lila/generate_lila_per_image_labels.py +23 -14
data_management/lila/get_lila_annotation_counts.py +16 -10
data_management/lila/lila_common.py +14 -11
data_management/lila/test_lila_metadata_urls.py +116 -0
data_management/resize_coco_dataset.py +12 -10
data_management/yolo_output_to_md_output.py +40 -13
data_management/yolo_to_coco.py +34 -21
detection/process_video.py +36 -14
detection/pytorch_detector.py +1 -1
detection/run_detector.py +73 -18
detection/run_detector_batch.py +104 -24
detection/run_inference_with_yolov5_val.py +127 -26
detection/run_tiled_inference.py +153 -43
detection/video_utils.py +3 -1
md_utils/ct_utils.py +79 -3
md_utils/md_tests.py +253 -15
md_utils/path_utils.py +129 -24
md_utils/process_utils.py +26 -7
md_utils/split_locations_into_train_val.py +215 -0
md_utils/string_utils.py +10 -0
md_utils/url_utils.py +0 -2
md_utils/write_html_image_list.py +1 -0
md_visualization/visualization_utils.py +17 -2
md_visualization/visualize_db.py +8 -0
md_visualization/visualize_detector_output.py +185 -104
{megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/METADATA +2 -2
{megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/RECORD +62 -58
{megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/WHEEL +1 -1
taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
taxonomy_mapping/map_new_lila_datasets.py +43 -39
taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
taxonomy_mapping/preview_lila_taxonomy.py +27 -27
taxonomy_mapping/species_lookup.py +33 -13
taxonomy_mapping/taxonomy_csv_checker.py +7 -5
{megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/LICENSE +0 -0
{megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/top_level.txt +0 -0

taxonomy_mapping/map_new_lila_datasets.py CHANGED Viewed

@@ -15,15 +15,25 @@ import json
 # Created by get_lila_category_list.py
 input_lila_category_list_file = os.path.expanduser('~/lila/lila_categories_list/lila_dataset_to_categories.json')
-output_file = os.path.expanduser('~/lila/lila_additions_2022.08.22.csv')
+output_file = os.path.expanduser('~/lila/lila_additions_2023.12.29.csv')
 datasets_to_map = [
-    # 'NACTI'
-    # 'Channel Islands Camera Traps'
-    'ENA24'
+    'Trail Camera Images of New Zealand Animals'
     ]
+#%% Initialize taxonomic lookup
+from taxonomy_mapping.species_lookup import (
+    initialize_taxonomy_lookup,
+    get_preferred_taxonomic_match)
+# from taxonomy_mapping.species_lookup import (
+#    get_taxonomic_info, print_taxonomy_matche)
+initialize_taxonomy_lookup(force_init=False)
 #%% Read the list of datasets
 with open(input_lila_category_list_file,'r') as f:
@@ -57,46 +67,14 @@ for dataset_name in datasets_to_map:
 print('Need to create {} mappings'.format(len(category_mappings)))
-#%% Initialize taxonomic lookup
-from taxonomy_mapping.species_lookup import (
-    initialize_taxonomy_lookup,
-    get_preferred_taxonomic_match)
-# from taxonomy_mapping.species_lookup import (
-#    get_taxonomic_info, print_taxonomy_matche)
-initialize_taxonomy_lookup()
-#%% Manual lookup
-if False:
-    #%%
-    # q = 'white-throated monkey'
-    q = 'cingulata'
-    taxonomy_preference = 'inat'
-    m = get_preferred_taxonomic_match(q,taxonomy_preference)
-    if m is None:
-        print('No match')
-    else:
-        if m.source != taxonomy_preference:
-            print('\n*** non-preferred match ***\n')
-            # raise ValueError('')
-        print(m.source)
-        print(m.taxonomy_string)
-        import clipboard; clipboard.copy(m.taxonomy_string)
 #%% Match every query against our taxonomies
 output_rows = []
 taxonomy_preference = 'inat'
+allow_non_preferred_matches = True
 # mapping_string = category_mappings[1]; print(mapping_string)
 for mapping_string in category_mappings:
@@ -108,7 +86,7 @@ for mapping_string in category_mappings:
     taxonomic_match = get_preferred_taxonomic_match(query,taxonomy_preference=taxonomy_preference)
-    if taxonomic_match.source == taxonomy_preference:
+    if (taxonomic_match.source == taxonomy_preference) or allow_non_preferred_matches:
         output_row = {
             'dataset_name': dataset_name,
@@ -148,3 +126,29 @@ output_df = pd.DataFrame(data=output_rows, columns=[
     'dataset_name', 'query', 'source', 'taxonomy_level',
     'scientific_name', 'common_name', 'taxonomy_string'])
 output_df.to_csv(output_file, index=None, header=True)
+#%% Manual lookup
+if False:
+    #%%
+    # q = 'white-throated monkey'
+    # q = 'cingulata'
+    # q = 'notamacropus'
+    q = 'porzana'
+    taxonomy_preference = 'inat'
+    m = get_preferred_taxonomic_match(q,taxonomy_preference)
+    # print(m.scientific_name); import clipboard; clipboard.copy(m.scientific_name)
+    if m is None:
+        print('No match')
+    else:
+        if m.source != taxonomy_preference:
+            print('\n*** non-preferred match ***\n')
+            # raise ValueError('')
+        print(m.source)
+        print(m.taxonomy_string)
+        # print(m.scientific_name); import clipboard; clipboard.copy(m.scientific_name)
+        import clipboard; clipboard.copy(m.taxonomy_string)

taxonomy_mapping/prepare_lila_taxonomy_release.py CHANGED Viewed

@@ -13,8 +13,9 @@ import os
 import json
 import pandas as pd
-lila_taxonomy_file = os.path.expanduser('~/git/agentmorrisprivate/lila-taxonomy/lila-taxonomy-mapping.csv')
-release_taxonomy_file = os.path.expanduser('~/lila/lila-taxonomy-mapping_release.22.08.22.0000.csv')
+lila_taxonomy_file = 'c:/git/agentmorrisprivate/lila-taxonomy/lila-taxonomy-mapping.csv'
+release_taxonomy_file = os.path.expanduser('~/lila/lila-taxonomy-mapping_release.csv')
+# import clipboard; clipboard.copy(release_taxonomy_file)
 # Created by get_lila_category_list.py... contains counts for each category
 lila_dataset_to_categories_file = os.path.expanduser('~/lila/lila_categories_list/lila_dataset_to_categories.json')
@@ -129,3 +130,5 @@ for i_row,row in df.iterrows():
 df = df.drop('source',axis=1)
 df.to_csv(release_taxonomy_file,header=True,index=False)
+print('Wrote final output to {}'.format(release_taxonomy_file))

taxonomy_mapping/preview_lila_taxonomy.py CHANGED Viewed

@@ -15,11 +15,10 @@ from tqdm import tqdm
 import os
 import pandas as pd
-# lila_taxonomy_file = r"G:\git\agentmorrisprivate\lila-taxonomy\lila-taxonomy-mapping.csv"
-lila_taxonomy_file = r"G:\temp\lila\lila-taxonomy-mapping_release.22.07.03.1608.csv"
-# lila_taxonomy_file = r"G:\temp\lila\lila_additions_2022.06.29.csv"
+# lila_taxonomy_file = r"c:\git\agentmorrisprivate\lila-taxonomy\lila-taxonomy-mapping.csv"
+lila_taxonomy_file = os.path.expanduser('~/lila/lila_additions_2023.12.29.csv')
-preview_base = r'g:\temp\lila\lila_taxonomy_preview'
+preview_base = os.path.expanduser('~/lila/lila_taxonomy_preview')
 os.makedirs(preview_base,exist_ok=True)
 html_output_file = os.path.join(preview_base,'index.html')
@@ -172,15 +171,14 @@ for i_row,row in tqdm(df.iterrows(),total=len(df)):
 print('\nMade {} taxonomy changes'.format(n_taxonomy_changes))
+# Optionally re-write
 if False:
     df.to_csv(lila_taxonomy_file,header=True,index=False)
 #%% List null mappings
-#
-# These should all be things like "unidentified" and "fire"
-#
+# These should all be things like "empty", "unidentified", "fire", "car", etc.
 # i_row = 0; row = df.iloc[i_row]
 for i_row,row in df.iterrows():
@@ -393,20 +391,20 @@ remapped_queries = {'papio':'papio+baboon',
 import os
 from taxonomy_mapping import retrieve_sample_image
 scientific_name_to_paths = {}
 image_base = os.path.join(preview_base,'images')
 images_per_query = 15
 min_valid_images_per_query = 3
 min_valid_image_size = 3000
+# TODO: trivially prallelizable
+#
 # i_row = 0; row = df.iloc[i_row]
 for i_row,row in df.iterrows():
     s = row['scientific_name']
-    # if s != 'mirafra':
-    #    continue
     if (not isinstance(s,str)) or (len(s)==0):
         continue
@@ -416,17 +414,17 @@ for i_row,row in df.iterrows():
         query = remapped_queries[query]
     query_folder = os.path.join(image_base,query)
+    os.makedirs(query_folder,exist_ok=True)
     # Check whether we already have enough images for this query
-    if os.path.isdir(query_folder):
-        image_files = os.listdir(query_folder)
-        image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
-        sizes = [os.path.getsize(p) for p in image_fullpaths]
-        sizes_above_threshold = [x for x in sizes if x > min_valid_image_size]
-        if len(sizes_above_threshold) > min_valid_images_per_query:
-            # print('Skipping query {}, already have {} images'.format(s,len(sizes_above_threshold)))
-            continue
+    image_files = os.listdir(query_folder)
+    image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
+    sizes = [os.path.getsize(p) for p in image_fullpaths]
+    sizes_above_threshold = [x for x in sizes if x > min_valid_image_size]
+    if len(sizes_above_threshold) > min_valid_images_per_query:
+        print('Skipping query {}, already have {} images'.format(s,len(sizes_above_threshold)))
+        continue
     # Check whether we've already run this query for a previous row
     if query in scientific_name_to_paths:
         continue
@@ -448,14 +446,16 @@ from md_utils import path_utils
 all_images = path_utils.recursive_file_list(image_base,False)
 for fn in tqdm(all_images):
-    if fn.endswith('.jpeg'):
+    if fn.lower().endswith('.jpeg'):
         new_fn = fn[0:-5] + '.jpg'
-        # print('Renaming {} to {}'.format(fn,new_fn))
         os.rename(fn, new_fn)
 #%% Choose representative images for each scientific name
+# Specifically, sort by size, and take the largest unique sizes. Very small files tend
+# to be bogus thumbnails, etc.
 max_images_per_query = 4
 scientific_name_to_preferred_images = {}
@@ -506,7 +506,7 @@ for images in scientific_name_to_preferred_images.values():
 print('Using a total of {} images'.format(len(used_images)))
 used_images_set = set(used_images)
-import path_utils
+from md_utils import path_utils
 all_images = path_utils.recursive_file_list(image_base,False)
 unused_images = []
@@ -523,7 +523,7 @@ for fn in tqdm(unused_images):
 #%% Produce HTML preview
-with open(html_output_file, 'w') as f:
+with open(html_output_file, 'w', encoding='utf-8') as f:
     f.write('<html><head></head><body>\n')
@@ -555,10 +555,11 @@ with open(html_output_file, 'w') as f:
         f.write('<p class="speciesinfo_p" style="font-weight:bold;font-size:130%">')
         if isinstance(row.scientific_name,str):
-            f.write('{}: <b><u>{}</u></b> mapped to {} {} ({}) ({})</p>\n'.format(
+            output_string = '{}: <b><u>{}</u></b> mapped to {} {} ({}) ({})</p>\n'.format(
                 row.dataset_name, row.query,
                 row.taxonomy_level, row.scientific_name, common_name_string,
-                row.common_name))
+                row.common_name)
+            f.write(output_string)
         else:
             f.write('{}: <b><u>{}</u></b> unmapped'.format(row.dataset_name,row.query))
@@ -586,6 +587,5 @@ with open(html_output_file, 'w') as f:
 #%% Open HTML preview
-from md_utils.path_utils import open_file # from ai4eutils
+from md_utils.path_utils import open_file
 open_file(html_output_file)

taxonomy_mapping/species_lookup.py CHANGED Viewed

@@ -36,14 +36,23 @@ taxonomy_urls = {
 }
 files_to_unzip = {
-    'GBIF': ['backbone/Taxon.tsv', 'backbone/VernacularName.tsv'],
+    # GBIF used to put everything in a "backbone" folder within the zipfile, but as of
+    # 12.2023, this is no longer the case.
+    # 'GBIF': ['backbone/Taxon.tsv', 'backbone/VernacularName.tsv'],
+    'GBIF': ['Taxon.tsv', 'VernacularName.tsv'],
     'iNaturalist': ['taxa.csv']
 }
 # As of 2020.05.12:
 #
 # GBIF: ~777MB zipped, ~1.6GB taxonomy
-# iNat: ~2.2GB zipped, ~51MB taxonomy
+# iNat: ~2.2GB zipped, ~51MB taxonomy (most of the zipfile is observations)
+# As of 2023.12.29:
+#
+# GBIF: ~948MB zipped, ~2.2GB taxonomy
+# iNat: ~6.7GB zipped, ~62MB taxonomy (most of the zipfile is observations)
 os.makedirs(taxonomy_download_dir, exist_ok=True)
 for taxonomy_name in taxonomy_urls:
@@ -99,15 +108,16 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
         gbif_taxon_id_to_scientific,\
         gbif_scientific_to_taxon_id
     ## Load serialized taxonomy info if we've already saved it
     if (not force_init) and (inat_taxonomy is not None):
         print('Skipping taxonomy re-init')
         return
-    if os.path.isfile(serialized_structures_file):
+    if (not force_init) and (os.path.isfile(serialized_structures_file)):
-        print(f'Reading taxonomy data from {serialized_structures_file}')
+        print(f'De-serializing taxonomy data from {serialized_structures_file}')
         with open(serialized_structures_file, 'rb') as f:
             structures_to_serialize = pickle.load(f)
@@ -125,6 +135,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
         gbif_vernacular_to_taxon_id,\
         gbif_taxon_id_to_scientific,\
         gbif_scientific_to_taxon_id = structures_to_serialize
         return
@@ -135,6 +146,9 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
     for taxonomy_name, zip_url in taxonomy_urls.items():
         need_to_download = False
+        if force_init:
+            need_to_download = True
         # Don't download the zipfile if we've already unzipped what we need
         for fn in files_to_unzip[taxonomy_name]:
@@ -150,11 +164,11 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
         zipfile_path = os.path.join(
             taxonomy_download_dir, zip_url.split('/')[-1])
-        # Bypasses download if the file exists already
+        # Bypasses download if the file exists already (unless force_init is set)
         url_utils.download_url(
             zip_url, os.path.join(zipfile_path),
             progress_updater=url_utils.DownloadProgressBar(),
-            verbose=True)
+            verbose=True,force_download=force_init)
         # Unzip the files we need
         files_we_need = files_to_unzip[taxonomy_name]
@@ -166,7 +180,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
                 target_file = os.path.join(
                     taxonomy_download_dir, taxonomy_name, os.path.basename(fn))
-                if os.path.isfile(target_file):
+                if (not force_init) and (os.path.isfile(target_file)):
                     print(f'Bypassing unzip of {target_file}, file exists')
                 else:
                     os.makedirs(os.path.basename(target_file),exist_ok=True)
@@ -185,13 +199,16 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
     # name file
     # Load iNat taxonomy
-    inat_taxonomy = pd.read_csv(os.path.join(taxonomy_download_dir, 'iNaturalist', 'taxa.csv'))
+    inat_taxonomy_file = os.path.join(taxonomy_download_dir, 'iNaturalist', 'taxa.csv')
+    print('Loading iNat taxonomy from {}'.format(inat_taxonomy_file))
+    inat_taxonomy = pd.read_csv(inat_taxonomy_file)
     inat_taxonomy['scientificName'] = inat_taxonomy['scientificName'].fillna('').str.strip()
     inat_taxonomy['vernacularName'] = inat_taxonomy['vernacularName'].fillna('').str.strip()
     # Load GBIF taxonomy
-    gbif_taxonomy = pd.read_csv(os.path.join(
-        taxonomy_download_dir, 'GBIF', 'Taxon.tsv'), sep='\t')
+    gbif_taxonomy_file = os.path.join(taxonomy_download_dir, 'GBIF', 'Taxon.tsv')
+    print('Loading GBIF taxonomy from {}'.format(gbif_taxonomy_file))
+    gbif_taxonomy = pd.read_csv(gbif_taxonomy_file, sep='\t')
     gbif_taxonomy['scientificName'] = gbif_taxonomy['scientificName'].fillna('').str.strip()
     gbif_taxonomy['canonicalName'] = gbif_taxonomy['canonicalName'].fillna('').str.strip()
@@ -249,7 +266,8 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
     # Build iNat dictionaries
-    # row = inat_taxonomy.iloc[0]
+    print('Building lookup dictionaries for iNat taxonomy')
     for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
         taxon_id = row['taxonID']
@@ -267,6 +285,8 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
     # Build GBIF dictionaries
+    print('Building lookup dictionaries for GBIF taxonomy')
     for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
         taxon_id = row['taxonID']
@@ -320,13 +340,13 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
         gbif_scientific_to_taxon_id
     ]
-    print('Serializing...', end='')
+    print('Serializing to {}...'.format(serialized_structures_file), end='')
     if not os.path.isfile(serialized_structures_file):
         with open(serialized_structures_file, 'wb') as p:
             pickle.dump(structures_to_serialize, p)
     print(' done')
-# ...def initialize_taxonomy_lookup()
+# ...def initialize_taxonomy_lookup(...)
 def get_scientific_name_from_row(r):

taxonomy_mapping/taxonomy_csv_checker.py CHANGED Viewed

@@ -45,7 +45,7 @@ def check_taxonomy_csv(csv_path: str) -> None:
     num_taxon_level_errors = 0
     num_scientific_name_errors = 0
-    for i, row in taxonomy_df.iterrows():
+    for i_row, row in taxonomy_df.iterrows():
         ds = row['dataset_name']
         ds_label = row['query']
@@ -81,14 +81,14 @@ def check_taxonomy_csv(csv_path: str) -> None:
             node.add_id(id_source, int(taxon_id))  # np.int64 -> int
             if j == 0:
                 if level != taxon_level:
-                    print(f'row: {i}, {ds}, {ds_label}')
+                    print(f'row: {i_row}, {ds}, {ds_label}')
                     print(f'- taxonomy_level column: {level}, '
                           f'level from taxonomy_string: {taxon_level}')
                     print()
                     num_taxon_level_errors += 1
                 if scientific_name != taxon_name:
-                    print(f'row: {i}, {ds}, {ds_label}')
+                    print(f'row: {i_row}, {ds}, {ds_label}')
                     print(f'- scientific_name column: {scientific_name}, '
                           f'name from taxonomy_string: {taxon_name}')
                     print()
@@ -97,7 +97,7 @@ def check_taxonomy_csv(csv_path: str) -> None:
             taxon_child = node
     # ...for each row in the taxonomy file
     assert nx.is_directed_acyclic_graph(graph)
     for node in graph.nodes:
@@ -123,6 +123,8 @@ def check_taxonomy_csv(csv_path: str) -> None:
     except AssertionError as e:
         print(f'At least one node has unresolved ambiguous parents: {e}')
+    print('Processed {} rows from {}'.format(len(taxonomy_df),csv_path))
     print('num taxon level errors:', num_taxon_level_errors)
     print('num scientific name errors:', num_scientific_name_errors)
@@ -154,4 +156,4 @@ if False:
     import os
     csv_path = os.path.expanduser('~/lila/lila-taxonomy-mapping_release.csv')
     check_taxonomy_csv(csv_path)

{megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/LICENSE RENAMED Viewed

File without changes

{megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

megadetector 5.0.6__py3-none-any.whl → 5.0.7__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.6py3-none-any.whl → 5.0.7py3-none-any.whl