PyPI - megadetector - Versions diffs - 5.0.11__py3-none-any.whl → 5.0.13__py3-none-any.whl - Mend - Supply Chain Defender

megadetector 5.0.11py3-none-any.whl → 5.0.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (203) hide show

megadetector/data_management/lila/create_links_to_md_results_files.py ADDED Viewed

@@ -0,0 +1,106 @@
+"""
+create_links_to_md_results_files.py
+One-off script to populate the columns in the camera trap data .csv file that point to MD results.
+"""
+#%% Imports and constants
+import os
+import pandas as pd
+input_csv_file = r'g:\temp\lila_camera_trap_datasets_no_md_results.csv'
+output_csv_file = r'g:\temp\lila_camera_trap_datasets.csv'
+md_results_local_folder = r'g:\temp\lila-md-results'
+md_base_url = 'https://lila.science/public/lila-md-results/'
+assert md_base_url.endswith('/')
+# No RDE files for datasets with no location information
+datasets_without_location_info = ('ena24','missouri-camera-traps')
+md_results_column_names = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
+validate_urls = False
+#%% Read input data
+df = pd.read_csv(input_csv_file)
+for s in md_results_column_names:
+    df[s] = ''
+#%% Find matching files locally, and create URLs
+local_files = os.listdir(md_results_local_folder)
+local_files = [fn for fn in local_files if fn.endswith('.zip')]
+# i_row = 0; row = df.iloc[i_row]
+for i_row,row in df.iterrows():
+    if not isinstance(row['name'],str):
+        continue
+    dataset_shortname = row['short_name']
+    matching_files = [fn for fn in local_files if dataset_shortname in fn]
+    # No RDE files for datasets with no location information
+    if dataset_shortname in datasets_without_location_info:
+        assert len(matching_files) == 2
+        mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn]
+        mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn]
+        assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1
+        df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
+        df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
+    else:
+        # Exclude single-season files for snapshot-serengeti
+        if dataset_shortname == 'snapshot-serengeti':
+            matching_files = [fn for fn in matching_files if '_S' not in fn]
+            assert len(matching_files) == 2
+            assert all(['mdv4' in fn for fn in matching_files])
+            rde_files = [fn for fn in matching_files if 'rde' in fn]
+            raw_files = [fn for fn in matching_files if 'rde' not in fn]
+            assert len(rde_files) == 1 and len(raw_files) == 1
+            df.loc[i_row,'mdv4_results_raw'] = md_base_url + raw_files[0]
+            df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
+        else:
+            assert len(matching_files) == 3
+            mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn and 'rde' not in fn]
+            mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn and 'rde' not in fn]
+            rde_files = [fn for fn in matching_files if 'rde' in fn]
+            assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1 and len(rde_files) == 1
+            df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
+            df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
+            df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
+    print('Found {} matching files for {}'.format(len(matching_files),dataset_shortname))
+# ...for each row
+#%% Validate URLs
+if validate_urls:
+    from megadetector.utils.url_utils import test_urls
+    urls = set()
+    for i_row,row in df.iterrows():
+        for column_name in md_results_column_names:
+            if len(row[column_name]) > 0:
+                assert row[column_name] not in urls
+                urls.add(row[column_name])
+    test_urls(urls,error_on_failure=True)
+    print('Validated {} URLs'.format(len(urls)))
+#%% Write new .csv file
+df.to_csv(output_csv_file,header=True,index=False)

megadetector/data_management/lila/download_lila_subset.py ADDED Viewed

@@ -0,0 +1,178 @@
+"""
+download_lila_subset.py
+Example of how to download a list of files from LILA, e.g. all the files
+in a data set corresponding to a particular species.
+"""
+#%% Constants and imports
+import os
+import random
+from tqdm import tqdm
+from collections import defaultdict
+from megadetector.data_management.lila.lila_common import \
+    read_lila_all_images_file, is_empty, lila_base_urls
+for s in lila_base_urls.values():
+    assert s.endswith('/')
+# If any of these strings appear in the common name of a species, we'll download that image
+species_of_interest = ['grey fox','gray fox','cape fox','red fox','kit fox']
+# We'll write images, metadata downloads, and temporary files here
+lila_local_base = os.path.expanduser('~/lila')
+metadata_dir = os.path.join(lila_local_base,'metadata')
+os.makedirs(metadata_dir,exist_ok=True)
+output_dir = os.path.join(lila_local_base,'lila_downloads_by_dataset')
+os.makedirs(output_dir,exist_ok=True)
+# Number of concurrent download threads
+n_download_threads = 20
+max_images_per_dataset = 10 # None
+preferred_provider = 'gcp' # 'azure', 'gcp', 'aws'
+random.seed(0)
+#%% Download and open the giant table of image URLs and labels
+# Takes ~60 seconds to download, unzip, and open
+df = read_lila_all_images_file(metadata_dir)
+#%% Find all the images we want to download
+# Takes ~2 minutes
+common_name_to_count = defaultdict(int)
+ds_name_to_urls = defaultdict(list)
+def find_items(row):
+    if is_empty(row['common_name']):
+        return
+    match = False
+    # This is the only bit of this file that's specific to a particular query.  In this case
+    # we're checking whether each row is on a list of species of interest, but you do you.
+    for species_name in species_of_interest:
+        if species_name in row['common_name']:
+            match = True
+            common_name_to_count[species_name] += 1
+            break
+    if match:
+        ds_name_to_urls[row['dataset_name']].append(row['url_' + preferred_provider])
+tqdm.pandas()
+_ = df.progress_apply(find_items,axis=1)
+# We have a list of URLs for each dataset, flatten them all into a list of URLs
+all_urls = list(ds_name_to_urls.values())
+all_urls = [item for sublist in all_urls for item in sublist]
+print('Found {} matching URLs across {} datasets'.format(len(all_urls),len(ds_name_to_urls)))
+for common_name in common_name_to_count:
+    print('{}: {}'.format(common_name,common_name_to_count[common_name]))
+from copy import deepcopy
+ds_name_to_urls_raw = deepcopy(ds_name_to_urls)
+#%% Optionally trim to a fixed number of URLs per dataset
+if max_images_per_dataset is None:
+    pass
+else:
+    # ds_name = next(iter(ds_name_to_urls.keys()))
+    for ds_name in ds_name_to_urls:
+        if len(ds_name_to_urls[ds_name]) > max_images_per_dataset:
+            ds_name_to_urls[ds_name] = random.sample(ds_name_to_urls[ds_name],max_images_per_dataset)
+#%% Choose target files for each URL
+from megadetector.data_management.lila.lila_common import lila_base_urls
+# We have a list of URLs per dataset, flatten that into a single list of URLs
+urls_to_download = set()
+for ds_name in ds_name_to_urls:
+    for url in ds_name_to_urls[ds_name]:
+        urls_to_download.add(url)
+urls_to_download = sorted(list(urls_to_download))
+# A URL might look like this:
+#
+# https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0667/0302.jpg
+#
+# We'll write that to an output file that looks like this (relative to output_dir):
+#
+# wcs-unzipped/animals/0667/0302.jpg
+#
+# ...so we need to remove the base URL to get the target file.
+base_url = lila_base_urls[preferred_provider]
+assert base_url.endswith('/')
+url_to_target_file = {}
+for url in urls_to_download:
+    assert url.startswith(base_url)
+    target_fn_relative = url.replace(base_url,'')
+    target_fn_abs = os.path.join(output_dir,target_fn_relative)
+    url_to_target_file[url] = target_fn_abs
+#%% Download image files
+from megadetector.utils.url_utils import parallel_download_urls
+download_results = parallel_download_urls(url_to_target_file=url_to_target_file,
+                                          verbose=False,
+                                          overwrite=False,
+                                          n_workers=n_download_threads,
+                                          pool_type='thread')
+#%% Scrap
+if False:
+    pass
+    #%% Find all the reptiles on LILA
+    reptile_rows = df.loc[df['class'] == 'reptilia']
+    # i_row = 0; row = reptile_rows.iloc[i_row]
+    common_name_to_count = defaultdict(int)
+    dataset_to_count = defaultdict(int)
+    for i_row,row in reptile_rows.iterrows():
+        common_name_to_count[row['common_name']] += 1
+        dataset_to_count[row['dataset_name']] += 1
+    from megadetector.utils.ct_utils import sort_dictionary_by_value
+    print('Found {} reptiles\n'.format(len(reptile_rows)))
+    common_name_to_count = sort_dictionary_by_value(common_name_to_count,reverse=True)
+    dataset_to_count = sort_dictionary_by_value(dataset_to_count,reverse=True)
+    print('Common names by count:\n')
+    for k in common_name_to_count:
+        print('{} ({})'.format(k,common_name_to_count[k]))
+    print('\nDatasets by count:\n')
+    for k in dataset_to_count:
+        print('{} ({})'.format(k,dataset_to_count[k]))