PyPI - megadetector - Versions diffs - 5.0.8__py3-none-any.whl → 5.0.10__py3-none-any.whl - Mend

megadetector 5.0.8py3-none-any.whl → 5.0.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (190) hide show

api/__init__.py +0 -0
api/batch_processing/__init__.py +0 -0
api/batch_processing/api_core/__init__.py +0 -0
api/batch_processing/api_core/batch_service/__init__.py +0 -0
api/batch_processing/api_core/batch_service/score.py +0 -1
api/batch_processing/api_core/server_job_status_table.py +0 -1
api/batch_processing/api_core_support/__init__.py +0 -0
api/batch_processing/api_core_support/aggregate_results_manually.py +0 -1
api/batch_processing/api_support/__init__.py +0 -0
api/batch_processing/api_support/summarize_daily_activity.py +0 -1
api/batch_processing/data_preparation/__init__.py +0 -0
api/batch_processing/data_preparation/manage_local_batch.py +65 -65
api/batch_processing/data_preparation/manage_video_batch.py +8 -8
api/batch_processing/integration/digiKam/xmp_integration.py +0 -1
api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -1
api/batch_processing/postprocessing/__init__.py +0 -0
api/batch_processing/postprocessing/add_max_conf.py +12 -12
api/batch_processing/postprocessing/categorize_detections_by_size.py +32 -14
api/batch_processing/postprocessing/combine_api_outputs.py +68 -54
api/batch_processing/postprocessing/compare_batch_results.py +113 -43
api/batch_processing/postprocessing/convert_output_format.py +41 -16
api/batch_processing/postprocessing/load_api_results.py +16 -17
api/batch_processing/postprocessing/md_to_coco.py +31 -21
api/batch_processing/postprocessing/md_to_labelme.py +52 -22
api/batch_processing/postprocessing/merge_detections.py +14 -14
api/batch_processing/postprocessing/postprocess_batch_results.py +246 -174
api/batch_processing/postprocessing/remap_detection_categories.py +32 -25
api/batch_processing/postprocessing/render_detection_confusion_matrix.py +60 -27
api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +53 -44
api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +25 -14
api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +242 -158
api/batch_processing/postprocessing/separate_detections_into_folders.py +159 -114
api/batch_processing/postprocessing/subset_json_detector_output.py +146 -169
api/batch_processing/postprocessing/top_folders_to_bottom.py +77 -43
api/synchronous/__init__.py +0 -0
api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
api/synchronous/api_core/animal_detection_api/api_backend.py +0 -2
api/synchronous/api_core/animal_detection_api/api_frontend.py +266 -268
api/synchronous/api_core/animal_detection_api/config.py +35 -35
api/synchronous/api_core/tests/__init__.py +0 -0
api/synchronous/api_core/tests/load_test.py +109 -109
classification/__init__.py +0 -0
classification/aggregate_classifier_probs.py +21 -24
classification/analyze_failed_images.py +11 -13
classification/cache_batchapi_outputs.py +51 -51
classification/create_classification_dataset.py +69 -68
classification/crop_detections.py +54 -53
classification/csv_to_json.py +97 -100
classification/detect_and_crop.py +105 -105
classification/evaluate_model.py +43 -42
classification/identify_mislabeled_candidates.py +47 -46
classification/json_to_azcopy_list.py +10 -10
classification/json_validator.py +72 -71
classification/map_classification_categories.py +44 -43
classification/merge_classification_detection_output.py +68 -68
classification/prepare_classification_script.py +157 -154
classification/prepare_classification_script_mc.py +228 -228
classification/run_classifier.py +27 -26
classification/save_mislabeled.py +30 -30
classification/train_classifier.py +20 -20
classification/train_classifier_tf.py +21 -22
classification/train_utils.py +10 -10
data_management/__init__.py +0 -0
data_management/annotations/__init__.py +0 -0
data_management/annotations/annotation_constants.py +18 -31
data_management/camtrap_dp_to_coco.py +238 -0
data_management/cct_json_utils.py +102 -59
data_management/cct_to_md.py +176 -158
data_management/cct_to_wi.py +247 -219
data_management/coco_to_labelme.py +272 -263
data_management/coco_to_yolo.py +79 -58
data_management/databases/__init__.py +0 -0
data_management/databases/add_width_and_height_to_db.py +20 -16
data_management/databases/combine_coco_camera_traps_files.py +35 -31
data_management/databases/integrity_check_json_db.py +62 -24
data_management/databases/subset_json_db.py +24 -15
data_management/generate_crops_from_cct.py +27 -45
data_management/get_image_sizes.py +188 -162
data_management/importers/add_nacti_sizes.py +8 -8
data_management/importers/add_timestamps_to_icct.py +78 -78
data_management/importers/animl_results_to_md_results.py +158 -158
data_management/importers/auckland_doc_test_to_json.py +9 -9
data_management/importers/auckland_doc_to_json.py +8 -8
data_management/importers/awc_to_json.py +7 -7
data_management/importers/bellevue_to_json.py +15 -15
data_management/importers/cacophony-thermal-importer.py +13 -13
data_management/importers/carrizo_shrubfree_2018.py +8 -8
data_management/importers/carrizo_trail_cam_2017.py +8 -8
data_management/importers/cct_field_adjustments.py +9 -9
data_management/importers/channel_islands_to_cct.py +10 -10
data_management/importers/eMammal/copy_and_unzip_emammal.py +1 -0
data_management/importers/ena24_to_json.py +7 -7
data_management/importers/filenames_to_json.py +8 -8
data_management/importers/helena_to_cct.py +7 -7
data_management/importers/idaho-camera-traps.py +7 -7
data_management/importers/idfg_iwildcam_lila_prep.py +10 -10
data_management/importers/jb_csv_to_json.py +9 -9
data_management/importers/mcgill_to_json.py +8 -8
data_management/importers/missouri_to_json.py +18 -18
data_management/importers/nacti_fieldname_adjustments.py +10 -10
data_management/importers/noaa_seals_2019.py +7 -7
data_management/importers/pc_to_json.py +7 -7
data_management/importers/plot_wni_giraffes.py +7 -7
data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -359
data_management/importers/prepare_zsl_imerit.py +7 -7
data_management/importers/rspb_to_json.py +8 -8
data_management/importers/save_the_elephants_survey_A.py +8 -8
data_management/importers/save_the_elephants_survey_B.py +9 -9
data_management/importers/snapshot_safari_importer.py +26 -26
data_management/importers/snapshot_safari_importer_reprise.py +665 -665
data_management/importers/snapshot_serengeti_lila.py +14 -14
data_management/importers/sulross_get_exif.py +8 -9
data_management/importers/timelapse_csv_set_to_json.py +11 -11
data_management/importers/ubc_to_json.py +13 -13
data_management/importers/umn_to_json.py +7 -7
data_management/importers/wellington_to_json.py +8 -8
data_management/importers/wi_to_json.py +9 -9
data_management/importers/zamba_results_to_md_results.py +181 -181
data_management/labelme_to_coco.py +65 -24
data_management/labelme_to_yolo.py +8 -8
data_management/lila/__init__.py +0 -0
data_management/lila/add_locations_to_island_camera_traps.py +9 -9
data_management/lila/add_locations_to_nacti.py +147 -147
data_management/lila/create_lila_blank_set.py +13 -13
data_management/lila/create_lila_test_set.py +8 -8
data_management/lila/create_links_to_md_results_files.py +106 -106
data_management/lila/download_lila_subset.py +44 -110
data_management/lila/generate_lila_per_image_labels.py +55 -42
data_management/lila/get_lila_annotation_counts.py +18 -15
data_management/lila/get_lila_image_counts.py +11 -11
data_management/lila/lila_common.py +96 -33
data_management/lila/test_lila_metadata_urls.py +132 -116
data_management/ocr_tools.py +173 -128
data_management/read_exif.py +110 -97
data_management/remap_coco_categories.py +83 -83
data_management/remove_exif.py +58 -62
data_management/resize_coco_dataset.py +30 -23
data_management/wi_download_csv_to_coco.py +246 -239
data_management/yolo_output_to_md_output.py +86 -73
data_management/yolo_to_coco.py +300 -60
detection/__init__.py +0 -0
detection/detector_training/__init__.py +0 -0
detection/process_video.py +85 -33
detection/pytorch_detector.py +43 -25
detection/run_detector.py +157 -72
detection/run_detector_batch.py +179 -113
detection/run_inference_with_yolov5_val.py +108 -48
detection/run_tiled_inference.py +111 -40
detection/tf_detector.py +51 -29
detection/video_utils.py +606 -521
docs/source/conf.py +43 -0
md_utils/__init__.py +0 -0
md_utils/azure_utils.py +9 -9
md_utils/ct_utils.py +228 -68
md_utils/directory_listing.py +59 -64
md_utils/md_tests.py +968 -871
md_utils/path_utils.py +460 -134
md_utils/process_utils.py +157 -133
md_utils/sas_blob_utils.py +20 -20
md_utils/split_locations_into_train_val.py +45 -32
md_utils/string_utils.py +33 -10
md_utils/url_utils.py +176 -60
md_utils/write_html_image_list.py +40 -33
md_visualization/__init__.py +0 -0
md_visualization/plot_utils.py +102 -109
md_visualization/render_images_with_thumbnails.py +34 -34
md_visualization/visualization_utils.py +597 -291
md_visualization/visualize_db.py +76 -48
md_visualization/visualize_detector_output.py +61 -42
{megadetector-5.0.8.dist-info → megadetector-5.0.10.dist-info}/METADATA +13 -7
megadetector-5.0.10.dist-info/RECORD +224 -0
{megadetector-5.0.8.dist-info → megadetector-5.0.10.dist-info}/top_level.txt +1 -0
taxonomy_mapping/__init__.py +0 -0
taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +342 -335
taxonomy_mapping/map_new_lila_datasets.py +154 -154
taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -134
taxonomy_mapping/preview_lila_taxonomy.py +591 -591
taxonomy_mapping/retrieve_sample_image.py +12 -12
taxonomy_mapping/simple_image_download.py +11 -11
taxonomy_mapping/species_lookup.py +10 -10
taxonomy_mapping/taxonomy_csv_checker.py +18 -18
taxonomy_mapping/taxonomy_graph.py +47 -47
taxonomy_mapping/validate_lila_category_mappings.py +83 -76
data_management/cct_json_to_filename_json.py +0 -89
data_management/cct_to_csv.py +0 -140
data_management/databases/remove_corrupted_images_from_db.py +0 -191
detection/detector_training/copy_checkpoints.py +0 -43
megadetector-5.0.8.dist-info/RECORD +0 -205
{megadetector-5.0.8.dist-info → megadetector-5.0.10.dist-info}/LICENSE +0 -0
{megadetector-5.0.8.dist-info → megadetector-5.0.10.dist-info}/WHEEL +0 -0

data_management/lila/create_links_to_md_results_files.py CHANGED Viewed

@@ -1,106 +1,106 @@
-########
-#
-# create_links_to_md_results_files.py
-#
-# One-off script to populate the columns in the camera trap data .csv file that point to MD results.
-#
-########
-#%% Imports and constants
-import os
-import pandas as pd
-input_csv_file = r'g:\temp\lila_camera_trap_datasets_no_md_results.csv'
-output_csv_file = r'g:\temp\lila_camera_trap_datasets.csv'
-md_results_local_folder = r'g:\temp\lila-md-results'
-md_base_url = 'https://lila.science/public/lila-md-results/'
-assert md_base_url.endswith('/')
-# No RDE files for datasets with no location information
-datasets_without_location_info = ('ena24','missouri-camera-traps')
-md_results_column_names = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
-validate_urls = False
-#%% Read input data
-df = pd.read_csv(input_csv_file)
-for s in md_results_column_names:
-    df[s] = ''
-#%% Find matching files locally, and create URLs
-local_files = os.listdir(md_results_local_folder)
-local_files = [fn for fn in local_files if fn.endswith('.zip')]
-# i_row = 0; row = df.iloc[i_row]
-for i_row,row in df.iterrows():
-    if not isinstance(row['name'],str):
-        continue
-    dataset_shortname = row['short_name']
-    matching_files = [fn for fn in local_files if dataset_shortname in fn]
-    # No RDE files for datasets with no location information
-    if dataset_shortname in datasets_without_location_info:
-        assert len(matching_files) == 2
-        mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn]
-        mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn]
-        assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1
-        df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
-        df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
-    else:
-        # Exclude single-season files for snapshot-serengeti
-        if dataset_shortname == 'snapshot-serengeti':
-            matching_files = [fn for fn in matching_files if '_S' not in fn]
-            assert len(matching_files) == 2
-            assert all(['mdv4' in fn for fn in matching_files])
-            rde_files = [fn for fn in matching_files if 'rde' in fn]
-            raw_files = [fn for fn in matching_files if 'rde' not in fn]
-            assert len(rde_files) == 1 and len(raw_files) == 1
-            df.loc[i_row,'mdv4_results_raw'] = md_base_url + raw_files[0]
-            df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
-        else:
-            assert len(matching_files) == 3
-            mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn and 'rde' not in fn]
-            mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn and 'rde' not in fn]
-            rde_files = [fn for fn in matching_files if 'rde' in fn]
-            assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1 and len(rde_files) == 1
-            df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
-            df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
-            df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
-    print('Found {} matching files for {}'.format(len(matching_files),dataset_shortname))
-# ...for each row
-#%% Validate URLs
-if validate_urls:
-    from md_utils.url_utils import test_urls
-    urls = set()
-    for i_row,row in df.iterrows():
-        for column_name in md_results_column_names:
-            if len(row[column_name]) > 0:
-                assert row[column_name] not in urls
-                urls.add(row[column_name])
-    test_urls(urls,error_on_failure=True)
-    print('Validated {} URLs'.format(len(urls)))
-#%% Write new .csv file
-df.to_csv(output_csv_file,header=True,index=False)
+"""
+create_links_to_md_results_files.py
+One-off script to populate the columns in the camera trap data .csv file that point to MD results.
+"""
+#%% Imports and constants
+import os
+import pandas as pd
+input_csv_file = r'g:\temp\lila_camera_trap_datasets_no_md_results.csv'
+output_csv_file = r'g:\temp\lila_camera_trap_datasets.csv'
+md_results_local_folder = r'g:\temp\lila-md-results'
+md_base_url = 'https://lila.science/public/lila-md-results/'
+assert md_base_url.endswith('/')
+# No RDE files for datasets with no location information
+datasets_without_location_info = ('ena24','missouri-camera-traps')
+md_results_column_names = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
+validate_urls = False
+#%% Read input data
+df = pd.read_csv(input_csv_file)
+for s in md_results_column_names:
+    df[s] = ''
+#%% Find matching files locally, and create URLs
+local_files = os.listdir(md_results_local_folder)
+local_files = [fn for fn in local_files if fn.endswith('.zip')]
+# i_row = 0; row = df.iloc[i_row]
+for i_row,row in df.iterrows():
+    if not isinstance(row['name'],str):
+        continue
+    dataset_shortname = row['short_name']
+    matching_files = [fn for fn in local_files if dataset_shortname in fn]
+    # No RDE files for datasets with no location information
+    if dataset_shortname in datasets_without_location_info:
+        assert len(matching_files) == 2
+        mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn]
+        mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn]
+        assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1
+        df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
+        df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
+    else:
+        # Exclude single-season files for snapshot-serengeti
+        if dataset_shortname == 'snapshot-serengeti':
+            matching_files = [fn for fn in matching_files if '_S' not in fn]
+            assert len(matching_files) == 2
+            assert all(['mdv4' in fn for fn in matching_files])
+            rde_files = [fn for fn in matching_files if 'rde' in fn]
+            raw_files = [fn for fn in matching_files if 'rde' not in fn]
+            assert len(rde_files) == 1 and len(raw_files) == 1
+            df.loc[i_row,'mdv4_results_raw'] = md_base_url + raw_files[0]
+            df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
+        else:
+            assert len(matching_files) == 3
+            mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn and 'rde' not in fn]
+            mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn and 'rde' not in fn]
+            rde_files = [fn for fn in matching_files if 'rde' in fn]
+            assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1 and len(rde_files) == 1
+            df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
+            df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
+            df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
+    print('Found {} matching files for {}'.format(len(matching_files),dataset_shortname))
+# ...for each row
+#%% Validate URLs
+if validate_urls:
+    from md_utils.url_utils import test_urls
+    urls = set()
+    for i_row,row in df.iterrows():
+        for column_name in md_results_column_names:
+            if len(row[column_name]) > 0:
+                assert row[column_name] not in urls
+                urls.add(row[column_name])
+    test_urls(urls,error_on_failure=True)
+    print('Validated {} URLs'.format(len(urls)))
+#%% Write new .csv file
+df.to_csv(output_csv_file,header=True,index=False)

data_management/lila/download_lila_subset.py CHANGED Viewed

@@ -1,17 +1,11 @@
-########
-#
-# download_lila_subset.py
-#
-# Example of how to download a list of files from LILA, e.g. all the files
-# in a data set corresponding to a particular species.
-#
-# Organizes the downloaded images by dataset.  How you actually want to organize files,
-# what you want to query for, etc., is very application-specific; this is just meant as a
-# demo.
-#
-# Can download from GCP (all datasets), AWS (all datasets), or Azure (most datasets).
-#
-########
+"""
+download_lila_subset.py
+Example of how to download a list of files from LILA, e.g. all the files
+in a data set corresponding to a particular species.
+"""
 #%% Constants and imports
@@ -19,11 +13,9 @@ import os
 import random
 from tqdm import tqdm
-from multiprocessing.pool import ThreadPool
 from collections import defaultdict
 from data_management.lila.lila_common import read_lila_all_images_file, is_empty, lila_base_urls
-from md_utils.url_utils import download_url
 for s in lila_base_urls.values():
     assert s.endswith('/')
@@ -43,70 +35,22 @@ os.makedirs(output_dir,exist_ok=True)
 # Number of concurrent download threads
 n_download_threads = 20
-verbose = False
 max_images_per_dataset = 10 # None
-# This impacts the data download, but not the metadata download
-#
-# Setting this to "Azure" really means "Azure if available"; some datasets are
-# not available on Azure.
 preferred_provider = 'gcp' # 'azure', 'gcp', 'aws'
 random.seed(0)
-#%% Support functions
-def download_relative_url(relative_url, output_base, provider='gcp',
-                          verbose=False, overwrite=False):
-    """
-    Download a URL to output_base, preserving the path relative to the common LILA root.
-    """
-    assert not relative_url.startswith('/')
-    # Not all datasets are available on Azure, fall back in these cases.  The decision
-    # to fall back to GCP rather than AWS is arbitrary.
-    if provider == 'azure':
-        nominal_provider = relative_url_to_nominal_provider[relative_url]
-        if nominal_provider != 'azure':
-            if verbose:
-                print('URL {} not available on Azure, falling back to GCP'.format(
-                    relative_url))
-            provider = 'gcp'
-    url = lila_base_urls[provider] + relative_url
-    result = {'status':'unknown','url':url,'destination_filename':None}
-    destination_filename = os.path.join(output_base,relative_url)
-    result['destination_filename'] = destination_filename
-    if ((os.path.isfile(destination_filename)) and (not overwrite)):
-        result['status'] = 'skipped'
-        return result
-    try:
-        download_url(url, destination_filename, verbose=verbose, force_download=overwrite)
-    except Exception as e:
-        print('Warning: error downloading URL {}: {}'.format(
-            url,str(e)))
-        result['status'] = 'error: {}'.format(str(e))
-        return result
-    result['status'] = 'success'
-    return result
 #%% Download and open the giant table of image URLs and labels
-# ~60 seconds to download, unzip, and open
+# Takes ~60 seconds to download, unzip, and open
 df = read_lila_all_images_file(metadata_dir)
 #%% Find all the images we want to download
-# ~2 minutes
+# Takes ~2 minutes
 common_name_to_count = defaultdict(int)
@@ -119,6 +63,8 @@ def find_items(row):
     match = False
+    # This is the only bit of this file that's specific to a particular query.  In this case
+    # we're checking whether each row is on a list of species of interest, but you do you.
     for species_name in species_of_interest:
         if species_name in row['common_name']:
             match = True
@@ -126,7 +72,7 @@ def find_items(row):
             break
     if match:
-        ds_name_to_urls[row['dataset_name']].append(row['url'])
+        ds_name_to_urls[row['dataset_name']].append(row['url_' + preferred_provider])
 tqdm.pandas()
 _ = df.progress_apply(find_items,axis=1)
@@ -154,58 +100,47 @@ else:
             ds_name_to_urls[ds_name] = random.sample(ds_name_to_urls[ds_name],max_images_per_dataset)
-#%% Convert URLs to be relative to the common LILA base
+#%% Choose target files for each URL
-all_urls = list(ds_name_to_urls.values())
-all_urls = [item for sublist in all_urls for item in sublist]
+from data_management.lila.lila_common import lila_base_urls
-all_urls_relative = []
+# We have a list of URLs per dataset, flatten that into a single list of URLs
+urls_to_download = set()
+for ds_name in ds_name_to_urls:
+    for url in ds_name_to_urls[ds_name]:
+        urls_to_download.add(url)
+urls_to_download = sorted(list(urls_to_download))
-# Each file has a nominal URL in the .csv file.  For now, the only thing this tells is
-# is that if the nominal URL isn't an Azure URL, the file isn't on Azure.  All files are on
-# GCP and AWS.
+# A URL might look like this:
 #
-# Keep track of the nominal provider for each URL.
-relative_url_to_nominal_provider = {}
-for url in all_urls:
-    found_base = False
-    for provider in lila_base_urls.keys():
-        base = lila_base_urls[provider]
-        if url.startswith(base):
-            relative_url = url.replace(base,'')
-            all_urls_relative.append(relative_url)
-            relative_url_to_nominal_provider[relative_url] = provider
-            found_base = True
-            break
-    assert found_base
-assert len(all_urls) == len(all_urls_relative)
+# https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0667/0302.jpg
+#
+# We'll write that to an output file that looks like this (relative to output_dir):
+#
+# wcs-unzipped/animals/0667/0302.jpg
+#
+# ...so we need to remove the base URL to get the target file.
+base_url = lila_base_urls[preferred_provider]
+assert base_url.endswith('/')
+url_to_target_file = {}
-#%% Download image files
+for url in urls_to_download:
+    assert url.startswith(base_url)
+    target_fn_relative = url.replace(base_url,'')
+    target_fn_abs = os.path.join(output_dir,target_fn_relative)
+    url_to_target_file[url] = target_fn_abs
-print('Downloading {} images on {} workers, preferred provider is {}'.format(
-    len(all_urls),n_download_threads,preferred_provider))
-if n_download_threads <= 1:
+#%% Download image files
-    results = []
-    # url_relative = all_urls_relative[0]
-    for url_relative in tqdm(all_urls_relative):
-        result = download_relative_url(url_relative,
-                                       output_base=output_dir,
-                                       provider=preferred_provider,
-                                       verbose=verbose)
-        results.append(result)
-else:
+from md_utils.url_utils import parallel_download_urls
-    pool = ThreadPool(n_download_threads)
-    results = list(tqdm(pool.imap(lambda s: download_relative_url(
-        s,output_base=output_dir,provider=preferred_provider,verbose=verbose),
-        all_urls_relative), total=len(all_urls_relative)))
+download_results = parallel_download_urls(url_to_target_file=url_to_target_file,
+                                          verbose=False,
+                                          overwrite=False,
+                                          n_workers=n_download_threads,
+                                          pool_type='thread')
 #%% Scrap
@@ -240,4 +175,3 @@ if False:
     print('\nDatasets by count:\n')
     for k in dataset_to_count:
         print('{} ({})'.format(k,dataset_to_count[k]))

data_management/lila/generate_lila_per_image_labels.py CHANGED Viewed

@@ -1,19 +1,19 @@
-########
-#
-# generate_lila_per_image_labels.py
-#
-# Generate a .csv file with one row per annotation, containing full URLs to every
-# camera trap image on LILA, with taxonomically expanded labels.
-#
-# Typically there will be one row per image, though images with multiple annotations
-# will have multiple rows.
-#
-# Some images may not physically exist, particularly images that are labeled as "human".
-# This script does not validate image URLs.
-#
-# Does not include bounding box annotations.
-#
-########
+"""
+generate_lila_per_image_labels.py
+Generate a .csv file with one row per annotation, containing full URLs to every
+camera trap image on LILA, with taxonomically expanded labels.
+Typically there will be one row per image, though images with multiple annotations
+will have multiple rows.
+Some images may not physically exist, particularly images that are labeled as "human".
+This script does not validate image URLs.
+Does not include bounding box annotations.
+"""
 #%% Constants and imports
@@ -23,8 +23,6 @@ import pandas as pd
 import numpy as np
 import dateparser
 import csv
-import urllib
-import urllib.request
 from collections import defaultdict
 from tqdm import tqdm
@@ -36,7 +34,6 @@ from data_management.lila.lila_common import read_lila_metadata, \
 from md_utils import write_html_image_list
 from md_utils.path_utils import zip_file
 from md_utils.path_utils import open_file
-from md_utils.url_utils import download_url
 # We'll write images, metadata downloads, and temporary files here
 lila_local_base = os.path.expanduser('~/lila')
@@ -107,12 +104,15 @@ for i_row,row in taxonomy_df.iterrows():
 # Takes several hours
-header = ['dataset_name','url','image_id','sequence_id','location_id','frame_num','original_label',\
-          'scientific_name','common_name','datetime','annotation_level']
+# The order of these headers needs to match the order in which fields are added later in this cell;
+# don't mess with this order.
+header = ['dataset_name','url_gcp','url_aws','url_azure',
+          'image_id','sequence_id','location_id','frame_num',
+          'original_label','scientific_name','common_name','datetime','annotation_level']
 taxonomy_levels_to_include = \
     ['kingdom','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order',
-     'suborder','infraorder','superfamily','family','subfamily','tribe','genus','species','subspecies',\
+     'suborder','infraorder','superfamily','family','subfamily','tribe','genus','species','subspecies',
      'variety']
 header.extend(taxonomy_levels_to_include)
@@ -179,10 +179,17 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
                 break
             file_name = im['file_name'].replace('\\','/')
-            base_url = metadata_table[ds_name]['image_base_url']
-            assert not base_url.endswith('/')
-            url = base_url + '/' + file_name
+            base_url_gcp = metadata_table[ds_name]['image_base_url_gcp']
+            base_url_aws = metadata_table[ds_name]['image_base_url_aws']
+            base_url_azure = metadata_table[ds_name]['image_base_url_azure']
+            assert not base_url_gcp.endswith('/')
+            assert not base_url_aws.endswith('/')
+            assert not base_url_azure.endswith('/')
+            url_gcp = base_url_gcp + '/' + file_name
+            url_aws = base_url_aws + '/' + file_name
+            url_azure = base_url_azure + '/' + file_name
             for k in im.keys():
                 if ('date' in k or 'time' in k) and (k not in ['datetime','date_captured']):
                     raise ValueError('Unrecognized datetime field')
@@ -297,7 +304,9 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
                 row = []
                 row.append(ds_name)
-                row.append(url)
+                row.append(url_gcp)
+                row.append(url_aws)
+                row.append(url_azure)
                 row.append(image_id)
                 row.append(sequence_id)
                 row.append(location_id)
@@ -365,7 +374,8 @@ dataset_name_to_locations = defaultdict(set)
 def check_row(row):
     assert row['dataset_name'] in metadata_table.keys()
-    assert row['url'].startswith('https://')
+    for url_column in ['url_gcp','url_aws','url_azure']:
+        assert row[url_column].startswith('https://') or row[url_column].startswith('http://')
     assert ' : ' in row['image_id']
     assert 'seq' not in row['location_id'].lower()
     assert row['annotation_level'] in valid_annotation_levels
@@ -446,28 +456,31 @@ for ds_name in metadata_table.keys():
 print('Selected {} total images'.format(len(images_to_download)))
-#%% Download images
+#%% Download images (prep)
 # Expect a few errors for images with human or vehicle labels (or things like "ignore" that *could* be humans)
-# TODO: trivially parallelizable
-#
+preferred_cloud = 'aws'
+url_to_target_file = {}
 # i_image = 10; image = images_to_download[i_image]
 for i_image,image in tqdm(enumerate(images_to_download),total=len(images_to_download)):
-    url = image['url']
+    url = image['url_' + preferred_cloud]
     ext = os.path.splitext(url)[1]
-    image_file = os.path.join(preview_folder,'image_{}'.format(str(i_image).zfill(4)) + ext)
-    relative_file = os.path.relpath(image_file,preview_folder)
-    try:
-        download_url(url,image_file,verbose=False)
-        image['relative_file'] = relative_file
-    except urllib.error.HTTPError:
-        print('Image {} does not exist ({}:{})'.format(
-            i_image,image['dataset_name'],image['original_label']))
-        image['relative_file'] = None
+    fn_relative = 'image_{}'.format(str(i_image).zfill(4)) + ext
+    fn_abs = os.path.join(preview_folder,fn_relative)
+    image['relative_file'] = fn_relative
+    image['url'] = url
+    url_to_target_file[url] = fn_abs
+#%% Download images (execution)
-# ...for each image we need to download
+from md_utils.url_utils import parallel_download_urls
+download_results = parallel_download_urls(url_to_target_file,verbose=False,overwrite=True,
+                                          n_workers=20,pool_type='thread')
 #%% Write preview HTML
@@ -499,4 +512,4 @@ open_file(html_filename)
 zipped_output_file = zip_file(output_file,verbose=True)
-print('Zipped {} to {}'.format(output_file,zipped_output_file))
+print('Zipped {} to {}'.format(output_file,zipped_output_file))

data_management/lila/get_lila_annotation_counts.py CHANGED Viewed

@@ -1,16 +1,16 @@
-########
-#
-# get_lila_annotation_counts.py
-#
-# Generates a .json-formatted dictionary mapping each LILA dataset to all categories
-# that exist for that dataset, with counts for the number of occurrences of each category
-# (the number of *annotations* for each category, not the number of *images*).
-#
-# Also loads the taxonomy mapping file, to include scientific names for each category.
-#
-# get_lila_image_counts.py counts the number of *images* for each category in each dataset.
-#
-########
+"""
+get_lila_annotation_counts.py
+Generates a .json-formatted dictionary mapping each LILA dataset to all categories
+that exist for that dataset, with counts for the number of occurrences of each category
+(the number of *annotations* for each category, not the number of *images*).
+Also loads the taxonomy mapping file, to include scientific names for each category.
+get_lila_image_counts.py counts the number of *images* for each category in each dataset.
+"""
 #%% Constants and imports
@@ -20,6 +20,9 @@ import os
 from data_management.lila.lila_common import read_lila_metadata,\
     read_metadata_file_for_dataset, read_lila_taxonomy_mapping
+# cloud provider to use for downloading images; options are 'gcp', 'azure', or 'aws'
+preferred_cloud = 'gcp'
 # array to fill for output
 category_list = []
@@ -96,9 +99,9 @@ for ds_name in metadata_table.keys():
         print('Warning: taxonomy mapping not available for {}'.format(ds_name))
     print('Finding categories in {}'.format(ds_name))
     json_filename = metadata_table[ds_name]['json_filename']
-    base_url = metadata_table[ds_name]['image_base_url']
+    base_url = metadata_table[ds_name]['image_base_url_' + preferred_cloud]
     assert not base_url.endswith('/')
     # Open the metadata file

megadetector 5.0.8__py3-none-any.whl → 5.0.10__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.8py3-none-any.whl → 5.0.10py3-none-any.whl