PyPI - megadetector - Versions diffs - 5.0.8__py3-none-any.whl → 5.0.9__py3-none-any.whl - Mend

megadetector 5.0.8py3-none-any.whl → 5.0.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (190) hide show

api/__init__.py +0 -0
api/batch_processing/__init__.py +0 -0
api/batch_processing/api_core/__init__.py +0 -0
api/batch_processing/api_core/batch_service/__init__.py +0 -0
api/batch_processing/api_core/batch_service/score.py +0 -1
api/batch_processing/api_core/server_job_status_table.py +0 -1
api/batch_processing/api_core_support/__init__.py +0 -0
api/batch_processing/api_core_support/aggregate_results_manually.py +0 -1
api/batch_processing/api_support/__init__.py +0 -0
api/batch_processing/api_support/summarize_daily_activity.py +0 -1
api/batch_processing/data_preparation/__init__.py +0 -0
api/batch_processing/data_preparation/manage_local_batch.py +65 -65
api/batch_processing/data_preparation/manage_video_batch.py +8 -8
api/batch_processing/integration/digiKam/xmp_integration.py +0 -1
api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -1
api/batch_processing/postprocessing/__init__.py +0 -0
api/batch_processing/postprocessing/add_max_conf.py +12 -12
api/batch_processing/postprocessing/categorize_detections_by_size.py +32 -14
api/batch_processing/postprocessing/combine_api_outputs.py +68 -54
api/batch_processing/postprocessing/compare_batch_results.py +113 -43
api/batch_processing/postprocessing/convert_output_format.py +41 -16
api/batch_processing/postprocessing/load_api_results.py +16 -17
api/batch_processing/postprocessing/md_to_coco.py +31 -21
api/batch_processing/postprocessing/md_to_labelme.py +52 -22
api/batch_processing/postprocessing/merge_detections.py +14 -14
api/batch_processing/postprocessing/postprocess_batch_results.py +246 -174
api/batch_processing/postprocessing/remap_detection_categories.py +32 -25
api/batch_processing/postprocessing/render_detection_confusion_matrix.py +60 -27
api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +53 -44
api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +25 -14
api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +242 -158
api/batch_processing/postprocessing/separate_detections_into_folders.py +159 -114
api/batch_processing/postprocessing/subset_json_detector_output.py +146 -169
api/batch_processing/postprocessing/top_folders_to_bottom.py +77 -43
api/synchronous/__init__.py +0 -0
api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
api/synchronous/api_core/animal_detection_api/api_backend.py +0 -2
api/synchronous/api_core/animal_detection_api/api_frontend.py +266 -268
api/synchronous/api_core/animal_detection_api/config.py +35 -35
api/synchronous/api_core/tests/__init__.py +0 -0
api/synchronous/api_core/tests/load_test.py +109 -109
classification/__init__.py +0 -0
classification/aggregate_classifier_probs.py +21 -24
classification/analyze_failed_images.py +11 -13
classification/cache_batchapi_outputs.py +51 -51
classification/create_classification_dataset.py +69 -68
classification/crop_detections.py +54 -53
classification/csv_to_json.py +97 -100
classification/detect_and_crop.py +105 -105
classification/evaluate_model.py +43 -42
classification/identify_mislabeled_candidates.py +47 -46
classification/json_to_azcopy_list.py +10 -10
classification/json_validator.py +72 -71
classification/map_classification_categories.py +44 -43
classification/merge_classification_detection_output.py +68 -68
classification/prepare_classification_script.py +157 -154
classification/prepare_classification_script_mc.py +228 -228
classification/run_classifier.py +27 -26
classification/save_mislabeled.py +30 -30
classification/train_classifier.py +20 -20
classification/train_classifier_tf.py +21 -22
classification/train_utils.py +10 -10
data_management/__init__.py +0 -0
data_management/annotations/__init__.py +0 -0
data_management/annotations/annotation_constants.py +18 -31
data_management/camtrap_dp_to_coco.py +238 -0
data_management/cct_json_utils.py +102 -59
data_management/cct_to_md.py +176 -158
data_management/cct_to_wi.py +247 -219
data_management/coco_to_labelme.py +272 -263
data_management/coco_to_yolo.py +79 -58
data_management/databases/__init__.py +0 -0
data_management/databases/add_width_and_height_to_db.py +20 -16
data_management/databases/combine_coco_camera_traps_files.py +35 -31
data_management/databases/integrity_check_json_db.py +62 -24
data_management/databases/subset_json_db.py +24 -15
data_management/generate_crops_from_cct.py +27 -45
data_management/get_image_sizes.py +188 -162
data_management/importers/add_nacti_sizes.py +8 -8
data_management/importers/add_timestamps_to_icct.py +78 -78
data_management/importers/animl_results_to_md_results.py +158 -158
data_management/importers/auckland_doc_test_to_json.py +9 -9
data_management/importers/auckland_doc_to_json.py +8 -8
data_management/importers/awc_to_json.py +7 -7
data_management/importers/bellevue_to_json.py +15 -15
data_management/importers/cacophony-thermal-importer.py +13 -13
data_management/importers/carrizo_shrubfree_2018.py +8 -8
data_management/importers/carrizo_trail_cam_2017.py +8 -8
data_management/importers/cct_field_adjustments.py +9 -9
data_management/importers/channel_islands_to_cct.py +10 -10
data_management/importers/eMammal/copy_and_unzip_emammal.py +1 -0
data_management/importers/ena24_to_json.py +7 -7
data_management/importers/filenames_to_json.py +8 -8
data_management/importers/helena_to_cct.py +7 -7
data_management/importers/idaho-camera-traps.py +7 -7
data_management/importers/idfg_iwildcam_lila_prep.py +10 -10
data_management/importers/jb_csv_to_json.py +9 -9
data_management/importers/mcgill_to_json.py +8 -8
data_management/importers/missouri_to_json.py +18 -18
data_management/importers/nacti_fieldname_adjustments.py +10 -10
data_management/importers/noaa_seals_2019.py +7 -7
data_management/importers/pc_to_json.py +7 -7
data_management/importers/plot_wni_giraffes.py +7 -7
data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -359
data_management/importers/prepare_zsl_imerit.py +7 -7
data_management/importers/rspb_to_json.py +8 -8
data_management/importers/save_the_elephants_survey_A.py +8 -8
data_management/importers/save_the_elephants_survey_B.py +9 -9
data_management/importers/snapshot_safari_importer.py +26 -26
data_management/importers/snapshot_safari_importer_reprise.py +665 -665
data_management/importers/snapshot_serengeti_lila.py +14 -14
data_management/importers/sulross_get_exif.py +8 -9
data_management/importers/timelapse_csv_set_to_json.py +11 -11
data_management/importers/ubc_to_json.py +13 -13
data_management/importers/umn_to_json.py +7 -7
data_management/importers/wellington_to_json.py +8 -8
data_management/importers/wi_to_json.py +9 -9
data_management/importers/zamba_results_to_md_results.py +181 -181
data_management/labelme_to_coco.py +65 -24
data_management/labelme_to_yolo.py +8 -8
data_management/lila/__init__.py +0 -0
data_management/lila/add_locations_to_island_camera_traps.py +9 -9
data_management/lila/add_locations_to_nacti.py +147 -147
data_management/lila/create_lila_blank_set.py +13 -13
data_management/lila/create_lila_test_set.py +8 -8
data_management/lila/create_links_to_md_results_files.py +106 -106
data_management/lila/download_lila_subset.py +44 -110
data_management/lila/generate_lila_per_image_labels.py +55 -42
data_management/lila/get_lila_annotation_counts.py +18 -15
data_management/lila/get_lila_image_counts.py +11 -11
data_management/lila/lila_common.py +96 -33
data_management/lila/test_lila_metadata_urls.py +132 -116
data_management/ocr_tools.py +173 -128
data_management/read_exif.py +110 -97
data_management/remap_coco_categories.py +83 -83
data_management/remove_exif.py +58 -62
data_management/resize_coco_dataset.py +30 -23
data_management/wi_download_csv_to_coco.py +246 -239
data_management/yolo_output_to_md_output.py +86 -73
data_management/yolo_to_coco.py +300 -60
detection/__init__.py +0 -0
detection/detector_training/__init__.py +0 -0
detection/process_video.py +85 -33
detection/pytorch_detector.py +43 -25
detection/run_detector.py +157 -72
detection/run_detector_batch.py +179 -113
detection/run_inference_with_yolov5_val.py +108 -48
detection/run_tiled_inference.py +111 -40
detection/tf_detector.py +51 -29
detection/video_utils.py +606 -521
docs/source/conf.py +43 -0
md_utils/__init__.py +0 -0
md_utils/azure_utils.py +9 -9
md_utils/ct_utils.py +228 -68
md_utils/directory_listing.py +59 -64
md_utils/md_tests.py +968 -871
md_utils/path_utils.py +460 -134
md_utils/process_utils.py +157 -133
md_utils/sas_blob_utils.py +20 -20
md_utils/split_locations_into_train_val.py +45 -32
md_utils/string_utils.py +33 -10
md_utils/url_utils.py +176 -60
md_utils/write_html_image_list.py +40 -33
md_visualization/__init__.py +0 -0
md_visualization/plot_utils.py +102 -109
md_visualization/render_images_with_thumbnails.py +34 -34
md_visualization/visualization_utils.py +597 -291
md_visualization/visualize_db.py +76 -48
md_visualization/visualize_detector_output.py +61 -42
{megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/METADATA +13 -7
megadetector-5.0.9.dist-info/RECORD +224 -0
{megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/top_level.txt +1 -0
taxonomy_mapping/__init__.py +0 -0
taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +342 -335
taxonomy_mapping/map_new_lila_datasets.py +154 -154
taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -134
taxonomy_mapping/preview_lila_taxonomy.py +591 -591
taxonomy_mapping/retrieve_sample_image.py +12 -12
taxonomy_mapping/simple_image_download.py +11 -11
taxonomy_mapping/species_lookup.py +10 -10
taxonomy_mapping/taxonomy_csv_checker.py +18 -18
taxonomy_mapping/taxonomy_graph.py +47 -47
taxonomy_mapping/validate_lila_category_mappings.py +83 -76
data_management/cct_json_to_filename_json.py +0 -89
data_management/cct_to_csv.py +0 -140
data_management/databases/remove_corrupted_images_from_db.py +0 -191
detection/detector_training/copy_checkpoints.py +0 -43
megadetector-5.0.8.dist-info/RECORD +0 -205
{megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/LICENSE +0 -0
{megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/WHEEL +0 -0

data_management/lila/get_lila_image_counts.py CHANGED Viewed

@@ -1,14 +1,14 @@
-########
-#
-# get_lila_image_counts.py
-#
-# Count the number of images and bounding boxes with each label in one or more LILA datasets.
-#
-# This script doesn't write these counts out anywhere other than the console, it's just intended
-# as a template for doing operations like this on LILA data.  get_lila_annotation_counts.py writes
-# information out to a .json file, but it counts *annotations*, not *images*, for each category.
-#
-########
+"""
+get_lila_image_counts.py
+Count the number of images and bounding boxes with each label in one or more LILA datasets.
+This script doesn't write these counts out anywhere other than the console, it's just intended
+as a template for doing operations like this on LILA data.  get_lila_annotation_counts.py writes
+information out to a .json file, but it counts *annotations*, not *images*, for each category.
+"""
 #%% Constants and imports

data_management/lila/lila_common.py CHANGED Viewed

@@ -1,10 +1,10 @@
-########
-#
-# lila_common.py
-#
-# Common constants and functions related to LILA data management/retrieval.
-#
-########
+"""
+lila_common.py
+Common constants and functions related to LILA data management/retrieval.
+"""
 #%% Imports and constants
@@ -12,12 +12,12 @@ import os
 import json
 import zipfile
 import pandas as pd
-import numpy as np
 from urllib.parse import urlparse
 from md_utils.url_utils import download_url
 from md_utils.path_utils import unzip_file
+from md_utils.ct_utils import is_empty
 # LILA camera trap primary metadata file
 lila_metadata_url = 'http://lila.science/wp-content/uploads/2023/06/lila_camera_trap_datasets.csv'
@@ -33,11 +33,19 @@ wildlife_insights_taxonomy_local_csv_filename = \
 # Filenames are consistent across clouds relative to these URLs
 lila_base_urls = {
-    'azure':'https://lilablobssc.blob.core.windows.net/',
+    'azure':'https://lilawildlife.blob.core.windows.net/lila-wildlife/',
     'gcp':'https://storage.googleapis.com/public-datasets-lila/',
     'aws':'http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/'
 }
+lila_cloud_urls = {
+    'azure':'https://lilawildlife.blob.core.windows.net/lila-wildlife/',
+    'gcp':'gs://public-datasets-lila/',
+    'aws':'s3://us-west-2.opendata.source.coop/agentmorris/lila-wildlife/'
+}
+for url in lila_base_urls.values():
+    assert url.endswith('/')
 #%% Common functions
@@ -46,7 +54,11 @@ def read_wildlife_insights_taxonomy_mapping(metadata_dir):
     """
     Reads the WI taxonomy mapping file, downloading the .json data (and writing to .csv) if necessary.
-    Returns a Pandas dataframe.
+    Args:
+        metadata_dir (str): folder to use for temporary LILA metadata files
+    Returns:
+        pd.dataframe: A DataFrame with taxonomy information
     """
     wi_taxonomy_csv_path = os.path.join(metadata_dir,wildlife_insights_taxonomy_local_csv_filename)
@@ -85,7 +97,11 @@ def read_lila_taxonomy_mapping(metadata_dir):
     """
     Reads the LILA taxonomy mapping file, downloading the .csv file if necessary.
-    Returns a Pandas dataframe, with one row per identification.
+    Args:
+        metadata_dir (str): folder to use for temporary LILA metadata files
+    Returns:
+        pd.DataFrame: a DataFrame with one row per identification
     """
     p = urlparse(lila_taxonomy_mapping_url)
@@ -97,24 +113,38 @@ def read_lila_taxonomy_mapping(metadata_dir):
     return df
-def is_empty(v):
-    if v is None:
-        return True
-    if isinstance(v,str) and v == '':
-        return True
-    if isinstance(v,float) and np.isnan(v):
-        return True
-    return False
 def read_lila_metadata(metadata_dir):
     """
-    Reads LILA metadata (URLs to each dataset), downloading the txt file if necessary.
+    Reads LILA metadata (URLs to each dataset), downloading the .csv file if necessary.
-    Returns a dict mapping dataset names (e.g. "Caltech Camera Traps") to dicts
-    with keys corresponding to the headers in the .csv file, currently:
+    Args:
+        metadata_dir (str): folder to use for temporary LILA metadata files
-    name,image_base_url,metadata_url,bbox_url,continent,country,region
+    Returns:
+        dict: a dict mapping dataset names (e.g. "Caltech Camera Traps") to dicts
+        with keys corresponding to the headers in the .csv file, currently:
+        - name
+        - short_name
+        - continent
+        - country
+        - region
+        - image_base_url_relative
+        - metadata_url_relative
+        - bbox_url_relative
+        - image_base_url_gcp
+        - metadata_url_gcp
+        - bbox_url_gcp
+        - image_base_url_aws
+        - metadata_url_aws
+        - bbox_url_aws
+        - image_base_url_azure
+        - metadata_url_azure
+        - box_url_azure
+        - mdv4_results_raw
+        - mdv5b_results_raw
+        - md_results_with_rde
+        - json_filename
     """
     # Put the master metadata file in the same folder where we're putting images
@@ -148,6 +178,12 @@ def read_lila_all_images_file(metadata_dir):
     """
     Downloads if necessary - then unzips if necessary - the .csv file with label mappings for
     all LILA files, and opens the resulting .csv file as a Pandas DataFrame.
+    Args:
+        metadata_dir (str): folder to use for temporary LILA metadata files
+    Returns:
+        pd.DataFrame: a DataFrame containing one row per identification in a LILA camera trap image
     """
     p = urlparse(lila_all_images_url)
@@ -169,18 +205,37 @@ def read_lila_all_images_file(metadata_dir):
     return df
-def read_metadata_file_for_dataset(ds_name,metadata_dir,metadata_table=None,json_url=None):
+def read_metadata_file_for_dataset(ds_name,
+                                   metadata_dir,
+                                   metadata_table=None,
+                                   json_url=None,
+                                   preferred_cloud='gcp'):
     """
     Downloads if necessary - then unzips if necessary - the .json file for a specific dataset.
-    Returns the .json filename on the local disk.
+    Args:
+        ds_name (str): the name of the dataset for which you want to retrieve metadata (e.g.
+            "Caltech Camera Traps")
+        metadata_dir (str): folder to use for temporary LILA metadata files
+        metadata_table (dict, optional): an optional dictionary already loaded via
+            read_lila_metadata()
+        json_url (str, optional): the URL of the metadata file, if None will be retrieved
+            via read_lila_metadata()
+        preferred_cloud (str, optional): 'gcp' (default), 'azure', or 'aws'
+    Returns:
+        str: the .json filename on the local disk
     """
+    assert preferred_cloud in lila_base_urls.keys()
     if json_url is None:
         if metadata_table is None:
             metadata_table = read_lila_metadata(metadata_dir)
-        json_url = metadata_table[ds_name]['metadata_url']
+        json_url = metadata_table[ds_name]['metadata_url_' + preferred_cloud]
     p = urlparse(json_url)
     json_filename = os.path.join(metadata_dir,os.path.basename(p.path))
@@ -215,7 +270,8 @@ if False:
     from md_utils import url_utils
-    status_codes = url_utils.test_urls(urls)
+    status_codes = url_utils.test_urls(urls,timeout=2.0)
+    assert all([code == 200 for code in status_codes])
     #%% Verify that the metadata URLs exist for individual datasets
@@ -225,13 +281,20 @@ if False:
     dataset_metadata = read_lila_metadata(metadata_dir)
     urls_to_test = []
     # ds_name = next(iter(dataset_metadata.keys()))
     for ds_name in dataset_metadata.keys():
         ds_info = dataset_metadata[ds_name]
-        urls_to_test.append(ds_info['metadata_url'])
-        if ds_info['bbox_url'] != None:
-            urls_to_test.append(ds_info['bbox_url'])
+        for cloud_name in lila_base_urls.keys():
+            urls_to_test.append(ds_info['metadata_url_' + cloud_name])
+            if ds_info['bbox_url_relative'] != None:
+                urls_to_test.append(ds_info['bbox_url_' + cloud_name])
-    status_codes = url_utils.test_urls(urls_to_test)
+    status_codes = url_utils.test_urls(urls_to_test,
+                                       error_on_failure=True,
+                                       n_workers=10,
+                                       pool_type='process',
+                                       timeout=2.0)
+    assert all([code == 200 for code in status_codes])

data_management/lila/test_lila_metadata_urls.py CHANGED Viewed

@@ -1,116 +1,132 @@
-########
-#
-# test_lila_metadata_urls.py
-#
-# Test that all the metadata URLs for LILA camera trap datasets are valid, and
-# test that at least one image within each URL is valid, including MegaDetector results
-# files.
-#
-########
-#%% Constants and imports
-import json
-import os
-from data_management.lila.lila_common import read_lila_metadata,\
-    read_metadata_file_for_dataset, read_lila_taxonomy_mapping
-# We'll write images, metadata downloads, and temporary files here
-lila_local_base = os.path.expanduser('~/lila')
-output_dir = os.path.join(lila_local_base,'lila_metadata_tests')
-os.makedirs(output_dir,exist_ok=True)
-metadata_dir = os.path.join(lila_local_base,'metadata')
-os.makedirs(metadata_dir,exist_ok=True)
-md_results_dir = os.path.join(lila_local_base,'md_results')
-os.makedirs(md_results_dir,exist_ok=True)
-md_results_keys = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
-#%% Load category and taxonomy files
-taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
-#%% Download and parse the metadata file
-metadata_table = read_lila_metadata(metadata_dir)
-print('Loaded metadata URLs for {} datasets'.format(len(metadata_table)))
-#%% Download and extract metadata and MD results for each dataset
-for ds_name in metadata_table.keys():
-    metadata_table[ds_name]['json_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
-                                                                         metadata_dir=metadata_dir,
-                                                                         metadata_table=metadata_table)
-    for k in md_results_keys:
-        md_results_url = metadata_table[ds_name][k]
-        if md_results_url is None:
-            metadata_table[ds_name][k + '_filename'] = None
-        else:
-            metadata_table[ds_name][k + '_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
-                                                                        metadata_dir=md_results_dir,
-                                                                        json_url=md_results_url)
-#%% Build up a list of URLs to test
-url_to_source = {}
-# The first image in a dataset is disproportionately likely to be human (and thus 404)
-image_index = 1000
-# ds_name = list(metadata_table.keys())[0]
-for ds_name in metadata_table.keys():
-    if 'bbox' in ds_name:
-        print('Skipping bbox dataset {}'.format(ds_name))
-        continue
-    print('Processing dataset {}'.format(ds_name))
-    json_filename = metadata_table[ds_name]['json_filename']
-    with open(json_filename, 'r') as f:
-        data = json.load(f)
-    image_base_url = metadata_table[ds_name]['image_base_url']
-    assert not image_base_url.endswith('/')
-    # Download a test image
-    test_image_relative_path = data['images'][image_index]['file_name']
-    test_image_url = image_base_url + '/' + test_image_relative_path
-    url_to_source[test_image_url] = ds_name + ' metadata'
-    # k = md_results_keys[2]
-    for k in md_results_keys:
-        k_fn = k + '_filename'
-        if metadata_table[ds_name][k_fn] is not None:
-            with open(metadata_table[ds_name][k_fn],'r') as f:
-                md_results = json.load(f)
-                im = md_results['images'][image_index]
-                md_image_url = image_base_url + '/' + im['file']
-                url_to_source[md_image_url] = ds_name + ' ' + k
-# ...for each dataset
-#%% Test URLs
-from md_utils.url_utils import test_urls
-urls_to_test = sorted(url_to_source.keys())
-urls_to_test = [fn.replace('\\','/') for fn in urls_to_test]
-status_codes = test_urls(urls_to_test,error_on_failure=False)
-for i_url,url in enumerate(urls_to_test):
-    if status_codes[i_url] != 200:
-        print('Status {} for {} ({})'.format(
-            status_codes[i_url],url,url_to_source[url]))
+"""
+test_lila_metadata_urls.py
+Test that all the metadata URLs for LILA camera trap datasets are valid, including MegaDetector
+results files.
+Also pick an arbitrary image from each dataset and make sure that URL is valid.
+Also picks an arbitrary image from each dataset's MD results and make sure the corresponding URL is valid.
+"""
+#%% Constants and imports
+import json
+import os
+from data_management.lila.lila_common import read_lila_metadata,\
+    read_metadata_file_for_dataset, read_lila_taxonomy_mapping
+# We'll write images, metadata downloads, and temporary files here
+lila_local_base = os.path.expanduser('~/lila')
+output_dir = os.path.join(lila_local_base,'lila_metadata_tests')
+os.makedirs(output_dir,exist_ok=True)
+metadata_dir = os.path.join(lila_local_base,'metadata')
+os.makedirs(metadata_dir,exist_ok=True)
+md_results_dir = os.path.join(lila_local_base,'md_results')
+os.makedirs(md_results_dir,exist_ok=True)
+md_results_keys = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
+preferred_cloud = 'gcp' # 'azure', 'aws'
+#%% Load category and taxonomy files
+taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
+#%% Download and parse the metadata file
+metadata_table = read_lila_metadata(metadata_dir)
+print('Loaded metadata URLs for {} datasets'.format(len(metadata_table)))
+#%% Download and extract metadata and MD results for each dataset
+for ds_name in metadata_table.keys():
+    metadata_table[ds_name]['json_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
+                                                                         metadata_dir=metadata_dir,
+                                                                         metadata_table=metadata_table)
+    for k in md_results_keys:
+        md_results_url = metadata_table[ds_name][k]
+        if md_results_url is None:
+            metadata_table[ds_name][k + '_filename'] = None
+        else:
+            metadata_table[ds_name][k + '_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
+                                                                        metadata_dir=md_results_dir,
+                                                                        json_url=md_results_url)
+#%% Build up a list of URLs to test
+# Takes ~15 mins, since it has to open all the giant .json files
+url_to_source = {}
+# The first image in a dataset is disproportionately likely to be human (and thus 404),
+# so we pick a semi-arbitrary image that isn't the first.  How about the 1000th?
+image_index = 1000
+# ds_name = list(metadata_table.keys())[0]
+for ds_name in metadata_table.keys():
+    if 'bbox' in ds_name:
+        print('Skipping bbox dataset {}'.format(ds_name))
+        continue
+    print('Processing dataset {}'.format(ds_name))
+    json_filename = metadata_table[ds_name]['json_filename']
+    with open(json_filename, 'r') as f:
+        data = json.load(f)
+    image_base_url = metadata_table[ds_name]['image_base_url_' + preferred_cloud]
+    assert not image_base_url.endswith('/')
+    # Download a test image
+    test_image_relative_path = data['images'][image_index]['file_name']
+    test_image_url = image_base_url + '/' + test_image_relative_path
+    url_to_source[test_image_url] = ds_name + ' metadata'
+    # Grab an image from the MegaDetector results
+    # k = md_results_keys[2]
+    for k in md_results_keys:
+        k_fn = k + '_filename'
+        if metadata_table[ds_name][k_fn] is not None:
+            with open(metadata_table[ds_name][k_fn],'r') as f:
+                md_results = json.load(f)
+                im = md_results['images'][image_index]
+                md_image_url = image_base_url + '/' + im['file']
+                url_to_source[md_image_url] = ds_name + ' ' + k
+            del md_results
+    del data
+# ...for each dataset
+#%% Test URLs
+from md_utils.url_utils import test_urls
+urls_to_test = sorted(url_to_source.keys())
+urls_to_test = [fn.replace('\\','/') for fn in urls_to_test]
+status_codes = test_urls(urls_to_test,
+                         error_on_failure=False,
+                         pool_type='thread',
+                         n_workers=10,
+                         timeout=2.0)
+for i_url,url in enumerate(urls_to_test):
+    if status_codes[i_url] != 200:
+        print('Status {} for {} ({})'.format(
+            status_codes[i_url],url,url_to_source[url]))

megadetector 5.0.8__py3-none-any.whl → 5.0.9__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.8py3-none-any.whl → 5.0.9py3-none-any.whl