PyPI - megadetector - Versions diffs - 5.0.28__py3-none-any.whl → 10.0.0__py3-none-any.whl - Mend

megadetector 5.0.28py3-none-any.whl → 10.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (197) hide show

megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
megadetector/classification/aggregate_classifier_probs.py +3 -3
megadetector/classification/analyze_failed_images.py +5 -5
megadetector/classification/cache_batchapi_outputs.py +5 -5
megadetector/classification/create_classification_dataset.py +11 -12
megadetector/classification/crop_detections.py +10 -10
megadetector/classification/csv_to_json.py +8 -8
megadetector/classification/detect_and_crop.py +13 -15
megadetector/classification/efficientnet/model.py +8 -8
megadetector/classification/efficientnet/utils.py +6 -5
megadetector/classification/evaluate_model.py +7 -7
megadetector/classification/identify_mislabeled_candidates.py +6 -6
megadetector/classification/json_to_azcopy_list.py +1 -1
megadetector/classification/json_validator.py +29 -32
megadetector/classification/map_classification_categories.py +9 -9
megadetector/classification/merge_classification_detection_output.py +12 -9
megadetector/classification/prepare_classification_script.py +19 -19
megadetector/classification/prepare_classification_script_mc.py +26 -26
megadetector/classification/run_classifier.py +4 -4
megadetector/classification/save_mislabeled.py +6 -6
megadetector/classification/train_classifier.py +1 -1
megadetector/classification/train_classifier_tf.py +9 -9
megadetector/classification/train_utils.py +10 -10
megadetector/data_management/annotations/annotation_constants.py +1 -2
megadetector/data_management/camtrap_dp_to_coco.py +79 -46
megadetector/data_management/cct_json_utils.py +103 -103
megadetector/data_management/cct_to_md.py +49 -49
megadetector/data_management/cct_to_wi.py +33 -33
megadetector/data_management/coco_to_labelme.py +75 -75
megadetector/data_management/coco_to_yolo.py +210 -193
megadetector/data_management/databases/add_width_and_height_to_db.py +86 -12
megadetector/data_management/databases/combine_coco_camera_traps_files.py +40 -40
megadetector/data_management/databases/integrity_check_json_db.py +228 -200
megadetector/data_management/databases/subset_json_db.py +33 -33
megadetector/data_management/generate_crops_from_cct.py +88 -39
megadetector/data_management/get_image_sizes.py +54 -49
megadetector/data_management/labelme_to_coco.py +133 -125
megadetector/data_management/labelme_to_yolo.py +159 -73
megadetector/data_management/lila/create_lila_blank_set.py +81 -83
megadetector/data_management/lila/create_lila_test_set.py +32 -31
megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
megadetector/data_management/lila/download_lila_subset.py +21 -24
megadetector/data_management/lila/generate_lila_per_image_labels.py +365 -107
megadetector/data_management/lila/get_lila_annotation_counts.py +35 -33
megadetector/data_management/lila/get_lila_image_counts.py +22 -22
megadetector/data_management/lila/lila_common.py +73 -70
megadetector/data_management/lila/test_lila_metadata_urls.py +28 -19
megadetector/data_management/mewc_to_md.py +344 -340
megadetector/data_management/ocr_tools.py +262 -255
megadetector/data_management/read_exif.py +249 -227
megadetector/data_management/remap_coco_categories.py +90 -28
megadetector/data_management/remove_exif.py +81 -21
megadetector/data_management/rename_images.py +187 -187
megadetector/data_management/resize_coco_dataset.py +588 -120
megadetector/data_management/speciesnet_to_md.py +41 -41
megadetector/data_management/wi_download_csv_to_coco.py +55 -55
megadetector/data_management/yolo_output_to_md_output.py +248 -122
megadetector/data_management/yolo_to_coco.py +333 -191
megadetector/detection/change_detection.py +832 -0
megadetector/detection/process_video.py +340 -337
megadetector/detection/pytorch_detector.py +358 -278
megadetector/detection/run_detector.py +399 -186
megadetector/detection/run_detector_batch.py +404 -377
megadetector/detection/run_inference_with_yolov5_val.py +340 -327
megadetector/detection/run_tiled_inference.py +257 -249
megadetector/detection/tf_detector.py +24 -24
megadetector/detection/video_utils.py +332 -295
megadetector/postprocessing/add_max_conf.py +19 -11
megadetector/postprocessing/categorize_detections_by_size.py +45 -45
megadetector/postprocessing/classification_postprocessing.py +468 -433
megadetector/postprocessing/combine_batch_outputs.py +23 -23
megadetector/postprocessing/compare_batch_results.py +590 -525
megadetector/postprocessing/convert_output_format.py +106 -102
megadetector/postprocessing/create_crop_folder.py +347 -147
megadetector/postprocessing/detector_calibration.py +173 -168
megadetector/postprocessing/generate_csv_report.py +508 -499
megadetector/postprocessing/load_api_results.py +48 -27
megadetector/postprocessing/md_to_coco.py +133 -102
megadetector/postprocessing/md_to_labelme.py +107 -90
megadetector/postprocessing/md_to_wi.py +40 -40
megadetector/postprocessing/merge_detections.py +92 -114
megadetector/postprocessing/postprocess_batch_results.py +319 -301
megadetector/postprocessing/remap_detection_categories.py +91 -38
megadetector/postprocessing/render_detection_confusion_matrix.py +214 -205
megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +704 -679
megadetector/postprocessing/separate_detections_into_folders.py +226 -211
megadetector/postprocessing/subset_json_detector_output.py +265 -262
megadetector/postprocessing/top_folders_to_bottom.py +45 -45
megadetector/postprocessing/validate_batch_results.py +70 -70
megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
megadetector/taxonomy_mapping/map_new_lila_datasets.py +18 -19
megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +54 -33
megadetector/taxonomy_mapping/preview_lila_taxonomy.py +67 -67
megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
megadetector/taxonomy_mapping/simple_image_download.py +8 -8
megadetector/taxonomy_mapping/species_lookup.py +156 -74
megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
megadetector/taxonomy_mapping/taxonomy_graph.py +10 -10
megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
megadetector/utils/ct_utils.py +1049 -211
megadetector/utils/directory_listing.py +21 -77
megadetector/utils/gpu_test.py +22 -22
megadetector/utils/md_tests.py +632 -529
megadetector/utils/path_utils.py +1520 -431
megadetector/utils/process_utils.py +41 -41
megadetector/utils/split_locations_into_train_val.py +62 -62
megadetector/utils/string_utils.py +148 -27
megadetector/utils/url_utils.py +489 -176
megadetector/utils/wi_utils.py +2658 -2526
megadetector/utils/write_html_image_list.py +137 -137
megadetector/visualization/plot_utils.py +34 -30
megadetector/visualization/render_images_with_thumbnails.py +39 -74
megadetector/visualization/visualization_utils.py +487 -435
megadetector/visualization/visualize_db.py +232 -198
megadetector/visualization/visualize_detector_output.py +82 -76
{megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/METADATA +5 -2
megadetector-10.0.0.dist-info/RECORD +139 -0
{megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/WHEEL +1 -1
megadetector/api/batch_processing/api_core/__init__.py +0 -0
megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
megadetector/api/batch_processing/api_core/batch_service/score.py +0 -439
megadetector/api/batch_processing/api_core/server.py +0 -294
megadetector/api/batch_processing/api_core/server_api_config.py +0 -97
megadetector/api/batch_processing/api_core/server_app_config.py +0 -55
megadetector/api/batch_processing/api_core/server_batch_job_manager.py +0 -220
megadetector/api/batch_processing/api_core/server_job_status_table.py +0 -149
megadetector/api/batch_processing/api_core/server_orchestration.py +0 -360
megadetector/api/batch_processing/api_core/server_utils.py +0 -88
megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
megadetector/api/batch_processing/api_support/__init__.py +0 -0
megadetector/api/batch_processing/api_support/summarize_daily_activity.py +0 -152
megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
megadetector/api/synchronous/__init__.py +0 -0
megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +0 -151
megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -263
megadetector/api/synchronous/api_core/animal_detection_api/config.py +0 -35
megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
megadetector/api/synchronous/api_core/tests/load_test.py +0 -110
megadetector/data_management/importers/add_nacti_sizes.py +0 -52
megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
megadetector/data_management/importers/awc_to_json.py +0 -191
megadetector/data_management/importers/bellevue_to_json.py +0 -272
megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
megadetector/data_management/importers/cct_field_adjustments.py +0 -58
megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
megadetector/data_management/importers/ena24_to_json.py +0 -276
megadetector/data_management/importers/filenames_to_json.py +0 -386
megadetector/data_management/importers/helena_to_cct.py +0 -283
megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
megadetector/data_management/importers/jb_csv_to_json.py +0 -150
megadetector/data_management/importers/mcgill_to_json.py +0 -250
megadetector/data_management/importers/missouri_to_json.py +0 -490
megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
megadetector/data_management/importers/noaa_seals_2019.py +0 -181
megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
megadetector/data_management/importers/pc_to_json.py +0 -365
megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
megadetector/data_management/importers/rspb_to_json.py +0 -356
megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
megadetector/data_management/importers/sulross_get_exif.py +0 -65
megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
megadetector/data_management/importers/ubc_to_json.py +0 -399
megadetector/data_management/importers/umn_to_json.py +0 -507
megadetector/data_management/importers/wellington_to_json.py +0 -263
megadetector/data_management/importers/wi_to_json.py +0 -442
megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
megadetector/utils/azure_utils.py +0 -178
megadetector/utils/sas_blob_utils.py +0 -509
megadetector-5.0.28.dist-info/RECORD +0 -209
/megadetector/{api/batch_processing/__init__.py → __init__.py} +0 -0
{megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/licenses/LICENSE +0 -0
{megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/top_level.txt +0 -0

megadetector/utils/url_utils.py CHANGED Viewed

@@ -11,8 +11,11 @@ Frequently-used functions for downloading or manipulating URLs
 import os
 import re
 import urllib
-import tempfile
+import urllib.request
+import urllib.error
 import requests
+import shutil
+import pytest
 from functools import partial
 from tqdm import tqdm
@@ -20,111 +23,105 @@ from urllib.parse import urlparse
 from multiprocessing.pool import ThreadPool
 from multiprocessing.pool import Pool
-url_utils_temp_dir = None
+from megadetector.utils.ct_utils import make_test_folder
+from megadetector.utils.ct_utils import make_temp_folder
 max_path_len = 255
 #%% Download functions
-class DownloadProgressBar():
+class DownloadProgressBar:
     """
     Progress updater based on the progressbar2 package.
     https://stackoverflow.com/questions/37748105/how-to-use-progressbar-module-with-urlretrieve
     """
     def __init__(self):
         self.pbar = None
-    def __call__(self, block_num, block_size, total_size):
-        if not self.pbar:
-            # This is a pretty random import I'd rather not depend on outside of the
-            # rare case where it's used, so importing locally
-            # pip install progressbar2
-            import progressbar
-            self.pbar = progressbar.ProgressBar(max_value=total_size)
-            self.pbar.start()
-        downloaded = block_num * block_size
-        if downloaded < total_size:
-            self.pbar.update(downloaded)
-        else:
-            self.pbar.finish()
-def get_temp_folder(preferred_name='url_utils'):
-    """
-    Gets a temporary folder for use within this module.
-    Args:
-        preferred_name (str, optional): subfolder to use within the system temp folder
-    Returns:
-        str: the full path to the temporary subfolder
-    """
-    global url_utils_temp_dir
-    if url_utils_temp_dir is None:
-        url_utils_temp_dir = os.path.join(tempfile.gettempdir(),preferred_name)
-        os.makedirs(url_utils_temp_dir,exist_ok=True)
-    return url_utils_temp_dir
-def download_url(url,
-                 destination_filename=None,
-                 progress_updater=None,
-                 force_download=False,
+    def __call__(self, block_num, block_size, total_size): # noqa
+        if not self.pbar:
+            try:
+                import progressbar # type: ignore
+                self.pbar = progressbar.ProgressBar(max_value=total_size)
+                self.pbar.start()
+            except ImportError:
+                self.pbar = None
+                # print("ProgressBar not available, install 'progressbar2' for visual progress.")
+        if self.pbar:
+            downloaded = block_num * block_size
+            if downloaded < total_size:
+                self.pbar.update(downloaded)
+            else:
+                self.pbar.finish()
+def download_url(url,
+                 destination_filename=None,
+                 progress_updater=None,
+                 force_download=False,
                  verbose=True,
                  escape_spaces=True):
     """
-    Downloads a URL to a file.  If no file is specified, creates a temporary file,
+    Downloads a URL to a file.  If no file is specified, creates a temporary file,
     making a best effort to avoid filename collisions.
     Prints some diagnostic information and makes sure to omit SAS tokens from printouts.
     Args:
         url (str): the URL to download
         destination_filename (str, optional): the target filename; if None, will create
-            a file in system temp space
-        progress_updater (object or bool, optional): can be "None", "False", "True", or a
-            specific callable object.  If None or False, no progress updated will be
+            a file in system temp space
+        progress_updater (object or bool, optional): can be "None", "False", "True", or a
+            specific callable object.  If None or False, no progress updated will be
             displayed.  If True, a default progress bar will be created.
         force_download (bool, optional): download this file even if [destination_filename]
             exists.
         verbose (bool, optional): enable additional debug console output
         escape_spaces (bool, optional): replace ' ' with '%20'
     Returns:
         str: the filename to which [url] was downloaded, the same as [destination_filename]
         if [destination_filename] was not None
     """
     if progress_updater is not None and isinstance(progress_updater,bool):
         if not progress_updater:
             progress_updater = None
         else:
             progress_updater = DownloadProgressBar()
     url_no_sas = url.split('?')[0]
     if destination_filename is None:
-        target_folder = get_temp_folder()
+        target_folder = make_temp_folder(subfolder='url_utils',append_guid=False)
         url_without_sas = url.split('?', 1)[0]
         # This does not guarantee uniqueness, hence "semi-best-effort"
         url_as_filename = re.sub(r'\W+', '', url_without_sas)
-        n_folder_chars = len(url_utils_temp_dir)
-        if len(url_as_filename) + n_folder_chars > max_path_len:
+        n_folder_chars = len(target_folder)
+        if (len(url_as_filename) + n_folder_chars) >= max_path_len:
             print('Warning: truncating filename target to {} characters'.format(max_path_len))
-            url_as_filename = url_as_filename[-1*(max_path_len-n_folder_chars):]
+            max_fn_len = max_path_len - (n_folder_chars + 1)
+            url_as_filename = url_as_filename[-1 * max_fn_len:]
         destination_filename = \
             os.path.join(target_folder,url_as_filename)
+    # ...if the destination filename wasn't specified
     if escape_spaces:
         url = url.replace(' ','%20')
     if (not force_download) and (os.path.isfile(destination_filename)):
         if verbose:
             print('Bypassing download of already-downloaded file {}'.format(os.path.basename(url_no_sas)))
@@ -133,12 +130,12 @@ def download_url(url,
             print('Downloading file {} to {}'.format(os.path.basename(url_no_sas),destination_filename),end='')
         target_dir = os.path.dirname(destination_filename)
         os.makedirs(target_dir,exist_ok=True)
-        urllib.request.urlretrieve(url, destination_filename, progress_updater)
+        urllib.request.urlretrieve(url, destination_filename, progress_updater)
         assert(os.path.isfile(destination_filename))
-        nBytes = os.path.getsize(destination_filename)
+        n_bytes = os.path.getsize(destination_filename)
         if verbose:
-            print('...done, {} bytes.'.format(nBytes))
+            print('...done, {} bytes.'.format(n_bytes))
     return destination_filename
 # ...def download_url(...)
@@ -146,24 +143,24 @@ def download_url(url,
 def download_relative_filename(url, output_base, verbose=False):
     """
-    Download a URL to output_base, preserving relative path.  Path is relative to
+    Download a URL to output_base, preserving relative path.  Path is relative to
     the site, so:
         https://abc.com/xyz/123.txt
     ...will get downloaded to:
-        output_base/xyz/123.txt
+        output_base/xyz/123.txt
     Args:
         url (str): the URL to download
         output_base (str): the base folder to which we should download this file
         verbose (bool, optional): enable additional debug console output
     Returns:
         str: the local destination filename
     """
     p = urlparse(url)
     # remove the leading '/'
     assert p.path.startswith('/'); relative_filename = p.path[1:]
@@ -177,123 +174,139 @@ def _do_parallelized_download(download_info,overwrite=False,verbose=False):
     """
     Internal function for download parallelization.
     """
     url = download_info['url']
     target_file = download_info['target_file']
     result = {'status':'unknown','url':url,'target_file':target_file}
     if ((os.path.isfile(target_file)) and (not overwrite)):
         if verbose:
             print('Skipping existing file {}'.format(target_file))
         result['status'] = 'skipped'
         return result
     try:
-        download_url(url=url,
+        download_url(url=url,
                      destination_filename=target_file,
-                     verbose=verbose,
+                     verbose=verbose,
                      force_download=overwrite)
     except Exception as e:
         print('Warning: error downloading URL {}: {}'.format(
-            url,str(e)))
+            url,str(e)))
         result['status'] = 'error: {}'.format(str(e))
         return result
     result['status'] = 'success'
     return result
 # ...def _do_parallelized_download(...)
-def parallel_download_urls(url_to_target_file,verbose=False,overwrite=False,
-                           n_workers=20,pool_type='thread'):
+def parallel_download_urls(url_to_target_file,
+                           verbose=False,
+                           overwrite=False,
+                           n_workers=20,
+                           pool_type='thread'):
     """
     Downloads a list of URLs to local files.
-    Catches exceptions and reports them in the returned "results" array.
+    Catches exceptions and reports them in the returned "results" array.
     Args:
-        url_to_target_file: a dict mapping URLs to local filenames.
+        url_to_target_file (dict): a dict mapping URLs to local filenames.
         verbose (bool, optional): enable additional debug console output
         overwrite (bool, optional): whether to overwrite existing local files
         n_workers (int, optional): number of concurrent workers, set to <=1 to disable
             parallelization
         pool_type (str, optional): worker type to use; should be 'thread' or 'process'
     Returns:
         list: list of dicts with keys:
             - 'url': the url this item refers to
             - 'status': 'skipped', 'success', or a string starting with 'error'
-            - 'target_file': the local filename to which we downloaded (or tried to
-              download) this URL
+            - 'target_file': the local filename to which we downloaded (or tried to
+              download) this URL
     """
     all_download_info = []
-    print('Preparing download list')
-    for url in tqdm(url_to_target_file):
+    if verbose:
+        print('Preparing download list')
+    for url in tqdm(url_to_target_file, disable=(not verbose)):
         download_info = {}
         download_info['url'] = url
         download_info['target_file'] = url_to_target_file[url]
         all_download_info.append(download_info)
-    print('Downloading {} images on {} workers'.format(
-        len(all_download_info),n_workers))
+    if verbose:
+        print('Downloading {} images on {} workers'.format(
+            len(all_download_info),n_workers))
     if n_workers <= 1:
         results = []
-        for download_info in tqdm(all_download_info):
+        for download_info in tqdm(all_download_info, disable=(not verbose)):
             result = _do_parallelized_download(download_info,overwrite=overwrite,verbose=verbose)
             results.append(result)
     else:
-        if pool_type == 'thread':
-            pool = ThreadPool(n_workers)
-        else:
-            assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
-            pool = Pool(n_workers)
-        print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
-        results = list(tqdm(pool.imap(
-            partial(_do_parallelized_download,overwrite=overwrite,verbose=verbose),
-            all_download_info), total=len(all_download_info)))
+        pool = None
+        try:
+            if pool_type == 'thread':
+                pool = ThreadPool(n_workers)
+            else:
+                assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
+                pool = Pool(n_workers)
+            if verbose:
+                print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
+            results = list(tqdm(pool.imap(
+                partial(_do_parallelized_download,overwrite=overwrite,verbose=verbose),
+                all_download_info), total=len(all_download_info), disable=(not verbose)))
+        finally:
+            if pool:
+                pool.close()
+                pool.join()
+                print("Pool closed and joined for parallel URL downloads")
     return results
 # ...def parallel_download_urls(...)
+@pytest.mark.skip(reason="This is not a test function")
 def test_url(url,error_on_failure=True,timeout=None):
     """
     Tests the availability of [url], returning an http status code.
     Args:
         url (str): URL to test
         error_on_failure (bool, optional): whether to error (vs. just returning an
             error code) if accessing this URL fails
-        timeout (int, optional): timeout in seconds to wait before considering this
+        timeout (int, optional): timeout in seconds to wait before considering this
             access attempt to be a failure; see requests.head() for precise documentation
     Returns:
         int: http status code (200 for success)
     """
-    # r = requests.get(url, stream=True, verify=True, timeout=timeout)
     r = requests.head(url, stream=True, verify=True, timeout=timeout)
-    if error_on_failure and r.status_code != 200:
+    if error_on_failure and r.status_code != 200:
         raise ValueError('Could not access {}: error {}'.format(url,r.status_code))
     return r.status_code
-def test_urls(urls,error_on_failure=True,n_workers=1,pool_type='thread',timeout=None):
+@pytest.mark.skip(reason="This is not a test function")
+def test_urls(urls,error_on_failure=True,n_workers=1,pool_type='thread',timeout=None,verbose=False):
     """
     Verify that URLs are available (i.e., returns status 200).  By default,
-    errors if any URL is unavailable.
+    errors if any URL is unavailable.
     Args:
         urls (list): list of URLs to test
         error_on_failure (bool, optional): whether to error (vs. just returning an
@@ -301,39 +314,48 @@ def test_urls(urls,error_on_failure=True,n_workers=1,pool_type='thread',timeout=
         n_workers (int, optional): number of concurrent workers, set to <=1 to disable
             parallelization
         pool_type (str, optional): worker type to use; should be 'thread' or 'process'
-        timeout (int, optional): timeout in seconds to wait before considering this
+        timeout (int, optional): timeout in seconds to wait before considering this
             access attempt to be a failure; see requests.head() for precise documentation
+        verbose (bool, optional): enable additional debug output
     Returns:
         list: a list of http status codes, the same length and order as [urls]
     """
     if n_workers <= 1:
         status_codes = []
-        for url in tqdm(urls):
+        for url in tqdm(urls,disable=(not verbose)):
             r = requests.get(url, timeout=timeout)
-            if error_on_failure and r.status_code != 200:
+            if error_on_failure and r.status_code != 200:
                 raise ValueError('Could not access {}: error {}'.format(url,r.status_code))
             status_codes.append(r.status_code)
     else:
-        if pool_type == 'thread':
-            pool = ThreadPool(n_workers)
-        else:
-            assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
-            pool = Pool(n_workers)
-        print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
-        status_codes = list(tqdm(pool.imap(
-            partial(test_url,error_on_failure=error_on_failure,timeout=timeout),
-            urls), total=len(urls)))
+        pool = None
+        try:
+            if pool_type == 'thread':
+                pool = ThreadPool(n_workers)
+            else:
+                assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
+                pool = Pool(n_workers)
+            if verbose:
+                print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
+            status_codes = list(tqdm(pool.imap(
+                partial(test_url,error_on_failure=error_on_failure,timeout=timeout),
+                urls), total=len(urls), disable=(not verbose)))
+        finally:
+            if pool:
+                pool.close()
+                pool.join()
+                print('Pool closed and joined for URL tests')
     return status_codes
 # ...def test_urls(...)
@@ -341,16 +363,16 @@ def test_urls(urls,error_on_failure=True,n_workers=1,pool_type='thread',timeout=
 def get_url_size(url,verbose=False,timeout=None):
     """
-    Get the size of the file pointed to by a URL, based on the Content-Length property.  If the
-    URL is not available, or the Content-Length property is not available, or the content-Length
-    property is not an integer, returns None.
+    Get the size of the file pointed to by a URL, based on the Content-Length property.  If the
+    URL is not available, or the Content-Length property is not available, or the content-Length
+    property is not an integer, returns None.
     Args:
         url (str): the url to test
         verbose (bool, optional): enable additional debug output
-        timeout (int, optional): timeout in seconds to wait before considering this
+        timeout (int, optional): timeout in seconds to wait before considering this
             access attempt to be a failure; see requests.head() for precise documentation
     Returns:
         int: the file size in bytes, or None if it can't be retrieved
     """
@@ -362,13 +384,18 @@ def get_url_size(url,verbose=False,timeout=None):
             if verbose:
                 print('Status {} retrieving file size for {}'.format(f.status,url))
             return None
-        size_bytes = int(f.headers['Content-Length'])
+        size_bytes_str = f.headers.get('Content-Length')
+        if size_bytes_str is None:
+            if verbose:
+                print('No Content-Length header for {}'.format(url))
+            return None
+        size_bytes = int(size_bytes_str)
         return size_bytes
     except Exception as e:
         if verbose:
             print('Error retrieving file size for {}:\n{}'.format(url,str(e)))
         return None
 # ...def get_url_size(...)
@@ -376,45 +403,331 @@ def get_url_sizes(urls,n_workers=1,pool_type='thread',timeout=None,verbose=False
     """
     Retrieve file sizes for the URLs specified by [urls].  Returns None for any URLs
     that we can't access, or URLs for which the Content-Length property is not set.
     Args:
         urls (list): list of URLs for which we should retrieve sizes
         n_workers (int, optional): number of concurrent workers, set to <=1 to disable
             parallelization
         pool_type (str, optional): worker type to use; should be 'thread' or 'process'
-        timeout (int, optional): timeout in seconds to wait before considering this
+        timeout (int, optional): timeout in seconds to wait before considering this
             access attempt to be a failure; see requests.head() for precise documentation
         verbose (bool, optional): print additional debug information
     Returns:
         dict: maps urls to file sizes, which will be None for URLs for which we were unable
-        to retrieve a valid size.
+        to retrieve a valid size.
     """
     url_to_size = {}
-    if n_workers <= 1:
-        for url in tqdm(urls):
+    if n_workers <= 1:
+        for url in tqdm(urls, disable=(not verbose)):
             url_to_size[url] = get_url_size(url,verbose=verbose,timeout=timeout)
     else:
-        if pool_type == 'thread':
-            pool = ThreadPool(n_workers)
-        else:
-            assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
-            pool = Pool(n_workers)
-        print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
-        file_sizes = list(tqdm(pool.imap(
-            partial(get_url_size,verbose=verbose,timeout=timeout),
-            urls), total=len(urls)))
-        for i_url,url in enumerate(urls):
-            url_to_size[url] = file_sizes[i_url]
+        pool = None
+        try:
+            if pool_type == 'thread':
+                pool = ThreadPool(n_workers)
+            else:
+                assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
+                pool = Pool(n_workers)
+            if verbose:
+                print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
+            file_sizes = list(tqdm(pool.imap(
+                partial(get_url_size,verbose=verbose,timeout=timeout),
+                urls), total=len(urls), disable=(not verbose)))
+            for i_url,url in enumerate(urls):
+                url_to_size[url] = file_sizes[i_url]
+        finally:
+            if pool:
+                pool.close()
+                pool.join()
+                print('Pool closed and joined for URL size checks')
     return url_to_size
-# ...get_url_sizes(...)
+#%% Tests
+# Constants for tests
+SMALL_FILE_URL = "https://www.google.com/images/branding/googlelogo/1x/googlelogo_color_272x92dp.png"
+REDIRECT_SRC_URL = "http://google.com"
+REDIRECT_DEST_URL = "https://www.google.com/"
+NON_EXISTENT_URL = "https://example.com/non_existent_page_404.html"
+DEFINITELY_NON_EXISTENT_DOMAIN_URL = "https://thisshouldnotexist1234567890.com/file.txt"
+RELATIVE_DOWNLOAD_URL = "https://raw.githubusercontent.com/agentmorris/MegaDetector/main/README.md"
+RELATIVE_DOWNLOAD_CONTAIN_TOKEN = 'agentmorris'
+RELATIVE_DOWNLOAD_NOT_CONTAIN_TOKEN = 'github'
+class TestUrlUtils:
+    """
+    Tests for url_utils.py
+    """
+    def set_up(self):
+        """
+        Create a temporary directory for testing.
+        """
+        self.test_dir = make_test_folder(subfolder='url_utils_tests')
+        self.download_target_dir = os.path.join(self.test_dir, 'downloads')
+        os.makedirs(self.download_target_dir, exist_ok=True)
+    def tear_down(self):
+        """
+        Remove the temporary directory after tests and restore module temp_dir.
+        """
+        if os.path.exists(self.test_dir):
+            shutil.rmtree(self.test_dir)
+    def test_download_url_to_specified_file(self):
+        """
+        Test download_url with a specified destination filename.
+        """
+        dest_filename = os.path.join(self.download_target_dir, "downloaded_google_logo.png")
+        returned_filename = download_url(SMALL_FILE_URL,
+                                         destination_filename=dest_filename,
+                                         verbose=False)
+        assert returned_filename == dest_filename
+        assert os.path.exists(dest_filename)
+        assert os.path.getsize(dest_filename) > 1000
+    def test_download_url_to_temp_file(self):
+        """
+        Test download_url when destination_filename is None.
+        """
+        returned_filename = download_url(SMALL_FILE_URL,
+                                         destination_filename=None,
+                                         verbose=False)
+        assert os.path.exists(returned_filename)
+        assert os.path.getsize(returned_filename) > 1000
+    def test_download_url_non_existent(self):
+        """
+        Test download_url with a non-existent URL.
+        """
+        dest_filename = os.path.join(self.download_target_dir, "non_existent.html")
+        try:
+            download_url(NON_EXISTENT_URL, destination_filename=dest_filename, verbose=False)
+            raise AssertionError("urllib.error.HTTPError not raised for 404")
+        except urllib.error.HTTPError:
+            pass
+        try:
+            download_url(DEFINITELY_NON_EXISTENT_DOMAIN_URL,
+                         destination_filename=dest_filename,
+                         verbose=False)
+            raise AssertionError(
+                "urllib.error.URLError or requests.exceptions.ConnectionError not raised for DNS failure")
+        except urllib.error.URLError:
+            pass
+        except requests.exceptions.ConnectionError:
+            pass
+    def test_download_url_force_download(self):
+        """
+        Test the force_download parameter of download_url.
+        """
+        dest_filename = os.path.join(self.download_target_dir, "force_test.png")
+        download_url(SMALL_FILE_URL, destination_filename=dest_filename, verbose=False)
+        assert os.path.exists(dest_filename)
+        initial_mtime = os.path.getmtime(dest_filename)
+        download_url(SMALL_FILE_URL, destination_filename=dest_filename, verbose=True)
+        assert os.path.getmtime(dest_filename) == initial_mtime
+        download_url(SMALL_FILE_URL,
+                     destination_filename=dest_filename,
+                     force_download=True,
+                     verbose=False)
+        assert os.path.exists(dest_filename)
+    def test_download_url_escape_spaces(self):
+        """
+        Test download_url with spaces in the URL.
+        """
+        dest_filename = os.path.join(self.download_target_dir, "escape_test.png")
+        download_url(SMALL_FILE_URL,
+                     destination_filename=dest_filename,
+                     escape_spaces=True,
+                     verbose=False)
+        assert os.path.exists(dest_filename)
+    def test_download_relative_filename(self):
+        """
+        Test download_relative_filename.
+        """
+        output_base = os.path.join(self.download_target_dir, "relative_dl")
+        returned_filename = download_relative_filename(RELATIVE_DOWNLOAD_URL, output_base, verbose=False)
+        assert RELATIVE_DOWNLOAD_CONTAIN_TOKEN in returned_filename
+        assert RELATIVE_DOWNLOAD_NOT_CONTAIN_TOKEN not in returned_filename
+        assert os.path.exists(returned_filename)
+        assert os.path.getsize(returned_filename) > 100
+    def test_parallel_download_urls(self):
+        """
+        Test parallel_download_urls (with n_workers=1 for simplicity).
+        """
+        url1_target = os.path.join(self.download_target_dir, "parallel_dl_1.png")
+        url2_target = os.path.join(self.download_target_dir, "parallel_dl_2_nonexistent.html")
+        url_to_target_file = {
+            SMALL_FILE_URL: url1_target,
+            NON_EXISTENT_URL: url2_target
+        }
+        results = parallel_download_urls(url_to_target_file, n_workers=1, verbose=False)
+        assert len(results) == 2
+        status_map = {res['url']: res for res in results}
+        assert status_map[SMALL_FILE_URL]['status'] == 'success'
+        assert status_map[SMALL_FILE_URL]['target_file'] == url1_target
+        assert os.path.exists(url1_target)
+        assert status_map[NON_EXISTENT_URL]['status'].startswith('error: HTTP Error 404')
+        assert status_map[NON_EXISTENT_URL]['target_file'] == url2_target
+        assert not os.path.exists(url2_target)
+        if not os.path.exists(url1_target):
+             download_url(SMALL_FILE_URL, url1_target, verbose=False)
+        results_skip = parallel_download_urls({SMALL_FILE_URL: url1_target},
+                                              n_workers=1,
+                                              overwrite=False,
+                                              verbose=True)
+        assert results_skip[0]['status'] == 'skipped'
+        results_overwrite = parallel_download_urls({SMALL_FILE_URL: url1_target},
+                                                   n_workers=1,
+                                                   overwrite=True,
+                                                   verbose=False)
+        assert results_overwrite[0]['status'] == 'success'
+    def test_test_url_and_test_urls(self):
+        """
+        Test test_url and test_urls functions.
+        """
+        assert test_url(SMALL_FILE_URL, error_on_failure=False, timeout=10) == 200
+        assert test_url(REDIRECT_SRC_URL, error_on_failure=False, timeout=10) in (200,301)
+        status_non_existent = test_url(NON_EXISTENT_URL, error_on_failure=False, timeout=5)
+        assert status_non_existent == 404
+        try:
+            test_url(NON_EXISTENT_URL, error_on_failure=True, timeout=5)
+            raise AssertionError("ValueError not raised for NON_EXISTENT_URL")
+        except ValueError:
+            pass
+        try:
+            test_url(DEFINITELY_NON_EXISTENT_DOMAIN_URL,
+                     error_on_failure=True,
+                     timeout=5)
+            raise AssertionError("requests.exceptions.ConnectionError or urllib.error.URLError not raised")
+        except requests.exceptions.ConnectionError:
+            pass
+        except urllib.error.URLError:
+            pass
+        urls_to_test = [SMALL_FILE_URL, NON_EXISTENT_URL]
+        status_codes = test_urls(urls_to_test, error_on_failure=False, n_workers=1, timeout=10)
+        assert len(status_codes) == 2
+        assert status_codes[0] == 200
+        assert status_codes[1] == 404
+        try:
+            test_urls(urls_to_test, error_on_failure=True, n_workers=1, timeout=5)
+            raise AssertionError("ValueError not raised for urls_to_test")
+        except ValueError:
+            pass
+        good_urls = [SMALL_FILE_URL, REDIRECT_SRC_URL]
+        good_status_codes = test_urls(good_urls, error_on_failure=True, n_workers=1, timeout=10)
+        assert good_status_codes == [200, 200]
+    def test_get_url_size_and_sizes(self):
+        """
+        Test get_url_size and get_url_sizes functions.
+        """
+        size = get_url_size(SMALL_FILE_URL, timeout=10)
+        assert size is not None
+        assert size > 1000
+        size_dynamic = get_url_size(REDIRECT_DEST_URL, timeout=10, verbose=True)
+        if size_dynamic is not None:
+            assert isinstance(size_dynamic, int)
+        size_non_existent = get_url_size(NON_EXISTENT_URL, timeout=5)
+        assert size_non_existent is None
+        size_bad_domain = get_url_size(DEFINITELY_NON_EXISTENT_DOMAIN_URL, timeout=5)
+        assert size_bad_domain is None
+        urls_for_size = [SMALL_FILE_URL, NON_EXISTENT_URL, REDIRECT_DEST_URL]
+        sizes_map = get_url_sizes(urls_for_size, n_workers=1, timeout=10)
+        assert SMALL_FILE_URL in sizes_map
+        assert sizes_map[SMALL_FILE_URL] == size
+        assert NON_EXISTENT_URL in sizes_map
+        assert sizes_map[NON_EXISTENT_URL] is None
+        assert REDIRECT_DEST_URL in sizes_map
+        assert sizes_map[REDIRECT_DEST_URL] == size_dynamic
+def _test_url_utils():
+    """
+    Runs all tests in the TestUrlUtils class.  I generally disable this during testing
+    because it creates irritating nondeterminism, and this is neither a core module nor
+    a module that changes often.
+    """
+    test_instance = TestUrlUtils()
+    test_instance.set_up()
+    try:
+        test_instance.test_download_url_to_specified_file()
+        test_instance.test_download_url_to_temp_file()
+        test_instance.test_download_url_non_existent()
+        test_instance.test_download_url_force_download()
+        test_instance.test_download_url_escape_spaces()
+        test_instance.test_download_relative_filename()
+        test_instance.test_parallel_download_urls()
+        test_instance.test_test_url_and_test_urls()
+        test_instance.test_get_url_size_and_sizes()
+    finally:
+        test_instance.tear_down()
+# from IPython import embed; embed()
+# test_url_utils()

megadetector 5.0.28__py3-none-any.whl → 10.0.0__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.28py3-none-any.whl → 10.0.0py3-none-any.whl