megadetector 5.0.28__py3-none-any.whl → 10.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
- megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
- megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
- megadetector/classification/aggregate_classifier_probs.py +3 -3
- megadetector/classification/analyze_failed_images.py +5 -5
- megadetector/classification/cache_batchapi_outputs.py +5 -5
- megadetector/classification/create_classification_dataset.py +11 -12
- megadetector/classification/crop_detections.py +10 -10
- megadetector/classification/csv_to_json.py +8 -8
- megadetector/classification/detect_and_crop.py +13 -15
- megadetector/classification/efficientnet/model.py +8 -8
- megadetector/classification/efficientnet/utils.py +6 -5
- megadetector/classification/evaluate_model.py +7 -7
- megadetector/classification/identify_mislabeled_candidates.py +6 -6
- megadetector/classification/json_to_azcopy_list.py +1 -1
- megadetector/classification/json_validator.py +29 -32
- megadetector/classification/map_classification_categories.py +9 -9
- megadetector/classification/merge_classification_detection_output.py +12 -9
- megadetector/classification/prepare_classification_script.py +19 -19
- megadetector/classification/prepare_classification_script_mc.py +26 -26
- megadetector/classification/run_classifier.py +4 -4
- megadetector/classification/save_mislabeled.py +6 -6
- megadetector/classification/train_classifier.py +1 -1
- megadetector/classification/train_classifier_tf.py +9 -9
- megadetector/classification/train_utils.py +10 -10
- megadetector/data_management/annotations/annotation_constants.py +1 -2
- megadetector/data_management/camtrap_dp_to_coco.py +79 -46
- megadetector/data_management/cct_json_utils.py +103 -103
- megadetector/data_management/cct_to_md.py +49 -49
- megadetector/data_management/cct_to_wi.py +33 -33
- megadetector/data_management/coco_to_labelme.py +75 -75
- megadetector/data_management/coco_to_yolo.py +210 -193
- megadetector/data_management/databases/add_width_and_height_to_db.py +86 -12
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +40 -40
- megadetector/data_management/databases/integrity_check_json_db.py +228 -200
- megadetector/data_management/databases/subset_json_db.py +33 -33
- megadetector/data_management/generate_crops_from_cct.py +88 -39
- megadetector/data_management/get_image_sizes.py +54 -49
- megadetector/data_management/labelme_to_coco.py +133 -125
- megadetector/data_management/labelme_to_yolo.py +159 -73
- megadetector/data_management/lila/create_lila_blank_set.py +81 -83
- megadetector/data_management/lila/create_lila_test_set.py +32 -31
- megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
- megadetector/data_management/lila/download_lila_subset.py +21 -24
- megadetector/data_management/lila/generate_lila_per_image_labels.py +365 -107
- megadetector/data_management/lila/get_lila_annotation_counts.py +35 -33
- megadetector/data_management/lila/get_lila_image_counts.py +22 -22
- megadetector/data_management/lila/lila_common.py +73 -70
- megadetector/data_management/lila/test_lila_metadata_urls.py +28 -19
- megadetector/data_management/mewc_to_md.py +344 -340
- megadetector/data_management/ocr_tools.py +262 -255
- megadetector/data_management/read_exif.py +249 -227
- megadetector/data_management/remap_coco_categories.py +90 -28
- megadetector/data_management/remove_exif.py +81 -21
- megadetector/data_management/rename_images.py +187 -187
- megadetector/data_management/resize_coco_dataset.py +588 -120
- megadetector/data_management/speciesnet_to_md.py +41 -41
- megadetector/data_management/wi_download_csv_to_coco.py +55 -55
- megadetector/data_management/yolo_output_to_md_output.py +248 -122
- megadetector/data_management/yolo_to_coco.py +333 -191
- megadetector/detection/change_detection.py +832 -0
- megadetector/detection/process_video.py +340 -337
- megadetector/detection/pytorch_detector.py +358 -278
- megadetector/detection/run_detector.py +399 -186
- megadetector/detection/run_detector_batch.py +404 -377
- megadetector/detection/run_inference_with_yolov5_val.py +340 -327
- megadetector/detection/run_tiled_inference.py +257 -249
- megadetector/detection/tf_detector.py +24 -24
- megadetector/detection/video_utils.py +332 -295
- megadetector/postprocessing/add_max_conf.py +19 -11
- megadetector/postprocessing/categorize_detections_by_size.py +45 -45
- megadetector/postprocessing/classification_postprocessing.py +468 -433
- megadetector/postprocessing/combine_batch_outputs.py +23 -23
- megadetector/postprocessing/compare_batch_results.py +590 -525
- megadetector/postprocessing/convert_output_format.py +106 -102
- megadetector/postprocessing/create_crop_folder.py +347 -147
- megadetector/postprocessing/detector_calibration.py +173 -168
- megadetector/postprocessing/generate_csv_report.py +508 -499
- megadetector/postprocessing/load_api_results.py +48 -27
- megadetector/postprocessing/md_to_coco.py +133 -102
- megadetector/postprocessing/md_to_labelme.py +107 -90
- megadetector/postprocessing/md_to_wi.py +40 -40
- megadetector/postprocessing/merge_detections.py +92 -114
- megadetector/postprocessing/postprocess_batch_results.py +319 -301
- megadetector/postprocessing/remap_detection_categories.py +91 -38
- megadetector/postprocessing/render_detection_confusion_matrix.py +214 -205
- megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
- megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +704 -679
- megadetector/postprocessing/separate_detections_into_folders.py +226 -211
- megadetector/postprocessing/subset_json_detector_output.py +265 -262
- megadetector/postprocessing/top_folders_to_bottom.py +45 -45
- megadetector/postprocessing/validate_batch_results.py +70 -70
- megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +18 -19
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +54 -33
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +67 -67
- megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
- megadetector/taxonomy_mapping/simple_image_download.py +8 -8
- megadetector/taxonomy_mapping/species_lookup.py +156 -74
- megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
- megadetector/taxonomy_mapping/taxonomy_graph.py +10 -10
- megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
- megadetector/utils/ct_utils.py +1049 -211
- megadetector/utils/directory_listing.py +21 -77
- megadetector/utils/gpu_test.py +22 -22
- megadetector/utils/md_tests.py +632 -529
- megadetector/utils/path_utils.py +1520 -431
- megadetector/utils/process_utils.py +41 -41
- megadetector/utils/split_locations_into_train_val.py +62 -62
- megadetector/utils/string_utils.py +148 -27
- megadetector/utils/url_utils.py +489 -176
- megadetector/utils/wi_utils.py +2658 -2526
- megadetector/utils/write_html_image_list.py +137 -137
- megadetector/visualization/plot_utils.py +34 -30
- megadetector/visualization/render_images_with_thumbnails.py +39 -74
- megadetector/visualization/visualization_utils.py +487 -435
- megadetector/visualization/visualize_db.py +232 -198
- megadetector/visualization/visualize_detector_output.py +82 -76
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/METADATA +5 -2
- megadetector-10.0.0.dist-info/RECORD +139 -0
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/WHEEL +1 -1
- megadetector/api/batch_processing/api_core/__init__.py +0 -0
- megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
- megadetector/api/batch_processing/api_core/batch_service/score.py +0 -439
- megadetector/api/batch_processing/api_core/server.py +0 -294
- megadetector/api/batch_processing/api_core/server_api_config.py +0 -97
- megadetector/api/batch_processing/api_core/server_app_config.py +0 -55
- megadetector/api/batch_processing/api_core/server_batch_job_manager.py +0 -220
- megadetector/api/batch_processing/api_core/server_job_status_table.py +0 -149
- megadetector/api/batch_processing/api_core/server_orchestration.py +0 -360
- megadetector/api/batch_processing/api_core/server_utils.py +0 -88
- megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
- megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
- megadetector/api/batch_processing/api_support/__init__.py +0 -0
- megadetector/api/batch_processing/api_support/summarize_daily_activity.py +0 -152
- megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
- megadetector/api/synchronous/__init__.py +0 -0
- megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
- megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +0 -151
- megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -263
- megadetector/api/synchronous/api_core/animal_detection_api/config.py +0 -35
- megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
- megadetector/api/synchronous/api_core/tests/load_test.py +0 -110
- megadetector/data_management/importers/add_nacti_sizes.py +0 -52
- megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
- megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
- megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
- megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
- megadetector/data_management/importers/awc_to_json.py +0 -191
- megadetector/data_management/importers/bellevue_to_json.py +0 -272
- megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
- megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
- megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
- megadetector/data_management/importers/cct_field_adjustments.py +0 -58
- megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
- megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
- megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
- megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
- megadetector/data_management/importers/ena24_to_json.py +0 -276
- megadetector/data_management/importers/filenames_to_json.py +0 -386
- megadetector/data_management/importers/helena_to_cct.py +0 -283
- megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
- megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
- megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
- megadetector/data_management/importers/jb_csv_to_json.py +0 -150
- megadetector/data_management/importers/mcgill_to_json.py +0 -250
- megadetector/data_management/importers/missouri_to_json.py +0 -490
- megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
- megadetector/data_management/importers/noaa_seals_2019.py +0 -181
- megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
- megadetector/data_management/importers/pc_to_json.py +0 -365
- megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
- megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
- megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
- megadetector/data_management/importers/rspb_to_json.py +0 -356
- megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
- megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
- megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
- megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
- megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
- megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
- megadetector/data_management/importers/sulross_get_exif.py +0 -65
- megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
- megadetector/data_management/importers/ubc_to_json.py +0 -399
- megadetector/data_management/importers/umn_to_json.py +0 -507
- megadetector/data_management/importers/wellington_to_json.py +0 -263
- megadetector/data_management/importers/wi_to_json.py +0 -442
- megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
- megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
- megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
- megadetector/utils/azure_utils.py +0 -178
- megadetector/utils/sas_blob_utils.py +0 -509
- megadetector-5.0.28.dist-info/RECORD +0 -209
- /megadetector/{api/batch_processing/__init__.py → __init__.py} +0 -0
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/licenses/LICENSE +0 -0
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/top_level.txt +0 -0
megadetector/utils/url_utils.py
CHANGED
|
@@ -11,8 +11,11 @@ Frequently-used functions for downloading or manipulating URLs
|
|
|
11
11
|
import os
|
|
12
12
|
import re
|
|
13
13
|
import urllib
|
|
14
|
-
import
|
|
14
|
+
import urllib.request
|
|
15
|
+
import urllib.error
|
|
15
16
|
import requests
|
|
17
|
+
import shutil
|
|
18
|
+
import pytest
|
|
16
19
|
|
|
17
20
|
from functools import partial
|
|
18
21
|
from tqdm import tqdm
|
|
@@ -20,111 +23,105 @@ from urllib.parse import urlparse
|
|
|
20
23
|
from multiprocessing.pool import ThreadPool
|
|
21
24
|
from multiprocessing.pool import Pool
|
|
22
25
|
|
|
23
|
-
|
|
26
|
+
from megadetector.utils.ct_utils import make_test_folder
|
|
27
|
+
from megadetector.utils.ct_utils import make_temp_folder
|
|
28
|
+
|
|
24
29
|
max_path_len = 255
|
|
25
30
|
|
|
26
31
|
|
|
27
32
|
#%% Download functions
|
|
28
33
|
|
|
29
|
-
class DownloadProgressBar
|
|
34
|
+
class DownloadProgressBar:
|
|
30
35
|
"""
|
|
31
36
|
Progress updater based on the progressbar2 package.
|
|
32
|
-
|
|
37
|
+
|
|
33
38
|
https://stackoverflow.com/questions/37748105/how-to-use-progressbar-module-with-urlretrieve
|
|
34
39
|
"""
|
|
35
|
-
|
|
40
|
+
|
|
41
|
+
|
|
36
42
|
def __init__(self):
|
|
43
|
+
|
|
37
44
|
self.pbar = None
|
|
38
45
|
|
|
39
|
-
def __call__(self, block_num, block_size, total_size):
|
|
40
|
-
if not self.pbar:
|
|
41
|
-
# This is a pretty random import I'd rather not depend on outside of the
|
|
42
|
-
# rare case where it's used, so importing locally
|
|
43
|
-
# pip install progressbar2
|
|
44
|
-
import progressbar
|
|
45
|
-
self.pbar = progressbar.ProgressBar(max_value=total_size)
|
|
46
|
-
self.pbar.start()
|
|
47
|
-
|
|
48
|
-
downloaded = block_num * block_size
|
|
49
|
-
if downloaded < total_size:
|
|
50
|
-
self.pbar.update(downloaded)
|
|
51
|
-
else:
|
|
52
|
-
self.pbar.finish()
|
|
53
|
-
|
|
54
46
|
|
|
55
|
-
def
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def download_url(url,
|
|
75
|
-
destination_filename=None,
|
|
76
|
-
progress_updater=None,
|
|
77
|
-
force_download=False,
|
|
47
|
+
def __call__(self, block_num, block_size, total_size): # noqa
|
|
48
|
+
|
|
49
|
+
if not self.pbar:
|
|
50
|
+
try:
|
|
51
|
+
import progressbar # type: ignore
|
|
52
|
+
self.pbar = progressbar.ProgressBar(max_value=total_size)
|
|
53
|
+
self.pbar.start()
|
|
54
|
+
except ImportError:
|
|
55
|
+
self.pbar = None
|
|
56
|
+
# print("ProgressBar not available, install 'progressbar2' for visual progress.")
|
|
57
|
+
|
|
58
|
+
if self.pbar:
|
|
59
|
+
downloaded = block_num * block_size
|
|
60
|
+
if downloaded < total_size:
|
|
61
|
+
self.pbar.update(downloaded)
|
|
62
|
+
else:
|
|
63
|
+
self.pbar.finish()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def download_url(url,
|
|
67
|
+
destination_filename=None,
|
|
68
|
+
progress_updater=None,
|
|
69
|
+
force_download=False,
|
|
78
70
|
verbose=True,
|
|
79
71
|
escape_spaces=True):
|
|
80
72
|
"""
|
|
81
|
-
Downloads a URL to a file. If no file is specified, creates a temporary file,
|
|
73
|
+
Downloads a URL to a file. If no file is specified, creates a temporary file,
|
|
82
74
|
making a best effort to avoid filename collisions.
|
|
83
|
-
|
|
75
|
+
|
|
84
76
|
Prints some diagnostic information and makes sure to omit SAS tokens from printouts.
|
|
85
|
-
|
|
77
|
+
|
|
86
78
|
Args:
|
|
87
79
|
url (str): the URL to download
|
|
88
80
|
destination_filename (str, optional): the target filename; if None, will create
|
|
89
|
-
a file in system temp space
|
|
90
|
-
progress_updater (object or bool, optional): can be "None", "False", "True", or a
|
|
91
|
-
specific callable object. If None or False, no progress updated will be
|
|
81
|
+
a file in system temp space
|
|
82
|
+
progress_updater (object or bool, optional): can be "None", "False", "True", or a
|
|
83
|
+
specific callable object. If None or False, no progress updated will be
|
|
92
84
|
displayed. If True, a default progress bar will be created.
|
|
93
85
|
force_download (bool, optional): download this file even if [destination_filename]
|
|
94
86
|
exists.
|
|
95
87
|
verbose (bool, optional): enable additional debug console output
|
|
96
88
|
escape_spaces (bool, optional): replace ' ' with '%20'
|
|
97
|
-
|
|
89
|
+
|
|
98
90
|
Returns:
|
|
99
91
|
str: the filename to which [url] was downloaded, the same as [destination_filename]
|
|
100
92
|
if [destination_filename] was not None
|
|
101
93
|
"""
|
|
102
|
-
|
|
94
|
+
|
|
103
95
|
if progress_updater is not None and isinstance(progress_updater,bool):
|
|
104
96
|
if not progress_updater:
|
|
105
97
|
progress_updater = None
|
|
106
98
|
else:
|
|
107
99
|
progress_updater = DownloadProgressBar()
|
|
108
|
-
|
|
100
|
+
|
|
109
101
|
url_no_sas = url.split('?')[0]
|
|
110
|
-
|
|
102
|
+
|
|
111
103
|
if destination_filename is None:
|
|
112
|
-
|
|
113
|
-
target_folder =
|
|
104
|
+
|
|
105
|
+
target_folder = make_temp_folder(subfolder='url_utils',append_guid=False)
|
|
114
106
|
url_without_sas = url.split('?', 1)[0]
|
|
115
|
-
|
|
107
|
+
|
|
116
108
|
# This does not guarantee uniqueness, hence "semi-best-effort"
|
|
117
109
|
url_as_filename = re.sub(r'\W+', '', url_without_sas)
|
|
118
|
-
|
|
119
|
-
|
|
110
|
+
|
|
111
|
+
n_folder_chars = len(target_folder)
|
|
112
|
+
|
|
113
|
+
if (len(url_as_filename) + n_folder_chars) >= max_path_len:
|
|
120
114
|
print('Warning: truncating filename target to {} characters'.format(max_path_len))
|
|
121
|
-
|
|
115
|
+
max_fn_len = max_path_len - (n_folder_chars + 1)
|
|
116
|
+
url_as_filename = url_as_filename[-1 * max_fn_len:]
|
|
122
117
|
destination_filename = \
|
|
123
118
|
os.path.join(target_folder,url_as_filename)
|
|
124
|
-
|
|
119
|
+
|
|
120
|
+
# ...if the destination filename wasn't specified
|
|
121
|
+
|
|
125
122
|
if escape_spaces:
|
|
126
123
|
url = url.replace(' ','%20')
|
|
127
|
-
|
|
124
|
+
|
|
128
125
|
if (not force_download) and (os.path.isfile(destination_filename)):
|
|
129
126
|
if verbose:
|
|
130
127
|
print('Bypassing download of already-downloaded file {}'.format(os.path.basename(url_no_sas)))
|
|
@@ -133,12 +130,12 @@ def download_url(url,
|
|
|
133
130
|
print('Downloading file {} to {}'.format(os.path.basename(url_no_sas),destination_filename),end='')
|
|
134
131
|
target_dir = os.path.dirname(destination_filename)
|
|
135
132
|
os.makedirs(target_dir,exist_ok=True)
|
|
136
|
-
urllib.request.urlretrieve(url, destination_filename, progress_updater)
|
|
133
|
+
urllib.request.urlretrieve(url, destination_filename, progress_updater)
|
|
137
134
|
assert(os.path.isfile(destination_filename))
|
|
138
|
-
|
|
135
|
+
n_bytes = os.path.getsize(destination_filename)
|
|
139
136
|
if verbose:
|
|
140
|
-
print('...done, {} bytes.'.format(
|
|
141
|
-
|
|
137
|
+
print('...done, {} bytes.'.format(n_bytes))
|
|
138
|
+
|
|
142
139
|
return destination_filename
|
|
143
140
|
|
|
144
141
|
# ...def download_url(...)
|
|
@@ -146,24 +143,24 @@ def download_url(url,
|
|
|
146
143
|
|
|
147
144
|
def download_relative_filename(url, output_base, verbose=False):
|
|
148
145
|
"""
|
|
149
|
-
Download a URL to output_base, preserving relative path. Path is relative to
|
|
146
|
+
Download a URL to output_base, preserving relative path. Path is relative to
|
|
150
147
|
the site, so:
|
|
151
|
-
|
|
148
|
+
|
|
152
149
|
https://abc.com/xyz/123.txt
|
|
153
|
-
|
|
150
|
+
|
|
154
151
|
...will get downloaded to:
|
|
155
|
-
|
|
156
|
-
output_base/xyz/123.txt
|
|
157
|
-
|
|
152
|
+
|
|
153
|
+
output_base/xyz/123.txt
|
|
154
|
+
|
|
158
155
|
Args:
|
|
159
156
|
url (str): the URL to download
|
|
160
157
|
output_base (str): the base folder to which we should download this file
|
|
161
158
|
verbose (bool, optional): enable additional debug console output
|
|
162
|
-
|
|
159
|
+
|
|
163
160
|
Returns:
|
|
164
161
|
str: the local destination filename
|
|
165
162
|
"""
|
|
166
|
-
|
|
163
|
+
|
|
167
164
|
p = urlparse(url)
|
|
168
165
|
# remove the leading '/'
|
|
169
166
|
assert p.path.startswith('/'); relative_filename = p.path[1:]
|
|
@@ -177,123 +174,139 @@ def _do_parallelized_download(download_info,overwrite=False,verbose=False):
|
|
|
177
174
|
"""
|
|
178
175
|
Internal function for download parallelization.
|
|
179
176
|
"""
|
|
180
|
-
|
|
177
|
+
|
|
181
178
|
url = download_info['url']
|
|
182
179
|
target_file = download_info['target_file']
|
|
183
180
|
result = {'status':'unknown','url':url,'target_file':target_file}
|
|
184
|
-
|
|
181
|
+
|
|
185
182
|
if ((os.path.isfile(target_file)) and (not overwrite)):
|
|
186
183
|
if verbose:
|
|
187
184
|
print('Skipping existing file {}'.format(target_file))
|
|
188
185
|
result['status'] = 'skipped'
|
|
189
186
|
return result
|
|
190
187
|
try:
|
|
191
|
-
download_url(url=url,
|
|
188
|
+
download_url(url=url,
|
|
192
189
|
destination_filename=target_file,
|
|
193
|
-
verbose=verbose,
|
|
190
|
+
verbose=verbose,
|
|
194
191
|
force_download=overwrite)
|
|
195
192
|
except Exception as e:
|
|
196
193
|
print('Warning: error downloading URL {}: {}'.format(
|
|
197
|
-
url,str(e)))
|
|
194
|
+
url,str(e)))
|
|
198
195
|
result['status'] = 'error: {}'.format(str(e))
|
|
199
196
|
return result
|
|
200
|
-
|
|
197
|
+
|
|
201
198
|
result['status'] = 'success'
|
|
202
199
|
return result
|
|
203
200
|
|
|
204
201
|
# ...def _do_parallelized_download(...)
|
|
205
202
|
|
|
206
203
|
|
|
207
|
-
def parallel_download_urls(url_to_target_file,
|
|
208
|
-
|
|
204
|
+
def parallel_download_urls(url_to_target_file,
|
|
205
|
+
verbose=False,
|
|
206
|
+
overwrite=False,
|
|
207
|
+
n_workers=20,
|
|
208
|
+
pool_type='thread'):
|
|
209
209
|
"""
|
|
210
210
|
Downloads a list of URLs to local files.
|
|
211
|
-
|
|
212
|
-
Catches exceptions and reports them in the returned "results" array.
|
|
213
|
-
|
|
211
|
+
|
|
212
|
+
Catches exceptions and reports them in the returned "results" array.
|
|
213
|
+
|
|
214
214
|
Args:
|
|
215
|
-
url_to_target_file: a dict mapping URLs to local filenames.
|
|
215
|
+
url_to_target_file (dict): a dict mapping URLs to local filenames.
|
|
216
216
|
verbose (bool, optional): enable additional debug console output
|
|
217
217
|
overwrite (bool, optional): whether to overwrite existing local files
|
|
218
218
|
n_workers (int, optional): number of concurrent workers, set to <=1 to disable
|
|
219
219
|
parallelization
|
|
220
220
|
pool_type (str, optional): worker type to use; should be 'thread' or 'process'
|
|
221
|
-
|
|
221
|
+
|
|
222
222
|
Returns:
|
|
223
223
|
list: list of dicts with keys:
|
|
224
224
|
- 'url': the url this item refers to
|
|
225
225
|
- 'status': 'skipped', 'success', or a string starting with 'error'
|
|
226
|
-
- 'target_file': the local filename to which we downloaded (or tried to
|
|
227
|
-
download) this URL
|
|
226
|
+
- 'target_file': the local filename to which we downloaded (or tried to
|
|
227
|
+
download) this URL
|
|
228
228
|
"""
|
|
229
|
-
|
|
229
|
+
|
|
230
230
|
all_download_info = []
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
231
|
+
|
|
232
|
+
if verbose:
|
|
233
|
+
print('Preparing download list')
|
|
234
|
+
for url in tqdm(url_to_target_file, disable=(not verbose)):
|
|
234
235
|
download_info = {}
|
|
235
236
|
download_info['url'] = url
|
|
236
237
|
download_info['target_file'] = url_to_target_file[url]
|
|
237
238
|
all_download_info.append(download_info)
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
239
|
+
|
|
240
|
+
if verbose:
|
|
241
|
+
print('Downloading {} images on {} workers'.format(
|
|
242
|
+
len(all_download_info),n_workers))
|
|
241
243
|
|
|
242
244
|
if n_workers <= 1:
|
|
243
245
|
|
|
244
246
|
results = []
|
|
245
|
-
|
|
246
|
-
for download_info in tqdm(all_download_info):
|
|
247
|
+
|
|
248
|
+
for download_info in tqdm(all_download_info, disable=(not verbose)):
|
|
247
249
|
result = _do_parallelized_download(download_info,overwrite=overwrite,verbose=verbose)
|
|
248
250
|
results.append(result)
|
|
249
|
-
|
|
251
|
+
|
|
250
252
|
else:
|
|
251
253
|
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
254
|
+
pool = None
|
|
255
|
+
|
|
256
|
+
try:
|
|
257
|
+
if pool_type == 'thread':
|
|
258
|
+
pool = ThreadPool(n_workers)
|
|
259
|
+
else:
|
|
260
|
+
assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
|
|
261
|
+
pool = Pool(n_workers)
|
|
262
|
+
|
|
263
|
+
if verbose:
|
|
264
|
+
print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
|
|
265
|
+
|
|
266
|
+
results = list(tqdm(pool.imap(
|
|
267
|
+
partial(_do_parallelized_download,overwrite=overwrite,verbose=verbose),
|
|
268
|
+
all_download_info), total=len(all_download_info), disable=(not verbose)))
|
|
269
|
+
|
|
270
|
+
finally:
|
|
271
|
+
if pool:
|
|
272
|
+
pool.close()
|
|
273
|
+
pool.join()
|
|
274
|
+
print("Pool closed and joined for parallel URL downloads")
|
|
275
|
+
|
|
264
276
|
return results
|
|
265
277
|
|
|
266
278
|
# ...def parallel_download_urls(...)
|
|
267
279
|
|
|
268
280
|
|
|
281
|
+
@pytest.mark.skip(reason="This is not a test function")
|
|
269
282
|
def test_url(url,error_on_failure=True,timeout=None):
|
|
270
283
|
"""
|
|
271
284
|
Tests the availability of [url], returning an http status code.
|
|
272
|
-
|
|
285
|
+
|
|
273
286
|
Args:
|
|
274
287
|
url (str): URL to test
|
|
275
288
|
error_on_failure (bool, optional): whether to error (vs. just returning an
|
|
276
289
|
error code) if accessing this URL fails
|
|
277
|
-
timeout (int, optional): timeout in seconds to wait before considering this
|
|
290
|
+
timeout (int, optional): timeout in seconds to wait before considering this
|
|
278
291
|
access attempt to be a failure; see requests.head() for precise documentation
|
|
279
|
-
|
|
292
|
+
|
|
280
293
|
Returns:
|
|
281
294
|
int: http status code (200 for success)
|
|
282
295
|
"""
|
|
283
|
-
|
|
284
|
-
# r = requests.get(url, stream=True, verify=True, timeout=timeout)
|
|
296
|
+
|
|
285
297
|
r = requests.head(url, stream=True, verify=True, timeout=timeout)
|
|
286
|
-
|
|
287
|
-
if error_on_failure and r.status_code != 200:
|
|
298
|
+
|
|
299
|
+
if error_on_failure and r.status_code != 200:
|
|
288
300
|
raise ValueError('Could not access {}: error {}'.format(url,r.status_code))
|
|
289
301
|
return r.status_code
|
|
290
|
-
|
|
291
302
|
|
|
292
|
-
|
|
303
|
+
|
|
304
|
+
@pytest.mark.skip(reason="This is not a test function")
|
|
305
|
+
def test_urls(urls,error_on_failure=True,n_workers=1,pool_type='thread',timeout=None,verbose=False):
|
|
293
306
|
"""
|
|
294
307
|
Verify that URLs are available (i.e., returns status 200). By default,
|
|
295
|
-
errors if any URL is unavailable.
|
|
296
|
-
|
|
308
|
+
errors if any URL is unavailable.
|
|
309
|
+
|
|
297
310
|
Args:
|
|
298
311
|
urls (list): list of URLs to test
|
|
299
312
|
error_on_failure (bool, optional): whether to error (vs. just returning an
|
|
@@ -301,39 +314,48 @@ def test_urls(urls,error_on_failure=True,n_workers=1,pool_type='thread',timeout=
|
|
|
301
314
|
n_workers (int, optional): number of concurrent workers, set to <=1 to disable
|
|
302
315
|
parallelization
|
|
303
316
|
pool_type (str, optional): worker type to use; should be 'thread' or 'process'
|
|
304
|
-
timeout (int, optional): timeout in seconds to wait before considering this
|
|
317
|
+
timeout (int, optional): timeout in seconds to wait before considering this
|
|
305
318
|
access attempt to be a failure; see requests.head() for precise documentation
|
|
306
|
-
|
|
319
|
+
verbose (bool, optional): enable additional debug output
|
|
320
|
+
|
|
307
321
|
Returns:
|
|
308
322
|
list: a list of http status codes, the same length and order as [urls]
|
|
309
323
|
"""
|
|
310
|
-
|
|
324
|
+
|
|
311
325
|
if n_workers <= 1:
|
|
312
326
|
|
|
313
327
|
status_codes = []
|
|
314
|
-
|
|
315
|
-
for url in tqdm(urls):
|
|
316
|
-
|
|
328
|
+
|
|
329
|
+
for url in tqdm(urls,disable=(not verbose)):
|
|
330
|
+
|
|
317
331
|
r = requests.get(url, timeout=timeout)
|
|
318
|
-
|
|
319
|
-
if error_on_failure and r.status_code != 200:
|
|
332
|
+
|
|
333
|
+
if error_on_failure and r.status_code != 200:
|
|
320
334
|
raise ValueError('Could not access {}: error {}'.format(url,r.status_code))
|
|
321
335
|
status_codes.append(r.status_code)
|
|
322
|
-
|
|
336
|
+
|
|
323
337
|
else:
|
|
324
338
|
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
339
|
+
pool = None
|
|
340
|
+
try:
|
|
341
|
+
if pool_type == 'thread':
|
|
342
|
+
pool = ThreadPool(n_workers)
|
|
343
|
+
else:
|
|
344
|
+
assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
|
|
345
|
+
pool = Pool(n_workers)
|
|
346
|
+
|
|
347
|
+
if verbose:
|
|
348
|
+
print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
|
|
349
|
+
|
|
350
|
+
status_codes = list(tqdm(pool.imap(
|
|
351
|
+
partial(test_url,error_on_failure=error_on_failure,timeout=timeout),
|
|
352
|
+
urls), total=len(urls), disable=(not verbose)))
|
|
353
|
+
finally:
|
|
354
|
+
if pool:
|
|
355
|
+
pool.close()
|
|
356
|
+
pool.join()
|
|
357
|
+
print('Pool closed and joined for URL tests')
|
|
358
|
+
|
|
337
359
|
return status_codes
|
|
338
360
|
|
|
339
361
|
# ...def test_urls(...)
|
|
@@ -341,16 +363,16 @@ def test_urls(urls,error_on_failure=True,n_workers=1,pool_type='thread',timeout=
|
|
|
341
363
|
|
|
342
364
|
def get_url_size(url,verbose=False,timeout=None):
|
|
343
365
|
"""
|
|
344
|
-
Get the size of the file pointed to by a URL, based on the Content-Length property. If the
|
|
345
|
-
URL is not available, or the Content-Length property is not available, or the content-Length
|
|
346
|
-
property is not an integer, returns None.
|
|
347
|
-
|
|
366
|
+
Get the size of the file pointed to by a URL, based on the Content-Length property. If the
|
|
367
|
+
URL is not available, or the Content-Length property is not available, or the content-Length
|
|
368
|
+
property is not an integer, returns None.
|
|
369
|
+
|
|
348
370
|
Args:
|
|
349
371
|
url (str): the url to test
|
|
350
372
|
verbose (bool, optional): enable additional debug output
|
|
351
|
-
timeout (int, optional): timeout in seconds to wait before considering this
|
|
373
|
+
timeout (int, optional): timeout in seconds to wait before considering this
|
|
352
374
|
access attempt to be a failure; see requests.head() for precise documentation
|
|
353
|
-
|
|
375
|
+
|
|
354
376
|
Returns:
|
|
355
377
|
int: the file size in bytes, or None if it can't be retrieved
|
|
356
378
|
"""
|
|
@@ -362,13 +384,18 @@ def get_url_size(url,verbose=False,timeout=None):
|
|
|
362
384
|
if verbose:
|
|
363
385
|
print('Status {} retrieving file size for {}'.format(f.status,url))
|
|
364
386
|
return None
|
|
365
|
-
|
|
387
|
+
size_bytes_str = f.headers.get('Content-Length')
|
|
388
|
+
if size_bytes_str is None:
|
|
389
|
+
if verbose:
|
|
390
|
+
print('No Content-Length header for {}'.format(url))
|
|
391
|
+
return None
|
|
392
|
+
size_bytes = int(size_bytes_str)
|
|
366
393
|
return size_bytes
|
|
367
394
|
except Exception as e:
|
|
368
395
|
if verbose:
|
|
369
396
|
print('Error retrieving file size for {}:\n{}'.format(url,str(e)))
|
|
370
397
|
return None
|
|
371
|
-
|
|
398
|
+
|
|
372
399
|
# ...def get_url_size(...)
|
|
373
400
|
|
|
374
401
|
|
|
@@ -376,45 +403,331 @@ def get_url_sizes(urls,n_workers=1,pool_type='thread',timeout=None,verbose=False
|
|
|
376
403
|
"""
|
|
377
404
|
Retrieve file sizes for the URLs specified by [urls]. Returns None for any URLs
|
|
378
405
|
that we can't access, or URLs for which the Content-Length property is not set.
|
|
379
|
-
|
|
406
|
+
|
|
380
407
|
Args:
|
|
381
408
|
urls (list): list of URLs for which we should retrieve sizes
|
|
382
409
|
n_workers (int, optional): number of concurrent workers, set to <=1 to disable
|
|
383
410
|
parallelization
|
|
384
411
|
pool_type (str, optional): worker type to use; should be 'thread' or 'process'
|
|
385
|
-
timeout (int, optional): timeout in seconds to wait before considering this
|
|
412
|
+
timeout (int, optional): timeout in seconds to wait before considering this
|
|
386
413
|
access attempt to be a failure; see requests.head() for precise documentation
|
|
387
414
|
verbose (bool, optional): print additional debug information
|
|
388
|
-
|
|
415
|
+
|
|
389
416
|
Returns:
|
|
390
417
|
dict: maps urls to file sizes, which will be None for URLs for which we were unable
|
|
391
|
-
to retrieve a valid size.
|
|
418
|
+
to retrieve a valid size.
|
|
392
419
|
"""
|
|
393
|
-
|
|
420
|
+
|
|
394
421
|
url_to_size = {}
|
|
395
|
-
|
|
396
|
-
if n_workers <= 1:
|
|
397
|
-
|
|
398
|
-
for url in tqdm(urls):
|
|
422
|
+
|
|
423
|
+
if n_workers <= 1:
|
|
424
|
+
|
|
425
|
+
for url in tqdm(urls, disable=(not verbose)):
|
|
399
426
|
url_to_size[url] = get_url_size(url,verbose=verbose,timeout=timeout)
|
|
400
|
-
|
|
427
|
+
|
|
401
428
|
else:
|
|
402
429
|
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
430
|
+
pool = None
|
|
431
|
+
try:
|
|
432
|
+
if pool_type == 'thread':
|
|
433
|
+
pool = ThreadPool(n_workers)
|
|
434
|
+
else:
|
|
435
|
+
assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
|
|
436
|
+
pool = Pool(n_workers)
|
|
437
|
+
|
|
438
|
+
if verbose:
|
|
439
|
+
print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
|
|
440
|
+
|
|
441
|
+
file_sizes = list(tqdm(pool.imap(
|
|
442
|
+
partial(get_url_size,verbose=verbose,timeout=timeout),
|
|
443
|
+
urls), total=len(urls), disable=(not verbose)))
|
|
444
|
+
|
|
445
|
+
for i_url,url in enumerate(urls):
|
|
446
|
+
url_to_size[url] = file_sizes[i_url]
|
|
447
|
+
finally:
|
|
448
|
+
if pool:
|
|
449
|
+
pool.close()
|
|
450
|
+
pool.join()
|
|
451
|
+
print('Pool closed and joined for URL size checks')
|
|
452
|
+
|
|
418
453
|
return url_to_size
|
|
419
454
|
|
|
420
|
-
|
|
455
|
+
|
|
456
|
+
#%% Tests
|
|
457
|
+
|
|
458
|
+
# Constants for tests
|
|
459
|
+
|
|
460
|
+
SMALL_FILE_URL = "https://www.google.com/images/branding/googlelogo/1x/googlelogo_color_272x92dp.png"
|
|
461
|
+
REDIRECT_SRC_URL = "http://google.com"
|
|
462
|
+
REDIRECT_DEST_URL = "https://www.google.com/"
|
|
463
|
+
NON_EXISTENT_URL = "https://example.com/non_existent_page_404.html"
|
|
464
|
+
DEFINITELY_NON_EXISTENT_DOMAIN_URL = "https://thisshouldnotexist1234567890.com/file.txt"
|
|
465
|
+
RELATIVE_DOWNLOAD_URL = "https://raw.githubusercontent.com/agentmorris/MegaDetector/main/README.md"
|
|
466
|
+
RELATIVE_DOWNLOAD_CONTAIN_TOKEN = 'agentmorris'
|
|
467
|
+
RELATIVE_DOWNLOAD_NOT_CONTAIN_TOKEN = 'github'
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
class TestUrlUtils:
|
|
471
|
+
"""
|
|
472
|
+
Tests for url_utils.py
|
|
473
|
+
"""
|
|
474
|
+
|
|
475
|
+
def set_up(self):
|
|
476
|
+
"""
|
|
477
|
+
Create a temporary directory for testing.
|
|
478
|
+
"""
|
|
479
|
+
|
|
480
|
+
self.test_dir = make_test_folder(subfolder='url_utils_tests')
|
|
481
|
+
self.download_target_dir = os.path.join(self.test_dir, 'downloads')
|
|
482
|
+
os.makedirs(self.download_target_dir, exist_ok=True)
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
def tear_down(self):
|
|
486
|
+
"""
|
|
487
|
+
Remove the temporary directory after tests and restore module temp_dir.
|
|
488
|
+
"""
|
|
489
|
+
|
|
490
|
+
if os.path.exists(self.test_dir):
|
|
491
|
+
shutil.rmtree(self.test_dir)
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def test_download_url_to_specified_file(self):
|
|
495
|
+
"""
|
|
496
|
+
Test download_url with a specified destination filename.
|
|
497
|
+
"""
|
|
498
|
+
|
|
499
|
+
dest_filename = os.path.join(self.download_target_dir, "downloaded_google_logo.png")
|
|
500
|
+
returned_filename = download_url(SMALL_FILE_URL,
|
|
501
|
+
destination_filename=dest_filename,
|
|
502
|
+
verbose=False)
|
|
503
|
+
assert returned_filename == dest_filename
|
|
504
|
+
assert os.path.exists(dest_filename)
|
|
505
|
+
assert os.path.getsize(dest_filename) > 1000
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
def test_download_url_to_temp_file(self):
|
|
509
|
+
"""
|
|
510
|
+
Test download_url when destination_filename is None.
|
|
511
|
+
"""
|
|
512
|
+
|
|
513
|
+
returned_filename = download_url(SMALL_FILE_URL,
|
|
514
|
+
destination_filename=None,
|
|
515
|
+
verbose=False)
|
|
516
|
+
assert os.path.exists(returned_filename)
|
|
517
|
+
assert os.path.getsize(returned_filename) > 1000
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
def test_download_url_non_existent(self):
|
|
521
|
+
"""
|
|
522
|
+
Test download_url with a non-existent URL.
|
|
523
|
+
"""
|
|
524
|
+
|
|
525
|
+
dest_filename = os.path.join(self.download_target_dir, "non_existent.html")
|
|
526
|
+
try:
|
|
527
|
+
download_url(NON_EXISTENT_URL, destination_filename=dest_filename, verbose=False)
|
|
528
|
+
raise AssertionError("urllib.error.HTTPError not raised for 404")
|
|
529
|
+
except urllib.error.HTTPError:
|
|
530
|
+
pass
|
|
531
|
+
|
|
532
|
+
try:
|
|
533
|
+
download_url(DEFINITELY_NON_EXISTENT_DOMAIN_URL,
|
|
534
|
+
destination_filename=dest_filename,
|
|
535
|
+
verbose=False)
|
|
536
|
+
raise AssertionError(
|
|
537
|
+
"urllib.error.URLError or requests.exceptions.ConnectionError not raised for DNS failure")
|
|
538
|
+
except urllib.error.URLError:
|
|
539
|
+
pass
|
|
540
|
+
except requests.exceptions.ConnectionError:
|
|
541
|
+
pass
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
def test_download_url_force_download(self):
|
|
545
|
+
"""
|
|
546
|
+
Test the force_download parameter of download_url.
|
|
547
|
+
"""
|
|
548
|
+
|
|
549
|
+
dest_filename = os.path.join(self.download_target_dir, "force_test.png")
|
|
550
|
+
|
|
551
|
+
download_url(SMALL_FILE_URL, destination_filename=dest_filename, verbose=False)
|
|
552
|
+
assert os.path.exists(dest_filename)
|
|
553
|
+
initial_mtime = os.path.getmtime(dest_filename)
|
|
554
|
+
|
|
555
|
+
download_url(SMALL_FILE_URL, destination_filename=dest_filename, verbose=True)
|
|
556
|
+
assert os.path.getmtime(dest_filename) == initial_mtime
|
|
557
|
+
|
|
558
|
+
download_url(SMALL_FILE_URL,
|
|
559
|
+
destination_filename=dest_filename,
|
|
560
|
+
force_download=True,
|
|
561
|
+
verbose=False)
|
|
562
|
+
assert os.path.exists(dest_filename)
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
def test_download_url_escape_spaces(self):
|
|
566
|
+
"""
|
|
567
|
+
Test download_url with spaces in the URL.
|
|
568
|
+
"""
|
|
569
|
+
|
|
570
|
+
dest_filename = os.path.join(self.download_target_dir, "escape_test.png")
|
|
571
|
+
download_url(SMALL_FILE_URL,
|
|
572
|
+
destination_filename=dest_filename,
|
|
573
|
+
escape_spaces=True,
|
|
574
|
+
verbose=False)
|
|
575
|
+
assert os.path.exists(dest_filename)
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
def test_download_relative_filename(self):
|
|
579
|
+
"""
|
|
580
|
+
Test download_relative_filename.
|
|
581
|
+
"""
|
|
582
|
+
|
|
583
|
+
output_base = os.path.join(self.download_target_dir, "relative_dl")
|
|
584
|
+
returned_filename = download_relative_filename(RELATIVE_DOWNLOAD_URL, output_base, verbose=False)
|
|
585
|
+
assert RELATIVE_DOWNLOAD_CONTAIN_TOKEN in returned_filename
|
|
586
|
+
assert RELATIVE_DOWNLOAD_NOT_CONTAIN_TOKEN not in returned_filename
|
|
587
|
+
assert os.path.exists(returned_filename)
|
|
588
|
+
assert os.path.getsize(returned_filename) > 100
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
def test_parallel_download_urls(self):
|
|
592
|
+
"""
|
|
593
|
+
Test parallel_download_urls (with n_workers=1 for simplicity).
|
|
594
|
+
"""
|
|
595
|
+
|
|
596
|
+
url1_target = os.path.join(self.download_target_dir, "parallel_dl_1.png")
|
|
597
|
+
url2_target = os.path.join(self.download_target_dir, "parallel_dl_2_nonexistent.html")
|
|
598
|
+
|
|
599
|
+
url_to_target_file = {
|
|
600
|
+
SMALL_FILE_URL: url1_target,
|
|
601
|
+
NON_EXISTENT_URL: url2_target
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
results = parallel_download_urls(url_to_target_file, n_workers=1, verbose=False)
|
|
605
|
+
|
|
606
|
+
assert len(results) == 2
|
|
607
|
+
|
|
608
|
+
status_map = {res['url']: res for res in results}
|
|
609
|
+
|
|
610
|
+
assert status_map[SMALL_FILE_URL]['status'] == 'success'
|
|
611
|
+
assert status_map[SMALL_FILE_URL]['target_file'] == url1_target
|
|
612
|
+
assert os.path.exists(url1_target)
|
|
613
|
+
|
|
614
|
+
assert status_map[NON_EXISTENT_URL]['status'].startswith('error: HTTP Error 404')
|
|
615
|
+
assert status_map[NON_EXISTENT_URL]['target_file'] == url2_target
|
|
616
|
+
assert not os.path.exists(url2_target)
|
|
617
|
+
|
|
618
|
+
if not os.path.exists(url1_target):
|
|
619
|
+
download_url(SMALL_FILE_URL, url1_target, verbose=False)
|
|
620
|
+
results_skip = parallel_download_urls({SMALL_FILE_URL: url1_target},
|
|
621
|
+
n_workers=1,
|
|
622
|
+
overwrite=False,
|
|
623
|
+
verbose=True)
|
|
624
|
+
assert results_skip[0]['status'] == 'skipped'
|
|
625
|
+
|
|
626
|
+
results_overwrite = parallel_download_urls({SMALL_FILE_URL: url1_target},
|
|
627
|
+
n_workers=1,
|
|
628
|
+
overwrite=True,
|
|
629
|
+
verbose=False)
|
|
630
|
+
assert results_overwrite[0]['status'] == 'success'
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
def test_test_url_and_test_urls(self):
|
|
634
|
+
"""
|
|
635
|
+
Test test_url and test_urls functions.
|
|
636
|
+
"""
|
|
637
|
+
|
|
638
|
+
assert test_url(SMALL_FILE_URL, error_on_failure=False, timeout=10) == 200
|
|
639
|
+
assert test_url(REDIRECT_SRC_URL, error_on_failure=False, timeout=10) in (200,301)
|
|
640
|
+
|
|
641
|
+
status_non_existent = test_url(NON_EXISTENT_URL, error_on_failure=False, timeout=5)
|
|
642
|
+
assert status_non_existent == 404
|
|
643
|
+
|
|
644
|
+
try:
|
|
645
|
+
test_url(NON_EXISTENT_URL, error_on_failure=True, timeout=5)
|
|
646
|
+
raise AssertionError("ValueError not raised for NON_EXISTENT_URL")
|
|
647
|
+
except ValueError:
|
|
648
|
+
pass
|
|
649
|
+
|
|
650
|
+
try:
|
|
651
|
+
test_url(DEFINITELY_NON_EXISTENT_DOMAIN_URL,
|
|
652
|
+
error_on_failure=True,
|
|
653
|
+
timeout=5)
|
|
654
|
+
raise AssertionError("requests.exceptions.ConnectionError or urllib.error.URLError not raised")
|
|
655
|
+
except requests.exceptions.ConnectionError:
|
|
656
|
+
pass
|
|
657
|
+
except urllib.error.URLError:
|
|
658
|
+
pass
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
urls_to_test = [SMALL_FILE_URL, NON_EXISTENT_URL]
|
|
662
|
+
status_codes = test_urls(urls_to_test, error_on_failure=False, n_workers=1, timeout=10)
|
|
663
|
+
assert len(status_codes) == 2
|
|
664
|
+
assert status_codes[0] == 200
|
|
665
|
+
assert status_codes[1] == 404
|
|
666
|
+
|
|
667
|
+
try:
|
|
668
|
+
test_urls(urls_to_test, error_on_failure=True, n_workers=1, timeout=5)
|
|
669
|
+
raise AssertionError("ValueError not raised for urls_to_test")
|
|
670
|
+
except ValueError:
|
|
671
|
+
pass
|
|
672
|
+
|
|
673
|
+
good_urls = [SMALL_FILE_URL, REDIRECT_SRC_URL]
|
|
674
|
+
good_status_codes = test_urls(good_urls, error_on_failure=True, n_workers=1, timeout=10)
|
|
675
|
+
assert good_status_codes == [200, 200]
|
|
676
|
+
|
|
677
|
+
|
|
678
|
+
def test_get_url_size_and_sizes(self):
|
|
679
|
+
"""
|
|
680
|
+
Test get_url_size and get_url_sizes functions.
|
|
681
|
+
"""
|
|
682
|
+
|
|
683
|
+
size = get_url_size(SMALL_FILE_URL, timeout=10)
|
|
684
|
+
assert size is not None
|
|
685
|
+
assert size > 1000
|
|
686
|
+
|
|
687
|
+
size_dynamic = get_url_size(REDIRECT_DEST_URL, timeout=10, verbose=True)
|
|
688
|
+
if size_dynamic is not None:
|
|
689
|
+
assert isinstance(size_dynamic, int)
|
|
690
|
+
|
|
691
|
+
size_non_existent = get_url_size(NON_EXISTENT_URL, timeout=5)
|
|
692
|
+
assert size_non_existent is None
|
|
693
|
+
|
|
694
|
+
size_bad_domain = get_url_size(DEFINITELY_NON_EXISTENT_DOMAIN_URL, timeout=5)
|
|
695
|
+
assert size_bad_domain is None
|
|
696
|
+
|
|
697
|
+
urls_for_size = [SMALL_FILE_URL, NON_EXISTENT_URL, REDIRECT_DEST_URL]
|
|
698
|
+
sizes_map = get_url_sizes(urls_for_size, n_workers=1, timeout=10)
|
|
699
|
+
|
|
700
|
+
assert SMALL_FILE_URL in sizes_map
|
|
701
|
+
assert sizes_map[SMALL_FILE_URL] == size
|
|
702
|
+
|
|
703
|
+
assert NON_EXISTENT_URL in sizes_map
|
|
704
|
+
assert sizes_map[NON_EXISTENT_URL] is None
|
|
705
|
+
|
|
706
|
+
assert REDIRECT_DEST_URL in sizes_map
|
|
707
|
+
assert sizes_map[REDIRECT_DEST_URL] == size_dynamic
|
|
708
|
+
|
|
709
|
+
|
|
710
|
+
def _test_url_utils():
|
|
711
|
+
"""
|
|
712
|
+
Runs all tests in the TestUrlUtils class. I generally disable this during testing
|
|
713
|
+
because it creates irritating nondeterminism, and this is neither a core module nor
|
|
714
|
+
a module that changes often.
|
|
715
|
+
"""
|
|
716
|
+
|
|
717
|
+
test_instance = TestUrlUtils()
|
|
718
|
+
test_instance.set_up()
|
|
719
|
+
try:
|
|
720
|
+
test_instance.test_download_url_to_specified_file()
|
|
721
|
+
test_instance.test_download_url_to_temp_file()
|
|
722
|
+
test_instance.test_download_url_non_existent()
|
|
723
|
+
test_instance.test_download_url_force_download()
|
|
724
|
+
test_instance.test_download_url_escape_spaces()
|
|
725
|
+
test_instance.test_download_relative_filename()
|
|
726
|
+
test_instance.test_parallel_download_urls()
|
|
727
|
+
test_instance.test_test_url_and_test_urls()
|
|
728
|
+
test_instance.test_get_url_size_and_sizes()
|
|
729
|
+
finally:
|
|
730
|
+
test_instance.tear_down()
|
|
731
|
+
|
|
732
|
+
# from IPython import embed; embed()
|
|
733
|
+
# test_url_utils()
|