megadetector 5.0.27__py3-none-any.whl → 5.0.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/api/batch_processing/api_core/batch_service/score.py +4 -5
- megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +1 -1
- megadetector/api/batch_processing/api_support/summarize_daily_activity.py +1 -1
- megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
- megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
- megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
- megadetector/api/synchronous/api_core/tests/load_test.py +2 -3
- megadetector/classification/aggregate_classifier_probs.py +3 -3
- megadetector/classification/analyze_failed_images.py +5 -5
- megadetector/classification/cache_batchapi_outputs.py +5 -5
- megadetector/classification/create_classification_dataset.py +11 -12
- megadetector/classification/crop_detections.py +10 -10
- megadetector/classification/csv_to_json.py +8 -8
- megadetector/classification/detect_and_crop.py +13 -15
- megadetector/classification/evaluate_model.py +7 -7
- megadetector/classification/identify_mislabeled_candidates.py +6 -6
- megadetector/classification/json_to_azcopy_list.py +1 -1
- megadetector/classification/json_validator.py +29 -32
- megadetector/classification/map_classification_categories.py +9 -9
- megadetector/classification/merge_classification_detection_output.py +12 -9
- megadetector/classification/prepare_classification_script.py +19 -19
- megadetector/classification/prepare_classification_script_mc.py +23 -23
- megadetector/classification/run_classifier.py +4 -4
- megadetector/classification/save_mislabeled.py +6 -6
- megadetector/classification/train_classifier.py +1 -1
- megadetector/classification/train_classifier_tf.py +9 -9
- megadetector/classification/train_utils.py +10 -10
- megadetector/data_management/annotations/annotation_constants.py +1 -1
- megadetector/data_management/camtrap_dp_to_coco.py +45 -45
- megadetector/data_management/cct_json_utils.py +101 -101
- megadetector/data_management/cct_to_md.py +49 -49
- megadetector/data_management/cct_to_wi.py +33 -33
- megadetector/data_management/coco_to_labelme.py +75 -75
- megadetector/data_management/coco_to_yolo.py +189 -189
- megadetector/data_management/databases/add_width_and_height_to_db.py +3 -2
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +38 -38
- megadetector/data_management/databases/integrity_check_json_db.py +202 -188
- megadetector/data_management/databases/subset_json_db.py +33 -33
- megadetector/data_management/generate_crops_from_cct.py +38 -38
- megadetector/data_management/get_image_sizes.py +54 -49
- megadetector/data_management/labelme_to_coco.py +130 -124
- megadetector/data_management/labelme_to_yolo.py +78 -72
- megadetector/data_management/lila/create_lila_blank_set.py +81 -83
- megadetector/data_management/lila/create_lila_test_set.py +32 -31
- megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
- megadetector/data_management/lila/download_lila_subset.py +21 -24
- megadetector/data_management/lila/generate_lila_per_image_labels.py +91 -91
- megadetector/data_management/lila/get_lila_annotation_counts.py +30 -30
- megadetector/data_management/lila/get_lila_image_counts.py +22 -22
- megadetector/data_management/lila/lila_common.py +70 -70
- megadetector/data_management/lila/test_lila_metadata_urls.py +13 -14
- megadetector/data_management/mewc_to_md.py +339 -340
- megadetector/data_management/ocr_tools.py +258 -252
- megadetector/data_management/read_exif.py +232 -223
- megadetector/data_management/remap_coco_categories.py +26 -26
- megadetector/data_management/remove_exif.py +31 -20
- megadetector/data_management/rename_images.py +187 -187
- megadetector/data_management/resize_coco_dataset.py +41 -41
- megadetector/data_management/speciesnet_to_md.py +41 -41
- megadetector/data_management/wi_download_csv_to_coco.py +55 -55
- megadetector/data_management/yolo_output_to_md_output.py +117 -120
- megadetector/data_management/yolo_to_coco.py +195 -188
- megadetector/detection/change_detection.py +831 -0
- megadetector/detection/process_video.py +341 -338
- megadetector/detection/pytorch_detector.py +308 -266
- megadetector/detection/run_detector.py +186 -166
- megadetector/detection/run_detector_batch.py +366 -364
- megadetector/detection/run_inference_with_yolov5_val.py +328 -325
- megadetector/detection/run_tiled_inference.py +312 -253
- megadetector/detection/tf_detector.py +24 -24
- megadetector/detection/video_utils.py +291 -283
- megadetector/postprocessing/add_max_conf.py +15 -11
- megadetector/postprocessing/categorize_detections_by_size.py +44 -44
- megadetector/postprocessing/classification_postprocessing.py +808 -311
- megadetector/postprocessing/combine_batch_outputs.py +20 -21
- megadetector/postprocessing/compare_batch_results.py +528 -517
- megadetector/postprocessing/convert_output_format.py +97 -97
- megadetector/postprocessing/create_crop_folder.py +220 -147
- megadetector/postprocessing/detector_calibration.py +173 -168
- megadetector/postprocessing/generate_csv_report.py +508 -0
- megadetector/postprocessing/load_api_results.py +25 -22
- megadetector/postprocessing/md_to_coco.py +129 -98
- megadetector/postprocessing/md_to_labelme.py +89 -83
- megadetector/postprocessing/md_to_wi.py +40 -40
- megadetector/postprocessing/merge_detections.py +87 -114
- megadetector/postprocessing/postprocess_batch_results.py +319 -302
- megadetector/postprocessing/remap_detection_categories.py +36 -36
- megadetector/postprocessing/render_detection_confusion_matrix.py +205 -199
- megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
- megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +702 -677
- megadetector/postprocessing/separate_detections_into_folders.py +226 -211
- megadetector/postprocessing/subset_json_detector_output.py +265 -262
- megadetector/postprocessing/top_folders_to_bottom.py +45 -45
- megadetector/postprocessing/validate_batch_results.py +70 -70
- megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +15 -15
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +14 -14
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +66 -69
- megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
- megadetector/taxonomy_mapping/simple_image_download.py +8 -8
- megadetector/taxonomy_mapping/species_lookup.py +33 -33
- megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
- megadetector/taxonomy_mapping/taxonomy_graph.py +11 -11
- megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
- megadetector/utils/azure_utils.py +22 -22
- megadetector/utils/ct_utils.py +1019 -200
- megadetector/utils/directory_listing.py +21 -77
- megadetector/utils/gpu_test.py +22 -22
- megadetector/utils/md_tests.py +541 -518
- megadetector/utils/path_utils.py +1511 -406
- megadetector/utils/process_utils.py +41 -41
- megadetector/utils/sas_blob_utils.py +53 -49
- megadetector/utils/split_locations_into_train_val.py +73 -60
- megadetector/utils/string_utils.py +147 -26
- megadetector/utils/url_utils.py +463 -173
- megadetector/utils/wi_utils.py +2629 -2868
- megadetector/utils/write_html_image_list.py +137 -137
- megadetector/visualization/plot_utils.py +21 -21
- megadetector/visualization/render_images_with_thumbnails.py +37 -73
- megadetector/visualization/visualization_utils.py +424 -404
- megadetector/visualization/visualize_db.py +197 -190
- megadetector/visualization/visualize_detector_output.py +126 -98
- {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/METADATA +6 -3
- megadetector-5.0.29.dist-info/RECORD +163 -0
- {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/WHEEL +1 -1
- megadetector/data_management/importers/add_nacti_sizes.py +0 -52
- megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
- megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
- megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
- megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
- megadetector/data_management/importers/awc_to_json.py +0 -191
- megadetector/data_management/importers/bellevue_to_json.py +0 -272
- megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
- megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
- megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
- megadetector/data_management/importers/cct_field_adjustments.py +0 -58
- megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
- megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
- megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
- megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
- megadetector/data_management/importers/ena24_to_json.py +0 -276
- megadetector/data_management/importers/filenames_to_json.py +0 -386
- megadetector/data_management/importers/helena_to_cct.py +0 -283
- megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
- megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
- megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
- megadetector/data_management/importers/jb_csv_to_json.py +0 -150
- megadetector/data_management/importers/mcgill_to_json.py +0 -250
- megadetector/data_management/importers/missouri_to_json.py +0 -490
- megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
- megadetector/data_management/importers/noaa_seals_2019.py +0 -181
- megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
- megadetector/data_management/importers/pc_to_json.py +0 -365
- megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
- megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
- megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
- megadetector/data_management/importers/rspb_to_json.py +0 -356
- megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
- megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
- megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
- megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
- megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
- megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
- megadetector/data_management/importers/sulross_get_exif.py +0 -65
- megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
- megadetector/data_management/importers/ubc_to_json.py +0 -399
- megadetector/data_management/importers/umn_to_json.py +0 -507
- megadetector/data_management/importers/wellington_to_json.py +0 -263
- megadetector/data_management/importers/wi_to_json.py +0 -442
- megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
- megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
- megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
- megadetector-5.0.27.dist-info/RECORD +0 -208
- {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/licenses/LICENSE +0 -0
- {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/top_level.txt +0 -0
megadetector/utils/url_utils.py
CHANGED
|
@@ -11,8 +11,12 @@ Frequently-used functions for downloading or manipulating URLs
|
|
|
11
11
|
import os
|
|
12
12
|
import re
|
|
13
13
|
import urllib
|
|
14
|
+
import urllib.request
|
|
15
|
+
import urllib.error
|
|
14
16
|
import tempfile
|
|
15
|
-
import requests
|
|
17
|
+
import requests
|
|
18
|
+
import shutil
|
|
19
|
+
import pytest
|
|
16
20
|
|
|
17
21
|
from functools import partial
|
|
18
22
|
from tqdm import tqdm
|
|
@@ -20,111 +24,105 @@ from urllib.parse import urlparse
|
|
|
20
24
|
from multiprocessing.pool import ThreadPool
|
|
21
25
|
from multiprocessing.pool import Pool
|
|
22
26
|
|
|
23
|
-
|
|
27
|
+
from megadetector.utils.ct_utils import make_test_folder
|
|
28
|
+
from megadetector.utils.ct_utils import make_temp_folder
|
|
29
|
+
|
|
24
30
|
max_path_len = 255
|
|
25
31
|
|
|
26
32
|
|
|
27
33
|
#%% Download functions
|
|
28
34
|
|
|
29
|
-
class DownloadProgressBar
|
|
35
|
+
class DownloadProgressBar:
|
|
30
36
|
"""
|
|
31
37
|
Progress updater based on the progressbar2 package.
|
|
32
|
-
|
|
38
|
+
|
|
33
39
|
https://stackoverflow.com/questions/37748105/how-to-use-progressbar-module-with-urlretrieve
|
|
34
40
|
"""
|
|
35
|
-
|
|
41
|
+
|
|
42
|
+
|
|
36
43
|
def __init__(self):
|
|
44
|
+
|
|
37
45
|
self.pbar = None
|
|
38
46
|
|
|
39
|
-
def __call__(self, block_num, block_size, total_size):
|
|
40
|
-
if not self.pbar:
|
|
41
|
-
# This is a pretty random import I'd rather not depend on outside of the
|
|
42
|
-
# rare case where it's used, so importing locally
|
|
43
|
-
# pip install progressbar2
|
|
44
|
-
import progressbar
|
|
45
|
-
self.pbar = progressbar.ProgressBar(max_value=total_size)
|
|
46
|
-
self.pbar.start()
|
|
47
|
-
|
|
48
|
-
downloaded = block_num * block_size
|
|
49
|
-
if downloaded < total_size:
|
|
50
|
-
self.pbar.update(downloaded)
|
|
51
|
-
else:
|
|
52
|
-
self.pbar.finish()
|
|
53
|
-
|
|
54
47
|
|
|
55
|
-
def
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def download_url(url,
|
|
75
|
-
destination_filename=None,
|
|
76
|
-
progress_updater=None,
|
|
77
|
-
force_download=False,
|
|
48
|
+
def __call__(self, block_num, block_size, total_size): # noqa
|
|
49
|
+
|
|
50
|
+
if not self.pbar:
|
|
51
|
+
try:
|
|
52
|
+
import progressbar # type: ignore
|
|
53
|
+
self.pbar = progressbar.ProgressBar(max_value=total_size)
|
|
54
|
+
self.pbar.start()
|
|
55
|
+
except ImportError:
|
|
56
|
+
self.pbar = None
|
|
57
|
+
# print("ProgressBar not available, install 'progressbar2' for visual progress.")
|
|
58
|
+
|
|
59
|
+
if self.pbar:
|
|
60
|
+
downloaded = block_num * block_size
|
|
61
|
+
if downloaded < total_size:
|
|
62
|
+
self.pbar.update(downloaded)
|
|
63
|
+
else:
|
|
64
|
+
self.pbar.finish()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def download_url(url,
|
|
68
|
+
destination_filename=None,
|
|
69
|
+
progress_updater=None,
|
|
70
|
+
force_download=False,
|
|
78
71
|
verbose=True,
|
|
79
72
|
escape_spaces=True):
|
|
80
73
|
"""
|
|
81
|
-
Downloads a URL to a file. If no file is specified, creates a temporary file,
|
|
74
|
+
Downloads a URL to a file. If no file is specified, creates a temporary file,
|
|
82
75
|
making a best effort to avoid filename collisions.
|
|
83
|
-
|
|
76
|
+
|
|
84
77
|
Prints some diagnostic information and makes sure to omit SAS tokens from printouts.
|
|
85
|
-
|
|
78
|
+
|
|
86
79
|
Args:
|
|
87
80
|
url (str): the URL to download
|
|
88
81
|
destination_filename (str, optional): the target filename; if None, will create
|
|
89
|
-
a file in system temp space
|
|
90
|
-
progress_updater (object or bool, optional): can be "None", "False", "True", or a
|
|
91
|
-
specific callable object. If None or False, no progress updated will be
|
|
82
|
+
a file in system temp space
|
|
83
|
+
progress_updater (object or bool, optional): can be "None", "False", "True", or a
|
|
84
|
+
specific callable object. If None or False, no progress updated will be
|
|
92
85
|
displayed. If True, a default progress bar will be created.
|
|
93
86
|
force_download (bool, optional): download this file even if [destination_filename]
|
|
94
87
|
exists.
|
|
95
88
|
verbose (bool, optional): enable additional debug console output
|
|
96
89
|
escape_spaces (bool, optional): replace ' ' with '%20'
|
|
97
|
-
|
|
90
|
+
|
|
98
91
|
Returns:
|
|
99
92
|
str: the filename to which [url] was downloaded, the same as [destination_filename]
|
|
100
93
|
if [destination_filename] was not None
|
|
101
94
|
"""
|
|
102
|
-
|
|
95
|
+
|
|
103
96
|
if progress_updater is not None and isinstance(progress_updater,bool):
|
|
104
97
|
if not progress_updater:
|
|
105
98
|
progress_updater = None
|
|
106
99
|
else:
|
|
107
100
|
progress_updater = DownloadProgressBar()
|
|
108
|
-
|
|
101
|
+
|
|
109
102
|
url_no_sas = url.split('?')[0]
|
|
110
|
-
|
|
103
|
+
|
|
111
104
|
if destination_filename is None:
|
|
112
|
-
|
|
113
|
-
target_folder =
|
|
105
|
+
|
|
106
|
+
target_folder = make_temp_folder(subfolder='url_utils',append_guid=False)
|
|
114
107
|
url_without_sas = url.split('?', 1)[0]
|
|
115
|
-
|
|
108
|
+
|
|
116
109
|
# This does not guarantee uniqueness, hence "semi-best-effort"
|
|
117
110
|
url_as_filename = re.sub(r'\W+', '', url_without_sas)
|
|
118
|
-
|
|
119
|
-
|
|
111
|
+
|
|
112
|
+
n_folder_chars = len(target_folder)
|
|
113
|
+
|
|
114
|
+
if (len(url_as_filename) + n_folder_chars) >= max_path_len:
|
|
120
115
|
print('Warning: truncating filename target to {} characters'.format(max_path_len))
|
|
121
|
-
|
|
116
|
+
max_fn_len = max_path_len - (n_folder_chars + 1)
|
|
117
|
+
url_as_filename = url_as_filename[-1 * max_fn_len:]
|
|
122
118
|
destination_filename = \
|
|
123
119
|
os.path.join(target_folder,url_as_filename)
|
|
124
|
-
|
|
120
|
+
|
|
121
|
+
# ...if the destination filename wasn't specified
|
|
122
|
+
|
|
125
123
|
if escape_spaces:
|
|
126
124
|
url = url.replace(' ','%20')
|
|
127
|
-
|
|
125
|
+
|
|
128
126
|
if (not force_download) and (os.path.isfile(destination_filename)):
|
|
129
127
|
if verbose:
|
|
130
128
|
print('Bypassing download of already-downloaded file {}'.format(os.path.basename(url_no_sas)))
|
|
@@ -133,12 +131,12 @@ def download_url(url,
|
|
|
133
131
|
print('Downloading file {} to {}'.format(os.path.basename(url_no_sas),destination_filename),end='')
|
|
134
132
|
target_dir = os.path.dirname(destination_filename)
|
|
135
133
|
os.makedirs(target_dir,exist_ok=True)
|
|
136
|
-
urllib.request.urlretrieve(url, destination_filename, progress_updater)
|
|
134
|
+
urllib.request.urlretrieve(url, destination_filename, progress_updater)
|
|
137
135
|
assert(os.path.isfile(destination_filename))
|
|
138
|
-
|
|
136
|
+
n_bytes = os.path.getsize(destination_filename)
|
|
139
137
|
if verbose:
|
|
140
|
-
print('...done, {} bytes.'.format(
|
|
141
|
-
|
|
138
|
+
print('...done, {} bytes.'.format(n_bytes))
|
|
139
|
+
|
|
142
140
|
return destination_filename
|
|
143
141
|
|
|
144
142
|
# ...def download_url(...)
|
|
@@ -146,24 +144,24 @@ def download_url(url,
|
|
|
146
144
|
|
|
147
145
|
def download_relative_filename(url, output_base, verbose=False):
|
|
148
146
|
"""
|
|
149
|
-
Download a URL to output_base, preserving relative path. Path is relative to
|
|
147
|
+
Download a URL to output_base, preserving relative path. Path is relative to
|
|
150
148
|
the site, so:
|
|
151
|
-
|
|
149
|
+
|
|
152
150
|
https://abc.com/xyz/123.txt
|
|
153
|
-
|
|
151
|
+
|
|
154
152
|
...will get downloaded to:
|
|
155
|
-
|
|
156
|
-
output_base/xyz/123.txt
|
|
157
|
-
|
|
153
|
+
|
|
154
|
+
output_base/xyz/123.txt
|
|
155
|
+
|
|
158
156
|
Args:
|
|
159
157
|
url (str): the URL to download
|
|
160
158
|
output_base (str): the base folder to which we should download this file
|
|
161
159
|
verbose (bool, optional): enable additional debug console output
|
|
162
|
-
|
|
160
|
+
|
|
163
161
|
Returns:
|
|
164
162
|
str: the local destination filename
|
|
165
163
|
"""
|
|
166
|
-
|
|
164
|
+
|
|
167
165
|
p = urlparse(url)
|
|
168
166
|
# remove the leading '/'
|
|
169
167
|
assert p.path.startswith('/'); relative_filename = p.path[1:]
|
|
@@ -177,40 +175,40 @@ def _do_parallelized_download(download_info,overwrite=False,verbose=False):
|
|
|
177
175
|
"""
|
|
178
176
|
Internal function for download parallelization.
|
|
179
177
|
"""
|
|
180
|
-
|
|
178
|
+
|
|
181
179
|
url = download_info['url']
|
|
182
180
|
target_file = download_info['target_file']
|
|
183
181
|
result = {'status':'unknown','url':url,'target_file':target_file}
|
|
184
|
-
|
|
182
|
+
|
|
185
183
|
if ((os.path.isfile(target_file)) and (not overwrite)):
|
|
186
184
|
if verbose:
|
|
187
185
|
print('Skipping existing file {}'.format(target_file))
|
|
188
186
|
result['status'] = 'skipped'
|
|
189
187
|
return result
|
|
190
188
|
try:
|
|
191
|
-
download_url(url=url,
|
|
189
|
+
download_url(url=url,
|
|
192
190
|
destination_filename=target_file,
|
|
193
|
-
verbose=verbose,
|
|
191
|
+
verbose=verbose,
|
|
194
192
|
force_download=overwrite)
|
|
195
193
|
except Exception as e:
|
|
196
194
|
print('Warning: error downloading URL {}: {}'.format(
|
|
197
|
-
url,str(e)))
|
|
195
|
+
url,str(e)))
|
|
198
196
|
result['status'] = 'error: {}'.format(str(e))
|
|
199
197
|
return result
|
|
200
|
-
|
|
198
|
+
|
|
201
199
|
result['status'] = 'success'
|
|
202
200
|
return result
|
|
203
201
|
|
|
204
202
|
# ...def _do_parallelized_download(...)
|
|
205
203
|
|
|
206
204
|
|
|
207
|
-
def parallel_download_urls(url_to_target_file,verbose=False,overwrite=False,
|
|
208
|
-
n_workers=20,pool_type='thread'):
|
|
205
|
+
def parallel_download_urls(url_to_target_file, verbose=False, overwrite=False,
|
|
206
|
+
n_workers=20, pool_type='thread'):
|
|
209
207
|
"""
|
|
210
208
|
Downloads a list of URLs to local files.
|
|
211
|
-
|
|
212
|
-
Catches exceptions and reports them in the returned "results" array.
|
|
213
|
-
|
|
209
|
+
|
|
210
|
+
Catches exceptions and reports them in the returned "results" array.
|
|
211
|
+
|
|
214
212
|
Args:
|
|
215
213
|
url_to_target_file: a dict mapping URLs to local filenames.
|
|
216
214
|
verbose (bool, optional): enable additional debug console output
|
|
@@ -218,82 +216,95 @@ def parallel_download_urls(url_to_target_file,verbose=False,overwrite=False,
|
|
|
218
216
|
n_workers (int, optional): number of concurrent workers, set to <=1 to disable
|
|
219
217
|
parallelization
|
|
220
218
|
pool_type (str, optional): worker type to use; should be 'thread' or 'process'
|
|
221
|
-
|
|
219
|
+
|
|
222
220
|
Returns:
|
|
223
221
|
list: list of dicts with keys:
|
|
224
222
|
- 'url': the url this item refers to
|
|
225
223
|
- 'status': 'skipped', 'success', or a string starting with 'error'
|
|
226
|
-
- 'target_file': the local filename to which we downloaded (or tried to
|
|
227
|
-
download) this URL
|
|
224
|
+
- 'target_file': the local filename to which we downloaded (or tried to
|
|
225
|
+
download) this URL
|
|
228
226
|
"""
|
|
229
|
-
|
|
227
|
+
|
|
230
228
|
all_download_info = []
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
229
|
+
|
|
230
|
+
if verbose:
|
|
231
|
+
print('Preparing download list')
|
|
232
|
+
for url in tqdm(url_to_target_file, disable=(not verbose)):
|
|
234
233
|
download_info = {}
|
|
235
234
|
download_info['url'] = url
|
|
236
235
|
download_info['target_file'] = url_to_target_file[url]
|
|
237
236
|
all_download_info.append(download_info)
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
237
|
+
|
|
238
|
+
if verbose:
|
|
239
|
+
print('Downloading {} images on {} workers'.format(
|
|
240
|
+
len(all_download_info),n_workers))
|
|
241
241
|
|
|
242
242
|
if n_workers <= 1:
|
|
243
243
|
|
|
244
244
|
results = []
|
|
245
|
-
|
|
246
|
-
for download_info in tqdm(all_download_info):
|
|
245
|
+
|
|
246
|
+
for download_info in tqdm(all_download_info, disable=(not verbose)):
|
|
247
247
|
result = _do_parallelized_download(download_info,overwrite=overwrite,verbose=verbose)
|
|
248
248
|
results.append(result)
|
|
249
|
-
|
|
249
|
+
|
|
250
250
|
else:
|
|
251
251
|
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
252
|
+
pool = None
|
|
253
|
+
|
|
254
|
+
try:
|
|
255
|
+
if pool_type == 'thread':
|
|
256
|
+
pool = ThreadPool(n_workers)
|
|
257
|
+
else:
|
|
258
|
+
assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
|
|
259
|
+
pool = Pool(n_workers)
|
|
260
|
+
|
|
261
|
+
if verbose:
|
|
262
|
+
print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
|
|
263
|
+
|
|
264
|
+
results = list(tqdm(pool.imap(
|
|
265
|
+
partial(_do_parallelized_download,overwrite=overwrite,verbose=verbose),
|
|
266
|
+
all_download_info), total=len(all_download_info), disable=(not verbose)))
|
|
267
|
+
|
|
268
|
+
finally:
|
|
269
|
+
if pool:
|
|
270
|
+
pool.close()
|
|
271
|
+
pool.join()
|
|
272
|
+
print("Pool closed and joined for parallel URL downloads")
|
|
273
|
+
|
|
264
274
|
return results
|
|
265
275
|
|
|
266
276
|
# ...def parallel_download_urls(...)
|
|
267
277
|
|
|
268
278
|
|
|
279
|
+
@pytest.mark.skip(reason="This is not a test function")
|
|
269
280
|
def test_url(url,error_on_failure=True,timeout=None):
|
|
270
281
|
"""
|
|
271
282
|
Tests the availability of [url], returning an http status code.
|
|
272
|
-
|
|
283
|
+
|
|
273
284
|
Args:
|
|
274
285
|
url (str): URL to test
|
|
275
286
|
error_on_failure (bool, optional): whether to error (vs. just returning an
|
|
276
287
|
error code) if accessing this URL fails
|
|
277
|
-
timeout (int, optional): timeout in seconds to wait before considering this
|
|
288
|
+
timeout (int, optional): timeout in seconds to wait before considering this
|
|
278
289
|
access attempt to be a failure; see requests.head() for precise documentation
|
|
279
|
-
|
|
290
|
+
|
|
280
291
|
Returns:
|
|
281
292
|
int: http status code (200 for success)
|
|
282
293
|
"""
|
|
283
|
-
|
|
284
|
-
# r = requests.get(url, stream=True, verify=True, timeout=timeout)
|
|
294
|
+
|
|
285
295
|
r = requests.head(url, stream=True, verify=True, timeout=timeout)
|
|
286
|
-
|
|
287
|
-
if error_on_failure and r.status_code != 200:
|
|
296
|
+
|
|
297
|
+
if error_on_failure and r.status_code != 200:
|
|
288
298
|
raise ValueError('Could not access {}: error {}'.format(url,r.status_code))
|
|
289
299
|
return r.status_code
|
|
290
|
-
|
|
291
300
|
|
|
292
|
-
|
|
301
|
+
|
|
302
|
+
@pytest.mark.skip(reason="This is not a test function")
|
|
303
|
+
def test_urls(urls,error_on_failure=True,n_workers=1,pool_type='thread',timeout=None,verbose=False):
|
|
293
304
|
"""
|
|
294
305
|
Verify that URLs are available (i.e., returns status 200). By default,
|
|
295
|
-
errors if any URL is unavailable.
|
|
296
|
-
|
|
306
|
+
errors if any URL is unavailable.
|
|
307
|
+
|
|
297
308
|
Args:
|
|
298
309
|
urls (list): list of URLs to test
|
|
299
310
|
error_on_failure (bool, optional): whether to error (vs. just returning an
|
|
@@ -301,39 +312,48 @@ def test_urls(urls,error_on_failure=True,n_workers=1,pool_type='thread',timeout=
|
|
|
301
312
|
n_workers (int, optional): number of concurrent workers, set to <=1 to disable
|
|
302
313
|
parallelization
|
|
303
314
|
pool_type (str, optional): worker type to use; should be 'thread' or 'process'
|
|
304
|
-
timeout (int, optional): timeout in seconds to wait before considering this
|
|
315
|
+
timeout (int, optional): timeout in seconds to wait before considering this
|
|
305
316
|
access attempt to be a failure; see requests.head() for precise documentation
|
|
306
|
-
|
|
317
|
+
verbose (bool, optional): enable additional debug output
|
|
318
|
+
|
|
307
319
|
Returns:
|
|
308
320
|
list: a list of http status codes, the same length and order as [urls]
|
|
309
321
|
"""
|
|
310
|
-
|
|
322
|
+
|
|
311
323
|
if n_workers <= 1:
|
|
312
324
|
|
|
313
325
|
status_codes = []
|
|
314
|
-
|
|
315
|
-
for url in tqdm(urls):
|
|
316
|
-
|
|
326
|
+
|
|
327
|
+
for url in tqdm(urls,disable=(not verbose)):
|
|
328
|
+
|
|
317
329
|
r = requests.get(url, timeout=timeout)
|
|
318
|
-
|
|
319
|
-
if error_on_failure and r.status_code != 200:
|
|
330
|
+
|
|
331
|
+
if error_on_failure and r.status_code != 200:
|
|
320
332
|
raise ValueError('Could not access {}: error {}'.format(url,r.status_code))
|
|
321
333
|
status_codes.append(r.status_code)
|
|
322
|
-
|
|
334
|
+
|
|
323
335
|
else:
|
|
324
336
|
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
+
pool = None
|
|
338
|
+
try:
|
|
339
|
+
if pool_type == 'thread':
|
|
340
|
+
pool = ThreadPool(n_workers)
|
|
341
|
+
else:
|
|
342
|
+
assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
|
|
343
|
+
pool = Pool(n_workers)
|
|
344
|
+
|
|
345
|
+
if verbose:
|
|
346
|
+
print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
|
|
347
|
+
|
|
348
|
+
status_codes = list(tqdm(pool.imap(
|
|
349
|
+
partial(test_url,error_on_failure=error_on_failure,timeout=timeout),
|
|
350
|
+
urls), total=len(urls), disable=(not verbose)))
|
|
351
|
+
finally:
|
|
352
|
+
if pool:
|
|
353
|
+
pool.close()
|
|
354
|
+
pool.join()
|
|
355
|
+
print('Pool closed and joined for URL tests')
|
|
356
|
+
|
|
337
357
|
return status_codes
|
|
338
358
|
|
|
339
359
|
# ...def test_urls(...)
|
|
@@ -341,16 +361,16 @@ def test_urls(urls,error_on_failure=True,n_workers=1,pool_type='thread',timeout=
|
|
|
341
361
|
|
|
342
362
|
def get_url_size(url,verbose=False,timeout=None):
|
|
343
363
|
"""
|
|
344
|
-
Get the size of the file pointed to by a URL, based on the Content-Length property. If the
|
|
345
|
-
URL is not available, or the Content-Length property is not available, or the content-Length
|
|
346
|
-
property is not an integer, returns None.
|
|
347
|
-
|
|
364
|
+
Get the size of the file pointed to by a URL, based on the Content-Length property. If the
|
|
365
|
+
URL is not available, or the Content-Length property is not available, or the content-Length
|
|
366
|
+
property is not an integer, returns None.
|
|
367
|
+
|
|
348
368
|
Args:
|
|
349
369
|
url (str): the url to test
|
|
350
370
|
verbose (bool, optional): enable additional debug output
|
|
351
|
-
timeout (int, optional): timeout in seconds to wait before considering this
|
|
371
|
+
timeout (int, optional): timeout in seconds to wait before considering this
|
|
352
372
|
access attempt to be a failure; see requests.head() for precise documentation
|
|
353
|
-
|
|
373
|
+
|
|
354
374
|
Returns:
|
|
355
375
|
int: the file size in bytes, or None if it can't be retrieved
|
|
356
376
|
"""
|
|
@@ -362,13 +382,18 @@ def get_url_size(url,verbose=False,timeout=None):
|
|
|
362
382
|
if verbose:
|
|
363
383
|
print('Status {} retrieving file size for {}'.format(f.status,url))
|
|
364
384
|
return None
|
|
365
|
-
|
|
385
|
+
size_bytes_str = f.headers.get('Content-Length')
|
|
386
|
+
if size_bytes_str is None:
|
|
387
|
+
if verbose:
|
|
388
|
+
print('No Content-Length header for {}'.format(url))
|
|
389
|
+
return None
|
|
390
|
+
size_bytes = int(size_bytes_str)
|
|
366
391
|
return size_bytes
|
|
367
392
|
except Exception as e:
|
|
368
393
|
if verbose:
|
|
369
394
|
print('Error retrieving file size for {}:\n{}'.format(url,str(e)))
|
|
370
395
|
return None
|
|
371
|
-
|
|
396
|
+
|
|
372
397
|
# ...def get_url_size(...)
|
|
373
398
|
|
|
374
399
|
|
|
@@ -376,45 +401,310 @@ def get_url_sizes(urls,n_workers=1,pool_type='thread',timeout=None,verbose=False
|
|
|
376
401
|
"""
|
|
377
402
|
Retrieve file sizes for the URLs specified by [urls]. Returns None for any URLs
|
|
378
403
|
that we can't access, or URLs for which the Content-Length property is not set.
|
|
379
|
-
|
|
404
|
+
|
|
380
405
|
Args:
|
|
381
406
|
urls (list): list of URLs for which we should retrieve sizes
|
|
382
407
|
n_workers (int, optional): number of concurrent workers, set to <=1 to disable
|
|
383
408
|
parallelization
|
|
384
409
|
pool_type (str, optional): worker type to use; should be 'thread' or 'process'
|
|
385
|
-
timeout (int, optional): timeout in seconds to wait before considering this
|
|
410
|
+
timeout (int, optional): timeout in seconds to wait before considering this
|
|
386
411
|
access attempt to be a failure; see requests.head() for precise documentation
|
|
387
412
|
verbose (bool, optional): print additional debug information
|
|
388
|
-
|
|
413
|
+
|
|
389
414
|
Returns:
|
|
390
415
|
dict: maps urls to file sizes, which will be None for URLs for which we were unable
|
|
391
|
-
to retrieve a valid size.
|
|
416
|
+
to retrieve a valid size.
|
|
392
417
|
"""
|
|
393
|
-
|
|
418
|
+
|
|
394
419
|
url_to_size = {}
|
|
395
|
-
|
|
396
|
-
if n_workers <= 1:
|
|
397
|
-
|
|
398
|
-
for url in tqdm(urls):
|
|
420
|
+
|
|
421
|
+
if n_workers <= 1:
|
|
422
|
+
|
|
423
|
+
for url in tqdm(urls, disable=(not verbose)):
|
|
399
424
|
url_to_size[url] = get_url_size(url,verbose=verbose,timeout=timeout)
|
|
400
|
-
|
|
425
|
+
|
|
401
426
|
else:
|
|
402
427
|
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
428
|
+
pool = None
|
|
429
|
+
try:
|
|
430
|
+
if pool_type == 'thread':
|
|
431
|
+
pool = ThreadPool(n_workers)
|
|
432
|
+
else:
|
|
433
|
+
assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
|
|
434
|
+
pool = Pool(n_workers)
|
|
435
|
+
|
|
436
|
+
if verbose:
|
|
437
|
+
print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
|
|
438
|
+
|
|
439
|
+
file_sizes = list(tqdm(pool.imap(
|
|
440
|
+
partial(get_url_size,verbose=verbose,timeout=timeout),
|
|
441
|
+
urls), total=len(urls), disable=(not verbose)))
|
|
442
|
+
|
|
443
|
+
for i_url,url in enumerate(urls):
|
|
444
|
+
url_to_size[url] = file_sizes[i_url]
|
|
445
|
+
finally:
|
|
446
|
+
if pool:
|
|
447
|
+
pool.close()
|
|
448
|
+
pool.join()
|
|
449
|
+
print('Pool closed and joined for URL size checks')
|
|
450
|
+
|
|
451
|
+
return url_to_size
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
#%% Tests
|
|
455
|
+
|
|
456
|
+
# Constants for tests
|
|
457
|
+
|
|
458
|
+
SMALL_FILE_URL = "https://www.google.com/images/branding/googlelogo/1x/googlelogo_color_272x92dp.png"
|
|
459
|
+
REDIRECT_SRC_URL = "http://google.com"
|
|
460
|
+
REDIRECT_DEST_URL = "https://www.google.com/"
|
|
461
|
+
NON_EXISTENT_URL = "https://example.com/non_existent_page_404.html"
|
|
462
|
+
DEFINITELY_NON_EXISTENT_DOMAIN_URL = "https://thisshouldnotexist1234567890.com/file.txt"
|
|
463
|
+
RELATIVE_DOWNLOAD_URL = "https://raw.githubusercontent.com/agentmorris/MegaDetector/main/README.md"
|
|
464
|
+
RELATIVE_DOWNLOAD_CONTAIN_TOKEN = 'agentmorris'
|
|
465
|
+
RELATIVE_DOWNLOAD_NOT_CONTAIN_TOKEN = 'github'
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
class TestUrlUtils:
|
|
469
|
+
"""
|
|
470
|
+
Tests for url_utils.py
|
|
471
|
+
"""
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
def set_up(self):
|
|
475
|
+
"""
|
|
476
|
+
Create a temporary directory for testing.
|
|
477
|
+
"""
|
|
478
|
+
|
|
479
|
+
self.test_dir = make_test_folder(subfolder='url_utils_tests')
|
|
480
|
+
self.download_target_dir = os.path.join(self.test_dir, 'downloads')
|
|
481
|
+
os.makedirs(self.download_target_dir, exist_ok=True)
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
def tear_down(self):
|
|
485
|
+
"""
|
|
486
|
+
Remove the temporary directory after tests and restore module temp_dir.
|
|
487
|
+
"""
|
|
488
|
+
|
|
489
|
+
if os.path.exists(self.test_dir):
|
|
490
|
+
shutil.rmtree(self.test_dir)
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def test_download_url_to_specified_file(self):
|
|
494
|
+
"""
|
|
495
|
+
Test download_url with a specified destination filename.
|
|
496
|
+
"""
|
|
497
|
+
|
|
498
|
+
dest_filename = os.path.join(self.download_target_dir, "downloaded_google_logo.png")
|
|
499
|
+
returned_filename = download_url(SMALL_FILE_URL, destination_filename=dest_filename, verbose=False)
|
|
500
|
+
assert returned_filename == dest_filename
|
|
501
|
+
assert os.path.exists(dest_filename)
|
|
502
|
+
assert os.path.getsize(dest_filename) > 1000
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
def test_download_url_to_temp_file(self):
|
|
506
|
+
"""
|
|
507
|
+
Test download_url when destination_filename is None.
|
|
508
|
+
"""
|
|
509
|
+
|
|
510
|
+
returned_filename = download_url(SMALL_FILE_URL, destination_filename=None, verbose=False)
|
|
511
|
+
assert os.path.exists(returned_filename)
|
|
512
|
+
assert os.path.getsize(returned_filename) > 1000
|
|
408
513
|
|
|
409
|
-
|
|
514
|
+
|
|
515
|
+
def test_download_url_non_existent(self):
|
|
516
|
+
"""
|
|
517
|
+
Test download_url with a non-existent URL.
|
|
518
|
+
"""
|
|
519
|
+
|
|
520
|
+
dest_filename = os.path.join(self.download_target_dir, "non_existent.html")
|
|
521
|
+
try:
|
|
522
|
+
download_url(NON_EXISTENT_URL, destination_filename=dest_filename, verbose=False)
|
|
523
|
+
assert False, "urllib.error.HTTPError not raised for 404"
|
|
524
|
+
except urllib.error.HTTPError:
|
|
525
|
+
pass
|
|
410
526
|
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
527
|
+
try:
|
|
528
|
+
download_url(DEFINITELY_NON_EXISTENT_DOMAIN_URL, destination_filename=dest_filename, verbose=False)
|
|
529
|
+
assert False, \
|
|
530
|
+
"urllib.error.URLError or requests.exceptions.ConnectionError not raised for DNS failure"
|
|
531
|
+
except urllib.error.URLError:
|
|
532
|
+
pass
|
|
533
|
+
except requests.exceptions.ConnectionError:
|
|
534
|
+
pass
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def test_download_url_force_download(self):
|
|
538
|
+
"""
|
|
539
|
+
Test the force_download parameter of download_url.
|
|
540
|
+
"""
|
|
541
|
+
|
|
542
|
+
dest_filename = os.path.join(self.download_target_dir, "force_test.png")
|
|
414
543
|
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
544
|
+
download_url(SMALL_FILE_URL, destination_filename=dest_filename, verbose=False)
|
|
545
|
+
assert os.path.exists(dest_filename)
|
|
546
|
+
initial_mtime = os.path.getmtime(dest_filename)
|
|
547
|
+
|
|
548
|
+
download_url(SMALL_FILE_URL, destination_filename=dest_filename, verbose=True)
|
|
549
|
+
assert os.path.getmtime(dest_filename) == initial_mtime
|
|
550
|
+
|
|
551
|
+
download_url(SMALL_FILE_URL, destination_filename=dest_filename, force_download=True, verbose=False)
|
|
552
|
+
assert os.path.exists(dest_filename)
|
|
553
|
+
|
|
554
|
+
|
|
555
|
+
def test_download_url_escape_spaces(self):
|
|
556
|
+
"""
|
|
557
|
+
Test download_url with spaces in the URL.
|
|
558
|
+
"""
|
|
559
|
+
|
|
560
|
+
dest_filename = os.path.join(self.download_target_dir, "escape_test.png")
|
|
561
|
+
download_url(SMALL_FILE_URL, destination_filename=dest_filename, escape_spaces=True, verbose=False)
|
|
562
|
+
assert os.path.exists(dest_filename)
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
def test_download_relative_filename(self):
|
|
566
|
+
"""
|
|
567
|
+
Test download_relative_filename.
|
|
568
|
+
"""
|
|
569
|
+
|
|
570
|
+
output_base = os.path.join(self.download_target_dir, "relative_dl")
|
|
571
|
+
returned_filename = download_relative_filename(RELATIVE_DOWNLOAD_URL, output_base, verbose=False)
|
|
572
|
+
assert RELATIVE_DOWNLOAD_CONTAIN_TOKEN in returned_filename
|
|
573
|
+
assert RELATIVE_DOWNLOAD_NOT_CONTAIN_TOKEN not in returned_filename
|
|
574
|
+
assert os.path.exists(returned_filename)
|
|
575
|
+
assert os.path.getsize(returned_filename) > 100
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
def test_parallel_download_urls(self):
|
|
579
|
+
"""
|
|
580
|
+
Test parallel_download_urls (with n_workers=1 for simplicity).
|
|
581
|
+
"""
|
|
582
|
+
|
|
583
|
+
url1_target = os.path.join(self.download_target_dir, "parallel_dl_1.png")
|
|
584
|
+
url2_target = os.path.join(self.download_target_dir, "parallel_dl_2_nonexistent.html")
|
|
585
|
+
|
|
586
|
+
url_to_target_file = {
|
|
587
|
+
SMALL_FILE_URL: url1_target,
|
|
588
|
+
NON_EXISTENT_URL: url2_target
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
results = parallel_download_urls(url_to_target_file, n_workers=1, verbose=False)
|
|
592
|
+
|
|
593
|
+
assert len(results) == 2
|
|
594
|
+
|
|
595
|
+
status_map = {res['url']: res for res in results}
|
|
596
|
+
|
|
597
|
+
assert status_map[SMALL_FILE_URL]['status'] == 'success'
|
|
598
|
+
assert status_map[SMALL_FILE_URL]['target_file'] == url1_target
|
|
599
|
+
assert os.path.exists(url1_target)
|
|
600
|
+
|
|
601
|
+
assert status_map[NON_EXISTENT_URL]['status'].startswith('error: HTTP Error 404')
|
|
602
|
+
assert status_map[NON_EXISTENT_URL]['target_file'] == url2_target
|
|
603
|
+
assert not os.path.exists(url2_target)
|
|
604
|
+
|
|
605
|
+
if not os.path.exists(url1_target):
|
|
606
|
+
download_url(SMALL_FILE_URL, url1_target, verbose=False)
|
|
607
|
+
results_skip = parallel_download_urls({SMALL_FILE_URL: url1_target}, n_workers=1, overwrite=False, verbose=True)
|
|
608
|
+
assert results_skip[0]['status'] == 'skipped'
|
|
419
609
|
|
|
420
|
-
|
|
610
|
+
results_overwrite = parallel_download_urls({SMALL_FILE_URL: url1_target}, n_workers=1, overwrite=True, verbose=False)
|
|
611
|
+
assert results_overwrite[0]['status'] == 'success'
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
def test_test_url_and_test_urls(self):
|
|
615
|
+
"""
|
|
616
|
+
Test test_url and test_urls functions.
|
|
617
|
+
"""
|
|
618
|
+
|
|
619
|
+
assert test_url(SMALL_FILE_URL, error_on_failure=False, timeout=10) == 200
|
|
620
|
+
assert test_url(REDIRECT_SRC_URL, error_on_failure=False, timeout=10) in (200,301)
|
|
621
|
+
|
|
622
|
+
status_non_existent = test_url(NON_EXISTENT_URL, error_on_failure=False, timeout=5)
|
|
623
|
+
assert status_non_existent == 404
|
|
624
|
+
|
|
625
|
+
try:
|
|
626
|
+
test_url(NON_EXISTENT_URL, error_on_failure=True, timeout=5)
|
|
627
|
+
assert False, "ValueError not raised for NON_EXISTENT_URL"
|
|
628
|
+
except ValueError:
|
|
629
|
+
pass
|
|
630
|
+
|
|
631
|
+
try:
|
|
632
|
+
test_url(DEFINITELY_NON_EXISTENT_DOMAIN_URL, error_on_failure=True, timeout=5)
|
|
633
|
+
assert False, "requests.exceptions.ConnectionError or urllib.error.URLError not raised"
|
|
634
|
+
except requests.exceptions.ConnectionError:
|
|
635
|
+
pass
|
|
636
|
+
except urllib.error.URLError:
|
|
637
|
+
pass
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
urls_to_test = [SMALL_FILE_URL, NON_EXISTENT_URL]
|
|
641
|
+
status_codes = test_urls(urls_to_test, error_on_failure=False, n_workers=1, timeout=10)
|
|
642
|
+
assert len(status_codes) == 2
|
|
643
|
+
assert status_codes[0] == 200
|
|
644
|
+
assert status_codes[1] == 404
|
|
645
|
+
|
|
646
|
+
try:
|
|
647
|
+
test_urls(urls_to_test, error_on_failure=True, n_workers=1, timeout=5)
|
|
648
|
+
assert False, "ValueError not raised for urls_to_test"
|
|
649
|
+
except ValueError:
|
|
650
|
+
pass
|
|
651
|
+
|
|
652
|
+
good_urls = [SMALL_FILE_URL, REDIRECT_SRC_URL]
|
|
653
|
+
good_status_codes = test_urls(good_urls, error_on_failure=True, n_workers=1, timeout=10)
|
|
654
|
+
assert good_status_codes == [200, 200]
|
|
655
|
+
|
|
656
|
+
|
|
657
|
+
def test_get_url_size_and_sizes(self):
|
|
658
|
+
"""
|
|
659
|
+
Test get_url_size and get_url_sizes functions.
|
|
660
|
+
"""
|
|
661
|
+
|
|
662
|
+
size = get_url_size(SMALL_FILE_URL, timeout=10)
|
|
663
|
+
assert size is not None
|
|
664
|
+
assert size > 1000
|
|
665
|
+
|
|
666
|
+
size_dynamic = get_url_size(REDIRECT_DEST_URL, timeout=10, verbose=True)
|
|
667
|
+
if size_dynamic is not None:
|
|
668
|
+
assert isinstance(size_dynamic, int)
|
|
669
|
+
|
|
670
|
+
size_non_existent = get_url_size(NON_EXISTENT_URL, timeout=5)
|
|
671
|
+
assert size_non_existent is None
|
|
672
|
+
|
|
673
|
+
size_bad_domain = get_url_size(DEFINITELY_NON_EXISTENT_DOMAIN_URL, timeout=5)
|
|
674
|
+
assert size_bad_domain is None
|
|
675
|
+
|
|
676
|
+
urls_for_size = [SMALL_FILE_URL, NON_EXISTENT_URL, REDIRECT_DEST_URL]
|
|
677
|
+
sizes_map = get_url_sizes(urls_for_size, n_workers=1, timeout=10)
|
|
678
|
+
|
|
679
|
+
assert SMALL_FILE_URL in sizes_map
|
|
680
|
+
assert sizes_map[SMALL_FILE_URL] == size
|
|
681
|
+
|
|
682
|
+
assert NON_EXISTENT_URL in sizes_map
|
|
683
|
+
assert sizes_map[NON_EXISTENT_URL] is None
|
|
684
|
+
|
|
685
|
+
assert REDIRECT_DEST_URL in sizes_map
|
|
686
|
+
assert sizes_map[REDIRECT_DEST_URL] == size_dynamic
|
|
687
|
+
|
|
688
|
+
|
|
689
|
+
def test_url_utils():
|
|
690
|
+
"""
|
|
691
|
+
Runs all tests in the TestUrlUtils class.
|
|
692
|
+
"""
|
|
693
|
+
|
|
694
|
+
test_instance = TestUrlUtils()
|
|
695
|
+
test_instance.set_up()
|
|
696
|
+
try:
|
|
697
|
+
test_instance.test_download_url_to_specified_file()
|
|
698
|
+
test_instance.test_download_url_to_temp_file()
|
|
699
|
+
test_instance.test_download_url_non_existent()
|
|
700
|
+
test_instance.test_download_url_force_download()
|
|
701
|
+
test_instance.test_download_url_escape_spaces()
|
|
702
|
+
test_instance.test_download_relative_filename()
|
|
703
|
+
test_instance.test_parallel_download_urls()
|
|
704
|
+
test_instance.test_test_url_and_test_urls()
|
|
705
|
+
test_instance.test_get_url_size_and_sizes()
|
|
706
|
+
finally:
|
|
707
|
+
test_instance.tear_down()
|
|
708
|
+
|
|
709
|
+
# from IPython import embed; embed()
|
|
710
|
+
# test_url_utils()
|