megadetector 5.0.28__py3-none-any.whl → 5.0.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (176) hide show
  1. megadetector/api/batch_processing/api_core/batch_service/score.py +4 -5
  2. megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +1 -1
  3. megadetector/api/batch_processing/api_support/summarize_daily_activity.py +1 -1
  4. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
  5. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
  6. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
  7. megadetector/api/synchronous/api_core/tests/load_test.py +2 -3
  8. megadetector/classification/aggregate_classifier_probs.py +3 -3
  9. megadetector/classification/analyze_failed_images.py +5 -5
  10. megadetector/classification/cache_batchapi_outputs.py +5 -5
  11. megadetector/classification/create_classification_dataset.py +11 -12
  12. megadetector/classification/crop_detections.py +10 -10
  13. megadetector/classification/csv_to_json.py +8 -8
  14. megadetector/classification/detect_and_crop.py +13 -15
  15. megadetector/classification/evaluate_model.py +7 -7
  16. megadetector/classification/identify_mislabeled_candidates.py +6 -6
  17. megadetector/classification/json_to_azcopy_list.py +1 -1
  18. megadetector/classification/json_validator.py +29 -32
  19. megadetector/classification/map_classification_categories.py +9 -9
  20. megadetector/classification/merge_classification_detection_output.py +12 -9
  21. megadetector/classification/prepare_classification_script.py +19 -19
  22. megadetector/classification/prepare_classification_script_mc.py +23 -23
  23. megadetector/classification/run_classifier.py +4 -4
  24. megadetector/classification/save_mislabeled.py +6 -6
  25. megadetector/classification/train_classifier.py +1 -1
  26. megadetector/classification/train_classifier_tf.py +9 -9
  27. megadetector/classification/train_utils.py +10 -10
  28. megadetector/data_management/annotations/annotation_constants.py +1 -1
  29. megadetector/data_management/camtrap_dp_to_coco.py +45 -45
  30. megadetector/data_management/cct_json_utils.py +101 -101
  31. megadetector/data_management/cct_to_md.py +49 -49
  32. megadetector/data_management/cct_to_wi.py +33 -33
  33. megadetector/data_management/coco_to_labelme.py +75 -75
  34. megadetector/data_management/coco_to_yolo.py +189 -189
  35. megadetector/data_management/databases/add_width_and_height_to_db.py +3 -2
  36. megadetector/data_management/databases/combine_coco_camera_traps_files.py +38 -38
  37. megadetector/data_management/databases/integrity_check_json_db.py +202 -188
  38. megadetector/data_management/databases/subset_json_db.py +33 -33
  39. megadetector/data_management/generate_crops_from_cct.py +38 -38
  40. megadetector/data_management/get_image_sizes.py +54 -49
  41. megadetector/data_management/labelme_to_coco.py +130 -124
  42. megadetector/data_management/labelme_to_yolo.py +78 -72
  43. megadetector/data_management/lila/create_lila_blank_set.py +81 -83
  44. megadetector/data_management/lila/create_lila_test_set.py +32 -31
  45. megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
  46. megadetector/data_management/lila/download_lila_subset.py +21 -24
  47. megadetector/data_management/lila/generate_lila_per_image_labels.py +91 -91
  48. megadetector/data_management/lila/get_lila_annotation_counts.py +30 -30
  49. megadetector/data_management/lila/get_lila_image_counts.py +22 -22
  50. megadetector/data_management/lila/lila_common.py +70 -70
  51. megadetector/data_management/lila/test_lila_metadata_urls.py +13 -14
  52. megadetector/data_management/mewc_to_md.py +339 -340
  53. megadetector/data_management/ocr_tools.py +258 -252
  54. megadetector/data_management/read_exif.py +231 -224
  55. megadetector/data_management/remap_coco_categories.py +26 -26
  56. megadetector/data_management/remove_exif.py +31 -20
  57. megadetector/data_management/rename_images.py +187 -187
  58. megadetector/data_management/resize_coco_dataset.py +41 -41
  59. megadetector/data_management/speciesnet_to_md.py +41 -41
  60. megadetector/data_management/wi_download_csv_to_coco.py +55 -55
  61. megadetector/data_management/yolo_output_to_md_output.py +117 -120
  62. megadetector/data_management/yolo_to_coco.py +195 -188
  63. megadetector/detection/change_detection.py +831 -0
  64. megadetector/detection/process_video.py +340 -337
  65. megadetector/detection/pytorch_detector.py +304 -262
  66. megadetector/detection/run_detector.py +177 -164
  67. megadetector/detection/run_detector_batch.py +364 -363
  68. megadetector/detection/run_inference_with_yolov5_val.py +328 -325
  69. megadetector/detection/run_tiled_inference.py +256 -249
  70. megadetector/detection/tf_detector.py +24 -24
  71. megadetector/detection/video_utils.py +290 -282
  72. megadetector/postprocessing/add_max_conf.py +15 -11
  73. megadetector/postprocessing/categorize_detections_by_size.py +44 -44
  74. megadetector/postprocessing/classification_postprocessing.py +415 -415
  75. megadetector/postprocessing/combine_batch_outputs.py +20 -21
  76. megadetector/postprocessing/compare_batch_results.py +528 -517
  77. megadetector/postprocessing/convert_output_format.py +97 -97
  78. megadetector/postprocessing/create_crop_folder.py +219 -146
  79. megadetector/postprocessing/detector_calibration.py +173 -168
  80. megadetector/postprocessing/generate_csv_report.py +508 -499
  81. megadetector/postprocessing/load_api_results.py +23 -20
  82. megadetector/postprocessing/md_to_coco.py +129 -98
  83. megadetector/postprocessing/md_to_labelme.py +89 -83
  84. megadetector/postprocessing/md_to_wi.py +40 -40
  85. megadetector/postprocessing/merge_detections.py +87 -114
  86. megadetector/postprocessing/postprocess_batch_results.py +313 -298
  87. megadetector/postprocessing/remap_detection_categories.py +36 -36
  88. megadetector/postprocessing/render_detection_confusion_matrix.py +205 -199
  89. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
  90. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
  91. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +702 -677
  92. megadetector/postprocessing/separate_detections_into_folders.py +226 -211
  93. megadetector/postprocessing/subset_json_detector_output.py +265 -262
  94. megadetector/postprocessing/top_folders_to_bottom.py +45 -45
  95. megadetector/postprocessing/validate_batch_results.py +70 -70
  96. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
  97. megadetector/taxonomy_mapping/map_new_lila_datasets.py +15 -15
  98. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +14 -14
  99. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +66 -66
  100. megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
  101. megadetector/taxonomy_mapping/simple_image_download.py +8 -8
  102. megadetector/taxonomy_mapping/species_lookup.py +33 -33
  103. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
  104. megadetector/taxonomy_mapping/taxonomy_graph.py +10 -10
  105. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
  106. megadetector/utils/azure_utils.py +22 -22
  107. megadetector/utils/ct_utils.py +1018 -200
  108. megadetector/utils/directory_listing.py +21 -77
  109. megadetector/utils/gpu_test.py +22 -22
  110. megadetector/utils/md_tests.py +541 -518
  111. megadetector/utils/path_utils.py +1457 -398
  112. megadetector/utils/process_utils.py +41 -41
  113. megadetector/utils/sas_blob_utils.py +53 -49
  114. megadetector/utils/split_locations_into_train_val.py +61 -61
  115. megadetector/utils/string_utils.py +147 -26
  116. megadetector/utils/url_utils.py +463 -173
  117. megadetector/utils/wi_utils.py +2629 -2526
  118. megadetector/utils/write_html_image_list.py +137 -137
  119. megadetector/visualization/plot_utils.py +21 -21
  120. megadetector/visualization/render_images_with_thumbnails.py +37 -73
  121. megadetector/visualization/visualization_utils.py +401 -397
  122. megadetector/visualization/visualize_db.py +197 -190
  123. megadetector/visualization/visualize_detector_output.py +79 -73
  124. {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/METADATA +135 -132
  125. megadetector-5.0.29.dist-info/RECORD +163 -0
  126. {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/WHEEL +1 -1
  127. {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/licenses/LICENSE +0 -0
  128. {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/top_level.txt +0 -0
  129. megadetector/data_management/importers/add_nacti_sizes.py +0 -52
  130. megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
  131. megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
  132. megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
  133. megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
  134. megadetector/data_management/importers/awc_to_json.py +0 -191
  135. megadetector/data_management/importers/bellevue_to_json.py +0 -272
  136. megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
  137. megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
  138. megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
  139. megadetector/data_management/importers/cct_field_adjustments.py +0 -58
  140. megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
  141. megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
  142. megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
  143. megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
  144. megadetector/data_management/importers/ena24_to_json.py +0 -276
  145. megadetector/data_management/importers/filenames_to_json.py +0 -386
  146. megadetector/data_management/importers/helena_to_cct.py +0 -283
  147. megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
  148. megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
  149. megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
  150. megadetector/data_management/importers/jb_csv_to_json.py +0 -150
  151. megadetector/data_management/importers/mcgill_to_json.py +0 -250
  152. megadetector/data_management/importers/missouri_to_json.py +0 -490
  153. megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
  154. megadetector/data_management/importers/noaa_seals_2019.py +0 -181
  155. megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
  156. megadetector/data_management/importers/pc_to_json.py +0 -365
  157. megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
  158. megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
  159. megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
  160. megadetector/data_management/importers/rspb_to_json.py +0 -356
  161. megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
  162. megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
  163. megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
  164. megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
  165. megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
  166. megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
  167. megadetector/data_management/importers/sulross_get_exif.py +0 -65
  168. megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
  169. megadetector/data_management/importers/ubc_to_json.py +0 -399
  170. megadetector/data_management/importers/umn_to_json.py +0 -507
  171. megadetector/data_management/importers/wellington_to_json.py +0 -263
  172. megadetector/data_management/importers/wi_to_json.py +0 -442
  173. megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
  174. megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
  175. megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
  176. megadetector-5.0.28.dist-info/RECORD +0 -209
@@ -11,8 +11,12 @@ Frequently-used functions for downloading or manipulating URLs
11
11
  import os
12
12
  import re
13
13
  import urllib
14
+ import urllib.request
15
+ import urllib.error
14
16
  import tempfile
15
- import requests
17
+ import requests
18
+ import shutil
19
+ import pytest
16
20
 
17
21
  from functools import partial
18
22
  from tqdm import tqdm
@@ -20,111 +24,105 @@ from urllib.parse import urlparse
20
24
  from multiprocessing.pool import ThreadPool
21
25
  from multiprocessing.pool import Pool
22
26
 
23
- url_utils_temp_dir = None
27
+ from megadetector.utils.ct_utils import make_test_folder
28
+ from megadetector.utils.ct_utils import make_temp_folder
29
+
24
30
  max_path_len = 255
25
31
 
26
32
 
27
33
  #%% Download functions
28
34
 
29
- class DownloadProgressBar():
35
+ class DownloadProgressBar:
30
36
  """
31
37
  Progress updater based on the progressbar2 package.
32
-
38
+
33
39
  https://stackoverflow.com/questions/37748105/how-to-use-progressbar-module-with-urlretrieve
34
40
  """
35
-
41
+
42
+
36
43
  def __init__(self):
44
+
37
45
  self.pbar = None
38
46
 
39
- def __call__(self, block_num, block_size, total_size):
40
- if not self.pbar:
41
- # This is a pretty random import I'd rather not depend on outside of the
42
- # rare case where it's used, so importing locally
43
- # pip install progressbar2
44
- import progressbar
45
- self.pbar = progressbar.ProgressBar(max_value=total_size)
46
- self.pbar.start()
47
-
48
- downloaded = block_num * block_size
49
- if downloaded < total_size:
50
- self.pbar.update(downloaded)
51
- else:
52
- self.pbar.finish()
53
-
54
47
 
55
- def get_temp_folder(preferred_name='url_utils'):
56
- """
57
- Gets a temporary folder for use within this module.
58
-
59
- Args:
60
- preferred_name (str, optional): subfolder to use within the system temp folder
61
-
62
- Returns:
63
- str: the full path to the temporary subfolder
64
- """
65
- global url_utils_temp_dir
66
-
67
- if url_utils_temp_dir is None:
68
- url_utils_temp_dir = os.path.join(tempfile.gettempdir(),preferred_name)
69
- os.makedirs(url_utils_temp_dir,exist_ok=True)
70
-
71
- return url_utils_temp_dir
72
-
73
-
74
- def download_url(url,
75
- destination_filename=None,
76
- progress_updater=None,
77
- force_download=False,
48
+ def __call__(self, block_num, block_size, total_size): # noqa
49
+
50
+ if not self.pbar:
51
+ try:
52
+ import progressbar # type: ignore
53
+ self.pbar = progressbar.ProgressBar(max_value=total_size)
54
+ self.pbar.start()
55
+ except ImportError:
56
+ self.pbar = None
57
+ # print("ProgressBar not available, install 'progressbar2' for visual progress.")
58
+
59
+ if self.pbar:
60
+ downloaded = block_num * block_size
61
+ if downloaded < total_size:
62
+ self.pbar.update(downloaded)
63
+ else:
64
+ self.pbar.finish()
65
+
66
+
67
+ def download_url(url,
68
+ destination_filename=None,
69
+ progress_updater=None,
70
+ force_download=False,
78
71
  verbose=True,
79
72
  escape_spaces=True):
80
73
  """
81
- Downloads a URL to a file. If no file is specified, creates a temporary file,
74
+ Downloads a URL to a file. If no file is specified, creates a temporary file,
82
75
  making a best effort to avoid filename collisions.
83
-
76
+
84
77
  Prints some diagnostic information and makes sure to omit SAS tokens from printouts.
85
-
78
+
86
79
  Args:
87
80
  url (str): the URL to download
88
81
  destination_filename (str, optional): the target filename; if None, will create
89
- a file in system temp space
90
- progress_updater (object or bool, optional): can be "None", "False", "True", or a
91
- specific callable object. If None or False, no progress updated will be
82
+ a file in system temp space
83
+ progress_updater (object or bool, optional): can be "None", "False", "True", or a
84
+ specific callable object. If None or False, no progress updated will be
92
85
  displayed. If True, a default progress bar will be created.
93
86
  force_download (bool, optional): download this file even if [destination_filename]
94
87
  exists.
95
88
  verbose (bool, optional): enable additional debug console output
96
89
  escape_spaces (bool, optional): replace ' ' with '%20'
97
-
90
+
98
91
  Returns:
99
92
  str: the filename to which [url] was downloaded, the same as [destination_filename]
100
93
  if [destination_filename] was not None
101
94
  """
102
-
95
+
103
96
  if progress_updater is not None and isinstance(progress_updater,bool):
104
97
  if not progress_updater:
105
98
  progress_updater = None
106
99
  else:
107
100
  progress_updater = DownloadProgressBar()
108
-
101
+
109
102
  url_no_sas = url.split('?')[0]
110
-
103
+
111
104
  if destination_filename is None:
112
-
113
- target_folder = get_temp_folder()
105
+
106
+ target_folder = make_temp_folder(subfolder='url_utils',append_guid=False)
114
107
  url_without_sas = url.split('?', 1)[0]
115
-
108
+
116
109
  # This does not guarantee uniqueness, hence "semi-best-effort"
117
110
  url_as_filename = re.sub(r'\W+', '', url_without_sas)
118
- n_folder_chars = len(url_utils_temp_dir)
119
- if len(url_as_filename) + n_folder_chars > max_path_len:
111
+
112
+ n_folder_chars = len(target_folder)
113
+
114
+ if (len(url_as_filename) + n_folder_chars) >= max_path_len:
120
115
  print('Warning: truncating filename target to {} characters'.format(max_path_len))
121
- url_as_filename = url_as_filename[-1*(max_path_len-n_folder_chars):]
116
+ max_fn_len = max_path_len - (n_folder_chars + 1)
117
+ url_as_filename = url_as_filename[-1 * max_fn_len:]
122
118
  destination_filename = \
123
119
  os.path.join(target_folder,url_as_filename)
124
-
120
+
121
+ # ...if the destination filename wasn't specified
122
+
125
123
  if escape_spaces:
126
124
  url = url.replace(' ','%20')
127
-
125
+
128
126
  if (not force_download) and (os.path.isfile(destination_filename)):
129
127
  if verbose:
130
128
  print('Bypassing download of already-downloaded file {}'.format(os.path.basename(url_no_sas)))
@@ -133,12 +131,12 @@ def download_url(url,
133
131
  print('Downloading file {} to {}'.format(os.path.basename(url_no_sas),destination_filename),end='')
134
132
  target_dir = os.path.dirname(destination_filename)
135
133
  os.makedirs(target_dir,exist_ok=True)
136
- urllib.request.urlretrieve(url, destination_filename, progress_updater)
134
+ urllib.request.urlretrieve(url, destination_filename, progress_updater)
137
135
  assert(os.path.isfile(destination_filename))
138
- nBytes = os.path.getsize(destination_filename)
136
+ n_bytes = os.path.getsize(destination_filename)
139
137
  if verbose:
140
- print('...done, {} bytes.'.format(nBytes))
141
-
138
+ print('...done, {} bytes.'.format(n_bytes))
139
+
142
140
  return destination_filename
143
141
 
144
142
  # ...def download_url(...)
@@ -146,24 +144,24 @@ def download_url(url,
146
144
 
147
145
  def download_relative_filename(url, output_base, verbose=False):
148
146
  """
149
- Download a URL to output_base, preserving relative path. Path is relative to
147
+ Download a URL to output_base, preserving relative path. Path is relative to
150
148
  the site, so:
151
-
149
+
152
150
  https://abc.com/xyz/123.txt
153
-
151
+
154
152
  ...will get downloaded to:
155
-
156
- output_base/xyz/123.txt
157
-
153
+
154
+ output_base/xyz/123.txt
155
+
158
156
  Args:
159
157
  url (str): the URL to download
160
158
  output_base (str): the base folder to which we should download this file
161
159
  verbose (bool, optional): enable additional debug console output
162
-
160
+
163
161
  Returns:
164
162
  str: the local destination filename
165
163
  """
166
-
164
+
167
165
  p = urlparse(url)
168
166
  # remove the leading '/'
169
167
  assert p.path.startswith('/'); relative_filename = p.path[1:]
@@ -177,40 +175,40 @@ def _do_parallelized_download(download_info,overwrite=False,verbose=False):
177
175
  """
178
176
  Internal function for download parallelization.
179
177
  """
180
-
178
+
181
179
  url = download_info['url']
182
180
  target_file = download_info['target_file']
183
181
  result = {'status':'unknown','url':url,'target_file':target_file}
184
-
182
+
185
183
  if ((os.path.isfile(target_file)) and (not overwrite)):
186
184
  if verbose:
187
185
  print('Skipping existing file {}'.format(target_file))
188
186
  result['status'] = 'skipped'
189
187
  return result
190
188
  try:
191
- download_url(url=url,
189
+ download_url(url=url,
192
190
  destination_filename=target_file,
193
- verbose=verbose,
191
+ verbose=verbose,
194
192
  force_download=overwrite)
195
193
  except Exception as e:
196
194
  print('Warning: error downloading URL {}: {}'.format(
197
- url,str(e)))
195
+ url,str(e)))
198
196
  result['status'] = 'error: {}'.format(str(e))
199
197
  return result
200
-
198
+
201
199
  result['status'] = 'success'
202
200
  return result
203
201
 
204
202
  # ...def _do_parallelized_download(...)
205
203
 
206
204
 
207
- def parallel_download_urls(url_to_target_file,verbose=False,overwrite=False,
208
- n_workers=20,pool_type='thread'):
205
+ def parallel_download_urls(url_to_target_file, verbose=False, overwrite=False,
206
+ n_workers=20, pool_type='thread'):
209
207
  """
210
208
  Downloads a list of URLs to local files.
211
-
212
- Catches exceptions and reports them in the returned "results" array.
213
-
209
+
210
+ Catches exceptions and reports them in the returned "results" array.
211
+
214
212
  Args:
215
213
  url_to_target_file: a dict mapping URLs to local filenames.
216
214
  verbose (bool, optional): enable additional debug console output
@@ -218,82 +216,95 @@ def parallel_download_urls(url_to_target_file,verbose=False,overwrite=False,
218
216
  n_workers (int, optional): number of concurrent workers, set to <=1 to disable
219
217
  parallelization
220
218
  pool_type (str, optional): worker type to use; should be 'thread' or 'process'
221
-
219
+
222
220
  Returns:
223
221
  list: list of dicts with keys:
224
222
  - 'url': the url this item refers to
225
223
  - 'status': 'skipped', 'success', or a string starting with 'error'
226
- - 'target_file': the local filename to which we downloaded (or tried to
227
- download) this URL
224
+ - 'target_file': the local filename to which we downloaded (or tried to
225
+ download) this URL
228
226
  """
229
-
227
+
230
228
  all_download_info = []
231
-
232
- print('Preparing download list')
233
- for url in tqdm(url_to_target_file):
229
+
230
+ if verbose:
231
+ print('Preparing download list')
232
+ for url in tqdm(url_to_target_file, disable=(not verbose)):
234
233
  download_info = {}
235
234
  download_info['url'] = url
236
235
  download_info['target_file'] = url_to_target_file[url]
237
236
  all_download_info.append(download_info)
238
-
239
- print('Downloading {} images on {} workers'.format(
240
- len(all_download_info),n_workers))
237
+
238
+ if verbose:
239
+ print('Downloading {} images on {} workers'.format(
240
+ len(all_download_info),n_workers))
241
241
 
242
242
  if n_workers <= 1:
243
243
 
244
244
  results = []
245
-
246
- for download_info in tqdm(all_download_info):
245
+
246
+ for download_info in tqdm(all_download_info, disable=(not verbose)):
247
247
  result = _do_parallelized_download(download_info,overwrite=overwrite,verbose=verbose)
248
248
  results.append(result)
249
-
249
+
250
250
  else:
251
251
 
252
- if pool_type == 'thread':
253
- pool = ThreadPool(n_workers)
254
- else:
255
- assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
256
- pool = Pool(n_workers)
257
-
258
- print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
259
-
260
- results = list(tqdm(pool.imap(
261
- partial(_do_parallelized_download,overwrite=overwrite,verbose=verbose),
262
- all_download_info), total=len(all_download_info)))
263
-
252
+ pool = None
253
+
254
+ try:
255
+ if pool_type == 'thread':
256
+ pool = ThreadPool(n_workers)
257
+ else:
258
+ assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
259
+ pool = Pool(n_workers)
260
+
261
+ if verbose:
262
+ print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
263
+
264
+ results = list(tqdm(pool.imap(
265
+ partial(_do_parallelized_download,overwrite=overwrite,verbose=verbose),
266
+ all_download_info), total=len(all_download_info), disable=(not verbose)))
267
+
268
+ finally:
269
+ if pool:
270
+ pool.close()
271
+ pool.join()
272
+ print("Pool closed and joined for parallel URL downloads")
273
+
264
274
  return results
265
275
 
266
276
  # ...def parallel_download_urls(...)
267
277
 
268
278
 
279
+ @pytest.mark.skip(reason="This is not a test function")
269
280
  def test_url(url,error_on_failure=True,timeout=None):
270
281
  """
271
282
  Tests the availability of [url], returning an http status code.
272
-
283
+
273
284
  Args:
274
285
  url (str): URL to test
275
286
  error_on_failure (bool, optional): whether to error (vs. just returning an
276
287
  error code) if accessing this URL fails
277
- timeout (int, optional): timeout in seconds to wait before considering this
288
+ timeout (int, optional): timeout in seconds to wait before considering this
278
289
  access attempt to be a failure; see requests.head() for precise documentation
279
-
290
+
280
291
  Returns:
281
292
  int: http status code (200 for success)
282
293
  """
283
-
284
- # r = requests.get(url, stream=True, verify=True, timeout=timeout)
294
+
285
295
  r = requests.head(url, stream=True, verify=True, timeout=timeout)
286
-
287
- if error_on_failure and r.status_code != 200:
296
+
297
+ if error_on_failure and r.status_code != 200:
288
298
  raise ValueError('Could not access {}: error {}'.format(url,r.status_code))
289
299
  return r.status_code
290
-
291
300
 
292
- def test_urls(urls,error_on_failure=True,n_workers=1,pool_type='thread',timeout=None):
301
+
302
+ @pytest.mark.skip(reason="This is not a test function")
303
+ def test_urls(urls,error_on_failure=True,n_workers=1,pool_type='thread',timeout=None,verbose=False):
293
304
  """
294
305
  Verify that URLs are available (i.e., returns status 200). By default,
295
- errors if any URL is unavailable.
296
-
306
+ errors if any URL is unavailable.
307
+
297
308
  Args:
298
309
  urls (list): list of URLs to test
299
310
  error_on_failure (bool, optional): whether to error (vs. just returning an
@@ -301,39 +312,48 @@ def test_urls(urls,error_on_failure=True,n_workers=1,pool_type='thread',timeout=
301
312
  n_workers (int, optional): number of concurrent workers, set to <=1 to disable
302
313
  parallelization
303
314
  pool_type (str, optional): worker type to use; should be 'thread' or 'process'
304
- timeout (int, optional): timeout in seconds to wait before considering this
315
+ timeout (int, optional): timeout in seconds to wait before considering this
305
316
  access attempt to be a failure; see requests.head() for precise documentation
306
-
317
+ verbose (bool, optional): enable additional debug output
318
+
307
319
  Returns:
308
320
  list: a list of http status codes, the same length and order as [urls]
309
321
  """
310
-
322
+
311
323
  if n_workers <= 1:
312
324
 
313
325
  status_codes = []
314
-
315
- for url in tqdm(urls):
316
-
326
+
327
+ for url in tqdm(urls,disable=(not verbose)):
328
+
317
329
  r = requests.get(url, timeout=timeout)
318
-
319
- if error_on_failure and r.status_code != 200:
330
+
331
+ if error_on_failure and r.status_code != 200:
320
332
  raise ValueError('Could not access {}: error {}'.format(url,r.status_code))
321
333
  status_codes.append(r.status_code)
322
-
334
+
323
335
  else:
324
336
 
325
- if pool_type == 'thread':
326
- pool = ThreadPool(n_workers)
327
- else:
328
- assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
329
- pool = Pool(n_workers)
330
-
331
- print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
332
-
333
- status_codes = list(tqdm(pool.imap(
334
- partial(test_url,error_on_failure=error_on_failure,timeout=timeout),
335
- urls), total=len(urls)))
336
-
337
+ pool = None
338
+ try:
339
+ if pool_type == 'thread':
340
+ pool = ThreadPool(n_workers)
341
+ else:
342
+ assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
343
+ pool = Pool(n_workers)
344
+
345
+ if verbose:
346
+ print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
347
+
348
+ status_codes = list(tqdm(pool.imap(
349
+ partial(test_url,error_on_failure=error_on_failure,timeout=timeout),
350
+ urls), total=len(urls), disable=(not verbose)))
351
+ finally:
352
+ if pool:
353
+ pool.close()
354
+ pool.join()
355
+ print('Pool closed and joined for URL tests')
356
+
337
357
  return status_codes
338
358
 
339
359
  # ...def test_urls(...)
@@ -341,16 +361,16 @@ def test_urls(urls,error_on_failure=True,n_workers=1,pool_type='thread',timeout=
341
361
 
342
362
  def get_url_size(url,verbose=False,timeout=None):
343
363
  """
344
- Get the size of the file pointed to by a URL, based on the Content-Length property. If the
345
- URL is not available, or the Content-Length property is not available, or the content-Length
346
- property is not an integer, returns None.
347
-
364
+ Get the size of the file pointed to by a URL, based on the Content-Length property. If the
365
+ URL is not available, or the Content-Length property is not available, or the content-Length
366
+ property is not an integer, returns None.
367
+
348
368
  Args:
349
369
  url (str): the url to test
350
370
  verbose (bool, optional): enable additional debug output
351
- timeout (int, optional): timeout in seconds to wait before considering this
371
+ timeout (int, optional): timeout in seconds to wait before considering this
352
372
  access attempt to be a failure; see requests.head() for precise documentation
353
-
373
+
354
374
  Returns:
355
375
  int: the file size in bytes, or None if it can't be retrieved
356
376
  """
@@ -362,13 +382,18 @@ def get_url_size(url,verbose=False,timeout=None):
362
382
  if verbose:
363
383
  print('Status {} retrieving file size for {}'.format(f.status,url))
364
384
  return None
365
- size_bytes = int(f.headers['Content-Length'])
385
+ size_bytes_str = f.headers.get('Content-Length')
386
+ if size_bytes_str is None:
387
+ if verbose:
388
+ print('No Content-Length header for {}'.format(url))
389
+ return None
390
+ size_bytes = int(size_bytes_str)
366
391
  return size_bytes
367
392
  except Exception as e:
368
393
  if verbose:
369
394
  print('Error retrieving file size for {}:\n{}'.format(url,str(e)))
370
395
  return None
371
-
396
+
372
397
  # ...def get_url_size(...)
373
398
 
374
399
 
@@ -376,45 +401,310 @@ def get_url_sizes(urls,n_workers=1,pool_type='thread',timeout=None,verbose=False
376
401
  """
377
402
  Retrieve file sizes for the URLs specified by [urls]. Returns None for any URLs
378
403
  that we can't access, or URLs for which the Content-Length property is not set.
379
-
404
+
380
405
  Args:
381
406
  urls (list): list of URLs for which we should retrieve sizes
382
407
  n_workers (int, optional): number of concurrent workers, set to <=1 to disable
383
408
  parallelization
384
409
  pool_type (str, optional): worker type to use; should be 'thread' or 'process'
385
- timeout (int, optional): timeout in seconds to wait before considering this
410
+ timeout (int, optional): timeout in seconds to wait before considering this
386
411
  access attempt to be a failure; see requests.head() for precise documentation
387
412
  verbose (bool, optional): print additional debug information
388
-
413
+
389
414
  Returns:
390
415
  dict: maps urls to file sizes, which will be None for URLs for which we were unable
391
- to retrieve a valid size.
416
+ to retrieve a valid size.
392
417
  """
393
-
418
+
394
419
  url_to_size = {}
395
-
396
- if n_workers <= 1:
397
-
398
- for url in tqdm(urls):
420
+
421
+ if n_workers <= 1:
422
+
423
+ for url in tqdm(urls, disable=(not verbose)):
399
424
  url_to_size[url] = get_url_size(url,verbose=verbose,timeout=timeout)
400
-
425
+
401
426
  else:
402
427
 
403
- if pool_type == 'thread':
404
- pool = ThreadPool(n_workers)
405
- else:
406
- assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
407
- pool = Pool(n_workers)
428
+ pool = None
429
+ try:
430
+ if pool_type == 'thread':
431
+ pool = ThreadPool(n_workers)
432
+ else:
433
+ assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
434
+ pool = Pool(n_workers)
435
+
436
+ if verbose:
437
+ print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
438
+
439
+ file_sizes = list(tqdm(pool.imap(
440
+ partial(get_url_size,verbose=verbose,timeout=timeout),
441
+ urls), total=len(urls), disable=(not verbose)))
442
+
443
+ for i_url,url in enumerate(urls):
444
+ url_to_size[url] = file_sizes[i_url]
445
+ finally:
446
+ if pool:
447
+ pool.close()
448
+ pool.join()
449
+ print('Pool closed and joined for URL size checks')
450
+
451
+ return url_to_size
452
+
453
+
454
+ #%% Tests
455
+
456
+ # Constants for tests
457
+
458
+ SMALL_FILE_URL = "https://www.google.com/images/branding/googlelogo/1x/googlelogo_color_272x92dp.png"
459
+ REDIRECT_SRC_URL = "http://google.com"
460
+ REDIRECT_DEST_URL = "https://www.google.com/"
461
+ NON_EXISTENT_URL = "https://example.com/non_existent_page_404.html"
462
+ DEFINITELY_NON_EXISTENT_DOMAIN_URL = "https://thisshouldnotexist1234567890.com/file.txt"
463
+ RELATIVE_DOWNLOAD_URL = "https://raw.githubusercontent.com/agentmorris/MegaDetector/main/README.md"
464
+ RELATIVE_DOWNLOAD_CONTAIN_TOKEN = 'agentmorris'
465
+ RELATIVE_DOWNLOAD_NOT_CONTAIN_TOKEN = 'github'
466
+
467
+
468
+ class TestUrlUtils:
469
+ """
470
+ Tests for url_utils.py
471
+ """
472
+
473
+
474
+ def set_up(self):
475
+ """
476
+ Create a temporary directory for testing.
477
+ """
478
+
479
+ self.test_dir = make_test_folder(subfolder='url_utils_tests')
480
+ self.download_target_dir = os.path.join(self.test_dir, 'downloads')
481
+ os.makedirs(self.download_target_dir, exist_ok=True)
482
+
483
+
484
+ def tear_down(self):
485
+ """
486
+ Remove the temporary directory after tests and restore module temp_dir.
487
+ """
488
+
489
+ if os.path.exists(self.test_dir):
490
+ shutil.rmtree(self.test_dir)
491
+
492
+
493
+ def test_download_url_to_specified_file(self):
494
+ """
495
+ Test download_url with a specified destination filename.
496
+ """
497
+
498
+ dest_filename = os.path.join(self.download_target_dir, "downloaded_google_logo.png")
499
+ returned_filename = download_url(SMALL_FILE_URL, destination_filename=dest_filename, verbose=False)
500
+ assert returned_filename == dest_filename
501
+ assert os.path.exists(dest_filename)
502
+ assert os.path.getsize(dest_filename) > 1000
503
+
504
+
505
+ def test_download_url_to_temp_file(self):
506
+ """
507
+ Test download_url when destination_filename is None.
508
+ """
509
+
510
+ returned_filename = download_url(SMALL_FILE_URL, destination_filename=None, verbose=False)
511
+ assert os.path.exists(returned_filename)
512
+ assert os.path.getsize(returned_filename) > 1000
408
513
 
409
- print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
514
+
515
+ def test_download_url_non_existent(self):
516
+ """
517
+ Test download_url with a non-existent URL.
518
+ """
519
+
520
+ dest_filename = os.path.join(self.download_target_dir, "non_existent.html")
521
+ try:
522
+ download_url(NON_EXISTENT_URL, destination_filename=dest_filename, verbose=False)
523
+ assert False, "urllib.error.HTTPError not raised for 404"
524
+ except urllib.error.HTTPError:
525
+ pass
410
526
 
411
- file_sizes = list(tqdm(pool.imap(
412
- partial(get_url_size,verbose=verbose,timeout=timeout),
413
- urls), total=len(urls)))
527
+ try:
528
+ download_url(DEFINITELY_NON_EXISTENT_DOMAIN_URL, destination_filename=dest_filename, verbose=False)
529
+ assert False, \
530
+ "urllib.error.URLError or requests.exceptions.ConnectionError not raised for DNS failure"
531
+ except urllib.error.URLError:
532
+ pass
533
+ except requests.exceptions.ConnectionError:
534
+ pass
535
+
536
+
537
+ def test_download_url_force_download(self):
538
+ """
539
+ Test the force_download parameter of download_url.
540
+ """
541
+
542
+ dest_filename = os.path.join(self.download_target_dir, "force_test.png")
414
543
 
415
- for i_url,url in enumerate(urls):
416
- url_to_size[url] = file_sizes[i_url]
417
-
418
- return url_to_size
544
+ download_url(SMALL_FILE_URL, destination_filename=dest_filename, verbose=False)
545
+ assert os.path.exists(dest_filename)
546
+ initial_mtime = os.path.getmtime(dest_filename)
547
+
548
+ download_url(SMALL_FILE_URL, destination_filename=dest_filename, verbose=True)
549
+ assert os.path.getmtime(dest_filename) == initial_mtime
550
+
551
+ download_url(SMALL_FILE_URL, destination_filename=dest_filename, force_download=True, verbose=False)
552
+ assert os.path.exists(dest_filename)
553
+
554
+
555
+ def test_download_url_escape_spaces(self):
556
+ """
557
+ Test download_url with spaces in the URL.
558
+ """
559
+
560
+ dest_filename = os.path.join(self.download_target_dir, "escape_test.png")
561
+ download_url(SMALL_FILE_URL, destination_filename=dest_filename, escape_spaces=True, verbose=False)
562
+ assert os.path.exists(dest_filename)
563
+
564
+
565
+ def test_download_relative_filename(self):
566
+ """
567
+ Test download_relative_filename.
568
+ """
569
+
570
+ output_base = os.path.join(self.download_target_dir, "relative_dl")
571
+ returned_filename = download_relative_filename(RELATIVE_DOWNLOAD_URL, output_base, verbose=False)
572
+ assert RELATIVE_DOWNLOAD_CONTAIN_TOKEN in returned_filename
573
+ assert RELATIVE_DOWNLOAD_NOT_CONTAIN_TOKEN not in returned_filename
574
+ assert os.path.exists(returned_filename)
575
+ assert os.path.getsize(returned_filename) > 100
576
+
577
+
578
+ def test_parallel_download_urls(self):
579
+ """
580
+ Test parallel_download_urls (with n_workers=1 for simplicity).
581
+ """
582
+
583
+ url1_target = os.path.join(self.download_target_dir, "parallel_dl_1.png")
584
+ url2_target = os.path.join(self.download_target_dir, "parallel_dl_2_nonexistent.html")
585
+
586
+ url_to_target_file = {
587
+ SMALL_FILE_URL: url1_target,
588
+ NON_EXISTENT_URL: url2_target
589
+ }
590
+
591
+ results = parallel_download_urls(url_to_target_file, n_workers=1, verbose=False)
592
+
593
+ assert len(results) == 2
594
+
595
+ status_map = {res['url']: res for res in results}
596
+
597
+ assert status_map[SMALL_FILE_URL]['status'] == 'success'
598
+ assert status_map[SMALL_FILE_URL]['target_file'] == url1_target
599
+ assert os.path.exists(url1_target)
600
+
601
+ assert status_map[NON_EXISTENT_URL]['status'].startswith('error: HTTP Error 404')
602
+ assert status_map[NON_EXISTENT_URL]['target_file'] == url2_target
603
+ assert not os.path.exists(url2_target)
604
+
605
+ if not os.path.exists(url1_target):
606
+ download_url(SMALL_FILE_URL, url1_target, verbose=False)
607
+ results_skip = parallel_download_urls({SMALL_FILE_URL: url1_target}, n_workers=1, overwrite=False, verbose=True)
608
+ assert results_skip[0]['status'] == 'skipped'
419
609
 
420
- # ...get_url_sizes(...)
610
+ results_overwrite = parallel_download_urls({SMALL_FILE_URL: url1_target}, n_workers=1, overwrite=True, verbose=False)
611
+ assert results_overwrite[0]['status'] == 'success'
612
+
613
+
614
+ def test_test_url_and_test_urls(self):
615
+ """
616
+ Test test_url and test_urls functions.
617
+ """
618
+
619
+ assert test_url(SMALL_FILE_URL, error_on_failure=False, timeout=10) == 200
620
+ assert test_url(REDIRECT_SRC_URL, error_on_failure=False, timeout=10) in (200,301)
621
+
622
+ status_non_existent = test_url(NON_EXISTENT_URL, error_on_failure=False, timeout=5)
623
+ assert status_non_existent == 404
624
+
625
+ try:
626
+ test_url(NON_EXISTENT_URL, error_on_failure=True, timeout=5)
627
+ assert False, "ValueError not raised for NON_EXISTENT_URL"
628
+ except ValueError:
629
+ pass
630
+
631
+ try:
632
+ test_url(DEFINITELY_NON_EXISTENT_DOMAIN_URL, error_on_failure=True, timeout=5)
633
+ assert False, "requests.exceptions.ConnectionError or urllib.error.URLError not raised"
634
+ except requests.exceptions.ConnectionError:
635
+ pass
636
+ except urllib.error.URLError:
637
+ pass
638
+
639
+
640
+ urls_to_test = [SMALL_FILE_URL, NON_EXISTENT_URL]
641
+ status_codes = test_urls(urls_to_test, error_on_failure=False, n_workers=1, timeout=10)
642
+ assert len(status_codes) == 2
643
+ assert status_codes[0] == 200
644
+ assert status_codes[1] == 404
645
+
646
+ try:
647
+ test_urls(urls_to_test, error_on_failure=True, n_workers=1, timeout=5)
648
+ assert False, "ValueError not raised for urls_to_test"
649
+ except ValueError:
650
+ pass
651
+
652
+ good_urls = [SMALL_FILE_URL, REDIRECT_SRC_URL]
653
+ good_status_codes = test_urls(good_urls, error_on_failure=True, n_workers=1, timeout=10)
654
+ assert good_status_codes == [200, 200]
655
+
656
+
657
+ def test_get_url_size_and_sizes(self):
658
+ """
659
+ Test get_url_size and get_url_sizes functions.
660
+ """
661
+
662
+ size = get_url_size(SMALL_FILE_URL, timeout=10)
663
+ assert size is not None
664
+ assert size > 1000
665
+
666
+ size_dynamic = get_url_size(REDIRECT_DEST_URL, timeout=10, verbose=True)
667
+ if size_dynamic is not None:
668
+ assert isinstance(size_dynamic, int)
669
+
670
+ size_non_existent = get_url_size(NON_EXISTENT_URL, timeout=5)
671
+ assert size_non_existent is None
672
+
673
+ size_bad_domain = get_url_size(DEFINITELY_NON_EXISTENT_DOMAIN_URL, timeout=5)
674
+ assert size_bad_domain is None
675
+
676
+ urls_for_size = [SMALL_FILE_URL, NON_EXISTENT_URL, REDIRECT_DEST_URL]
677
+ sizes_map = get_url_sizes(urls_for_size, n_workers=1, timeout=10)
678
+
679
+ assert SMALL_FILE_URL in sizes_map
680
+ assert sizes_map[SMALL_FILE_URL] == size
681
+
682
+ assert NON_EXISTENT_URL in sizes_map
683
+ assert sizes_map[NON_EXISTENT_URL] is None
684
+
685
+ assert REDIRECT_DEST_URL in sizes_map
686
+ assert sizes_map[REDIRECT_DEST_URL] == size_dynamic
687
+
688
+
689
+ def test_url_utils():
690
+ """
691
+ Runs all tests in the TestUrlUtils class.
692
+ """
693
+
694
+ test_instance = TestUrlUtils()
695
+ test_instance.set_up()
696
+ try:
697
+ test_instance.test_download_url_to_specified_file()
698
+ test_instance.test_download_url_to_temp_file()
699
+ test_instance.test_download_url_non_existent()
700
+ test_instance.test_download_url_force_download()
701
+ test_instance.test_download_url_escape_spaces()
702
+ test_instance.test_download_relative_filename()
703
+ test_instance.test_parallel_download_urls()
704
+ test_instance.test_test_url_and_test_urls()
705
+ test_instance.test_get_url_size_and_sizes()
706
+ finally:
707
+ test_instance.tear_down()
708
+
709
+ # from IPython import embed; embed()
710
+ # test_url_utils()