megadetector 10.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. megadetector/__init__.py +0 -0
  2. megadetector/api/__init__.py +0 -0
  3. megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
  4. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
  5. megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
  6. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
  7. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
  8. megadetector/classification/__init__.py +0 -0
  9. megadetector/classification/aggregate_classifier_probs.py +108 -0
  10. megadetector/classification/analyze_failed_images.py +227 -0
  11. megadetector/classification/cache_batchapi_outputs.py +198 -0
  12. megadetector/classification/create_classification_dataset.py +626 -0
  13. megadetector/classification/crop_detections.py +516 -0
  14. megadetector/classification/csv_to_json.py +226 -0
  15. megadetector/classification/detect_and_crop.py +853 -0
  16. megadetector/classification/efficientnet/__init__.py +9 -0
  17. megadetector/classification/efficientnet/model.py +415 -0
  18. megadetector/classification/efficientnet/utils.py +608 -0
  19. megadetector/classification/evaluate_model.py +520 -0
  20. megadetector/classification/identify_mislabeled_candidates.py +152 -0
  21. megadetector/classification/json_to_azcopy_list.py +63 -0
  22. megadetector/classification/json_validator.py +696 -0
  23. megadetector/classification/map_classification_categories.py +276 -0
  24. megadetector/classification/merge_classification_detection_output.py +509 -0
  25. megadetector/classification/prepare_classification_script.py +194 -0
  26. megadetector/classification/prepare_classification_script_mc.py +228 -0
  27. megadetector/classification/run_classifier.py +287 -0
  28. megadetector/classification/save_mislabeled.py +110 -0
  29. megadetector/classification/train_classifier.py +827 -0
  30. megadetector/classification/train_classifier_tf.py +725 -0
  31. megadetector/classification/train_utils.py +323 -0
  32. megadetector/data_management/__init__.py +0 -0
  33. megadetector/data_management/animl_to_md.py +161 -0
  34. megadetector/data_management/annotations/__init__.py +0 -0
  35. megadetector/data_management/annotations/annotation_constants.py +33 -0
  36. megadetector/data_management/camtrap_dp_to_coco.py +270 -0
  37. megadetector/data_management/cct_json_utils.py +566 -0
  38. megadetector/data_management/cct_to_md.py +184 -0
  39. megadetector/data_management/cct_to_wi.py +293 -0
  40. megadetector/data_management/coco_to_labelme.py +284 -0
  41. megadetector/data_management/coco_to_yolo.py +701 -0
  42. megadetector/data_management/databases/__init__.py +0 -0
  43. megadetector/data_management/databases/add_width_and_height_to_db.py +107 -0
  44. megadetector/data_management/databases/combine_coco_camera_traps_files.py +210 -0
  45. megadetector/data_management/databases/integrity_check_json_db.py +563 -0
  46. megadetector/data_management/databases/subset_json_db.py +195 -0
  47. megadetector/data_management/generate_crops_from_cct.py +200 -0
  48. megadetector/data_management/get_image_sizes.py +164 -0
  49. megadetector/data_management/labelme_to_coco.py +559 -0
  50. megadetector/data_management/labelme_to_yolo.py +349 -0
  51. megadetector/data_management/lila/__init__.py +0 -0
  52. megadetector/data_management/lila/create_lila_blank_set.py +556 -0
  53. megadetector/data_management/lila/create_lila_test_set.py +192 -0
  54. megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
  55. megadetector/data_management/lila/download_lila_subset.py +182 -0
  56. megadetector/data_management/lila/generate_lila_per_image_labels.py +777 -0
  57. megadetector/data_management/lila/get_lila_annotation_counts.py +174 -0
  58. megadetector/data_management/lila/get_lila_image_counts.py +112 -0
  59. megadetector/data_management/lila/lila_common.py +319 -0
  60. megadetector/data_management/lila/test_lila_metadata_urls.py +164 -0
  61. megadetector/data_management/mewc_to_md.py +344 -0
  62. megadetector/data_management/ocr_tools.py +873 -0
  63. megadetector/data_management/read_exif.py +964 -0
  64. megadetector/data_management/remap_coco_categories.py +195 -0
  65. megadetector/data_management/remove_exif.py +156 -0
  66. megadetector/data_management/rename_images.py +194 -0
  67. megadetector/data_management/resize_coco_dataset.py +665 -0
  68. megadetector/data_management/speciesnet_to_md.py +41 -0
  69. megadetector/data_management/wi_download_csv_to_coco.py +247 -0
  70. megadetector/data_management/yolo_output_to_md_output.py +594 -0
  71. megadetector/data_management/yolo_to_coco.py +984 -0
  72. megadetector/data_management/zamba_to_md.py +188 -0
  73. megadetector/detection/__init__.py +0 -0
  74. megadetector/detection/change_detection.py +840 -0
  75. megadetector/detection/process_video.py +479 -0
  76. megadetector/detection/pytorch_detector.py +1451 -0
  77. megadetector/detection/run_detector.py +1267 -0
  78. megadetector/detection/run_detector_batch.py +2172 -0
  79. megadetector/detection/run_inference_with_yolov5_val.py +1314 -0
  80. megadetector/detection/run_md_and_speciesnet.py +1604 -0
  81. megadetector/detection/run_tiled_inference.py +1044 -0
  82. megadetector/detection/tf_detector.py +209 -0
  83. megadetector/detection/video_utils.py +1379 -0
  84. megadetector/postprocessing/__init__.py +0 -0
  85. megadetector/postprocessing/add_max_conf.py +72 -0
  86. megadetector/postprocessing/categorize_detections_by_size.py +166 -0
  87. megadetector/postprocessing/classification_postprocessing.py +1943 -0
  88. megadetector/postprocessing/combine_batch_outputs.py +249 -0
  89. megadetector/postprocessing/compare_batch_results.py +2110 -0
  90. megadetector/postprocessing/convert_output_format.py +403 -0
  91. megadetector/postprocessing/create_crop_folder.py +629 -0
  92. megadetector/postprocessing/detector_calibration.py +570 -0
  93. megadetector/postprocessing/generate_csv_report.py +522 -0
  94. megadetector/postprocessing/load_api_results.py +223 -0
  95. megadetector/postprocessing/md_to_coco.py +428 -0
  96. megadetector/postprocessing/md_to_labelme.py +351 -0
  97. megadetector/postprocessing/md_to_wi.py +41 -0
  98. megadetector/postprocessing/merge_detections.py +392 -0
  99. megadetector/postprocessing/postprocess_batch_results.py +2140 -0
  100. megadetector/postprocessing/remap_detection_categories.py +226 -0
  101. megadetector/postprocessing/render_detection_confusion_matrix.py +677 -0
  102. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +206 -0
  103. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +82 -0
  104. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1665 -0
  105. megadetector/postprocessing/separate_detections_into_folders.py +795 -0
  106. megadetector/postprocessing/subset_json_detector_output.py +964 -0
  107. megadetector/postprocessing/top_folders_to_bottom.py +238 -0
  108. megadetector/postprocessing/validate_batch_results.py +332 -0
  109. megadetector/taxonomy_mapping/__init__.py +0 -0
  110. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
  111. megadetector/taxonomy_mapping/map_new_lila_datasets.py +211 -0
  112. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +165 -0
  113. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +543 -0
  114. megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
  115. megadetector/taxonomy_mapping/simple_image_download.py +231 -0
  116. megadetector/taxonomy_mapping/species_lookup.py +1008 -0
  117. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
  118. megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
  119. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
  120. megadetector/tests/__init__.py +0 -0
  121. megadetector/tests/test_nms_synthetic.py +335 -0
  122. megadetector/utils/__init__.py +0 -0
  123. megadetector/utils/ct_utils.py +1857 -0
  124. megadetector/utils/directory_listing.py +199 -0
  125. megadetector/utils/extract_frames_from_video.py +307 -0
  126. megadetector/utils/gpu_test.py +125 -0
  127. megadetector/utils/md_tests.py +2072 -0
  128. megadetector/utils/path_utils.py +2872 -0
  129. megadetector/utils/process_utils.py +172 -0
  130. megadetector/utils/split_locations_into_train_val.py +237 -0
  131. megadetector/utils/string_utils.py +234 -0
  132. megadetector/utils/url_utils.py +825 -0
  133. megadetector/utils/wi_platform_utils.py +968 -0
  134. megadetector/utils/wi_taxonomy_utils.py +1766 -0
  135. megadetector/utils/write_html_image_list.py +239 -0
  136. megadetector/visualization/__init__.py +0 -0
  137. megadetector/visualization/plot_utils.py +309 -0
  138. megadetector/visualization/render_images_with_thumbnails.py +243 -0
  139. megadetector/visualization/visualization_utils.py +1973 -0
  140. megadetector/visualization/visualize_db.py +630 -0
  141. megadetector/visualization/visualize_detector_output.py +498 -0
  142. megadetector/visualization/visualize_video_output.py +705 -0
  143. megadetector-10.0.15.dist-info/METADATA +115 -0
  144. megadetector-10.0.15.dist-info/RECORD +147 -0
  145. megadetector-10.0.15.dist-info/WHEEL +5 -0
  146. megadetector-10.0.15.dist-info/licenses/LICENSE +19 -0
  147. megadetector-10.0.15.dist-info/top_level.txt +1 -0
@@ -0,0 +1,825 @@
1
+ """
2
+
3
+ url_utils.py
4
+
5
+ Frequently-used functions for downloading, manipulating, or serving URLs
6
+
7
+ """
8
+
9
+ #%% Imports and constants
10
+
11
+ import os
12
+ import re
13
+ import urllib
14
+ import urllib.request
15
+ import urllib.error
16
+ import requests
17
+ import shutil
18
+ import pytest
19
+ import socketserver
20
+ import threading
21
+ import http.server
22
+
23
+ from functools import partial
24
+ from tqdm import tqdm
25
+ from urllib.parse import urlparse
26
+ from multiprocessing.pool import ThreadPool
27
+ from multiprocessing.pool import Pool
28
+
29
+ from megadetector.utils.ct_utils import make_test_folder
30
+ from megadetector.utils.ct_utils import make_temp_folder
31
+
32
+ max_path_len = 255
33
+
34
+
35
+ #%% Download functions
36
+
37
+ class DownloadProgressBar:
38
+ """
39
+ Progress updater based on the progressbar2 package.
40
+
41
+ https://stackoverflow.com/questions/37748105/how-to-use-progressbar-module-with-urlretrieve
42
+ """
43
+
44
+
45
+ def __init__(self):
46
+
47
+ self.pbar = None
48
+
49
+
50
+ def __call__(self, block_num, block_size, total_size): # noqa
51
+
52
+ if not self.pbar:
53
+ try:
54
+ import progressbar # type: ignore
55
+ self.pbar = progressbar.ProgressBar(max_value=total_size)
56
+ self.pbar.start()
57
+ except ImportError:
58
+ self.pbar = None
59
+ # print("ProgressBar not available, install 'progressbar2' for visual progress.")
60
+
61
+ if self.pbar:
62
+ downloaded = block_num * block_size
63
+ if downloaded < total_size:
64
+ self.pbar.update(downloaded)
65
+ else:
66
+ self.pbar.finish()
67
+
68
+
69
+ def download_url(url,
70
+ destination_filename=None,
71
+ progress_updater=None,
72
+ force_download=False,
73
+ verbose=True,
74
+ escape_spaces=True):
75
+ """
76
+ Downloads a URL to a file. If no file is specified, creates a temporary file,
77
+ making a best effort to avoid filename collisions.
78
+
79
+ Prints some diagnostic information and makes sure to omit SAS tokens from printouts.
80
+
81
+ Args:
82
+ url (str): the URL to download
83
+ destination_filename (str, optional): the target filename; if None, will create
84
+ a file in system temp space
85
+ progress_updater (object or bool, optional): can be "None", "False", "True", or a
86
+ specific callable object. If None or False, no progress updated will be
87
+ displayed. If True, a default progress bar will be created.
88
+ force_download (bool, optional): download this file even if [destination_filename]
89
+ exists.
90
+ verbose (bool, optional): enable additional debug console output
91
+ escape_spaces (bool, optional): replace ' ' with '%20'
92
+
93
+ Returns:
94
+ str: the filename to which [url] was downloaded, the same as [destination_filename]
95
+ if [destination_filename] was not None
96
+ """
97
+
98
+ if progress_updater is not None and isinstance(progress_updater,bool):
99
+ if not progress_updater:
100
+ progress_updater = None
101
+ else:
102
+ progress_updater = DownloadProgressBar()
103
+
104
+ url_no_sas = url.split('?')[0]
105
+
106
+ if destination_filename is None:
107
+
108
+ target_folder = make_temp_folder(subfolder='url_utils',append_guid=False)
109
+ url_without_sas = url.split('?', 1)[0]
110
+
111
+ # This does not guarantee uniqueness, hence "semi-best-effort"
112
+ url_as_filename = re.sub(r'\W+', '', url_without_sas)
113
+
114
+ n_folder_chars = len(target_folder)
115
+
116
+ if (len(url_as_filename) + n_folder_chars) >= max_path_len:
117
+ print('Warning: truncating filename target to {} characters'.format(max_path_len))
118
+ max_fn_len = max_path_len - (n_folder_chars + 1)
119
+ url_as_filename = url_as_filename[-1 * max_fn_len:]
120
+ destination_filename = \
121
+ os.path.join(target_folder,url_as_filename)
122
+
123
+ # ...if the destination filename wasn't specified
124
+
125
+ if escape_spaces:
126
+ url = url.replace(' ','%20')
127
+
128
+ if (not force_download) and (os.path.isfile(destination_filename)):
129
+ if verbose:
130
+ print('Bypassing download of already-downloaded file {}'.format(os.path.basename(url_no_sas)))
131
+ else:
132
+ if verbose:
133
+ print('Downloading file {} to {}'.format(os.path.basename(url_no_sas),destination_filename),end='')
134
+ target_dir = os.path.dirname(destination_filename)
135
+ if len(target_dir) > 0:
136
+ os.makedirs(target_dir,exist_ok=True)
137
+ urllib.request.urlretrieve(url, destination_filename, progress_updater)
138
+ assert(os.path.isfile(destination_filename))
139
+ n_bytes = os.path.getsize(destination_filename)
140
+ if verbose:
141
+ print('...done, {} bytes.'.format(n_bytes))
142
+
143
+ return destination_filename
144
+
145
+ # ...def download_url(...)
146
+
147
+
148
+ def download_relative_filename(url, output_base, verbose=False):
149
+ """
150
+ Download a URL to output_base, preserving relative path. Path is relative to
151
+ the site, so:
152
+
153
+ https://abc.com/xyz/123.txt
154
+
155
+ ...will get downloaded to:
156
+
157
+ output_base/xyz/123.txt
158
+
159
+ Args:
160
+ url (str): the URL to download
161
+ output_base (str): the base folder to which we should download this file
162
+ verbose (bool, optional): enable additional debug console output
163
+
164
+ Returns:
165
+ str: the local destination filename
166
+ """
167
+
168
+ p = urlparse(url)
169
+ # remove the leading '/'
170
+ assert p.path.startswith('/'); relative_filename = p.path[1:]
171
+ destination_filename = os.path.join(output_base,relative_filename)
172
+ return download_url(url, destination_filename, verbose=verbose)
173
+
174
+ # ...def download_relative_filename(...)
175
+
176
+
177
+ def _do_parallelized_download(download_info,overwrite=False,verbose=False):
178
+ """
179
+ Internal function for download parallelization.
180
+ """
181
+
182
+ url = download_info['url']
183
+ target_file = download_info['target_file']
184
+ result = {'status':'unknown','url':url,'target_file':target_file}
185
+
186
+ if ((os.path.isfile(target_file)) and (not overwrite)):
187
+ if verbose:
188
+ print('Skipping existing file {}'.format(target_file))
189
+ result['status'] = 'skipped'
190
+ return result
191
+ try:
192
+ download_url(url=url,
193
+ destination_filename=target_file,
194
+ verbose=verbose,
195
+ force_download=overwrite)
196
+ except Exception as e:
197
+ print('Warning: error downloading URL {}: {}'.format(
198
+ url,str(e)))
199
+ result['status'] = 'error: {}'.format(str(e))
200
+ return result
201
+
202
+ result['status'] = 'success'
203
+ return result
204
+
205
+ # ...def _do_parallelized_download(...)
206
+
207
+
208
+ def parallel_download_urls(url_to_target_file,
209
+ verbose=False,
210
+ overwrite=False,
211
+ n_workers=20,
212
+ pool_type='thread'):
213
+ """
214
+ Downloads a list of URLs to local files.
215
+
216
+ Catches exceptions and reports them in the returned "results" array.
217
+
218
+ Args:
219
+ url_to_target_file (dict): a dict mapping URLs to local filenames.
220
+ verbose (bool, optional): enable additional debug console output
221
+ overwrite (bool, optional): whether to overwrite existing local files
222
+ n_workers (int, optional): number of concurrent workers, set to <=1 to disable
223
+ parallelization
224
+ pool_type (str, optional): worker type to use; should be 'thread' or 'process'
225
+
226
+ Returns:
227
+ list: list of dicts with keys:
228
+ - 'url': the url this item refers to
229
+ - 'status': 'skipped', 'success', or a string starting with 'error'
230
+ - 'target_file': the local filename to which we downloaded (or tried to
231
+ download) this URL
232
+ """
233
+
234
+ all_download_info = []
235
+
236
+ if verbose:
237
+ print('Preparing download list')
238
+ for url in tqdm(url_to_target_file, disable=(not verbose)):
239
+ download_info = {}
240
+ download_info['url'] = url
241
+ download_info['target_file'] = url_to_target_file[url]
242
+ all_download_info.append(download_info)
243
+
244
+ if verbose:
245
+ print('Downloading {} images on {} workers'.format(
246
+ len(all_download_info),n_workers))
247
+
248
+ if n_workers <= 1:
249
+
250
+ results = []
251
+
252
+ for download_info in tqdm(all_download_info, disable=(not verbose)):
253
+ result = _do_parallelized_download(download_info,overwrite=overwrite,verbose=verbose)
254
+ results.append(result)
255
+
256
+ else:
257
+
258
+ pool = None
259
+
260
+ try:
261
+ if pool_type == 'thread':
262
+ pool = ThreadPool(n_workers)
263
+ else:
264
+ assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
265
+ pool = Pool(n_workers)
266
+
267
+ if verbose:
268
+ print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
269
+
270
+ results = list(tqdm(pool.imap(
271
+ partial(_do_parallelized_download,overwrite=overwrite,verbose=verbose),
272
+ all_download_info), total=len(all_download_info), disable=(not verbose)))
273
+
274
+ finally:
275
+ if pool:
276
+ pool.close()
277
+ pool.join()
278
+ print('Pool closed and joined for parallel URL downloads')
279
+
280
+ return results
281
+
282
+ # ...def parallel_download_urls(...)
283
+
284
+
285
+ @pytest.mark.skip(reason="This is not a test function")
286
+ def test_url(url,error_on_failure=True,timeout=None):
287
+ """
288
+ Tests the availability of [url], returning an http status code.
289
+
290
+ Args:
291
+ url (str): URL to test
292
+ error_on_failure (bool, optional): whether to error (vs. just returning an
293
+ error code) if accessing this URL fails
294
+ timeout (int, optional): timeout in seconds to wait before considering this
295
+ access attempt to be a failure; see requests.head() for precise documentation
296
+
297
+ Returns:
298
+ int: http status code (200 for success)
299
+ """
300
+
301
+ r = requests.head(url, stream=True, verify=True, timeout=timeout)
302
+
303
+ if error_on_failure and r.status_code != 200:
304
+ raise ValueError('Could not access {}: error {}'.format(url,r.status_code))
305
+ return r.status_code
306
+
307
+
308
+ @pytest.mark.skip(reason="This is not a test function")
309
+ def test_urls(urls,error_on_failure=True,n_workers=1,pool_type='thread',timeout=None,verbose=False):
310
+ """
311
+ Verify that URLs are available (i.e., returns status 200). By default,
312
+ errors if any URL is unavailable.
313
+
314
+ Args:
315
+ urls (list): list of URLs to test
316
+ error_on_failure (bool, optional): whether to error (vs. just returning an
317
+ error code) if accessing this URL fails
318
+ n_workers (int, optional): number of concurrent workers, set to <=1 to disable
319
+ parallelization
320
+ pool_type (str, optional): worker type to use; should be 'thread' or 'process'
321
+ timeout (int, optional): timeout in seconds to wait before considering this
322
+ access attempt to be a failure; see requests.head() for precise documentation
323
+ verbose (bool, optional): enable additional debug output
324
+
325
+ Returns:
326
+ list: a list of http status codes, the same length and order as [urls]
327
+ """
328
+
329
+ if n_workers <= 1:
330
+
331
+ status_codes = []
332
+
333
+ for url in tqdm(urls,disable=(not verbose)):
334
+
335
+ r = requests.get(url, timeout=timeout)
336
+
337
+ if error_on_failure and r.status_code != 200:
338
+ raise ValueError('Could not access {}: error {}'.format(url,r.status_code))
339
+ status_codes.append(r.status_code)
340
+
341
+ else:
342
+
343
+ pool = None
344
+ try:
345
+ if pool_type == 'thread':
346
+ pool = ThreadPool(n_workers)
347
+ else:
348
+ assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
349
+ pool = Pool(n_workers)
350
+
351
+ if verbose:
352
+ print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
353
+
354
+ status_codes = list(tqdm(pool.imap(
355
+ partial(test_url,error_on_failure=error_on_failure,timeout=timeout),
356
+ urls), total=len(urls), disable=(not verbose)))
357
+ finally:
358
+ if pool:
359
+ pool.close()
360
+ pool.join()
361
+ print('Pool closed and joined for URL tests')
362
+
363
+ return status_codes
364
+
365
+ # ...def test_urls(...)
366
+
367
+
368
+ def get_url_size(url,verbose=False,timeout=None):
369
+ """
370
+ Get the size of the file pointed to by a URL, based on the Content-Length property. If the
371
+ URL is not available, or the Content-Length property is not available, or the content-Length
372
+ property is not an integer, returns None.
373
+
374
+ Args:
375
+ url (str): the url to test
376
+ verbose (bool, optional): enable additional debug output
377
+ timeout (int, optional): timeout in seconds to wait before considering this
378
+ access attempt to be a failure; see requests.head() for precise documentation
379
+
380
+ Returns:
381
+ int: the file size in bytes, or None if it can't be retrieved
382
+ """
383
+
384
+ try:
385
+ r = urllib.request.Request(url,method='HEAD')
386
+ f = urllib.request.urlopen(r, timeout=timeout)
387
+ if f.status != 200:
388
+ if verbose:
389
+ print('Status {} retrieving file size for {}'.format(f.status,url))
390
+ return None
391
+ size_bytes_str = f.headers.get('Content-Length')
392
+ if size_bytes_str is None:
393
+ if verbose:
394
+ print('No Content-Length header for {}'.format(url))
395
+ return None
396
+ size_bytes = int(size_bytes_str)
397
+ return size_bytes
398
+ except Exception as e:
399
+ if verbose:
400
+ print('Error retrieving file size for {}:\n{}'.format(url,str(e)))
401
+ return None
402
+
403
+ # ...def get_url_size(...)
404
+
405
+
406
+ def get_url_sizes(urls,n_workers=1,pool_type='thread',timeout=None,verbose=False):
407
+ """
408
+ Retrieve file sizes for the URLs specified by [urls]. Returns None for any URLs
409
+ that we can't access, or URLs for which the Content-Length property is not set.
410
+
411
+ Args:
412
+ urls (list): list of URLs for which we should retrieve sizes
413
+ n_workers (int, optional): number of concurrent workers, set to <=1 to disable
414
+ parallelization
415
+ pool_type (str, optional): worker type to use; should be 'thread' or 'process'
416
+ timeout (int, optional): timeout in seconds to wait before considering this
417
+ access attempt to be a failure; see requests.head() for precise documentation
418
+ verbose (bool, optional): print additional debug information
419
+
420
+ Returns:
421
+ dict: maps urls to file sizes, which will be None for URLs for which we were unable
422
+ to retrieve a valid size.
423
+ """
424
+
425
+ url_to_size = {}
426
+
427
+ if n_workers <= 1:
428
+
429
+ for url in tqdm(urls, disable=(not verbose)):
430
+ url_to_size[url] = get_url_size(url,verbose=verbose,timeout=timeout)
431
+
432
+ else:
433
+
434
+ pool = None
435
+ try:
436
+ if pool_type == 'thread':
437
+ pool = ThreadPool(n_workers)
438
+ else:
439
+ assert pool_type == 'process', 'Unsupported pool type {}'.format(pool_type)
440
+ pool = Pool(n_workers)
441
+
442
+ if verbose:
443
+ print('Starting a {} pool with {} workers'.format(pool_type,n_workers))
444
+
445
+ file_sizes = list(tqdm(pool.imap(
446
+ partial(get_url_size,verbose=verbose,timeout=timeout),
447
+ urls), total=len(urls), disable=(not verbose)))
448
+
449
+ for i_url,url in enumerate(urls):
450
+ url_to_size[url] = file_sizes[i_url]
451
+ finally:
452
+ if pool:
453
+ pool.close()
454
+ pool.join()
455
+ print('Pool closed and joined for URL size checks')
456
+
457
+ return url_to_size
458
+
459
+
460
+ #%% Singleton HTTP server
461
+
462
+ class QuietHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
463
+ """
464
+ SimpleHTTPRequestHandler subclass that suppresses console printouts
465
+ """
466
+ def __init__(self, *args, directory=None, **kwargs):
467
+ super().__init__(*args, directory=directory, **kwargs)
468
+
469
+ def log_message(self, format, *args): # noqa
470
+ pass
471
+
472
+
473
+ class SingletonHTTPServer:
474
+ """
475
+ HTTP server that runs on a local port, serving a particular local folder. Runs as a
476
+ singleton, so starting a server in a new folder closes the previous server. I use this
477
+ primarily to serve MD/SpeciesNet previews from manage_local_batch, which can exceed
478
+ the 260-character filename length limitation imposed by browser on Windows, so really the
479
+ point here is just to remove characters from the URL.
480
+ """
481
+
482
+ _server = None
483
+ _thread = None
484
+
485
+ @classmethod
486
+ def start_server(cls, directory, port=8000, host='localhost'):
487
+ """
488
+ Start or restart the HTTP server with a specific directory
489
+
490
+ Args:
491
+ directory (str): the root folder served by the server
492
+ port (int, optional): the port on which to create the server
493
+ host (str, optional): the host on which to listen, typically
494
+ either "localhost" (default) or "0.0.0.0"
495
+
496
+ Returns:
497
+ str: URL to the running host
498
+ """
499
+
500
+ # Stop the existing server instance if necessary
501
+ cls.stop_server()
502
+
503
+ # Create new server
504
+ handler = partial(QuietHTTPRequestHandler, directory=directory)
505
+ cls._server = socketserver.TCPServer((host, port), handler)
506
+
507
+ # Start server in daemon thread (dies when parent process dies)
508
+ cls._thread = threading.Thread(target=cls._server.serve_forever)
509
+ cls._thread.daemon = True
510
+ cls._thread.start()
511
+
512
+ print(f"Serving {directory} at http://{host}:{port}")
513
+ return f"http://{host}:{port}"
514
+
515
+
516
+ @classmethod
517
+ def stop_server(cls):
518
+ """
519
+ Stop the current server (if one is running)
520
+ """
521
+
522
+ if cls._server:
523
+ cls._server.shutdown()
524
+ cls._server.server_close()
525
+ cls._server = None
526
+ if cls._thread:
527
+ cls._thread.join(timeout=1)
528
+ cls._thread = None
529
+
530
+
531
+ @classmethod
532
+ def is_running(cls):
533
+ """
534
+ Check whether the server is currently running.
535
+
536
+ Returns:
537
+ bool: True if the server is running
538
+ """
539
+
540
+ return (cls._server is not None) and \
541
+ (cls._thread is not None) and \
542
+ (cls._thread.is_alive())
543
+
544
+ # ...class SingletonHTTPServer
545
+
546
+
547
+ #%% Tests
548
+
549
+ # Constants for tests
550
+
551
+ SMALL_FILE_URL = "https://www.google.com/images/branding/googlelogo/1x/googlelogo_color_272x92dp.png"
552
+ REDIRECT_SRC_URL = "http://google.com"
553
+ REDIRECT_DEST_URL = "https://www.google.com/"
554
+ NON_EXISTENT_URL = "https://example.com/non_existent_page_404.html"
555
+ DEFINITELY_NON_EXISTENT_DOMAIN_URL = "https://thisshouldnotexist1234567890.com/file.txt"
556
+ RELATIVE_DOWNLOAD_URL = "https://raw.githubusercontent.com/agentmorris/MegaDetector/main/README.md"
557
+ RELATIVE_DOWNLOAD_CONTAIN_TOKEN = 'agentmorris'
558
+ RELATIVE_DOWNLOAD_NOT_CONTAIN_TOKEN = 'github'
559
+
560
+
561
+ class TestUrlUtils:
562
+ """
563
+ Tests for url_utils.py
564
+ """
565
+
566
+ def set_up(self):
567
+ """
568
+ Create a temporary directory for testing.
569
+ """
570
+
571
+ self.test_dir = make_test_folder(subfolder='url_utils_tests')
572
+ self.download_target_dir = os.path.join(self.test_dir, 'downloads')
573
+ os.makedirs(self.download_target_dir, exist_ok=True)
574
+
575
+
576
+ def tear_down(self):
577
+ """
578
+ Remove the temporary directory after tests and restore module temp_dir.
579
+ """
580
+
581
+ if os.path.exists(self.test_dir):
582
+ shutil.rmtree(self.test_dir)
583
+
584
+
585
+ def test_download_url_to_specified_file(self):
586
+ """
587
+ Test download_url with a specified destination filename.
588
+ """
589
+
590
+ dest_filename = os.path.join(self.download_target_dir, "downloaded_google_logo.png")
591
+ returned_filename = download_url(SMALL_FILE_URL,
592
+ destination_filename=dest_filename,
593
+ verbose=False)
594
+ assert returned_filename == dest_filename
595
+ assert os.path.exists(dest_filename)
596
+ assert os.path.getsize(dest_filename) > 1000
597
+
598
+
599
+ def test_download_url_to_temp_file(self):
600
+ """
601
+ Test download_url when destination_filename is None.
602
+ """
603
+
604
+ returned_filename = download_url(SMALL_FILE_URL,
605
+ destination_filename=None,
606
+ verbose=False)
607
+ assert os.path.exists(returned_filename)
608
+ assert os.path.getsize(returned_filename) > 1000
609
+
610
+
611
+ def test_download_url_non_existent(self):
612
+ """
613
+ Test download_url with a non-existent URL.
614
+ """
615
+
616
+ dest_filename = os.path.join(self.download_target_dir, "non_existent.html")
617
+ try:
618
+ download_url(NON_EXISTENT_URL, destination_filename=dest_filename, verbose=False)
619
+ raise AssertionError("urllib.error.HTTPError not raised for 404")
620
+ except urllib.error.HTTPError:
621
+ pass
622
+
623
+ try:
624
+ download_url(DEFINITELY_NON_EXISTENT_DOMAIN_URL,
625
+ destination_filename=dest_filename,
626
+ verbose=False)
627
+ raise AssertionError(
628
+ "urllib.error.URLError or requests.exceptions.ConnectionError not raised for DNS failure")
629
+ except urllib.error.URLError:
630
+ pass
631
+ except requests.exceptions.ConnectionError:
632
+ pass
633
+
634
+
635
+ def test_download_url_force_download(self):
636
+ """
637
+ Test the force_download parameter of download_url.
638
+ """
639
+
640
+ dest_filename = os.path.join(self.download_target_dir, "force_test.png")
641
+
642
+ download_url(SMALL_FILE_URL, destination_filename=dest_filename, verbose=False)
643
+ assert os.path.exists(dest_filename)
644
+ initial_mtime = os.path.getmtime(dest_filename)
645
+
646
+ download_url(SMALL_FILE_URL, destination_filename=dest_filename, verbose=True)
647
+ assert os.path.getmtime(dest_filename) == initial_mtime
648
+
649
+ download_url(SMALL_FILE_URL,
650
+ destination_filename=dest_filename,
651
+ force_download=True,
652
+ verbose=False)
653
+ assert os.path.exists(dest_filename)
654
+
655
+
656
+ def test_download_url_escape_spaces(self):
657
+ """
658
+ Test download_url with spaces in the URL.
659
+ """
660
+
661
+ dest_filename = os.path.join(self.download_target_dir, "escape_test.png")
662
+ download_url(SMALL_FILE_URL,
663
+ destination_filename=dest_filename,
664
+ escape_spaces=True,
665
+ verbose=False)
666
+ assert os.path.exists(dest_filename)
667
+
668
+
669
+ def test_download_relative_filename(self):
670
+ """
671
+ Test download_relative_filename.
672
+ """
673
+
674
+ output_base = os.path.join(self.download_target_dir, "relative_dl")
675
+ returned_filename = download_relative_filename(RELATIVE_DOWNLOAD_URL, output_base, verbose=False)
676
+ assert RELATIVE_DOWNLOAD_CONTAIN_TOKEN in returned_filename
677
+ assert RELATIVE_DOWNLOAD_NOT_CONTAIN_TOKEN not in returned_filename
678
+ assert os.path.exists(returned_filename)
679
+ assert os.path.getsize(returned_filename) > 100
680
+
681
+
682
+ def test_parallel_download_urls(self):
683
+ """
684
+ Test parallel_download_urls (with n_workers=1 for simplicity).
685
+ """
686
+
687
+ url1_target = os.path.join(self.download_target_dir, "parallel_dl_1.png")
688
+ url2_target = os.path.join(self.download_target_dir, "parallel_dl_2_nonexistent.html")
689
+
690
+ url_to_target_file = {
691
+ SMALL_FILE_URL: url1_target,
692
+ NON_EXISTENT_URL: url2_target
693
+ }
694
+
695
+ results = parallel_download_urls(url_to_target_file, n_workers=1, verbose=False)
696
+
697
+ assert len(results) == 2
698
+
699
+ status_map = {res['url']: res for res in results}
700
+
701
+ assert status_map[SMALL_FILE_URL]['status'] == 'success'
702
+ assert status_map[SMALL_FILE_URL]['target_file'] == url1_target
703
+ assert os.path.exists(url1_target)
704
+
705
+ assert status_map[NON_EXISTENT_URL]['status'].startswith('error: HTTP Error 404')
706
+ assert status_map[NON_EXISTENT_URL]['target_file'] == url2_target
707
+ assert not os.path.exists(url2_target)
708
+
709
+ if not os.path.exists(url1_target):
710
+ download_url(SMALL_FILE_URL, url1_target, verbose=False)
711
+ results_skip = parallel_download_urls({SMALL_FILE_URL: url1_target},
712
+ n_workers=1,
713
+ overwrite=False,
714
+ verbose=True)
715
+ assert results_skip[0]['status'] == 'skipped'
716
+
717
+ results_overwrite = parallel_download_urls({SMALL_FILE_URL: url1_target},
718
+ n_workers=1,
719
+ overwrite=True,
720
+ verbose=False)
721
+ assert results_overwrite[0]['status'] == 'success'
722
+
723
+
724
+ def test_test_url_and_test_urls(self):
725
+ """
726
+ Test test_url and test_urls functions.
727
+ """
728
+
729
+ assert test_url(SMALL_FILE_URL, error_on_failure=False, timeout=10) == 200
730
+ assert test_url(REDIRECT_SRC_URL, error_on_failure=False, timeout=10) in (200,301)
731
+
732
+ status_non_existent = test_url(NON_EXISTENT_URL, error_on_failure=False, timeout=5)
733
+ assert status_non_existent == 404
734
+
735
+ try:
736
+ test_url(NON_EXISTENT_URL, error_on_failure=True, timeout=5)
737
+ raise AssertionError("ValueError not raised for NON_EXISTENT_URL")
738
+ except ValueError:
739
+ pass
740
+
741
+ try:
742
+ test_url(DEFINITELY_NON_EXISTENT_DOMAIN_URL,
743
+ error_on_failure=True,
744
+ timeout=5)
745
+ raise AssertionError("requests.exceptions.ConnectionError or urllib.error.URLError not raised")
746
+ except requests.exceptions.ConnectionError:
747
+ pass
748
+ except urllib.error.URLError:
749
+ pass
750
+
751
+
752
+ urls_to_test = [SMALL_FILE_URL, NON_EXISTENT_URL]
753
+ status_codes = test_urls(urls_to_test, error_on_failure=False, n_workers=1, timeout=10)
754
+ assert len(status_codes) == 2
755
+ assert status_codes[0] == 200
756
+ assert status_codes[1] == 404
757
+
758
+ try:
759
+ test_urls(urls_to_test, error_on_failure=True, n_workers=1, timeout=5)
760
+ raise AssertionError("ValueError not raised for urls_to_test")
761
+ except ValueError:
762
+ pass
763
+
764
+ good_urls = [SMALL_FILE_URL, REDIRECT_SRC_URL]
765
+ good_status_codes = test_urls(good_urls, error_on_failure=True, n_workers=1, timeout=10)
766
+ assert good_status_codes == [200, 200]
767
+
768
+
769
+ def test_get_url_size_and_sizes(self):
770
+ """
771
+ Test get_url_size and get_url_sizes functions.
772
+ """
773
+
774
+ size = get_url_size(SMALL_FILE_URL, timeout=10)
775
+ assert size is not None
776
+ assert size > 1000
777
+
778
+ size_dynamic = get_url_size(REDIRECT_DEST_URL, timeout=10, verbose=True)
779
+ if size_dynamic is not None:
780
+ assert isinstance(size_dynamic, int)
781
+
782
+ size_non_existent = get_url_size(NON_EXISTENT_URL, timeout=5)
783
+ assert size_non_existent is None
784
+
785
+ size_bad_domain = get_url_size(DEFINITELY_NON_EXISTENT_DOMAIN_URL, timeout=5)
786
+ assert size_bad_domain is None
787
+
788
+ urls_for_size = [SMALL_FILE_URL, NON_EXISTENT_URL, REDIRECT_DEST_URL]
789
+ sizes_map = get_url_sizes(urls_for_size, n_workers=1, timeout=10)
790
+
791
+ assert SMALL_FILE_URL in sizes_map
792
+ assert sizes_map[SMALL_FILE_URL] == size
793
+
794
+ assert NON_EXISTENT_URL in sizes_map
795
+ assert sizes_map[NON_EXISTENT_URL] is None
796
+
797
+ assert REDIRECT_DEST_URL in sizes_map
798
+ assert sizes_map[REDIRECT_DEST_URL] == size_dynamic
799
+
800
+
801
+ def _test_url_utils():
802
+ """
803
+ Runs all tests in the TestUrlUtils class. I generally disable this during testing
804
+ because it creates irritating nondeterminism (because it depends on downloading
805
+ stuff from the Internet), and this is neither a core module nor a module that changes
806
+ often.
807
+ """
808
+
809
+ test_instance = TestUrlUtils()
810
+ test_instance.set_up()
811
+ try:
812
+ test_instance.test_download_url_to_specified_file()
813
+ test_instance.test_download_url_to_temp_file()
814
+ test_instance.test_download_url_non_existent()
815
+ test_instance.test_download_url_force_download()
816
+ test_instance.test_download_url_escape_spaces()
817
+ test_instance.test_download_relative_filename()
818
+ test_instance.test_parallel_download_urls()
819
+ test_instance.test_test_url_and_test_urls()
820
+ test_instance.test_get_url_size_and_sizes()
821
+ finally:
822
+ test_instance.tear_down()
823
+
824
+ # from IPython import embed; embed()
825
+ # test_url_utils()