ciocore 5.1.1__py2.py3-none-any.whl → 10.0.0b3__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. ciocore/VERSION +1 -1
  2. ciocore/__init__.py +23 -1
  3. ciocore/api_client.py +655 -160
  4. ciocore/auth/__init__.py +5 -3
  5. ciocore/cli.py +501 -0
  6. ciocore/common.py +15 -13
  7. ciocore/conductor_submit.py +77 -60
  8. ciocore/config.py +127 -13
  9. ciocore/data.py +162 -77
  10. ciocore/docsite/404.html +746 -0
  11. ciocore/docsite/apidoc/api_client/index.html +3605 -0
  12. ciocore/docsite/apidoc/apidoc/index.html +909 -0
  13. ciocore/docsite/apidoc/config/index.html +1652 -0
  14. ciocore/docsite/apidoc/data/index.html +1553 -0
  15. ciocore/docsite/apidoc/hardware_set/index.html +2460 -0
  16. ciocore/docsite/apidoc/package_environment/index.html +1507 -0
  17. ciocore/docsite/apidoc/package_tree/index.html +2386 -0
  18. ciocore/docsite/assets/_mkdocstrings.css +16 -0
  19. ciocore/docsite/assets/images/favicon.png +0 -0
  20. ciocore/docsite/assets/javascripts/bundle.471ce7a9.min.js +29 -0
  21. ciocore/docsite/assets/javascripts/bundle.471ce7a9.min.js.map +7 -0
  22. ciocore/docsite/assets/javascripts/lunr/min/lunr.ar.min.js +1 -0
  23. ciocore/docsite/assets/javascripts/lunr/min/lunr.da.min.js +18 -0
  24. ciocore/docsite/assets/javascripts/lunr/min/lunr.de.min.js +18 -0
  25. ciocore/docsite/assets/javascripts/lunr/min/lunr.du.min.js +18 -0
  26. ciocore/docsite/assets/javascripts/lunr/min/lunr.el.min.js +1 -0
  27. ciocore/docsite/assets/javascripts/lunr/min/lunr.es.min.js +18 -0
  28. ciocore/docsite/assets/javascripts/lunr/min/lunr.fi.min.js +18 -0
  29. ciocore/docsite/assets/javascripts/lunr/min/lunr.fr.min.js +18 -0
  30. ciocore/docsite/assets/javascripts/lunr/min/lunr.he.min.js +1 -0
  31. ciocore/docsite/assets/javascripts/lunr/min/lunr.hi.min.js +1 -0
  32. ciocore/docsite/assets/javascripts/lunr/min/lunr.hu.min.js +18 -0
  33. ciocore/docsite/assets/javascripts/lunr/min/lunr.hy.min.js +1 -0
  34. ciocore/docsite/assets/javascripts/lunr/min/lunr.it.min.js +18 -0
  35. ciocore/docsite/assets/javascripts/lunr/min/lunr.ja.min.js +1 -0
  36. ciocore/docsite/assets/javascripts/lunr/min/lunr.jp.min.js +1 -0
  37. ciocore/docsite/assets/javascripts/lunr/min/lunr.kn.min.js +1 -0
  38. ciocore/docsite/assets/javascripts/lunr/min/lunr.ko.min.js +1 -0
  39. ciocore/docsite/assets/javascripts/lunr/min/lunr.multi.min.js +1 -0
  40. ciocore/docsite/assets/javascripts/lunr/min/lunr.nl.min.js +18 -0
  41. ciocore/docsite/assets/javascripts/lunr/min/lunr.no.min.js +18 -0
  42. ciocore/docsite/assets/javascripts/lunr/min/lunr.pt.min.js +18 -0
  43. ciocore/docsite/assets/javascripts/lunr/min/lunr.ro.min.js +18 -0
  44. ciocore/docsite/assets/javascripts/lunr/min/lunr.ru.min.js +18 -0
  45. ciocore/docsite/assets/javascripts/lunr/min/lunr.sa.min.js +1 -0
  46. ciocore/docsite/assets/javascripts/lunr/min/lunr.stemmer.support.min.js +1 -0
  47. ciocore/docsite/assets/javascripts/lunr/min/lunr.sv.min.js +18 -0
  48. ciocore/docsite/assets/javascripts/lunr/min/lunr.ta.min.js +1 -0
  49. ciocore/docsite/assets/javascripts/lunr/min/lunr.te.min.js +1 -0
  50. ciocore/docsite/assets/javascripts/lunr/min/lunr.th.min.js +1 -0
  51. ciocore/docsite/assets/javascripts/lunr/min/lunr.tr.min.js +18 -0
  52. ciocore/docsite/assets/javascripts/lunr/min/lunr.vi.min.js +1 -0
  53. ciocore/docsite/assets/javascripts/lunr/min/lunr.zh.min.js +1 -0
  54. ciocore/docsite/assets/javascripts/lunr/tinyseg.js +206 -0
  55. ciocore/docsite/assets/javascripts/lunr/wordcut.js +6708 -0
  56. ciocore/docsite/assets/javascripts/workers/search.b8dbb3d2.min.js +42 -0
  57. ciocore/docsite/assets/javascripts/workers/search.b8dbb3d2.min.js.map +7 -0
  58. ciocore/docsite/assets/stylesheets/main.3cba04c6.min.css +1 -0
  59. ciocore/docsite/assets/stylesheets/main.3cba04c6.min.css.map +1 -0
  60. ciocore/docsite/assets/stylesheets/palette.06af60db.min.css +1 -0
  61. ciocore/docsite/assets/stylesheets/palette.06af60db.min.css.map +1 -0
  62. ciocore/docsite/cmdline/docs/index.html +871 -0
  63. ciocore/docsite/cmdline/downloader/index.html +934 -0
  64. ciocore/docsite/cmdline/packages/index.html +878 -0
  65. ciocore/docsite/cmdline/uploader/index.html +995 -0
  66. ciocore/docsite/how-to-guides/index.html +869 -0
  67. ciocore/docsite/index.html +895 -0
  68. ciocore/docsite/logo.png +0 -0
  69. ciocore/docsite/objects.inv +0 -0
  70. ciocore/docsite/search/search_index.json +1 -0
  71. ciocore/docsite/sitemap.xml +3 -0
  72. ciocore/docsite/sitemap.xml.gz +0 -0
  73. ciocore/docsite/stylesheets/extra.css +26 -0
  74. ciocore/docsite/stylesheets/tables.css +167 -0
  75. ciocore/downloader/base_downloader.py +644 -0
  76. ciocore/downloader/download_runner_base.py +47 -0
  77. ciocore/downloader/job_downloader.py +119 -0
  78. ciocore/{downloader.py → downloader/legacy_downloader.py} +12 -9
  79. ciocore/downloader/log.py +73 -0
  80. ciocore/downloader/logging_download_runner.py +87 -0
  81. ciocore/downloader/perpetual_downloader.py +63 -0
  82. ciocore/downloader/registry.py +97 -0
  83. ciocore/downloader/reporter.py +135 -0
  84. ciocore/exceptions.py +8 -2
  85. ciocore/file_utils.py +51 -50
  86. ciocore/hardware_set.py +449 -0
  87. ciocore/loggeria.py +89 -20
  88. ciocore/package_environment.py +110 -48
  89. ciocore/package_query.py +182 -0
  90. ciocore/package_tree.py +319 -258
  91. ciocore/retry.py +0 -0
  92. ciocore/uploader/_uploader.py +547 -364
  93. ciocore/uploader/thread_queue_job.py +176 -0
  94. ciocore/uploader/upload_stats/__init__.py +3 -4
  95. ciocore/uploader/upload_stats/stats_formats.py +10 -4
  96. ciocore/validator.py +34 -2
  97. ciocore/worker.py +174 -151
  98. ciocore-10.0.0b3.dist-info/METADATA +928 -0
  99. ciocore-10.0.0b3.dist-info/RECORD +128 -0
  100. {ciocore-5.1.1.dist-info → ciocore-10.0.0b3.dist-info}/WHEEL +1 -1
  101. ciocore-10.0.0b3.dist-info/entry_points.txt +2 -0
  102. tests/instance_type_fixtures.py +175 -0
  103. tests/package_fixtures.py +205 -0
  104. tests/test_api_client.py +297 -12
  105. tests/test_base_downloader.py +104 -0
  106. tests/test_cli.py +149 -0
  107. tests/test_common.py +1 -7
  108. tests/test_config.py +40 -18
  109. tests/test_data.py +162 -173
  110. tests/test_downloader.py +118 -0
  111. tests/test_hardware_set.py +139 -0
  112. tests/test_job_downloader.py +213 -0
  113. tests/test_package_query.py +38 -0
  114. tests/test_package_tree.py +91 -291
  115. tests/test_submit.py +44 -18
  116. tests/test_uploader.py +1 -4
  117. ciocore/__about__.py +0 -10
  118. ciocore/cli/conductor.py +0 -191
  119. ciocore/compat.py +0 -15
  120. ciocore-5.1.1.data/scripts/conductor +0 -19
  121. ciocore-5.1.1.data/scripts/conductor.bat +0 -13
  122. ciocore-5.1.1.dist-info/METADATA +0 -408
  123. ciocore-5.1.1.dist-info/RECORD +0 -47
  124. tests/mocks/api_client_mock.py +0 -51
  125. /ciocore/{cli → downloader}/__init__.py +0 -0
  126. {ciocore-5.1.1.dist-info → ciocore-10.0.0b3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,644 @@
1
+ """
2
+ Contains the base class for both the JobDownloader and the PerpetualDownloader.
3
+
4
+
5
+ PAGING
6
+ Both the job downloader and the perpetual downloader get their lists of tasks to download in batches. In both cases they implement the get_some_tasks() method. This method is called repeatedly until it is interrupted or until it returns a falsy locator. The locator, if not falsy, is whatever the derived class finds useful. See the documentation for the derived classes for detailed information.
7
+
8
+ CALLBACKS
9
+ The intention is to keep the downloader simple and flexible. As such, some functionality is intentionally left out. For example, we do not report back to the Conductor API when tasks are complete. We do not format output, other than that provided by standard logging. We do not provide a GUI. Instead, we emit lifecycle events that can be used to do all of these things and more. The LoggingDownloadRunner class demonstrates this.
10
+
11
+ Callbacks are called with an 'evt' argument, which is a dictionary containing information about the event. The events are listed in the VALID_EVENTS list. To register a callback you use the `on` method - for example: `downloader.on("start", my_callback)`. The callback must be a function that accepts one argument named 'evt'. The callback can be a method of another class. Several callbacks may be registered for the same event type.
12
+
13
+ Since the downloader is multithreaded, the events are generated in different threads. We use a Queue to pass the events from the downloader threads to the main thread. The method, dispatch_events is responsible for reading events from the queue and calling the appropriate callbacks.
14
+
15
+ Most event types are emitted unchanged as they are received from the queue. However, if one or more callbacks are registered to handle the EVENT_TYPE_TASK_DONE event, then the dispatch_events method will also generate a TASK_DONE event when all files for a task have been downloaded. In order to do this, we make use of a registry of tasks and the number of files downloaded for each task. See the documentation for the Registry class for more information.
16
+
17
+ RETRIES
18
+ If an error occurs during download, the file will be retried with exponential backoff and jitter. We do not retry when the download is interrupted by the user.
19
+
20
+ MD5 HASHES
21
+ If force is False, and a file already exists on disk, the md5 hash of the file is compared to the md5 hash of the file on the server. If the hashes match, the file is skipped.
22
+ If force is True, then the file is downloaded regardless.
23
+
24
+ FILTERING
25
+ The regex parameter can be used to filter the files that are downloaded. If the regex parameter is provided, only files whose relative path matches the regex using `re.search` will be downloaded. This means users can give a literal string and the downloader will download all files whose relative path contains that string.
26
+ """
27
+
28
+ import base64
29
+ import contextlib
30
+ import hashlib
31
+ import logging
32
+ import os
33
+ import random
34
+ import re
35
+ import stat
36
+ import tempfile
37
+ import threading
38
+ import time
39
+ import traceback
40
+ import signal
41
+
42
+ from concurrent.futures import ThreadPoolExecutor
43
+ from queue import Queue
44
+
45
+ import requests
46
+ from ciocore import api_client
47
+ from ciocore.downloader.log import LOGGER_NAME
48
+ from ciocore.downloader.registry import Registry
49
+ from pathlib import Path
50
+
51
+ logger = logging.getLogger(LOGGER_NAME)
52
+
53
+ DEFAULT_PAGE_SIZE = 50
54
+ DEFAULT_NUM_THREADS = 4
55
+ DEFAULT_PROGRESS_INTERVAL = 0.5
56
+ CHUNK_SIZE = 1024
57
+ DEFAULT_MAX_ATTEMPTS = 3
58
+ DEFAULT_DELAY = 1
59
+ DEFAULT_JITTER = 0.1
60
+
61
+ EVENT_TYPE_START = "start"
62
+ EVENT_TYPE_START_TASK = "start_task"
63
+ EVENT_TYPE_FILE_DONE = "file_done"
64
+ EVENT_TYPE_TASK_DONE = "task_done"
65
+ EVENT_TYPE_DONE = "done"
66
+ EVENT_TYPE_PROGRESS = "progress"
67
+
68
+ FALLBACK_DOWNLOADS_FOLDER = "CONDUCTOR_DOWNLOADS"
69
+
70
+ class UserInterrupted(Exception):
71
+ pass
72
+
73
+ @contextlib.contextmanager
74
+ def temp_file(filepath):
75
+ """
76
+ Create a temporary file to use instead of the input filepath.
77
+
78
+ The input doesn't have to exist. If it does exist, it will ultimately be overwritten.
79
+ This context manager yields a path to a temporary file which will replace the original
80
+ file when the context is exited.
81
+ """
82
+ target_path = Path(filepath)
83
+ target_path.parent.mkdir(parents=True, exist_ok=True)
84
+
85
+ # Create temporary file in the same directory as the target.
86
+ # Descriptor is automatically closed on exiting the `with`.
87
+ with tempfile.NamedTemporaryFile(
88
+ prefix=target_path.name, dir=str(target_path.parent), delete=False
89
+ ) as tmp:
90
+ temp_file_path = Path(tmp.name) # Save the temporary file path to move it later
91
+
92
+ try:
93
+ yield temp_file_path # Yield control back to the caller, with the temp file path
94
+
95
+ # Move the temporary file to the target location, effectively overwriting it
96
+ temp_file_path.replace(target_path)
97
+
98
+ # Set permissions to 664
99
+ target_path.chmod(
100
+ stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH
101
+ )
102
+ finally:
103
+ # Clean up temporary file if it still exists
104
+ if temp_file_path and temp_file_path.exists():
105
+ temp_file_path.unlink()
106
+
107
+ class BaseDownloader(object):
108
+
109
+ WORK_QUEUE_THROTTLE = 0.1
110
+ EVENT_DISPATCHER_PAUSE = 0.1
111
+
112
+ VALID_EVENTS = [
113
+ EVENT_TYPE_START,
114
+ EVENT_TYPE_START_TASK,
115
+ EVENT_TYPE_PROGRESS,
116
+ EVENT_TYPE_TASK_DONE,
117
+ EVENT_TYPE_FILE_DONE,
118
+ EVENT_TYPE_DONE,
119
+ ]
120
+
121
+ @contextlib.contextmanager
122
+ def start_end_events(self):
123
+ """Send start and end events to the event queue."""
124
+ self.emit_start_event()
125
+ try:
126
+ yield
127
+ finally:
128
+ self.emit_end_event()
129
+
130
+
131
+ @contextlib.contextmanager
132
+ def event_queue_context(self):
133
+ """Send start and end events to the event queue."""
134
+ self.event_queue = Queue()
135
+ event_dispatcher_thread = threading.Thread(target=self.dispatch_events)
136
+ event_dispatcher_thread.start()
137
+ try:
138
+ yield
139
+ finally:
140
+ logger.debug("Waiting for event dispatcher thread to finish")
141
+ event_dispatcher_thread.join()
142
+
143
+
144
+ def __init__(
145
+ self,
146
+ output_path=None,
147
+ num_threads=DEFAULT_NUM_THREADS,
148
+ progress_interval=DEFAULT_PROGRESS_INTERVAL,
149
+ page_size=DEFAULT_PAGE_SIZE,
150
+ force=False,
151
+ regex=None,
152
+ max_attempts=DEFAULT_MAX_ATTEMPTS,
153
+ delay=DEFAULT_DELAY,
154
+ jitter=DEFAULT_JITTER,
155
+ client=api_client.ApiClient(),
156
+ ):
157
+ """Initialize the downloader."""
158
+ self.output_path = output_path
159
+ self.force = force
160
+ self.num_threads = num_threads
161
+ self.max_queue_size = num_threads * 2
162
+ self.progress_interval = progress_interval / 1000.0
163
+ self.page_size = page_size if page_size > 1 else None
164
+ self.client = client
165
+ self.max_attempts = max_attempts
166
+ self.delay = delay
167
+ self.jitter = jitter
168
+ self.regex = re.compile(regex) if regex else None
169
+ self.interrupt_flag = threading.Event()
170
+ self.registry_lock = threading.Lock()
171
+
172
+ self.event_queue = None
173
+
174
+ self.callbacks = {
175
+ EVENT_TYPE_START: [],
176
+ EVENT_TYPE_START_TASK: [],
177
+ EVENT_TYPE_PROGRESS: [],
178
+ EVENT_TYPE_TASK_DONE: [],
179
+ EVENT_TYPE_FILE_DONE: [],
180
+ EVENT_TYPE_DONE: [],
181
+ }
182
+
183
+ # A registry of tasks that are in progress.
184
+ self.registry = Registry()
185
+
186
+ logger.debug("Output_path: %s", self.output_path)
187
+ logger.debug("Force download: %s", self.force)
188
+ logger.debug("Num threads: %s", self.num_threads)
189
+ logger.debug("Max queue size: %s", self.max_queue_size)
190
+ logger.debug("Progress interval: %s seconds", self.progress_interval)
191
+ logger.debug("Page limit: %s", self.page_size)
192
+ logger.debug("Instantiated client: %s", self.client)
193
+ logger.debug("Max attempts: %s", self.max_attempts)
194
+ logger.debug("Delay: %s", self.delay)
195
+ logger.debug("Jitter: %s", self.jitter)
196
+ logger.debug("Regex: %s", self.regex)
197
+
198
+
199
+ def filter_task(self, task):
200
+ """Use a regex to Filter out files from a task."""
201
+ if not self.regex:
202
+ return task
203
+
204
+ filtered_files = [file for file in task['files'] if self.regex.search(file['relative_path'])]
205
+ new_size = sum(file['size'] for file in filtered_files)
206
+ new_task = {
207
+ 'download_id': task['download_id'],
208
+ 'files': filtered_files,
209
+ 'job_id': task['job_id'],
210
+ 'output_dir': task['output_dir'],
211
+ 'size': new_size,
212
+ 'task_id': task['task_id']
213
+ }
214
+ return new_task
215
+
216
+ def filter(self, tasks):
217
+ """Filter out files from tasks."""
218
+ return list(map(self.filter_task, tasks))
219
+
220
+ def handle_interrupt(self, *args):
221
+ """
222
+ Handle the first interrupt signal by setting the interrupt flag.
223
+ """
224
+ if not self.interrupt_flag.is_set():
225
+ logger.warning("INTERRUPTED! CLEANING UP. PLEASE BE PATIENT...")
226
+ self.interrupt_flag.set()
227
+ # Ignore further SIGINT signals by setting the handler to a new function that just logs a message
228
+ signal.signal(signal.SIGINT, self.handle_subsequent_interrupts)
229
+
230
+ def handle_subsequent_interrupts(self, *args):
231
+ """
232
+ Handle subsequent interrupt signals by logging a less polite message.
233
+ """
234
+ logger.warning(
235
+ " I SAID BE PATIENT. THE DOWNLOAD HAS BEEN CANCELLED BUT I AM STILL CLEANING UP!"
236
+ )
237
+
238
+ def run(self):
239
+ """Run the downloader.
240
+
241
+ For each job, we request pages of tasks, and then download each file from each task in a
242
+ thread.
243
+ """
244
+ logger.debug("Running the downloader")
245
+ self.interrupt_flag.clear()
246
+
247
+ # Set the initial signal handler for (Ctrl+C) so that we can clean up if the user interrupts the download.
248
+ signal.signal(signal.SIGINT, self.handle_interrupt)
249
+
250
+ with self.event_queue_context():
251
+ with self.start_end_events():
252
+
253
+ # Run a loop that fetches pages of tasks from the server.
254
+ # next_locator can be determined by the implementation of get_some_tasks().
255
+ # It is fed in and returned each loop.
256
+ # If it is returned as None, the loop will end.
257
+ try:
258
+ with ThreadPoolExecutor(max_workers=self.num_threads) as executor:
259
+ next_locator = None
260
+ while not self.interrupt_flag.is_set():
261
+ tasks, next_locator = self.get_some_tasks(next_locator)
262
+ if tasks:
263
+ self.download_tasks(tasks, executor)
264
+ if not next_locator or self.interrupt_flag.is_set():
265
+ break
266
+ # To test, we could fake an exception here.
267
+
268
+ except Exception: # Catch all exceptions
269
+ # Let the workers know they should stop
270
+ self.interrupt_flag.set()
271
+ finally:
272
+ logger.debug("Shutting down...")
273
+ executor.shutdown(wait=True)
274
+
275
+ def get_some_tasks(self, locator):
276
+ """Get a page of tasks from the server."""
277
+ raise NotImplementedError
278
+
279
+ def download_tasks(self, tasks, executor):
280
+ """Run a page of download tasks using a thread pool executor.
281
+
282
+ Parameters:
283
+ - tasks (list): A list of task dictionaries to be processed.
284
+ - executor (ThreadPoolExecutor): The executor for running download tasks concurrently.
285
+ """
286
+ logger.debug("Downloading page:")
287
+
288
+ for task_info in tasks:
289
+ if not self.registry.register_task(task_info):
290
+ # register_task returns none if the task is already in the registry.
291
+ continue
292
+
293
+ self.emit_start_task_event(task_info)
294
+ for file_info in task_info["files"]:
295
+
296
+ file_info["output_dir"] = self.ensure_writable_output_path(file_info, task_info)
297
+ file_info["filepath"] = os.path.join(file_info["output_dir"], file_info["relative_path"])
298
+
299
+ future = executor.submit(self.attempt_download, file_info)
300
+ # Upon completion, put the result in the event queue.
301
+ future.add_done_callback(lambda f: self.event_queue.put(f.result()))
302
+
303
+ # pylint: disable=protected-access
304
+ while executor._work_queue.qsize() > self.max_queue_size:
305
+ # Throttle to prevent the queue from growing too large.
306
+ time.sleep(self.WORK_QUEUE_THROTTLE)
307
+
308
+
309
+ def attempt_download(self, file_info):
310
+ """
311
+ Attempt to download a file with exponential backoff retries.
312
+
313
+ Parameters:
314
+ - file_info (dict): A dictionary containing information about the file to be downloaded.
315
+
316
+ Returns:
317
+ - file_done_event: A dictionary indicating the completion status of the download.
318
+ """
319
+ filepath = file_info["filepath"]
320
+ attempts_remaining = self.max_attempts
321
+ retry_delay = self.delay
322
+
323
+ while True:
324
+ try:
325
+ # Try to download the file.
326
+ file_done_event = self.download(file_info)
327
+ return file_done_event # Return the event if download is successful.
328
+
329
+ except UserInterrupted as ex:
330
+ # Handle user interruption.
331
+ file_done_event = self.generate_file_done_event(
332
+ file_info, error=str(ex)
333
+ )
334
+ break
335
+
336
+ except Exception as ex:
337
+ # Decrement the remaining attempts.
338
+ attempts_remaining -= 1
339
+
340
+ if attempts_remaining <= 0:
341
+ # If no attempts left, log the error and stop trying.
342
+ traceback_str = traceback.format_exc()
343
+ error_str = f"{ex}\nTraceback:\n{traceback_str}"
344
+ file_done_event = self.generate_file_done_event(
345
+ file_info, error=error_str
346
+ )
347
+ logger.exception(
348
+ "Failed to download %s after %d attempts.",
349
+ filepath,
350
+ self.max_attempts
351
+ )
352
+ break # Exit the loop if all attempts are exhausted.
353
+
354
+ else:
355
+ # If there are still attempts left, wait for the retry delay.
356
+ time.sleep(retry_delay)
357
+ # Calculate the next delay using exponential backoff with jitter.
358
+ retry_delay *= 2
359
+ retry_delay += random.uniform(0, retry_delay * self.jitter)
360
+
361
+ logger.exception(
362
+ "Failed to download %s. Retrying in %f seconds. %d attempts left.",
363
+ filepath,
364
+ retry_delay,
365
+ attempts_remaining
366
+ )
367
+
368
+ # Return the final file done event after all attempts.
369
+ return file_done_event
370
+
371
+
372
+ def ensure_writable_output_path(self,file_info,task_info):
373
+ """
374
+ Resolve the output directory for the file.
375
+ If the file's output directory is from a Windows machine, we provisionally use the fallback directory.
376
+
377
+ If the output_path is not writable, we try to use a fallback directory. If that fails, we use the temp folder.
378
+ """
379
+
380
+ output_path = file_info["output_dir"]
381
+ if os.name == "posix":
382
+ if re.match(r"^[a-zA-Z]:", output_path):
383
+ self.output_path = os.path.expanduser(os.path.join("~", FALLBACK_DOWNLOADS_FOLDER))
384
+
385
+ if self.output_path:
386
+ output_path = os.path.join(self.output_path, task_info["job_id"])
387
+
388
+ try:
389
+ os.makedirs(output_path, exist_ok=True)
390
+ return output_path
391
+ except Exception:
392
+ logger.exception("Can't use specified output directory %s. Trying fallback", output_path)
393
+
394
+ output_path = os.path.expanduser(os.path.join("~", FALLBACK_DOWNLOADS_FOLDER, task_info["job_id"]))
395
+ try:
396
+ os.makedirs(output_path, exist_ok=True)
397
+ return output_path
398
+ except Exception:
399
+ logger.exception("Can't use fallback output directory %s. Trying temp folder", output_path)
400
+ return os.path.join(tempfile.gettempdir(), FALLBACK_DOWNLOADS_FOLDER, task_info["job_id"])
401
+
402
+
403
+ def can_skip(self, file_info):
404
+ """Determine if a file download should be skipped.
405
+
406
+ It can be skipped if it exists already with the same content. In this case we return a file_done event dict with preexisting=True. The event is put in the event queue by the calling function.
407
+ """
408
+
409
+ if self.force:
410
+ return False
411
+
412
+ filepath = file_info["filepath"]
413
+ if not os.path.exists(filepath):
414
+ return False
415
+
416
+ try:
417
+ existing_md5 = self._generate_base64_md5(filepath)
418
+ download_md5 = file_info.get("md5", "none")
419
+ if existing_md5 != download_md5:
420
+ return False
421
+ except Exception:
422
+ logger.exception("Error checking md5 for %s", filepath)
423
+ return False
424
+
425
+ return self.generate_file_done_event(file_info, preexisting=True)
426
+
427
+ def download(self, file_info):
428
+ """
429
+ Do the work of downloading a file.
430
+
431
+ Use a temp file to avoid corrupting the original file if the download fails.
432
+ """
433
+ skip_result = self.can_skip(file_info)
434
+ if skip_result:
435
+ return skip_result
436
+
437
+ size = file_info["size"]
438
+ filepath = os.path.join(file_info["output_dir"], file_info["relative_path"])
439
+
440
+ logger.debug("Downloading file: %s", filepath)
441
+
442
+ with temp_file(filepath) as safe_filepath:
443
+ response = requests.get(file_info["url"], stream=True, timeout=60)
444
+ size = float(response.headers.get("content-length", 0))
445
+ progress_bytes = 0
446
+ last_poll = time.time()
447
+ with open(safe_filepath, "wb") as file_handle:
448
+ for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
449
+ # check if the download has been interrupted
450
+ if self.interrupt_flag.is_set():
451
+
452
+ raise UserInterrupted("Download interrupted by user.")
453
+
454
+ if not chunk:
455
+ continue
456
+ file_handle.write(chunk)
457
+
458
+ progress_bytes += len(chunk)
459
+ last_poll = self.emit_progress_event(
460
+ filepath, progress_bytes, size, last_poll
461
+ )
462
+
463
+ response.raise_for_status()
464
+
465
+ return self.generate_file_done_event(file_info)
466
+
467
+ def dispatch_events(self):
468
+ """
469
+ Pull events from the event queue as they are ready and call the appropriate callbacks.
470
+ """
471
+ while True:
472
+ # Get the next event from the queue
473
+ evt = self.event_queue.get()
474
+ event_type = evt["type"]
475
+
476
+ # Call all registered callbacks for the event
477
+ for callback in self.callbacks[event_type]:
478
+ callback(evt)
479
+
480
+ # If there are any callbacks registered for the task_done event,
481
+ # then we check if the event is a file_done event and if so, determine
482
+ # whether the whole task is done.
483
+ # If the task is done, call its callbacks.
484
+ if event_type == EVENT_TYPE_FILE_DONE:
485
+ if len(self.callbacks[EVENT_TYPE_TASK_DONE]) > 0:
486
+ task_done_event = self.generate_task_done_event(evt)
487
+ if task_done_event:
488
+ for callback in self.callbacks[EVENT_TYPE_TASK_DONE]:
489
+ callback(task_done_event)
490
+
491
+ if event_type == EVENT_TYPE_DONE:
492
+ break
493
+
494
+ def generate_task_done_event(self, evt):
495
+ """
496
+ Build task_done event from file_done event and the registry.
497
+
498
+ Only do this is the file count for the task is complete.
499
+ """
500
+ event_type = evt["type"]
501
+ # We don't want to update the registry if the file_done event is an error.
502
+ # Ignoring it ensures that it will eventually be reported back to the server as pending.
503
+ if event_type != EVENT_TYPE_FILE_DONE or evt.get("error"):
504
+ return None
505
+
506
+ # Increment the number of downloaded files for the task
507
+ updated_task = self.registry.update_task(evt)
508
+
509
+ if not updated_task: # should never happen
510
+ return None
511
+
512
+ if updated_task["completed_files"] >= updated_task["filecount"]:
513
+ return {
514
+ "type": EVENT_TYPE_TASK_DONE,
515
+ "job_id": evt["job_id"],
516
+ "task_id": evt["task_id"],
517
+ "download_id": updated_task["download_id"],
518
+ "filecount": updated_task["filecount"],
519
+ "preexisting": updated_task["preexisting_files"]
520
+ == updated_task["filecount"],
521
+ "size": updated_task["size"],
522
+ }
523
+
524
+ return None
525
+
526
+ ############## METHODS TO CONSTRUCT AND EMIT EVENTS #####################
527
+ def emit_start_task_event(self, task):
528
+ """Send a start_task event to the event queue."""
529
+ self.event_queue.put(
530
+ {
531
+ "type": EVENT_TYPE_START_TASK,
532
+ "download_id": task["download_id"],
533
+ "filecount": len(task["files"]),
534
+ "task_id": task["task_id"],
535
+ "job_id": task["job_id"],
536
+ "size": task["size"],
537
+ }
538
+ )
539
+
540
+ def emit_progress_event(self, filepath, progress_bytes, size, last_poll):
541
+ """Send a progress event to the event queue if it's time to do so."""
542
+ now = time.time()
543
+ if now >= last_poll + self.progress_interval:
544
+ last_poll = now
545
+ self.event_queue.put(
546
+ {
547
+ "type": EVENT_TYPE_PROGRESS,
548
+ "filepath": filepath,
549
+ "progress_bytes": progress_bytes,
550
+ "size": size,
551
+ }
552
+ )
553
+ return last_poll
554
+
555
+ def emit_start_event(self):
556
+ """Send start event to the event queue."""
557
+ self.event_queue.put(
558
+ {
559
+ "type": EVENT_TYPE_START,
560
+ "num_threads": self.num_threads,
561
+ "page_size": self.page_size,
562
+ }
563
+ )
564
+
565
+ def emit_end_event(self):
566
+ """Send done event to the event queue.
567
+
568
+ Send along the registry so that any callbacks can check if any tasks were not completed.
569
+ """
570
+ self.event_queue.put({"type": EVENT_TYPE_DONE, "registry": self.registry})
571
+
572
+ @staticmethod
573
+ def generate_file_done_event(file, **kwargs):
574
+ result = {
575
+ "type": EVENT_TYPE_FILE_DONE,
576
+ "job_id": file["job_id"],
577
+ "task_id": file["task_id"],
578
+ "filepath": file["filepath"],
579
+ "md5": file["md5"],
580
+ "size": file["size"],
581
+ "preexisting": False,
582
+ "error": None,
583
+ }
584
+ # If the preexisting key is in kwargs, merge it in the result dict.
585
+ return {**result, **kwargs}
586
+
587
+ ################################################################
588
+
589
+ def on(self, event_type, callback):
590
+ """Register a callback function.
591
+
592
+ Args:
593
+ event_type (str): The name of the callback. Must be one of the values in VALID_EVENTS.
594
+ callback (function): The callback function. Must accept one argument named 'evt'.
595
+ Raises:
596
+ ValueError: If the event_type is not in VALID_EVENTS.
597
+
598
+ Examples:
599
+ >>> def my_callback(evt):
600
+ ... print(evt)
601
+
602
+ >>> downloader = BaseDownloader(jobs)
603
+ >>> downloader.on("start", my_callback)
604
+
605
+ """
606
+ if event_type not in self.VALID_EVENTS:
607
+ raise ValueError(
608
+ f"Invalid event_type: {event_type}. Allowed values: {self.VALID_EVENTS}"
609
+ )
610
+ self._validate_callback(callback)
611
+ self.callbacks[event_type].append(callback)
612
+
613
+ @staticmethod
614
+ def _validate_callback(callback):
615
+ """Make sure the callback is a callable function with one argument named 'evt'.
616
+
617
+ The callback could be a method of another class, in which case the first argument will be 'self'. We account for this too.
618
+ """
619
+ if not callable(callback):
620
+ raise ValueError("Callback must be a callable function.")
621
+ num_args = callback.__code__.co_argcount
622
+
623
+ arg_names = callback.__code__.co_varnames[:num_args]
624
+
625
+ if num_args > 2 or (num_args == 2 and arg_names[0] != "self"):
626
+ raise ValueError(f"Too many args. Found {num_args} arguments: {arg_names}")
627
+
628
+ if num_args < 1 or arg_names[-1] != "evt":
629
+ raise ValueError("Callback is missing the named argument 'evt'.")
630
+ return True
631
+
632
+ @staticmethod
633
+ def _generate_base64_md5(filename):
634
+ """Generate the base64 md5 hash of a file.
635
+
636
+ This is used to determine if a file on disk is the same as a file on the server.
637
+ """
638
+ with open(filename, "rb") as file:
639
+ md5_hash = hashlib.md5()
640
+ for chunk in iter(lambda: file.read(4096), b""):
641
+ md5_hash.update(chunk)
642
+ md5_digest = md5_hash.digest()
643
+ md5_base64 = base64.b64encode(md5_digest)
644
+ return md5_base64.decode("utf-8")
@@ -0,0 +1,47 @@
1
+ """
2
+ Base Download runner
3
+
4
+ This module contains the DownloadRunnerBase class.
5
+
6
+ The DownloadRunnerBase is responsible for running one of the downloader classes: JobDownloader or PerpetualDownloader. If there are no jobids, it runs the PerpetualDownloader.
7
+
8
+ It also sets up a Reporter to report task status back to the server.
9
+
10
+ By design, derived classes need only be concerned with registering callbacks. See the LoggingDownloadRunner class for an example.
11
+
12
+ """
13
+
14
+ import logging
15
+ from ciocore.downloader.job_downloader import JobDownloader
16
+ from ciocore.downloader.perpetual_downloader import PerpetualDownloader
17
+ from ciocore.downloader.log import LOGGER_NAME
18
+ from ciocore.downloader.reporter import Reporter
19
+
20
+ logger = logging.getLogger(LOGGER_NAME)
21
+
22
+ class DownloadRunnerBase(object):
23
+ CLIENT_NAME = "DownloadRunnerBase"
24
+
25
+ def __init__(self, jobids=None, location=None, **kwargs):
26
+ """
27
+ Initialize the downloader.
28
+ """
29
+ self.disable_reporting = kwargs.pop("disable_reporting")
30
+ self.num_reporter_threads = kwargs.get("num_threads", 1)
31
+ if jobids:
32
+ self.downloader = JobDownloader(jobids, **kwargs)
33
+ else:
34
+ self.downloader = PerpetualDownloader(location, **kwargs)
35
+
36
+ def run(self):
37
+ """
38
+ Run the downloader.
39
+
40
+ Optionally wrap the downloader in a reporter to report task statuses back to the server.
41
+ """
42
+ if self.disable_reporting:
43
+ self.downloader.run()
44
+ else:
45
+ with Reporter(self.downloader, num_threads=self.num_reporter_threads):
46
+ self.downloader.run()
47
+