ciocore 5.1.1__py2.py3-none-any.whl → 10.0.0b3__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ciocore/VERSION +1 -1
- ciocore/__init__.py +23 -1
- ciocore/api_client.py +655 -160
- ciocore/auth/__init__.py +5 -3
- ciocore/cli.py +501 -0
- ciocore/common.py +15 -13
- ciocore/conductor_submit.py +77 -60
- ciocore/config.py +127 -13
- ciocore/data.py +162 -77
- ciocore/docsite/404.html +746 -0
- ciocore/docsite/apidoc/api_client/index.html +3605 -0
- ciocore/docsite/apidoc/apidoc/index.html +909 -0
- ciocore/docsite/apidoc/config/index.html +1652 -0
- ciocore/docsite/apidoc/data/index.html +1553 -0
- ciocore/docsite/apidoc/hardware_set/index.html +2460 -0
- ciocore/docsite/apidoc/package_environment/index.html +1507 -0
- ciocore/docsite/apidoc/package_tree/index.html +2386 -0
- ciocore/docsite/assets/_mkdocstrings.css +16 -0
- ciocore/docsite/assets/images/favicon.png +0 -0
- ciocore/docsite/assets/javascripts/bundle.471ce7a9.min.js +29 -0
- ciocore/docsite/assets/javascripts/bundle.471ce7a9.min.js.map +7 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.ar.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.da.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.de.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.du.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.el.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.es.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.fi.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.fr.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.he.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.hi.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.hu.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.hy.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.it.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.ja.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.jp.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.kn.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.ko.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.multi.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.nl.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.no.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.pt.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.ro.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.ru.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.sa.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.stemmer.support.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.sv.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.ta.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.te.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.th.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.tr.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.vi.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.zh.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/tinyseg.js +206 -0
- ciocore/docsite/assets/javascripts/lunr/wordcut.js +6708 -0
- ciocore/docsite/assets/javascripts/workers/search.b8dbb3d2.min.js +42 -0
- ciocore/docsite/assets/javascripts/workers/search.b8dbb3d2.min.js.map +7 -0
- ciocore/docsite/assets/stylesheets/main.3cba04c6.min.css +1 -0
- ciocore/docsite/assets/stylesheets/main.3cba04c6.min.css.map +1 -0
- ciocore/docsite/assets/stylesheets/palette.06af60db.min.css +1 -0
- ciocore/docsite/assets/stylesheets/palette.06af60db.min.css.map +1 -0
- ciocore/docsite/cmdline/docs/index.html +871 -0
- ciocore/docsite/cmdline/downloader/index.html +934 -0
- ciocore/docsite/cmdline/packages/index.html +878 -0
- ciocore/docsite/cmdline/uploader/index.html +995 -0
- ciocore/docsite/how-to-guides/index.html +869 -0
- ciocore/docsite/index.html +895 -0
- ciocore/docsite/logo.png +0 -0
- ciocore/docsite/objects.inv +0 -0
- ciocore/docsite/search/search_index.json +1 -0
- ciocore/docsite/sitemap.xml +3 -0
- ciocore/docsite/sitemap.xml.gz +0 -0
- ciocore/docsite/stylesheets/extra.css +26 -0
- ciocore/docsite/stylesheets/tables.css +167 -0
- ciocore/downloader/base_downloader.py +644 -0
- ciocore/downloader/download_runner_base.py +47 -0
- ciocore/downloader/job_downloader.py +119 -0
- ciocore/{downloader.py → downloader/legacy_downloader.py} +12 -9
- ciocore/downloader/log.py +73 -0
- ciocore/downloader/logging_download_runner.py +87 -0
- ciocore/downloader/perpetual_downloader.py +63 -0
- ciocore/downloader/registry.py +97 -0
- ciocore/downloader/reporter.py +135 -0
- ciocore/exceptions.py +8 -2
- ciocore/file_utils.py +51 -50
- ciocore/hardware_set.py +449 -0
- ciocore/loggeria.py +89 -20
- ciocore/package_environment.py +110 -48
- ciocore/package_query.py +182 -0
- ciocore/package_tree.py +319 -258
- ciocore/retry.py +0 -0
- ciocore/uploader/_uploader.py +547 -364
- ciocore/uploader/thread_queue_job.py +176 -0
- ciocore/uploader/upload_stats/__init__.py +3 -4
- ciocore/uploader/upload_stats/stats_formats.py +10 -4
- ciocore/validator.py +34 -2
- ciocore/worker.py +174 -151
- ciocore-10.0.0b3.dist-info/METADATA +928 -0
- ciocore-10.0.0b3.dist-info/RECORD +128 -0
- {ciocore-5.1.1.dist-info → ciocore-10.0.0b3.dist-info}/WHEEL +1 -1
- ciocore-10.0.0b3.dist-info/entry_points.txt +2 -0
- tests/instance_type_fixtures.py +175 -0
- tests/package_fixtures.py +205 -0
- tests/test_api_client.py +297 -12
- tests/test_base_downloader.py +104 -0
- tests/test_cli.py +149 -0
- tests/test_common.py +1 -7
- tests/test_config.py +40 -18
- tests/test_data.py +162 -173
- tests/test_downloader.py +118 -0
- tests/test_hardware_set.py +139 -0
- tests/test_job_downloader.py +213 -0
- tests/test_package_query.py +38 -0
- tests/test_package_tree.py +91 -291
- tests/test_submit.py +44 -18
- tests/test_uploader.py +1 -4
- ciocore/__about__.py +0 -10
- ciocore/cli/conductor.py +0 -191
- ciocore/compat.py +0 -15
- ciocore-5.1.1.data/scripts/conductor +0 -19
- ciocore-5.1.1.data/scripts/conductor.bat +0 -13
- ciocore-5.1.1.dist-info/METADATA +0 -408
- ciocore-5.1.1.dist-info/RECORD +0 -47
- tests/mocks/api_client_mock.py +0 -51
- /ciocore/{cli → downloader}/__init__.py +0 -0
- {ciocore-5.1.1.dist-info → ciocore-10.0.0b3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,644 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Contains the base class for both the JobDownloader and the PerpetualDownloader.
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
PAGING
|
|
6
|
+
Both the job downloader and the perpetual downloader get their lists of tasks to download in batches. In both cases they implement the get_some_tasks() method. This method is called repeatedly until it is interrupted or until it returns a falsy locator. The locator, if not falsy, is whatever the derived class finds useful. See the documentation for the derived classes for detailed information.
|
|
7
|
+
|
|
8
|
+
CALLBACKS
|
|
9
|
+
The intention is to keep the downloader simple and flexible. As such, some functionality is intentionally left out. For example, we do not report back to the Conductor API when tasks are complete. We do not format output, other than that provided by standard logging. We do not provide a GUI. Instead, we emit lifecycle events that can be used to do all of these things and more. The LoggingDownloadRunner class demonstrates this.
|
|
10
|
+
|
|
11
|
+
Callbacks are called with an 'evt' argument, which is a dictionary containing information about the event. The events are listed in the VALID_EVENTS list. To register a callback you use the `on` method - for example: `downloader.on("start", my_callback)`. The callback must be a function that accepts one argument named 'evt'. The callback can be a method of another class. Several callbacks may be registered for the same event type.
|
|
12
|
+
|
|
13
|
+
Since the downloader is multithreaded, the events are generated in different threads. We use a Queue to pass the events from the downloader threads to the main thread. The method, dispatch_events is responsible for reading events from the queue and calling the appropriate callbacks.
|
|
14
|
+
|
|
15
|
+
Most event types are emitted unchanged as they are received from the queue. However, if one or more callbacks are registered to handle the EVENT_TYPE_TASK_DONE event, then the dispatch_events method will also generate a TASK_DONE event when all files for a task have been downloaded. In order to do this, we make use of a registry of tasks and the number of files downloaded for each task. See the documentation for the Registry class for more information.
|
|
16
|
+
|
|
17
|
+
RETRIES
|
|
18
|
+
If an error occurs during download, the file will be retried with exponential backoff and jitter. We do not retry when the download is interrupted by the user.
|
|
19
|
+
|
|
20
|
+
MD5 HASHES
|
|
21
|
+
If force is False, and a file already exists on disk, the md5 hash of the file is compared to the md5 hash of the file on the server. If the hashes match, the file is skipped.
|
|
22
|
+
If force is True, then the file is downloaded regardless.
|
|
23
|
+
|
|
24
|
+
FILTERING
|
|
25
|
+
The regex parameter can be used to filter the files that are downloaded. If the regex parameter is provided, only files whose relative path matches the regex using `re.search` will be downloaded. This means users can give a literal string and the downloader will download all files whose relative path contains that string.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
import base64
|
|
29
|
+
import contextlib
|
|
30
|
+
import hashlib
|
|
31
|
+
import logging
|
|
32
|
+
import os
|
|
33
|
+
import random
|
|
34
|
+
import re
|
|
35
|
+
import stat
|
|
36
|
+
import tempfile
|
|
37
|
+
import threading
|
|
38
|
+
import time
|
|
39
|
+
import traceback
|
|
40
|
+
import signal
|
|
41
|
+
|
|
42
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
43
|
+
from queue import Queue
|
|
44
|
+
|
|
45
|
+
import requests
|
|
46
|
+
from ciocore import api_client
|
|
47
|
+
from ciocore.downloader.log import LOGGER_NAME
|
|
48
|
+
from ciocore.downloader.registry import Registry
|
|
49
|
+
from pathlib import Path
|
|
50
|
+
|
|
51
|
+
logger = logging.getLogger(LOGGER_NAME)
|
|
52
|
+
|
|
53
|
+
DEFAULT_PAGE_SIZE = 50
|
|
54
|
+
DEFAULT_NUM_THREADS = 4
|
|
55
|
+
DEFAULT_PROGRESS_INTERVAL = 0.5
|
|
56
|
+
CHUNK_SIZE = 1024
|
|
57
|
+
DEFAULT_MAX_ATTEMPTS = 3
|
|
58
|
+
DEFAULT_DELAY = 1
|
|
59
|
+
DEFAULT_JITTER = 0.1
|
|
60
|
+
|
|
61
|
+
EVENT_TYPE_START = "start"
|
|
62
|
+
EVENT_TYPE_START_TASK = "start_task"
|
|
63
|
+
EVENT_TYPE_FILE_DONE = "file_done"
|
|
64
|
+
EVENT_TYPE_TASK_DONE = "task_done"
|
|
65
|
+
EVENT_TYPE_DONE = "done"
|
|
66
|
+
EVENT_TYPE_PROGRESS = "progress"
|
|
67
|
+
|
|
68
|
+
FALLBACK_DOWNLOADS_FOLDER = "CONDUCTOR_DOWNLOADS"
|
|
69
|
+
|
|
70
|
+
class UserInterrupted(Exception):
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
@contextlib.contextmanager
|
|
74
|
+
def temp_file(filepath):
|
|
75
|
+
"""
|
|
76
|
+
Create a temporary file to use instead of the input filepath.
|
|
77
|
+
|
|
78
|
+
The input doesn't have to exist. If it does exist, it will ultimately be overwritten.
|
|
79
|
+
This context manager yields a path to a temporary file which will replace the original
|
|
80
|
+
file when the context is exited.
|
|
81
|
+
"""
|
|
82
|
+
target_path = Path(filepath)
|
|
83
|
+
target_path.parent.mkdir(parents=True, exist_ok=True)
|
|
84
|
+
|
|
85
|
+
# Create temporary file in the same directory as the target.
|
|
86
|
+
# Descriptor is automatically closed on exiting the `with`.
|
|
87
|
+
with tempfile.NamedTemporaryFile(
|
|
88
|
+
prefix=target_path.name, dir=str(target_path.parent), delete=False
|
|
89
|
+
) as tmp:
|
|
90
|
+
temp_file_path = Path(tmp.name) # Save the temporary file path to move it later
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
yield temp_file_path # Yield control back to the caller, with the temp file path
|
|
94
|
+
|
|
95
|
+
# Move the temporary file to the target location, effectively overwriting it
|
|
96
|
+
temp_file_path.replace(target_path)
|
|
97
|
+
|
|
98
|
+
# Set permissions to 664
|
|
99
|
+
target_path.chmod(
|
|
100
|
+
stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH
|
|
101
|
+
)
|
|
102
|
+
finally:
|
|
103
|
+
# Clean up temporary file if it still exists
|
|
104
|
+
if temp_file_path and temp_file_path.exists():
|
|
105
|
+
temp_file_path.unlink()
|
|
106
|
+
|
|
107
|
+
class BaseDownloader(object):
|
|
108
|
+
|
|
109
|
+
WORK_QUEUE_THROTTLE = 0.1
|
|
110
|
+
EVENT_DISPATCHER_PAUSE = 0.1
|
|
111
|
+
|
|
112
|
+
VALID_EVENTS = [
|
|
113
|
+
EVENT_TYPE_START,
|
|
114
|
+
EVENT_TYPE_START_TASK,
|
|
115
|
+
EVENT_TYPE_PROGRESS,
|
|
116
|
+
EVENT_TYPE_TASK_DONE,
|
|
117
|
+
EVENT_TYPE_FILE_DONE,
|
|
118
|
+
EVENT_TYPE_DONE,
|
|
119
|
+
]
|
|
120
|
+
|
|
121
|
+
@contextlib.contextmanager
|
|
122
|
+
def start_end_events(self):
|
|
123
|
+
"""Send start and end events to the event queue."""
|
|
124
|
+
self.emit_start_event()
|
|
125
|
+
try:
|
|
126
|
+
yield
|
|
127
|
+
finally:
|
|
128
|
+
self.emit_end_event()
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@contextlib.contextmanager
|
|
132
|
+
def event_queue_context(self):
|
|
133
|
+
"""Send start and end events to the event queue."""
|
|
134
|
+
self.event_queue = Queue()
|
|
135
|
+
event_dispatcher_thread = threading.Thread(target=self.dispatch_events)
|
|
136
|
+
event_dispatcher_thread.start()
|
|
137
|
+
try:
|
|
138
|
+
yield
|
|
139
|
+
finally:
|
|
140
|
+
logger.debug("Waiting for event dispatcher thread to finish")
|
|
141
|
+
event_dispatcher_thread.join()
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def __init__(
|
|
145
|
+
self,
|
|
146
|
+
output_path=None,
|
|
147
|
+
num_threads=DEFAULT_NUM_THREADS,
|
|
148
|
+
progress_interval=DEFAULT_PROGRESS_INTERVAL,
|
|
149
|
+
page_size=DEFAULT_PAGE_SIZE,
|
|
150
|
+
force=False,
|
|
151
|
+
regex=None,
|
|
152
|
+
max_attempts=DEFAULT_MAX_ATTEMPTS,
|
|
153
|
+
delay=DEFAULT_DELAY,
|
|
154
|
+
jitter=DEFAULT_JITTER,
|
|
155
|
+
client=api_client.ApiClient(),
|
|
156
|
+
):
|
|
157
|
+
"""Initialize the downloader."""
|
|
158
|
+
self.output_path = output_path
|
|
159
|
+
self.force = force
|
|
160
|
+
self.num_threads = num_threads
|
|
161
|
+
self.max_queue_size = num_threads * 2
|
|
162
|
+
self.progress_interval = progress_interval / 1000.0
|
|
163
|
+
self.page_size = page_size if page_size > 1 else None
|
|
164
|
+
self.client = client
|
|
165
|
+
self.max_attempts = max_attempts
|
|
166
|
+
self.delay = delay
|
|
167
|
+
self.jitter = jitter
|
|
168
|
+
self.regex = re.compile(regex) if regex else None
|
|
169
|
+
self.interrupt_flag = threading.Event()
|
|
170
|
+
self.registry_lock = threading.Lock()
|
|
171
|
+
|
|
172
|
+
self.event_queue = None
|
|
173
|
+
|
|
174
|
+
self.callbacks = {
|
|
175
|
+
EVENT_TYPE_START: [],
|
|
176
|
+
EVENT_TYPE_START_TASK: [],
|
|
177
|
+
EVENT_TYPE_PROGRESS: [],
|
|
178
|
+
EVENT_TYPE_TASK_DONE: [],
|
|
179
|
+
EVENT_TYPE_FILE_DONE: [],
|
|
180
|
+
EVENT_TYPE_DONE: [],
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
# A registry of tasks that are in progress.
|
|
184
|
+
self.registry = Registry()
|
|
185
|
+
|
|
186
|
+
logger.debug("Output_path: %s", self.output_path)
|
|
187
|
+
logger.debug("Force download: %s", self.force)
|
|
188
|
+
logger.debug("Num threads: %s", self.num_threads)
|
|
189
|
+
logger.debug("Max queue size: %s", self.max_queue_size)
|
|
190
|
+
logger.debug("Progress interval: %s seconds", self.progress_interval)
|
|
191
|
+
logger.debug("Page limit: %s", self.page_size)
|
|
192
|
+
logger.debug("Instantiated client: %s", self.client)
|
|
193
|
+
logger.debug("Max attempts: %s", self.max_attempts)
|
|
194
|
+
logger.debug("Delay: %s", self.delay)
|
|
195
|
+
logger.debug("Jitter: %s", self.jitter)
|
|
196
|
+
logger.debug("Regex: %s", self.regex)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def filter_task(self, task):
|
|
200
|
+
"""Use a regex to Filter out files from a task."""
|
|
201
|
+
if not self.regex:
|
|
202
|
+
return task
|
|
203
|
+
|
|
204
|
+
filtered_files = [file for file in task['files'] if self.regex.search(file['relative_path'])]
|
|
205
|
+
new_size = sum(file['size'] for file in filtered_files)
|
|
206
|
+
new_task = {
|
|
207
|
+
'download_id': task['download_id'],
|
|
208
|
+
'files': filtered_files,
|
|
209
|
+
'job_id': task['job_id'],
|
|
210
|
+
'output_dir': task['output_dir'],
|
|
211
|
+
'size': new_size,
|
|
212
|
+
'task_id': task['task_id']
|
|
213
|
+
}
|
|
214
|
+
return new_task
|
|
215
|
+
|
|
216
|
+
def filter(self, tasks):
|
|
217
|
+
"""Filter out files from tasks."""
|
|
218
|
+
return list(map(self.filter_task, tasks))
|
|
219
|
+
|
|
220
|
+
def handle_interrupt(self, *args):
|
|
221
|
+
"""
|
|
222
|
+
Handle the first interrupt signal by setting the interrupt flag.
|
|
223
|
+
"""
|
|
224
|
+
if not self.interrupt_flag.is_set():
|
|
225
|
+
logger.warning("INTERRUPTED! CLEANING UP. PLEASE BE PATIENT...")
|
|
226
|
+
self.interrupt_flag.set()
|
|
227
|
+
# Ignore further SIGINT signals by setting the handler to a new function that just logs a message
|
|
228
|
+
signal.signal(signal.SIGINT, self.handle_subsequent_interrupts)
|
|
229
|
+
|
|
230
|
+
def handle_subsequent_interrupts(self, *args):
|
|
231
|
+
"""
|
|
232
|
+
Handle subsequent interrupt signals by logging a less polite message.
|
|
233
|
+
"""
|
|
234
|
+
logger.warning(
|
|
235
|
+
" I SAID BE PATIENT. THE DOWNLOAD HAS BEEN CANCELLED BUT I AM STILL CLEANING UP!"
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
def run(self):
|
|
239
|
+
"""Run the downloader.
|
|
240
|
+
|
|
241
|
+
For each job, we request pages of tasks, and then download each file from each task in a
|
|
242
|
+
thread.
|
|
243
|
+
"""
|
|
244
|
+
logger.debug("Running the downloader")
|
|
245
|
+
self.interrupt_flag.clear()
|
|
246
|
+
|
|
247
|
+
# Set the initial signal handler for (Ctrl+C) so that we can clean up if the user interrupts the download.
|
|
248
|
+
signal.signal(signal.SIGINT, self.handle_interrupt)
|
|
249
|
+
|
|
250
|
+
with self.event_queue_context():
|
|
251
|
+
with self.start_end_events():
|
|
252
|
+
|
|
253
|
+
# Run a loop that fetches pages of tasks from the server.
|
|
254
|
+
# next_locator can be determined by the implementation of get_some_tasks().
|
|
255
|
+
# It is fed in and returned each loop.
|
|
256
|
+
# If it is returned as None, the loop will end.
|
|
257
|
+
try:
|
|
258
|
+
with ThreadPoolExecutor(max_workers=self.num_threads) as executor:
|
|
259
|
+
next_locator = None
|
|
260
|
+
while not self.interrupt_flag.is_set():
|
|
261
|
+
tasks, next_locator = self.get_some_tasks(next_locator)
|
|
262
|
+
if tasks:
|
|
263
|
+
self.download_tasks(tasks, executor)
|
|
264
|
+
if not next_locator or self.interrupt_flag.is_set():
|
|
265
|
+
break
|
|
266
|
+
# To test, we could fake an exception here.
|
|
267
|
+
|
|
268
|
+
except Exception: # Catch all exceptions
|
|
269
|
+
# Let the workers know they should stop
|
|
270
|
+
self.interrupt_flag.set()
|
|
271
|
+
finally:
|
|
272
|
+
logger.debug("Shutting down...")
|
|
273
|
+
executor.shutdown(wait=True)
|
|
274
|
+
|
|
275
|
+
def get_some_tasks(self, locator):
|
|
276
|
+
"""Get a page of tasks from the server."""
|
|
277
|
+
raise NotImplementedError
|
|
278
|
+
|
|
279
|
+
def download_tasks(self, tasks, executor):
|
|
280
|
+
"""Run a page of download tasks using a thread pool executor.
|
|
281
|
+
|
|
282
|
+
Parameters:
|
|
283
|
+
- tasks (list): A list of task dictionaries to be processed.
|
|
284
|
+
- executor (ThreadPoolExecutor): The executor for running download tasks concurrently.
|
|
285
|
+
"""
|
|
286
|
+
logger.debug("Downloading page:")
|
|
287
|
+
|
|
288
|
+
for task_info in tasks:
|
|
289
|
+
if not self.registry.register_task(task_info):
|
|
290
|
+
# register_task returns none if the task is already in the registry.
|
|
291
|
+
continue
|
|
292
|
+
|
|
293
|
+
self.emit_start_task_event(task_info)
|
|
294
|
+
for file_info in task_info["files"]:
|
|
295
|
+
|
|
296
|
+
file_info["output_dir"] = self.ensure_writable_output_path(file_info, task_info)
|
|
297
|
+
file_info["filepath"] = os.path.join(file_info["output_dir"], file_info["relative_path"])
|
|
298
|
+
|
|
299
|
+
future = executor.submit(self.attempt_download, file_info)
|
|
300
|
+
# Upon completion, put the result in the event queue.
|
|
301
|
+
future.add_done_callback(lambda f: self.event_queue.put(f.result()))
|
|
302
|
+
|
|
303
|
+
# pylint: disable=protected-access
|
|
304
|
+
while executor._work_queue.qsize() > self.max_queue_size:
|
|
305
|
+
# Throttle to prevent the queue from growing too large.
|
|
306
|
+
time.sleep(self.WORK_QUEUE_THROTTLE)
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def attempt_download(self, file_info):
|
|
310
|
+
"""
|
|
311
|
+
Attempt to download a file with exponential backoff retries.
|
|
312
|
+
|
|
313
|
+
Parameters:
|
|
314
|
+
- file_info (dict): A dictionary containing information about the file to be downloaded.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
- file_done_event: A dictionary indicating the completion status of the download.
|
|
318
|
+
"""
|
|
319
|
+
filepath = file_info["filepath"]
|
|
320
|
+
attempts_remaining = self.max_attempts
|
|
321
|
+
retry_delay = self.delay
|
|
322
|
+
|
|
323
|
+
while True:
|
|
324
|
+
try:
|
|
325
|
+
# Try to download the file.
|
|
326
|
+
file_done_event = self.download(file_info)
|
|
327
|
+
return file_done_event # Return the event if download is successful.
|
|
328
|
+
|
|
329
|
+
except UserInterrupted as ex:
|
|
330
|
+
# Handle user interruption.
|
|
331
|
+
file_done_event = self.generate_file_done_event(
|
|
332
|
+
file_info, error=str(ex)
|
|
333
|
+
)
|
|
334
|
+
break
|
|
335
|
+
|
|
336
|
+
except Exception as ex:
|
|
337
|
+
# Decrement the remaining attempts.
|
|
338
|
+
attempts_remaining -= 1
|
|
339
|
+
|
|
340
|
+
if attempts_remaining <= 0:
|
|
341
|
+
# If no attempts left, log the error and stop trying.
|
|
342
|
+
traceback_str = traceback.format_exc()
|
|
343
|
+
error_str = f"{ex}\nTraceback:\n{traceback_str}"
|
|
344
|
+
file_done_event = self.generate_file_done_event(
|
|
345
|
+
file_info, error=error_str
|
|
346
|
+
)
|
|
347
|
+
logger.exception(
|
|
348
|
+
"Failed to download %s after %d attempts.",
|
|
349
|
+
filepath,
|
|
350
|
+
self.max_attempts
|
|
351
|
+
)
|
|
352
|
+
break # Exit the loop if all attempts are exhausted.
|
|
353
|
+
|
|
354
|
+
else:
|
|
355
|
+
# If there are still attempts left, wait for the retry delay.
|
|
356
|
+
time.sleep(retry_delay)
|
|
357
|
+
# Calculate the next delay using exponential backoff with jitter.
|
|
358
|
+
retry_delay *= 2
|
|
359
|
+
retry_delay += random.uniform(0, retry_delay * self.jitter)
|
|
360
|
+
|
|
361
|
+
logger.exception(
|
|
362
|
+
"Failed to download %s. Retrying in %f seconds. %d attempts left.",
|
|
363
|
+
filepath,
|
|
364
|
+
retry_delay,
|
|
365
|
+
attempts_remaining
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
# Return the final file done event after all attempts.
|
|
369
|
+
return file_done_event
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def ensure_writable_output_path(self,file_info,task_info):
|
|
373
|
+
"""
|
|
374
|
+
Resolve the output directory for the file.
|
|
375
|
+
If the file's output directory is from a Windows machine, we provisionally use the fallback directory.
|
|
376
|
+
|
|
377
|
+
If the output_path is not writable, we try to use a fallback directory. If that fails, we use the temp folder.
|
|
378
|
+
"""
|
|
379
|
+
|
|
380
|
+
output_path = file_info["output_dir"]
|
|
381
|
+
if os.name == "posix":
|
|
382
|
+
if re.match(r"^[a-zA-Z]:", output_path):
|
|
383
|
+
self.output_path = os.path.expanduser(os.path.join("~", FALLBACK_DOWNLOADS_FOLDER))
|
|
384
|
+
|
|
385
|
+
if self.output_path:
|
|
386
|
+
output_path = os.path.join(self.output_path, task_info["job_id"])
|
|
387
|
+
|
|
388
|
+
try:
|
|
389
|
+
os.makedirs(output_path, exist_ok=True)
|
|
390
|
+
return output_path
|
|
391
|
+
except Exception:
|
|
392
|
+
logger.exception("Can't use specified output directory %s. Trying fallback", output_path)
|
|
393
|
+
|
|
394
|
+
output_path = os.path.expanduser(os.path.join("~", FALLBACK_DOWNLOADS_FOLDER, task_info["job_id"]))
|
|
395
|
+
try:
|
|
396
|
+
os.makedirs(output_path, exist_ok=True)
|
|
397
|
+
return output_path
|
|
398
|
+
except Exception:
|
|
399
|
+
logger.exception("Can't use fallback output directory %s. Trying temp folder", output_path)
|
|
400
|
+
return os.path.join(tempfile.gettempdir(), FALLBACK_DOWNLOADS_FOLDER, task_info["job_id"])
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def can_skip(self, file_info):
|
|
404
|
+
"""Determine if a file download should be skipped.
|
|
405
|
+
|
|
406
|
+
It can be skipped if it exists already with the same content. In this case we return a file_done event dict with preexisting=True. The event is put in the event queue by the calling function.
|
|
407
|
+
"""
|
|
408
|
+
|
|
409
|
+
if self.force:
|
|
410
|
+
return False
|
|
411
|
+
|
|
412
|
+
filepath = file_info["filepath"]
|
|
413
|
+
if not os.path.exists(filepath):
|
|
414
|
+
return False
|
|
415
|
+
|
|
416
|
+
try:
|
|
417
|
+
existing_md5 = self._generate_base64_md5(filepath)
|
|
418
|
+
download_md5 = file_info.get("md5", "none")
|
|
419
|
+
if existing_md5 != download_md5:
|
|
420
|
+
return False
|
|
421
|
+
except Exception:
|
|
422
|
+
logger.exception("Error checking md5 for %s", filepath)
|
|
423
|
+
return False
|
|
424
|
+
|
|
425
|
+
return self.generate_file_done_event(file_info, preexisting=True)
|
|
426
|
+
|
|
427
|
+
def download(self, file_info):
|
|
428
|
+
"""
|
|
429
|
+
Do the work of downloading a file.
|
|
430
|
+
|
|
431
|
+
Use a temp file to avoid corrupting the original file if the download fails.
|
|
432
|
+
"""
|
|
433
|
+
skip_result = self.can_skip(file_info)
|
|
434
|
+
if skip_result:
|
|
435
|
+
return skip_result
|
|
436
|
+
|
|
437
|
+
size = file_info["size"]
|
|
438
|
+
filepath = os.path.join(file_info["output_dir"], file_info["relative_path"])
|
|
439
|
+
|
|
440
|
+
logger.debug("Downloading file: %s", filepath)
|
|
441
|
+
|
|
442
|
+
with temp_file(filepath) as safe_filepath:
|
|
443
|
+
response = requests.get(file_info["url"], stream=True, timeout=60)
|
|
444
|
+
size = float(response.headers.get("content-length", 0))
|
|
445
|
+
progress_bytes = 0
|
|
446
|
+
last_poll = time.time()
|
|
447
|
+
with open(safe_filepath, "wb") as file_handle:
|
|
448
|
+
for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
|
|
449
|
+
# check if the download has been interrupted
|
|
450
|
+
if self.interrupt_flag.is_set():
|
|
451
|
+
|
|
452
|
+
raise UserInterrupted("Download interrupted by user.")
|
|
453
|
+
|
|
454
|
+
if not chunk:
|
|
455
|
+
continue
|
|
456
|
+
file_handle.write(chunk)
|
|
457
|
+
|
|
458
|
+
progress_bytes += len(chunk)
|
|
459
|
+
last_poll = self.emit_progress_event(
|
|
460
|
+
filepath, progress_bytes, size, last_poll
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
response.raise_for_status()
|
|
464
|
+
|
|
465
|
+
return self.generate_file_done_event(file_info)
|
|
466
|
+
|
|
467
|
+
def dispatch_events(self):
|
|
468
|
+
"""
|
|
469
|
+
Pull events from the event queue as they are ready and call the appropriate callbacks.
|
|
470
|
+
"""
|
|
471
|
+
while True:
|
|
472
|
+
# Get the next event from the queue
|
|
473
|
+
evt = self.event_queue.get()
|
|
474
|
+
event_type = evt["type"]
|
|
475
|
+
|
|
476
|
+
# Call all registered callbacks for the event
|
|
477
|
+
for callback in self.callbacks[event_type]:
|
|
478
|
+
callback(evt)
|
|
479
|
+
|
|
480
|
+
# If there are any callbacks registered for the task_done event,
|
|
481
|
+
# then we check if the event is a file_done event and if so, determine
|
|
482
|
+
# whether the whole task is done.
|
|
483
|
+
# If the task is done, call its callbacks.
|
|
484
|
+
if event_type == EVENT_TYPE_FILE_DONE:
|
|
485
|
+
if len(self.callbacks[EVENT_TYPE_TASK_DONE]) > 0:
|
|
486
|
+
task_done_event = self.generate_task_done_event(evt)
|
|
487
|
+
if task_done_event:
|
|
488
|
+
for callback in self.callbacks[EVENT_TYPE_TASK_DONE]:
|
|
489
|
+
callback(task_done_event)
|
|
490
|
+
|
|
491
|
+
if event_type == EVENT_TYPE_DONE:
|
|
492
|
+
break
|
|
493
|
+
|
|
494
|
+
def generate_task_done_event(self, evt):
|
|
495
|
+
"""
|
|
496
|
+
Build task_done event from file_done event and the registry.
|
|
497
|
+
|
|
498
|
+
Only do this is the file count for the task is complete.
|
|
499
|
+
"""
|
|
500
|
+
event_type = evt["type"]
|
|
501
|
+
# We don't want to update the registry if the file_done event is an error.
|
|
502
|
+
# Ignoring it ensures that it will eventually be reported back to the server as pending.
|
|
503
|
+
if event_type != EVENT_TYPE_FILE_DONE or evt.get("error"):
|
|
504
|
+
return None
|
|
505
|
+
|
|
506
|
+
# Increment the number of downloaded files for the task
|
|
507
|
+
updated_task = self.registry.update_task(evt)
|
|
508
|
+
|
|
509
|
+
if not updated_task: # should never happen
|
|
510
|
+
return None
|
|
511
|
+
|
|
512
|
+
if updated_task["completed_files"] >= updated_task["filecount"]:
|
|
513
|
+
return {
|
|
514
|
+
"type": EVENT_TYPE_TASK_DONE,
|
|
515
|
+
"job_id": evt["job_id"],
|
|
516
|
+
"task_id": evt["task_id"],
|
|
517
|
+
"download_id": updated_task["download_id"],
|
|
518
|
+
"filecount": updated_task["filecount"],
|
|
519
|
+
"preexisting": updated_task["preexisting_files"]
|
|
520
|
+
== updated_task["filecount"],
|
|
521
|
+
"size": updated_task["size"],
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
return None
|
|
525
|
+
|
|
526
|
+
############## METHODS TO CONSTRUCT AND EMIT EVENTS #####################
|
|
527
|
+
def emit_start_task_event(self, task):
|
|
528
|
+
"""Send a start_task event to the event queue."""
|
|
529
|
+
self.event_queue.put(
|
|
530
|
+
{
|
|
531
|
+
"type": EVENT_TYPE_START_TASK,
|
|
532
|
+
"download_id": task["download_id"],
|
|
533
|
+
"filecount": len(task["files"]),
|
|
534
|
+
"task_id": task["task_id"],
|
|
535
|
+
"job_id": task["job_id"],
|
|
536
|
+
"size": task["size"],
|
|
537
|
+
}
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
def emit_progress_event(self, filepath, progress_bytes, size, last_poll):
|
|
541
|
+
"""Send a progress event to the event queue if it's time to do so."""
|
|
542
|
+
now = time.time()
|
|
543
|
+
if now >= last_poll + self.progress_interval:
|
|
544
|
+
last_poll = now
|
|
545
|
+
self.event_queue.put(
|
|
546
|
+
{
|
|
547
|
+
"type": EVENT_TYPE_PROGRESS,
|
|
548
|
+
"filepath": filepath,
|
|
549
|
+
"progress_bytes": progress_bytes,
|
|
550
|
+
"size": size,
|
|
551
|
+
}
|
|
552
|
+
)
|
|
553
|
+
return last_poll
|
|
554
|
+
|
|
555
|
+
def emit_start_event(self):
|
|
556
|
+
"""Send start event to the event queue."""
|
|
557
|
+
self.event_queue.put(
|
|
558
|
+
{
|
|
559
|
+
"type": EVENT_TYPE_START,
|
|
560
|
+
"num_threads": self.num_threads,
|
|
561
|
+
"page_size": self.page_size,
|
|
562
|
+
}
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
def emit_end_event(self):
|
|
566
|
+
"""Send done event to the event queue.
|
|
567
|
+
|
|
568
|
+
Send along the registry so that any callbacks can check if any tasks were not completed.
|
|
569
|
+
"""
|
|
570
|
+
self.event_queue.put({"type": EVENT_TYPE_DONE, "registry": self.registry})
|
|
571
|
+
|
|
572
|
+
@staticmethod
|
|
573
|
+
def generate_file_done_event(file, **kwargs):
|
|
574
|
+
result = {
|
|
575
|
+
"type": EVENT_TYPE_FILE_DONE,
|
|
576
|
+
"job_id": file["job_id"],
|
|
577
|
+
"task_id": file["task_id"],
|
|
578
|
+
"filepath": file["filepath"],
|
|
579
|
+
"md5": file["md5"],
|
|
580
|
+
"size": file["size"],
|
|
581
|
+
"preexisting": False,
|
|
582
|
+
"error": None,
|
|
583
|
+
}
|
|
584
|
+
# If the preexisting key is in kwargs, merge it in the result dict.
|
|
585
|
+
return {**result, **kwargs}
|
|
586
|
+
|
|
587
|
+
################################################################
|
|
588
|
+
|
|
589
|
+
def on(self, event_type, callback):
|
|
590
|
+
"""Register a callback function.
|
|
591
|
+
|
|
592
|
+
Args:
|
|
593
|
+
event_type (str): The name of the callback. Must be one of the values in VALID_EVENTS.
|
|
594
|
+
callback (function): The callback function. Must accept one argument named 'evt'.
|
|
595
|
+
Raises:
|
|
596
|
+
ValueError: If the event_type is not in VALID_EVENTS.
|
|
597
|
+
|
|
598
|
+
Examples:
|
|
599
|
+
>>> def my_callback(evt):
|
|
600
|
+
... print(evt)
|
|
601
|
+
|
|
602
|
+
>>> downloader = BaseDownloader(jobs)
|
|
603
|
+
>>> downloader.on("start", my_callback)
|
|
604
|
+
|
|
605
|
+
"""
|
|
606
|
+
if event_type not in self.VALID_EVENTS:
|
|
607
|
+
raise ValueError(
|
|
608
|
+
f"Invalid event_type: {event_type}. Allowed values: {self.VALID_EVENTS}"
|
|
609
|
+
)
|
|
610
|
+
self._validate_callback(callback)
|
|
611
|
+
self.callbacks[event_type].append(callback)
|
|
612
|
+
|
|
613
|
+
@staticmethod
|
|
614
|
+
def _validate_callback(callback):
|
|
615
|
+
"""Make sure the callback is a callable function with one argument named 'evt'.
|
|
616
|
+
|
|
617
|
+
The callback could be a method of another class, in which case the first argument will be 'self'. We account for this too.
|
|
618
|
+
"""
|
|
619
|
+
if not callable(callback):
|
|
620
|
+
raise ValueError("Callback must be a callable function.")
|
|
621
|
+
num_args = callback.__code__.co_argcount
|
|
622
|
+
|
|
623
|
+
arg_names = callback.__code__.co_varnames[:num_args]
|
|
624
|
+
|
|
625
|
+
if num_args > 2 or (num_args == 2 and arg_names[0] != "self"):
|
|
626
|
+
raise ValueError(f"Too many args. Found {num_args} arguments: {arg_names}")
|
|
627
|
+
|
|
628
|
+
if num_args < 1 or arg_names[-1] != "evt":
|
|
629
|
+
raise ValueError("Callback is missing the named argument 'evt'.")
|
|
630
|
+
return True
|
|
631
|
+
|
|
632
|
+
@staticmethod
|
|
633
|
+
def _generate_base64_md5(filename):
|
|
634
|
+
"""Generate the base64 md5 hash of a file.
|
|
635
|
+
|
|
636
|
+
This is used to determine if a file on disk is the same as a file on the server.
|
|
637
|
+
"""
|
|
638
|
+
with open(filename, "rb") as file:
|
|
639
|
+
md5_hash = hashlib.md5()
|
|
640
|
+
for chunk in iter(lambda: file.read(4096), b""):
|
|
641
|
+
md5_hash.update(chunk)
|
|
642
|
+
md5_digest = md5_hash.digest()
|
|
643
|
+
md5_base64 = base64.b64encode(md5_digest)
|
|
644
|
+
return md5_base64.decode("utf-8")
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base Download runner
|
|
3
|
+
|
|
4
|
+
This module contains the DownloadRunnerBase class.
|
|
5
|
+
|
|
6
|
+
The DownloadRunnerBase is responsible for running one of the downloader classes: JobDownloader or PerpetualDownloader. If there are no jobids, it runs the PerpetualDownloader.
|
|
7
|
+
|
|
8
|
+
It also sets up a Reporter to report task status back to the server.
|
|
9
|
+
|
|
10
|
+
By design, derived classes need only be concerned with registering callbacks. See the LoggingDownloadRunner class for an example.
|
|
11
|
+
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
from ciocore.downloader.job_downloader import JobDownloader
|
|
16
|
+
from ciocore.downloader.perpetual_downloader import PerpetualDownloader
|
|
17
|
+
from ciocore.downloader.log import LOGGER_NAME
|
|
18
|
+
from ciocore.downloader.reporter import Reporter
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(LOGGER_NAME)
|
|
21
|
+
|
|
22
|
+
class DownloadRunnerBase(object):
|
|
23
|
+
CLIENT_NAME = "DownloadRunnerBase"
|
|
24
|
+
|
|
25
|
+
def __init__(self, jobids=None, location=None, **kwargs):
|
|
26
|
+
"""
|
|
27
|
+
Initialize the downloader.
|
|
28
|
+
"""
|
|
29
|
+
self.disable_reporting = kwargs.pop("disable_reporting")
|
|
30
|
+
self.num_reporter_threads = kwargs.get("num_threads", 1)
|
|
31
|
+
if jobids:
|
|
32
|
+
self.downloader = JobDownloader(jobids, **kwargs)
|
|
33
|
+
else:
|
|
34
|
+
self.downloader = PerpetualDownloader(location, **kwargs)
|
|
35
|
+
|
|
36
|
+
def run(self):
|
|
37
|
+
"""
|
|
38
|
+
Run the downloader.
|
|
39
|
+
|
|
40
|
+
Optionally wrap the downloader in a reporter to report task statuses back to the server.
|
|
41
|
+
"""
|
|
42
|
+
if self.disable_reporting:
|
|
43
|
+
self.downloader.run()
|
|
44
|
+
else:
|
|
45
|
+
with Reporter(self.downloader, num_threads=self.num_reporter_threads):
|
|
46
|
+
self.downloader.run()
|
|
47
|
+
|