ciocore 5.1.1__py2.py3-none-any.whl → 10.0.0b3__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ciocore/VERSION +1 -1
- ciocore/__init__.py +23 -1
- ciocore/api_client.py +655 -160
- ciocore/auth/__init__.py +5 -3
- ciocore/cli.py +501 -0
- ciocore/common.py +15 -13
- ciocore/conductor_submit.py +77 -60
- ciocore/config.py +127 -13
- ciocore/data.py +162 -77
- ciocore/docsite/404.html +746 -0
- ciocore/docsite/apidoc/api_client/index.html +3605 -0
- ciocore/docsite/apidoc/apidoc/index.html +909 -0
- ciocore/docsite/apidoc/config/index.html +1652 -0
- ciocore/docsite/apidoc/data/index.html +1553 -0
- ciocore/docsite/apidoc/hardware_set/index.html +2460 -0
- ciocore/docsite/apidoc/package_environment/index.html +1507 -0
- ciocore/docsite/apidoc/package_tree/index.html +2386 -0
- ciocore/docsite/assets/_mkdocstrings.css +16 -0
- ciocore/docsite/assets/images/favicon.png +0 -0
- ciocore/docsite/assets/javascripts/bundle.471ce7a9.min.js +29 -0
- ciocore/docsite/assets/javascripts/bundle.471ce7a9.min.js.map +7 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.ar.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.da.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.de.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.du.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.el.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.es.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.fi.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.fr.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.he.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.hi.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.hu.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.hy.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.it.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.ja.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.jp.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.kn.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.ko.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.multi.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.nl.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.no.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.pt.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.ro.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.ru.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.sa.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.stemmer.support.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.sv.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.ta.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.te.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.th.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.tr.min.js +18 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.vi.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/min/lunr.zh.min.js +1 -0
- ciocore/docsite/assets/javascripts/lunr/tinyseg.js +206 -0
- ciocore/docsite/assets/javascripts/lunr/wordcut.js +6708 -0
- ciocore/docsite/assets/javascripts/workers/search.b8dbb3d2.min.js +42 -0
- ciocore/docsite/assets/javascripts/workers/search.b8dbb3d2.min.js.map +7 -0
- ciocore/docsite/assets/stylesheets/main.3cba04c6.min.css +1 -0
- ciocore/docsite/assets/stylesheets/main.3cba04c6.min.css.map +1 -0
- ciocore/docsite/assets/stylesheets/palette.06af60db.min.css +1 -0
- ciocore/docsite/assets/stylesheets/palette.06af60db.min.css.map +1 -0
- ciocore/docsite/cmdline/docs/index.html +871 -0
- ciocore/docsite/cmdline/downloader/index.html +934 -0
- ciocore/docsite/cmdline/packages/index.html +878 -0
- ciocore/docsite/cmdline/uploader/index.html +995 -0
- ciocore/docsite/how-to-guides/index.html +869 -0
- ciocore/docsite/index.html +895 -0
- ciocore/docsite/logo.png +0 -0
- ciocore/docsite/objects.inv +0 -0
- ciocore/docsite/search/search_index.json +1 -0
- ciocore/docsite/sitemap.xml +3 -0
- ciocore/docsite/sitemap.xml.gz +0 -0
- ciocore/docsite/stylesheets/extra.css +26 -0
- ciocore/docsite/stylesheets/tables.css +167 -0
- ciocore/downloader/base_downloader.py +644 -0
- ciocore/downloader/download_runner_base.py +47 -0
- ciocore/downloader/job_downloader.py +119 -0
- ciocore/{downloader.py → downloader/legacy_downloader.py} +12 -9
- ciocore/downloader/log.py +73 -0
- ciocore/downloader/logging_download_runner.py +87 -0
- ciocore/downloader/perpetual_downloader.py +63 -0
- ciocore/downloader/registry.py +97 -0
- ciocore/downloader/reporter.py +135 -0
- ciocore/exceptions.py +8 -2
- ciocore/file_utils.py +51 -50
- ciocore/hardware_set.py +449 -0
- ciocore/loggeria.py +89 -20
- ciocore/package_environment.py +110 -48
- ciocore/package_query.py +182 -0
- ciocore/package_tree.py +319 -258
- ciocore/retry.py +0 -0
- ciocore/uploader/_uploader.py +547 -364
- ciocore/uploader/thread_queue_job.py +176 -0
- ciocore/uploader/upload_stats/__init__.py +3 -4
- ciocore/uploader/upload_stats/stats_formats.py +10 -4
- ciocore/validator.py +34 -2
- ciocore/worker.py +174 -151
- ciocore-10.0.0b3.dist-info/METADATA +928 -0
- ciocore-10.0.0b3.dist-info/RECORD +128 -0
- {ciocore-5.1.1.dist-info → ciocore-10.0.0b3.dist-info}/WHEEL +1 -1
- ciocore-10.0.0b3.dist-info/entry_points.txt +2 -0
- tests/instance_type_fixtures.py +175 -0
- tests/package_fixtures.py +205 -0
- tests/test_api_client.py +297 -12
- tests/test_base_downloader.py +104 -0
- tests/test_cli.py +149 -0
- tests/test_common.py +1 -7
- tests/test_config.py +40 -18
- tests/test_data.py +162 -173
- tests/test_downloader.py +118 -0
- tests/test_hardware_set.py +139 -0
- tests/test_job_downloader.py +213 -0
- tests/test_package_query.py +38 -0
- tests/test_package_tree.py +91 -291
- tests/test_submit.py +44 -18
- tests/test_uploader.py +1 -4
- ciocore/__about__.py +0 -10
- ciocore/cli/conductor.py +0 -191
- ciocore/compat.py +0 -15
- ciocore-5.1.1.data/scripts/conductor +0 -19
- ciocore-5.1.1.data/scripts/conductor.bat +0 -13
- ciocore-5.1.1.dist-info/METADATA +0 -408
- ciocore-5.1.1.dist-info/RECORD +0 -47
- tests/mocks/api_client_mock.py +0 -51
- /ciocore/{cli → downloader}/__init__.py +0 -0
- {ciocore-5.1.1.dist-info → ciocore-10.0.0b3.dist-info}/top_level.txt +0 -0
ciocore/uploader/_uploader.py
CHANGED
|
@@ -1,32 +1,32 @@
|
|
|
1
1
|
import datetime
|
|
2
|
-
import importlib
|
|
3
2
|
import json
|
|
4
3
|
import logging
|
|
5
4
|
import os
|
|
5
|
+
import pathlib
|
|
6
|
+
import requests.exceptions
|
|
6
7
|
import sys
|
|
7
8
|
import time
|
|
8
9
|
import threading
|
|
9
10
|
import traceback
|
|
10
11
|
|
|
11
|
-
try:
|
|
12
|
-
import Queue as queue
|
|
13
|
-
except ImportError:
|
|
14
|
-
import queue
|
|
15
|
-
|
|
16
12
|
import ciocore
|
|
17
|
-
from ciocore import
|
|
18
|
-
|
|
19
|
-
|
|
13
|
+
from ciocore import (
|
|
14
|
+
api_client,
|
|
15
|
+
client_db,
|
|
16
|
+
common,
|
|
17
|
+
config,
|
|
18
|
+
file_utils,
|
|
19
|
+
loggeria,
|
|
20
|
+
worker,
|
|
21
|
+
exceptions,
|
|
22
|
+
)
|
|
20
23
|
|
|
21
|
-
from .
|
|
24
|
+
from . import thread_queue_job
|
|
22
25
|
|
|
23
|
-
|
|
24
|
-
"%(asctime)s %(name)s%(levelname)9s %(threadName)s: %(message)s"
|
|
25
|
-
)
|
|
26
|
-
logger = logging.getLogger(CONDUCTOR_LOGGER_NAME)
|
|
26
|
+
from .upload_stats import UploadStats
|
|
27
27
|
|
|
28
|
-
|
|
29
|
-
|
|
28
|
+
logger = logging.getLogger(
|
|
29
|
+
"{}.uploader".format(loggeria.CONDUCTOR_LOGGER_NAME))
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
class MD5Worker(worker.ThreadWorker):
|
|
@@ -43,26 +43,33 @@ class MD5Worker(worker.ThreadWorker):
|
|
|
43
43
|
|
|
44
44
|
def do_work(self, job, thread_int):
|
|
45
45
|
logger.debug("job is %s", job)
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
current_md5, cache_hit = self.get_md5(filename)
|
|
46
|
+
|
|
47
|
+
current_md5, cache_hit = self.get_md5(job.path)
|
|
49
48
|
|
|
50
49
|
# if a submission time md5 was provided then check against it
|
|
51
|
-
if
|
|
52
|
-
logger.info(
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
50
|
+
if job.file_md5:
|
|
51
|
+
logger.info(
|
|
52
|
+
"Enforcing md5 match: %s for: %s", job.file_md5, job.path
|
|
53
|
+
)
|
|
54
|
+
if current_md5 != job.file_md5:
|
|
55
|
+
message = "MD5 of %s has changed since submission\n" % job.path
|
|
56
|
+
message += "submitted md5: %s\n" % job.file_md5
|
|
56
57
|
message += "current md5: %s\n" % current_md5
|
|
57
|
-
message +=
|
|
58
|
+
message += (
|
|
59
|
+
"This is likely due to the file being written to after the user"
|
|
60
|
+
)
|
|
58
61
|
message += " submitted the job but before it got uploaded to conductor"
|
|
59
62
|
logger.error(message)
|
|
60
|
-
raise
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
63
|
+
raise exceptions.UploadError(message)
|
|
64
|
+
|
|
65
|
+
else:
|
|
66
|
+
job.file_md5 = current_md5
|
|
67
|
+
|
|
68
|
+
self.metric_store.set_dict("file_md5s", job.path, current_md5)
|
|
69
|
+
self.metric_store.set_dict("file_md5s_cache_hit", job.path, cache_hit)
|
|
70
|
+
job.file_size = os.path.getsize(job.path)
|
|
71
|
+
|
|
72
|
+
return job
|
|
66
73
|
|
|
67
74
|
def get_md5(self, filepath):
|
|
68
75
|
"""
|
|
@@ -71,9 +78,9 @@ class MD5Worker(worker.ThreadWorker):
|
|
|
71
78
|
Use the sqlite db cache to retrive this (if the cache is valid), otherwise generate the md5
|
|
72
79
|
from scratch
|
|
73
80
|
"""
|
|
74
|
-
|
|
81
|
+
|
|
75
82
|
cache_hit = True
|
|
76
|
-
|
|
83
|
+
|
|
77
84
|
# If md5 caching is disable, then just generate the md5 from scratch
|
|
78
85
|
if not self.md5_caching:
|
|
79
86
|
cache_hit = False
|
|
@@ -99,7 +106,9 @@ class MD5Worker(worker.ThreadWorker):
|
|
|
99
106
|
"""
|
|
100
107
|
Store the given file_info into the database
|
|
101
108
|
"""
|
|
102
|
-
client_db.FilesDB.add_file(
|
|
109
|
+
client_db.FilesDB.add_file(
|
|
110
|
+
file_info, db_filepath=self.database_filepath, thread_safe=True
|
|
111
|
+
)
|
|
103
112
|
|
|
104
113
|
|
|
105
114
|
class MD5OutputWorker(worker.ThreadWorker):
|
|
@@ -110,135 +119,111 @@ class MD5OutputWorker(worker.ThreadWorker):
|
|
|
110
119
|
|
|
111
120
|
def __init__(self, *args, **kwargs):
|
|
112
121
|
super(MD5OutputWorker, self).__init__(*args, **kwargs)
|
|
113
|
-
self.batch_size = 20 # the
|
|
122
|
+
self.batch_size = 20 # the controls the batch size for http get_signed_urls
|
|
114
123
|
self.wait_time = 2
|
|
115
|
-
self.batch =
|
|
124
|
+
self.batch = {}
|
|
116
125
|
|
|
117
126
|
def check_for_poison_pill(self, job):
|
|
118
127
|
"""we need to make sure we ship the last batch before we terminate"""
|
|
119
128
|
if job == self.PoisonPill():
|
|
120
129
|
logger.debug("md5outputworker got poison pill")
|
|
121
130
|
self.ship_batch()
|
|
122
|
-
self.
|
|
123
|
-
_thread.exit()
|
|
131
|
+
super(MD5OutputWorker, self).check_for_poison_pill(job)
|
|
124
132
|
|
|
125
133
|
# helper function to ship batch
|
|
126
134
|
def ship_batch(self):
|
|
127
135
|
if self.batch:
|
|
128
136
|
logger.debug("sending batch: %s", self.batch)
|
|
129
137
|
self.put_job(self.batch)
|
|
130
|
-
self.batch =
|
|
138
|
+
self.batch = {}
|
|
131
139
|
|
|
132
140
|
@common.dec_catch_exception(raise_=True)
|
|
133
141
|
def target(self, thread_int):
|
|
134
|
-
|
|
135
142
|
while not common.SIGINT_EXIT:
|
|
136
|
-
|
|
137
143
|
job = None
|
|
138
144
|
|
|
139
145
|
try:
|
|
140
146
|
logger.debug("Worker querying for job")
|
|
141
147
|
job = self.in_queue.get(block=True, timeout=self.wait_time)
|
|
142
148
|
logger.debug("Got job")
|
|
143
|
-
queue_size = self.in_queue.qsize()
|
|
144
|
-
|
|
149
|
+
queue_size = self.in_queue.qsize()
|
|
150
|
+
|
|
145
151
|
except:
|
|
146
|
-
|
|
147
152
|
logger.debug("No jobs available")
|
|
148
|
-
|
|
149
|
-
if self._job_counter.value
|
|
150
|
-
|
|
153
|
+
|
|
154
|
+
if self._job_counter.value >= self.task_count:
|
|
151
155
|
if self.batch:
|
|
152
156
|
self.ship_batch()
|
|
153
|
-
|
|
154
|
-
logger.debug(
|
|
157
|
+
|
|
158
|
+
logger.debug(
|
|
159
|
+
"Worker has completed all of its tasks (%s)", job)
|
|
155
160
|
self.thread_complete_counter.decrement()
|
|
156
161
|
break
|
|
157
|
-
|
|
158
|
-
|
|
162
|
+
|
|
163
|
+
elif self._job_counter.value == 0:
|
|
159
164
|
logger.debug("Worker waiting for first job")
|
|
160
|
-
continue
|
|
161
|
-
|
|
162
|
-
logger.debug("Worker got job {}".format(job))
|
|
163
|
-
self._job_counter.increment()
|
|
164
|
-
logger.debug("Processing Job '{}' #{} on {}. {} tasks remaining in queue".format( job,
|
|
165
|
-
self._job_counter.value,
|
|
166
|
-
self,
|
|
167
|
-
queue_size))
|
|
168
165
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
166
|
+
time.sleep(1)
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
logger.debug("Worker got job %s", job)
|
|
170
|
+
self._job_counter.increment()
|
|
171
|
+
logger.debug(
|
|
172
|
+
"Processing Job '%s' #%s on %s. %s tasks remaining in queue",
|
|
173
|
+
job,
|
|
174
|
+
self._job_counter.value,
|
|
175
|
+
self,
|
|
176
|
+
queue_size,
|
|
178
177
|
)
|
|
179
178
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
179
|
+
try:
|
|
180
|
+
self.check_for_poison_pill(job)
|
|
181
|
+
|
|
182
|
+
# add file info to the batch list
|
|
183
|
+
self.batch[job.path] = job
|
|
184
|
+
|
|
185
|
+
# if the batch is self.batch_size, ship it
|
|
186
|
+
if len(self.batch) == self.batch_size:
|
|
187
|
+
self.ship_batch()
|
|
188
|
+
|
|
189
|
+
# mark this task as done
|
|
190
|
+
self.mark_done()
|
|
191
|
+
|
|
192
|
+
except Exception as exception:
|
|
193
|
+
logger.exception(
|
|
194
|
+
'CAUGHT EXCEPTION on job "%s" [%s]:\n', job, self)
|
|
183
195
|
|
|
184
|
-
|
|
185
|
-
|
|
196
|
+
# if there is no error queue to dump data into, then simply raise the exception
|
|
197
|
+
if self.error_queue is None:
|
|
198
|
+
raise
|
|
199
|
+
|
|
200
|
+
self.error_queue.put(sys.exc_info())
|
|
201
|
+
# exit the while loop to stop the thread
|
|
202
|
+
break
|
|
186
203
|
|
|
187
204
|
|
|
188
205
|
class HttpBatchWorker(worker.ThreadWorker):
|
|
189
206
|
"""
|
|
190
|
-
This worker
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
{
|
|
200
|
-
"path": "/linux64/bin/tiff2ps",
|
|
201
|
-
"hash": " fd27a8f925a72e788ea94997ca9a21ca",
|
|
202
|
-
"size": 123
|
|
203
|
-
},
|
|
204
|
-
]
|
|
205
|
-
out_queue: {"multiPartURLs": [
|
|
206
|
-
{
|
|
207
|
-
"uploadID": "FqzC8mkGxTsLzAR5CuBv771an9D5WLthLbl_xFKCaqKEdqf",
|
|
208
|
-
"filePath": "/linux64/bin/animate",
|
|
209
|
-
"md5": "c986fb5f1c9ccf47eecc645081e4b108",
|
|
210
|
-
"partSize": 1073741824,
|
|
211
|
-
"parts": [
|
|
212
|
-
{
|
|
213
|
-
"partNumber": 1,
|
|
214
|
-
"url": "https://www.signedurlexample.com/signature1"
|
|
215
|
-
},
|
|
216
|
-
{
|
|
217
|
-
"partNumber": 2,
|
|
218
|
-
"url": "https://www.signedurlexample.com/signature1"
|
|
219
|
-
}
|
|
220
|
-
]
|
|
221
|
-
}
|
|
222
|
-
],
|
|
223
|
-
"singlePartURLs": [
|
|
224
|
-
{
|
|
225
|
-
"filePath": "/linux64/bin/tiff2ps",
|
|
226
|
-
"fileSize": 123,
|
|
227
|
-
"preSignedURL": "https://www.signedurlexample.com/signature2"
|
|
228
|
-
}
|
|
229
|
-
]
|
|
230
|
-
}
|
|
207
|
+
This worker recieves a list of ThreadQueue Jobs with path, hash, size attributes. It generates
|
|
208
|
+
the request to the back-end to get signed upload urls for each file in the batch. The result
|
|
209
|
+
can be a mix of multi-part and single-part upload urls - each one has a unique set of data.
|
|
210
|
+
|
|
211
|
+
If a requested file isn't part of the result, it indicates that it already exists on the bucket
|
|
212
|
+
and hence has been previously uploaded.
|
|
213
|
+
|
|
214
|
+
This will will add values for upload_type, presigned_url, part_size, parts, kms_key_name and
|
|
215
|
+
upload_id (of the file - NOT the Upload entity) to each job.
|
|
231
216
|
"""
|
|
232
217
|
|
|
233
218
|
def __init__(self, *args, **kwargs):
|
|
234
219
|
super(HttpBatchWorker, self).__init__(*args, **kwargs)
|
|
235
220
|
self.api_client = api_client.ApiClient()
|
|
236
|
-
self.project = kwargs.get("project")
|
|
237
221
|
|
|
238
|
-
def make_request(self,
|
|
222
|
+
def make_request(self, jobs):
|
|
239
223
|
uri_path = "/api/v2/files/get_upload_urls"
|
|
240
224
|
headers = {"Content-Type": "application/json"}
|
|
241
|
-
data = {"upload_files":
|
|
225
|
+
data = {"upload_files": thread_queue_job.ThreadQueueJob.format_for_upload_request(jobs.values()),
|
|
226
|
+
"project": list(jobs.values())[0].project}
|
|
242
227
|
|
|
243
228
|
response_str, response_code = self.api_client.make_request(
|
|
244
229
|
uri_path=uri_path,
|
|
@@ -252,86 +237,83 @@ class HttpBatchWorker(worker.ThreadWorker):
|
|
|
252
237
|
if response_code == 200:
|
|
253
238
|
url_list = json.loads(response_str)
|
|
254
239
|
return url_list
|
|
240
|
+
|
|
255
241
|
if response_code == 204:
|
|
256
242
|
return None
|
|
257
|
-
raise Exception("%s Failed request to: %s\n%s" % (response_code, uri_path, response_str))
|
|
258
|
-
|
|
259
|
-
def do_work(self, job, thread_int):
|
|
260
|
-
logger.debug("getting upload urls for %s", job)
|
|
261
|
-
result = self.make_request(job)
|
|
262
243
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
244
|
+
raise exceptions.UploadError(
|
|
245
|
+
"%s Failed request to: %s\n%s" % (
|
|
246
|
+
response_code, uri_path, response_str)
|
|
247
|
+
)
|
|
267
248
|
|
|
249
|
+
def do_work(self, jobs, thread_int):
|
|
250
|
+
logger.debug("Getting upload urls for %s", jobs)
|
|
251
|
+
result = self.make_request(jobs)
|
|
252
|
+
logger.debug("Got result: %s", result)
|
|
253
|
+
|
|
254
|
+
# Determine which files have already been uploaded by looking at the difference between
|
|
255
|
+
# the file paths in job and the file paths returned by the request. Only files that need
|
|
256
|
+
# to be uploaded are returned by the request.
|
|
257
|
+
# Ideally, the MD5 would be used as the key but because the MD5 isn't returned for single-
|
|
258
|
+
# part files, we have to use the file path instead.
|
|
268
259
|
if result:
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
for item in item_type:
|
|
272
|
-
incoming_file_paths.remove(item['filePath'])
|
|
273
|
-
|
|
274
|
-
for path in incoming_file_paths:
|
|
275
|
-
self.metric_store.increment("already_uploaded", True, path)
|
|
276
|
-
|
|
277
|
-
return result
|
|
260
|
+
for upload_type, items in result.items():
|
|
261
|
+
for item in items:
|
|
278
262
|
|
|
279
|
-
""
|
|
280
|
-
This worker subscribes to a queue of list of file uploads (multipart and singlepart).
|
|
263
|
+
job_key = item["filePath"]
|
|
281
264
|
|
|
282
|
-
|
|
283
|
-
uploaded, and aggregates the total size for all uploads.
|
|
265
|
+
logger.debug("Matching %s in request", job_key)
|
|
284
266
|
|
|
285
|
-
|
|
286
|
-
|
|
267
|
+
jobs[job_key].upload_type = upload_type
|
|
268
|
+
jobs[job_key].kms_key_name = result.get('kmsKeyName')
|
|
287
269
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
"""
|
|
270
|
+
self.metric_store.increment(
|
|
271
|
+
"bytes_to_upload", jobs[job_key].file_size, item["filePath"])
|
|
272
|
+
self.metric_store.increment("num_files_to_upload")
|
|
292
273
|
|
|
274
|
+
if upload_type == "multiPartURLs":
|
|
275
|
+
jobs[job_key].part_size = item["partSize"]
|
|
276
|
+
jobs[job_key].set_parts(item["parts"])
|
|
277
|
+
jobs[job_key].file_upload_id = item.get("uploadID")
|
|
293
278
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
super(FileStatWorker, self).__init__(*args, **kwargs)
|
|
279
|
+
elif upload_type == "singlePartURLs":
|
|
280
|
+
jobs[job_key].presigned_url = item["preSignedURL"]
|
|
297
281
|
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
FileStatWorker iterates through the list. For each item, it aggregates the filesize in
|
|
302
|
-
bytes, and passes the upload into the UploadWorker queue.
|
|
303
|
-
"""
|
|
282
|
+
else:
|
|
283
|
+
raise exceptions.UploadError("Unknown upload_type '{}' for {}".format(upload_type,
|
|
284
|
+
item))
|
|
304
285
|
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
self.metric_store.increment("
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
self.
|
|
324
|
-
|
|
325
|
-
|
|
286
|
+
# If a job has no upload_type, it indicates it wasn't part of the result
|
|
287
|
+
# above and has already been uploaded.
|
|
288
|
+
# If it's a multipart job we need to split it into a job per part (to allow
|
|
289
|
+
# for parallelization of the uploads).
|
|
290
|
+
for job_count, job in enumerate(jobs.values()):
|
|
291
|
+
|
|
292
|
+
if job.upload_type is None:
|
|
293
|
+
job.already_uploaded = True
|
|
294
|
+
self.metric_store.increment("already_uploaded", True, job.path)
|
|
295
|
+
|
|
296
|
+
if job.is_multipart():
|
|
297
|
+
logger.debug(
|
|
298
|
+
"Job is multipart: %s, splitting parts into separate jobs", job)
|
|
299
|
+
for part_job in job.create_multipart_jobs():
|
|
300
|
+
self.put_job(part_job)
|
|
301
|
+
|
|
302
|
+
else:
|
|
303
|
+
logger.debug("Job is singlepart: %s, adding to out_queue", job)
|
|
304
|
+
self.put_job(job)
|
|
305
|
+
|
|
306
|
+
# The job counter is already incremented in target() once, so skip the first
|
|
307
|
+
# iteration
|
|
308
|
+
if job_count > 0:
|
|
309
|
+
self._job_counter.increment()
|
|
326
310
|
|
|
327
|
-
# make sure we return None, so no message is automatically added to the out_queue
|
|
328
311
|
return None
|
|
329
312
|
|
|
330
313
|
|
|
331
314
|
class UploadWorker(worker.ThreadWorker):
|
|
332
315
|
"""
|
|
333
|
-
This worker receives a
|
|
334
|
-
and performs an upload of the specified file to the provided url.
|
|
316
|
+
This worker receives a thread_queue_job.ThreadQueueJob and performs the upload.
|
|
335
317
|
"""
|
|
336
318
|
|
|
337
319
|
def __init__(self, *args, **kwargs):
|
|
@@ -339,7 +321,6 @@ class UploadWorker(worker.ThreadWorker):
|
|
|
339
321
|
self.chunk_size = 1048576 # 1M
|
|
340
322
|
self.report_size = 10485760 # 10M
|
|
341
323
|
self.api_client = api_client.ApiClient()
|
|
342
|
-
self.project = kwargs.get("project")
|
|
343
324
|
|
|
344
325
|
def chunked_reader(self, filename):
|
|
345
326
|
with open(filename, "rb") as fp:
|
|
@@ -352,37 +333,45 @@ class UploadWorker(worker.ThreadWorker):
|
|
|
352
333
|
yield data
|
|
353
334
|
|
|
354
335
|
# report upload progress
|
|
355
|
-
self.metric_store.increment(
|
|
336
|
+
self.metric_store.increment(
|
|
337
|
+
"bytes_uploaded", len(data), filename)
|
|
356
338
|
|
|
357
339
|
def do_work(self, job, thread_int):
|
|
358
|
-
|
|
359
|
-
if job:
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
340
|
+
|
|
341
|
+
if not job:
|
|
342
|
+
return worker.EMPTY_JOB
|
|
343
|
+
|
|
344
|
+
if job.already_uploaded:
|
|
345
|
+
logger.debug("Job is already uploaded: %s", job.path)
|
|
346
|
+
return job
|
|
347
|
+
|
|
348
|
+
try:
|
|
349
|
+
if job.is_multipart():
|
|
350
|
+
return self.do_multipart_upload(job)
|
|
351
|
+
|
|
352
|
+
else:
|
|
353
|
+
return self.do_singlepart_upload(job)
|
|
354
|
+
|
|
355
|
+
except Exception as err_msg:
|
|
356
|
+
real_md5 = common.get_base64_md5(job.path)
|
|
357
|
+
|
|
358
|
+
# Gather helpful details from the exception
|
|
359
|
+
exc_tb = sys.exc_info()[2]
|
|
360
|
+
exception_line_num = exc_tb.tb_lineno
|
|
361
|
+
exception_file = pathlib.Path(
|
|
362
|
+
exc_tb.tb_frame.f_code.co_filename).name
|
|
363
|
+
|
|
364
|
+
if isinstance(err_msg, requests.exceptions.HTTPError):
|
|
365
|
+
error_message = f"Upload of {job.path} failed with a response code {err_msg.response.status_code} ({err_msg.response.reason}) (expected '{job.file_md5}', got '{real_md5}')"
|
|
366
|
+
else:
|
|
367
|
+
error_message = (
|
|
368
|
+
f"Upload of {job.path} failed. (expected '{job.file_md5}', got '{real_md5}') {str(err_msg)} [{exception_file}-{exception_line_num}]"
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
raise exceptions.UploadError(error_message)
|
|
383
372
|
|
|
384
373
|
@common.DecRetry(retry_exceptions=api_client.CONNECTION_EXCEPTIONS, tries=5)
|
|
385
|
-
def do_singlepart_upload(self,
|
|
374
|
+
def do_singlepart_upload(self, job):
|
|
386
375
|
"""
|
|
387
376
|
Note that for GCS we don't rely on the make_request's own retry mechanism because we need to
|
|
388
377
|
recreate the chunked_reader generator before retrying the request. Instead, we wrap this
|
|
@@ -392,19 +381,19 @@ class UploadWorker(worker.ThreadWorker):
|
|
|
392
381
|
headers that S3 does not accept.
|
|
393
382
|
"""
|
|
394
383
|
|
|
395
|
-
if
|
|
384
|
+
if job.is_vendor_aws() or job.is_vendor_cw():
|
|
396
385
|
# must declare content-length ourselves due to zero byte bug in requests library.
|
|
397
386
|
# api_client.make_prepared_request docstring.
|
|
398
387
|
headers = {
|
|
399
388
|
"Content-Type": "application/octet-stream",
|
|
400
|
-
"Content-Length": str(file_size),
|
|
389
|
+
"Content-Length": str(job.file_size),
|
|
401
390
|
}
|
|
402
391
|
|
|
403
|
-
with open(
|
|
392
|
+
with open(job.path, "rb") as fh:
|
|
404
393
|
# TODO: support chunked
|
|
405
394
|
response = self.api_client.make_prepared_request(
|
|
406
395
|
verb="PUT",
|
|
407
|
-
url=
|
|
396
|
+
url=job.presigned_url,
|
|
408
397
|
headers=headers,
|
|
409
398
|
params=None,
|
|
410
399
|
data=fh,
|
|
@@ -418,79 +407,59 @@ class UploadWorker(worker.ThreadWorker):
|
|
|
418
407
|
response.close()
|
|
419
408
|
|
|
420
409
|
# report upload progress
|
|
421
|
-
self.metric_store.increment(
|
|
410
|
+
self.metric_store.increment(
|
|
411
|
+
"bytes_uploaded", job.file_size, job.path)
|
|
412
|
+
|
|
422
413
|
else:
|
|
423
|
-
headers = {"Content-MD5":
|
|
414
|
+
headers = {"Content-MD5": job.file_md5,
|
|
415
|
+
"Content-Type": "application/octet-stream"}
|
|
416
|
+
|
|
417
|
+
if job.kms_key_name is not None:
|
|
418
|
+
headers["x-goog-encryption-kms-key-name"] = job.kms_key_name
|
|
424
419
|
|
|
425
|
-
|
|
426
|
-
conductor_url=
|
|
420
|
+
response = self.api_client.make_request(
|
|
421
|
+
conductor_url=job.presigned_url,
|
|
427
422
|
headers=headers,
|
|
428
|
-
data=self.chunked_reader(
|
|
423
|
+
data=self.chunked_reader(job.path),
|
|
429
424
|
verb="PUT",
|
|
430
425
|
tries=1,
|
|
431
426
|
use_api_key=True,
|
|
432
427
|
)
|
|
433
428
|
|
|
434
|
-
|
|
429
|
+
logger.debug("Response from upload: %s", response)
|
|
430
|
+
|
|
431
|
+
return job
|
|
432
|
+
|
|
433
|
+
def do_multipart_upload(self, job):
|
|
435
434
|
"""
|
|
436
435
|
Files will be split into partSize returned by the FileAPI and hydrated once all parts are
|
|
437
436
|
uploaded. On successful part upload, response headers will contain an ETag. This value must
|
|
438
437
|
be tracked along with the part number in order to complete and hydrate the file.
|
|
439
438
|
"""
|
|
440
|
-
uploads = []
|
|
441
|
-
complete_payload = {
|
|
442
|
-
"uploadID": upload["uploadID"],
|
|
443
|
-
"hash": md5,
|
|
444
|
-
"completedParts": [],
|
|
445
|
-
"project": self.project,
|
|
446
|
-
}
|
|
447
439
|
|
|
448
|
-
|
|
449
|
-
for part in upload["parts"]:
|
|
450
|
-
resp_headers = self._do_multipart_upload(
|
|
451
|
-
upload_url=part["url"],
|
|
452
|
-
filename=filename,
|
|
453
|
-
part_number=part["partNumber"],
|
|
454
|
-
part_size=upload["partSize"],
|
|
455
|
-
)
|
|
440
|
+
resp_headers = self._do_multipart_upload(job)
|
|
456
441
|
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
completed_part = {
|
|
460
|
-
"partNumber": part["partNumber"],
|
|
461
|
-
"etag": resp_headers["ETag"].strip('"'),
|
|
462
|
-
}
|
|
463
|
-
complete_payload["completedParts"].append(completed_part)
|
|
442
|
+
if resp_headers:
|
|
443
|
+
job.etag = resp_headers["ETag"].strip('"')
|
|
464
444
|
|
|
465
|
-
|
|
466
|
-
uri_path = "/api/v2/files/multipart/complete"
|
|
467
|
-
headers = {"Content-Type": "application/json"}
|
|
468
|
-
self.api_client.make_request(
|
|
469
|
-
uri_path=uri_path,
|
|
470
|
-
verb="POST",
|
|
471
|
-
headers=headers,
|
|
472
|
-
data=json.dumps(complete_payload),
|
|
473
|
-
raise_on_error=True,
|
|
474
|
-
use_api_key=True,
|
|
475
|
-
)
|
|
476
|
-
|
|
477
|
-
return uploads
|
|
445
|
+
return job
|
|
478
446
|
|
|
479
447
|
@common.DecRetry(retry_exceptions=api_client.CONNECTION_EXCEPTIONS, tries=5)
|
|
480
|
-
def _do_multipart_upload(self,
|
|
481
|
-
|
|
448
|
+
def _do_multipart_upload(self, job):
|
|
449
|
+
|
|
450
|
+
with open(job.path, "rb") as fh:
|
|
482
451
|
# seek to the correct part position
|
|
483
|
-
start = (
|
|
452
|
+
start = (job.part_index - 1) * job.part_size
|
|
484
453
|
fh.seek(start)
|
|
485
454
|
|
|
486
455
|
# read up to part size determined by file-api
|
|
487
|
-
data = fh.read(part_size)
|
|
456
|
+
data = fh.read(job.part_size)
|
|
488
457
|
content_length = len(data)
|
|
489
458
|
|
|
490
459
|
# upload part
|
|
491
460
|
response = self.api_client.make_prepared_request(
|
|
492
461
|
verb="PUT",
|
|
493
|
-
url=
|
|
462
|
+
url=job.presigned_url,
|
|
494
463
|
headers={"Content-Type": "application/octet-stream"},
|
|
495
464
|
params=None,
|
|
496
465
|
data=data,
|
|
@@ -501,19 +470,127 @@ class UploadWorker(worker.ThreadWorker):
|
|
|
501
470
|
)
|
|
502
471
|
|
|
503
472
|
# report upload progress
|
|
504
|
-
self.metric_store.increment(
|
|
473
|
+
self.metric_store.increment(
|
|
474
|
+
"bytes_uploaded", content_length, job.path)
|
|
505
475
|
|
|
506
476
|
# close response object to add back to pool
|
|
507
477
|
# https://requests.readthedocs.io/en/master/user/advanced/#body-content-workflow
|
|
508
478
|
response.close()
|
|
509
479
|
|
|
480
|
+
logger.debug("Response from multipart upload: %s", response)
|
|
481
|
+
|
|
510
482
|
return response.headers
|
|
511
483
|
|
|
512
484
|
|
|
513
|
-
class
|
|
485
|
+
class MultiPartSiphonWorker(worker.ThreadWorker):
|
|
486
|
+
"""
|
|
487
|
+
This class is responsible for gathering all the jobs (aka files) and ensuring
|
|
488
|
+
the necessary steps are taken to have them available to be used by a Conductor Job.
|
|
489
|
+
|
|
490
|
+
For single-part files, this simply means passing the job to the out_queue so
|
|
491
|
+
that the Uploader is aware that the file has been sucesfully uploaded.
|
|
492
|
+
|
|
493
|
+
For multi-part files, this means collecting all the parts together and then
|
|
494
|
+
sending a request to the backend indicating that the file is complete.
|
|
495
|
+
"""
|
|
496
|
+
|
|
497
|
+
def __init__(self, *args, **kwargs):
|
|
498
|
+
super(MultiPartSiphonWorker, self).__init__(*args, **kwargs)
|
|
499
|
+
|
|
500
|
+
self.api_client = api_client.ApiClient()
|
|
501
|
+
self.multipart_siphon = {}
|
|
502
|
+
|
|
503
|
+
def do_work(self, job, thread_int):
|
|
504
|
+
"""
|
|
505
|
+
Process files that have already been uploaded.
|
|
506
|
+
|
|
507
|
+
If it's a single-part file, add the job to the out queue, so that it can
|
|
508
|
+
be used to determine if the Upload entity is complete.
|
|
509
|
+
|
|
510
|
+
If it's a multi-part upload, collect all the parts together. Once all the
|
|
511
|
+
parts have been accumulated, mark it as complete and add the file to the
|
|
512
|
+
out queue.
|
|
513
|
+
"""
|
|
514
|
+
|
|
515
|
+
if not job:
|
|
516
|
+
return None
|
|
517
|
+
|
|
518
|
+
if not job.is_multipart():
|
|
519
|
+
logger.debug("Job is not multipart (%s, %s)",
|
|
520
|
+
job.total_parts, job.part_index)
|
|
521
|
+
|
|
522
|
+
return job
|
|
523
|
+
|
|
524
|
+
if job.file_md5 not in self.multipart_siphon:
|
|
525
|
+
self.multipart_siphon[job.file_md5] = []
|
|
526
|
+
|
|
527
|
+
# Add to the task count for this worker.
|
|
528
|
+
# -1 because a task has already been added for a single file
|
|
529
|
+
# but not all its parts.
|
|
530
|
+
old_task_count = self.task_count
|
|
531
|
+
self.task_count += job.total_parts - 1
|
|
532
|
+
logger.debug("Incrementing task count to %s from %s",
|
|
533
|
+
self.task_count, old_task_count)
|
|
534
|
+
|
|
535
|
+
self.multipart_siphon[job.file_md5].append(job)
|
|
536
|
+
|
|
537
|
+
if len(self.multipart_siphon[job.file_md5]) == job.total_parts:
|
|
514
538
|
|
|
539
|
+
complete_payload = {
|
|
540
|
+
"uploadID": job.file_upload_id,
|
|
541
|
+
"hash": job.file_md5,
|
|
542
|
+
"completedParts": thread_queue_job.ThreadQueueJob.aggregate_parts(self.multipart_siphon[job.file_md5]),
|
|
543
|
+
"project": job.project,
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
# Complete multipart upload in order to hydrate file for availability
|
|
547
|
+
logger.debug("Complete payload: %s", complete_payload)
|
|
548
|
+
uri_path = "/api/v2/files/multipart/complete"
|
|
549
|
+
headers = {"Content-Type": "application/json"}
|
|
550
|
+
self.api_client.make_request(
|
|
551
|
+
uri_path=uri_path,
|
|
552
|
+
verb="POST",
|
|
553
|
+
headers=headers,
|
|
554
|
+
data=json.dumps(complete_payload),
|
|
555
|
+
raise_on_error=True,
|
|
556
|
+
use_api_key=True,
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
logger.debug("JSON payload: '%s'",
|
|
560
|
+
json.dumps(complete_payload))
|
|
561
|
+
|
|
562
|
+
for job_part in self.multipart_siphon[job.file_md5]:
|
|
563
|
+
self.put_job(job_part)
|
|
564
|
+
|
|
565
|
+
return None
|
|
566
|
+
|
|
567
|
+
def is_complete(self):
|
|
568
|
+
# Get the number of files already uploaded as they are not passed to the Upload
|
|
569
|
+
# worker
|
|
570
|
+
file_store = self.metric_store.get("files")
|
|
571
|
+
|
|
572
|
+
if isinstance(file_store, dict):
|
|
573
|
+
already_completed_uploads = len(
|
|
574
|
+
[x for x in file_store.values() if x["already_uploaded"]]
|
|
575
|
+
)
|
|
576
|
+
queue_size = self.out_queue.qsize()
|
|
577
|
+
logger.debug(
|
|
578
|
+
"Is complete? out_queue_size=%s, completed_uploads=%s, task_count=%s",
|
|
579
|
+
queue_size,
|
|
580
|
+
already_completed_uploads,
|
|
581
|
+
self.task_count,
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
return (queue_size) >= self.task_count
|
|
585
|
+
|
|
586
|
+
else:
|
|
587
|
+
logger.debug("Is complete?: Files not initialized yet")
|
|
588
|
+
return False
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
class Uploader(object):
|
|
515
592
|
sleep_time = 10
|
|
516
|
-
|
|
593
|
+
|
|
517
594
|
CLIENT_NAME = "Uploader"
|
|
518
595
|
|
|
519
596
|
def __init__(self, args=None):
|
|
@@ -528,24 +605,23 @@ class Uploader(object):
|
|
|
528
605
|
self.cancel = False
|
|
529
606
|
self.error_messages = []
|
|
530
607
|
self.num_files_to_process = 0
|
|
531
|
-
|
|
608
|
+
|
|
532
609
|
self.report_status_thread = None
|
|
533
610
|
self.monitor_status_thread = None
|
|
534
|
-
|
|
611
|
+
|
|
535
612
|
def emit_progress(self, upload_stats):
|
|
536
|
-
|
|
537
|
-
if self.progress_callback:
|
|
613
|
+
if self.progress_callback:
|
|
538
614
|
self.progress_callback(upload_stats)
|
|
539
615
|
|
|
540
616
|
def prepare_workers(self):
|
|
541
617
|
logger.debug("preparing workers...")
|
|
542
|
-
|
|
618
|
+
|
|
543
619
|
if isinstance(threading.current_thread(), threading._MainThread):
|
|
544
620
|
common.register_sigint_signal_handler()
|
|
621
|
+
|
|
545
622
|
self.manager = None
|
|
546
623
|
|
|
547
|
-
def create_manager(self
|
|
548
|
-
|
|
624
|
+
def create_manager(self):
|
|
549
625
|
job_description = [
|
|
550
626
|
(
|
|
551
627
|
MD5Worker,
|
|
@@ -556,40 +632,31 @@ class Uploader(object):
|
|
|
556
632
|
"md5_caching": self.args["md5_caching"],
|
|
557
633
|
},
|
|
558
634
|
),
|
|
559
|
-
(
|
|
560
|
-
|
|
561
|
-
),
|
|
562
|
-
(
|
|
563
|
-
HttpBatchWorker,
|
|
564
|
-
[],
|
|
565
|
-
{"thread_count": self.args["thread_count"], "project": project},
|
|
566
|
-
),
|
|
567
|
-
(
|
|
568
|
-
FileStatWorker, [], {"thread_count": 1}
|
|
569
|
-
),
|
|
570
|
-
(
|
|
571
|
-
UploadWorker, [], {"thread_count": self.args["thread_count"]}
|
|
572
|
-
),
|
|
635
|
+
(MD5OutputWorker, [], {"thread_count": 1}),
|
|
636
|
+
(HttpBatchWorker, [], {"thread_count": 1}),
|
|
637
|
+
(UploadWorker, [], {"thread_count": self.args["thread_count"]}),
|
|
638
|
+
(MultiPartSiphonWorker, [], {"thread_count": 1})
|
|
573
639
|
]
|
|
574
640
|
|
|
575
641
|
manager = worker.JobManager(job_description)
|
|
576
|
-
manager.start()
|
|
577
642
|
return manager
|
|
578
643
|
|
|
579
644
|
@common.dec_catch_exception(raise_=True)
|
|
580
645
|
def report_status(self):
|
|
581
646
|
logger.debug("started report_status thread")
|
|
582
|
-
update_interval =
|
|
647
|
+
update_interval = 15
|
|
583
648
|
while True:
|
|
584
|
-
|
|
585
649
|
# don't report status if we are doing a local_upload
|
|
586
650
|
if not self.upload_id:
|
|
587
|
-
logger.debug(
|
|
651
|
+
logger.debug(
|
|
652
|
+
"not updating status as we were not provided an upload_id")
|
|
588
653
|
return
|
|
589
654
|
|
|
590
655
|
if self.working:
|
|
591
|
-
bytes_to_upload = self.manager.metric_store.get(
|
|
592
|
-
|
|
656
|
+
bytes_to_upload = self.manager.metric_store.get(
|
|
657
|
+
"bytes_to_upload")
|
|
658
|
+
bytes_uploaded = self.manager.metric_store.get(
|
|
659
|
+
"bytes_uploaded")
|
|
593
660
|
try:
|
|
594
661
|
status_dict = {
|
|
595
662
|
"upload_id": self.upload_id,
|
|
@@ -608,7 +675,7 @@ class Uploader(object):
|
|
|
608
675
|
logger.error("could not report status:")
|
|
609
676
|
logger.error(traceback.print_exc())
|
|
610
677
|
logger.error(traceback.format_exc())
|
|
611
|
-
|
|
678
|
+
|
|
612
679
|
else:
|
|
613
680
|
break
|
|
614
681
|
|
|
@@ -616,14 +683,16 @@ class Uploader(object):
|
|
|
616
683
|
|
|
617
684
|
def create_report_status_thread(self):
|
|
618
685
|
logger.debug("creating reporter thread")
|
|
619
|
-
self.report_status_thread = threading.Thread(
|
|
686
|
+
self.report_status_thread = threading.Thread(
|
|
687
|
+
name="ReporterThread", target=self.report_status
|
|
688
|
+
)
|
|
620
689
|
self.report_status_thread.daemon = True
|
|
621
690
|
self.report_status_thread.start()
|
|
622
691
|
|
|
623
692
|
@common.dec_catch_exception(raise_=True)
|
|
624
693
|
def monitor_status(self, progress_handler):
|
|
625
694
|
logger.debug("starting monitor_status thread")
|
|
626
|
-
update_interval =
|
|
695
|
+
update_interval = 5
|
|
627
696
|
|
|
628
697
|
def sleep():
|
|
629
698
|
time.sleep(update_interval)
|
|
@@ -631,19 +700,27 @@ class Uploader(object):
|
|
|
631
700
|
while True:
|
|
632
701
|
if self.working:
|
|
633
702
|
try:
|
|
634
|
-
upload_stats = UploadStats.create
|
|
703
|
+
upload_stats = UploadStats.create(
|
|
704
|
+
self.manager.metric_store,
|
|
705
|
+
self.num_files_to_process,
|
|
706
|
+
self.job_start_time,
|
|
707
|
+
)
|
|
635
708
|
progress_handler(upload_stats)
|
|
636
709
|
except Exception as e:
|
|
637
710
|
print(e)
|
|
638
711
|
print(traceback.format_exc())
|
|
639
|
-
|
|
712
|
+
|
|
640
713
|
else:
|
|
641
714
|
break
|
|
642
715
|
sleep()
|
|
643
716
|
|
|
644
717
|
def create_monitor_status_thread(self):
|
|
645
718
|
logger.debug("creating console status thread")
|
|
646
|
-
self.monitor_status_thread = threading.Thread(
|
|
719
|
+
self.monitor_status_thread = threading.Thread(
|
|
720
|
+
name="PrintStatusThread",
|
|
721
|
+
target=self.monitor_status,
|
|
722
|
+
args=(self.emit_progress,),
|
|
723
|
+
)
|
|
647
724
|
|
|
648
725
|
# make sure threads don't stop the program from exiting
|
|
649
726
|
self.monitor_status_thread.daemon = True
|
|
@@ -652,24 +729,69 @@ class Uploader(object):
|
|
|
652
729
|
self.monitor_status_thread.start()
|
|
653
730
|
|
|
654
731
|
def mark_upload_finished(self, upload_id, upload_files):
|
|
655
|
-
|
|
656
|
-
|
|
732
|
+
data = {
|
|
733
|
+
"upload_id": upload_id,
|
|
734
|
+
"status": "server_pending",
|
|
735
|
+
"upload_files": upload_files,
|
|
736
|
+
}
|
|
657
737
|
|
|
658
738
|
self.api_client.make_request(
|
|
659
|
-
"/uploads/%s/finish" % upload_id,
|
|
739
|
+
"/uploads/%s/finish" % upload_id,
|
|
740
|
+
data=json.dumps(data),
|
|
741
|
+
verb="POST",
|
|
742
|
+
use_api_key=True,
|
|
660
743
|
)
|
|
661
744
|
return True
|
|
662
745
|
|
|
663
746
|
def mark_upload_failed(self, error_message, upload_id):
|
|
664
|
-
logger.error("
|
|
747
|
+
logger.error("Upload failed: %s", error_message)
|
|
665
748
|
|
|
666
749
|
# report error_message to the app
|
|
667
750
|
self.api_client.make_request(
|
|
668
|
-
"/uploads/%s/fail" % upload_id,
|
|
751
|
+
"/uploads/%s/fail" % upload_id,
|
|
752
|
+
data=error_message,
|
|
753
|
+
verb="POST",
|
|
754
|
+
use_api_key=True,
|
|
669
755
|
)
|
|
670
756
|
|
|
671
757
|
return True
|
|
672
758
|
|
|
759
|
+
def assets_only(self, *paths):
|
|
760
|
+
processed_filepaths = file_utils.process_upload_filepaths(paths)
|
|
761
|
+
file_map = {path: None for path in processed_filepaths}
|
|
762
|
+
self.handle_upload_response(project=None, upload_files=file_map)
|
|
763
|
+
|
|
764
|
+
if common.SIGINT_EXIT or self.cancel:
|
|
765
|
+
print("\nUpload cancelled\n")
|
|
766
|
+
|
|
767
|
+
elif self.error_messages:
|
|
768
|
+
print("\nUpload of {} file completed with errors\n".format(len(file_map)))
|
|
769
|
+
|
|
770
|
+
else:
|
|
771
|
+
print("\nUpload of {} file completed\n".format(len(file_map)))
|
|
772
|
+
|
|
773
|
+
error_messages = []
|
|
774
|
+
|
|
775
|
+
for exception in self.error_messages:
|
|
776
|
+
error_messages.append(str(exception[1]))
|
|
777
|
+
traceback_message = "".join(
|
|
778
|
+
traceback.format_exception(None, exception[1], exception[2]))
|
|
779
|
+
print(traceback_message)
|
|
780
|
+
logger.error(traceback_message)
|
|
781
|
+
|
|
782
|
+
if error_messages:
|
|
783
|
+
|
|
784
|
+
log_file = loggeria.LOG_PATH
|
|
785
|
+
sys.stderr.write("\nError uploading files:\n")
|
|
786
|
+
|
|
787
|
+
for err_msg in error_messages:
|
|
788
|
+
sys.stderr.write("\t{}\n".format(err_msg))
|
|
789
|
+
|
|
790
|
+
sys.stderr.write(
|
|
791
|
+
"\nSee log {} for more details\n\n".format(log_file))
|
|
792
|
+
|
|
793
|
+
self.error_messages = []
|
|
794
|
+
|
|
673
795
|
def handle_upload_response(self, project, upload_files, upload_id=None):
|
|
674
796
|
"""
|
|
675
797
|
This is a really confusing method and should probably be split into to clear logic
|
|
@@ -679,7 +801,6 @@ class Uploader(object):
|
|
|
679
801
|
only be fed uploads by the app which have valid projects attached to them.
|
|
680
802
|
"""
|
|
681
803
|
try:
|
|
682
|
-
|
|
683
804
|
logger.info("%s", " NEXT UPLOAD ".center(30, "#"))
|
|
684
805
|
logger.info("project: %s", project)
|
|
685
806
|
logger.info("upload_id is %s", upload_id)
|
|
@@ -691,7 +812,7 @@ class Uploader(object):
|
|
|
691
812
|
|
|
692
813
|
# reset counters
|
|
693
814
|
self.num_files_to_process = len(upload_files)
|
|
694
|
-
logger.debug(
|
|
815
|
+
logger.debug("Processing %s files", self.num_files_to_process)
|
|
695
816
|
self.job_start_time = datetime.datetime.now()
|
|
696
817
|
self.upload_id = upload_id
|
|
697
818
|
self.job_failed = False
|
|
@@ -700,9 +821,17 @@ class Uploader(object):
|
|
|
700
821
|
self.working = True
|
|
701
822
|
|
|
702
823
|
self.prepare_workers()
|
|
703
|
-
|
|
824
|
+
|
|
825
|
+
# Adjust the number of threads
|
|
826
|
+
if self.num_files_to_process < self.args["thread_count"]:
|
|
827
|
+
self.args["thread_count"] = min(self.args["thread_count"], self.num_files_to_process)
|
|
828
|
+
logger.info(
|
|
829
|
+
"Adjusting thread count to %s", self.args["thread_count"]
|
|
830
|
+
)
|
|
831
|
+
|
|
704
832
|
# create worker pools
|
|
705
|
-
self.manager = self.create_manager(
|
|
833
|
+
self.manager = self.create_manager()
|
|
834
|
+
self.manager.start()
|
|
706
835
|
|
|
707
836
|
# create reporters
|
|
708
837
|
logger.debug("creating report status thread...")
|
|
@@ -711,65 +840,72 @@ class Uploader(object):
|
|
|
711
840
|
# load tasks into worker pools
|
|
712
841
|
for path in upload_files:
|
|
713
842
|
md5 = upload_files[path]
|
|
714
|
-
self.manager.add_task((path, md5))
|
|
843
|
+
self.manager.add_task((path, md5, project))
|
|
715
844
|
|
|
716
845
|
logger.info("creating console status thread...")
|
|
717
|
-
self.create_monitor_status_thread()
|
|
846
|
+
self.create_monitor_status_thread()
|
|
718
847
|
|
|
719
|
-
#wait for work to finish
|
|
848
|
+
# wait for work to finish
|
|
720
849
|
while not self.manager.is_complete():
|
|
721
|
-
logger.
|
|
722
|
-
|
|
723
|
-
|
|
850
|
+
logger.debug(
|
|
851
|
+
"Manager is running, cancel requested?: %s", self.cancel)
|
|
852
|
+
|
|
853
|
+
if self.cancel or self.manager.error or common.SIGINT_EXIT:
|
|
724
854
|
self.error_messages = self.manager.stop_work()
|
|
725
855
|
logger.debug("Manager sucesfully stopped")
|
|
726
856
|
break
|
|
727
|
-
|
|
728
|
-
time.sleep(
|
|
729
|
-
|
|
857
|
+
|
|
858
|
+
time.sleep(5)
|
|
859
|
+
|
|
730
860
|
# Shutdown the manager once all jobs are done
|
|
731
|
-
if not self.cancel
|
|
732
|
-
logger.debug("Waiting for Manager to join")
|
|
861
|
+
if not (self.cancel or self.manager.error or common.SIGINT_EXIT):
|
|
733
862
|
self.manager.join()
|
|
734
863
|
|
|
735
|
-
upload_stats = UploadStats.create(
|
|
864
|
+
upload_stats = UploadStats.create(
|
|
865
|
+
self.manager.metric_store,
|
|
866
|
+
self.num_files_to_process,
|
|
867
|
+
self.job_start_time,
|
|
868
|
+
)
|
|
736
869
|
logger.info(upload_stats.get_formatted_text())
|
|
737
|
-
self.emit_progress(upload_stats)
|
|
738
|
-
|
|
739
|
-
logger.debug("
|
|
870
|
+
self.emit_progress(upload_stats)
|
|
871
|
+
|
|
872
|
+
logger.debug("Error_message: %s", self.error_messages)
|
|
740
873
|
|
|
741
874
|
# signal to the reporter to stop working
|
|
742
875
|
self.working = False
|
|
743
|
-
|
|
744
|
-
|
|
876
|
+
|
|
745
877
|
logger.debug("Waiting for reporter status thread to join")
|
|
746
878
|
self.report_status_thread.join()
|
|
747
|
-
|
|
879
|
+
|
|
748
880
|
logger.debug("Waiting for print status thread to join")
|
|
749
|
-
self.monitor_status_thread.join()
|
|
881
|
+
self.monitor_status_thread.join()
|
|
750
882
|
|
|
751
883
|
# Despite storing lots of data about new uploads, we will only send back the things
|
|
752
884
|
# that have changed, to keep payloads small.
|
|
753
885
|
finished_upload_files = {}
|
|
754
|
-
if self.upload_id:
|
|
886
|
+
if self.upload_id and not self.error_messages:
|
|
755
887
|
md5s = self.return_md5s()
|
|
756
888
|
for path in md5s:
|
|
757
|
-
finished_upload_files[path] =
|
|
889
|
+
finished_upload_files[path] = {
|
|
890
|
+
"source": path, "md5": md5s[path]}
|
|
758
891
|
|
|
759
|
-
self.mark_upload_finished(
|
|
892
|
+
self.mark_upload_finished(
|
|
893
|
+
self.upload_id, finished_upload_files)
|
|
760
894
|
|
|
761
895
|
except:
|
|
762
896
|
self.error_messages.append(sys.exc_info())
|
|
763
897
|
|
|
764
898
|
def main(self, run_one_loop=False):
|
|
765
|
-
|
|
766
899
|
def show_ouput(upload_stats):
|
|
767
|
-
|
|
768
|
-
|
|
900
|
+
print(upload_stats.get_formatted_text())
|
|
901
|
+
logger.info("File Progress: %s", upload_stats.file_progress)
|
|
902
|
+
|
|
769
903
|
self.progress_callback = show_ouput
|
|
770
|
-
|
|
904
|
+
|
|
771
905
|
logger.info("Uploader Started. Checking for uploads...")
|
|
772
906
|
|
|
907
|
+
waiting_for_uploads_flag = False
|
|
908
|
+
|
|
773
909
|
while not common.SIGINT_EXIT:
|
|
774
910
|
try:
|
|
775
911
|
# TODO: we should pass args as url params, not http data
|
|
@@ -777,16 +913,27 @@ class Uploader(object):
|
|
|
777
913
|
data["location"] = self.location
|
|
778
914
|
logger.debug("Data: %s", data)
|
|
779
915
|
resp_str, resp_code = self.api_client.make_request(
|
|
780
|
-
"/uploads/client/next",
|
|
916
|
+
"/uploads/client/next",
|
|
917
|
+
data=json.dumps(data),
|
|
918
|
+
verb="PUT",
|
|
919
|
+
use_api_key=True,
|
|
781
920
|
)
|
|
782
921
|
if resp_code == 204:
|
|
922
|
+
if not waiting_for_uploads_flag:
|
|
923
|
+
sys.stdout.write("\nWaiting for jobs to upload ")
|
|
924
|
+
sys.stdout.flush()
|
|
925
|
+
|
|
783
926
|
logger.debug("no files to upload")
|
|
784
927
|
sys.stdout.write(".")
|
|
785
928
|
sys.stdout.flush()
|
|
786
929
|
time.sleep(self.sleep_time)
|
|
930
|
+
waiting_for_uploads_flag = True
|
|
787
931
|
continue
|
|
932
|
+
|
|
788
933
|
elif resp_code != 201:
|
|
789
|
-
logger.error(
|
|
934
|
+
logger.error(
|
|
935
|
+
"received invalid response code from app %s", resp_code
|
|
936
|
+
)
|
|
790
937
|
logger.error("response is %s", resp_str)
|
|
791
938
|
time.sleep(self.sleep_time)
|
|
792
939
|
continue
|
|
@@ -796,6 +943,7 @@ class Uploader(object):
|
|
|
796
943
|
try:
|
|
797
944
|
json_data = json.loads(resp_str)
|
|
798
945
|
upload = json_data.get("data", {})
|
|
946
|
+
|
|
799
947
|
except ValueError:
|
|
800
948
|
logger.error("response was not valid json: %s", resp_str)
|
|
801
949
|
time.sleep(self.sleep_time)
|
|
@@ -806,24 +954,51 @@ class Uploader(object):
|
|
|
806
954
|
project = upload["project"]
|
|
807
955
|
|
|
808
956
|
self.handle_upload_response(project, upload_files, upload_id)
|
|
957
|
+
|
|
958
|
+
if self.error_messages:
|
|
959
|
+
logger.info("Upload of entity %s failed with errors.", upload_id)
|
|
960
|
+
|
|
961
|
+
else:
|
|
962
|
+
logger.info("Upload of entity %s completed.", upload_id)
|
|
809
963
|
|
|
810
|
-
|
|
811
|
-
|
|
964
|
+
upload_stats = UploadStats.create(
|
|
965
|
+
self.manager.metric_store,
|
|
966
|
+
self.num_files_to_process,
|
|
967
|
+
self.job_start_time,
|
|
968
|
+
)
|
|
969
|
+
show_ouput(upload_stats)
|
|
970
|
+
logger.debug(self.manager.worker_queue_status_text())
|
|
971
|
+
|
|
972
|
+
error_messages = []
|
|
973
|
+
|
|
812
974
|
for exception in self.error_messages:
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
975
|
+
error_messages.append(str(exception[1]))
|
|
976
|
+
|
|
977
|
+
if error_messages:
|
|
978
|
+
self.mark_upload_failed(
|
|
979
|
+
error_message="Uploader ERROR: {}".format(
|
|
980
|
+
"\n".join(error_messages)),
|
|
981
|
+
upload_id=upload_id
|
|
982
|
+
)
|
|
983
|
+
|
|
984
|
+
log_file = loggeria.LOG_PATH
|
|
985
|
+
sys.stderr.write("\nError uploading files:\n")
|
|
986
|
+
|
|
987
|
+
for err_msg in error_messages:
|
|
988
|
+
sys.stderr.write("\t{}\n".format(err_msg))
|
|
989
|
+
|
|
990
|
+
sys.stderr.write(
|
|
991
|
+
"\nSee log {} for more details\n\n".format(log_file))
|
|
992
|
+
|
|
820
993
|
self.error_messages = []
|
|
821
994
|
|
|
995
|
+
waiting_for_uploads_flag = False
|
|
996
|
+
|
|
822
997
|
except KeyboardInterrupt:
|
|
823
998
|
logger.info("ctrl-c exit")
|
|
824
999
|
break
|
|
825
|
-
except:
|
|
826
|
-
logger.exception("Caught exception:\n")
|
|
1000
|
+
except Exception as err_msg:
|
|
1001
|
+
logger.exception("Caught exception:\n%s", err_msg)
|
|
827
1002
|
time.sleep(self.sleep_time)
|
|
828
1003
|
continue
|
|
829
1004
|
|
|
@@ -836,17 +1011,6 @@ class Uploader(object):
|
|
|
836
1011
|
"""
|
|
837
1012
|
return self.manager.metric_store.get_dict("file_md5s")
|
|
838
1013
|
|
|
839
|
-
def set_logging(level=None, log_dirpath=None):
|
|
840
|
-
log_filepath = None
|
|
841
|
-
if log_dirpath:
|
|
842
|
-
log_filepath = os.path.join(log_dirpath, "conductor_ul_log")
|
|
843
|
-
loggeria.setup_conductor_logging(
|
|
844
|
-
logger_level=level,
|
|
845
|
-
console_formatter=LOG_FORMATTER,
|
|
846
|
-
file_formatter=LOG_FORMATTER,
|
|
847
|
-
log_filepath=log_filepath,
|
|
848
|
-
)
|
|
849
|
-
|
|
850
1014
|
|
|
851
1015
|
def run_uploader(args):
|
|
852
1016
|
"""
|
|
@@ -856,18 +1020,37 @@ def run_uploader(args):
|
|
|
856
1020
|
# convert the Namespace object to a dictionary
|
|
857
1021
|
args_dict = vars(args)
|
|
858
1022
|
cfg = config.config().config
|
|
859
|
-
|
|
860
|
-
api_client.ApiClient.register_client(
|
|
1023
|
+
|
|
1024
|
+
api_client.ApiClient.register_client(
|
|
1025
|
+
client_name=Uploader.CLIENT_NAME, client_version=ciocore.version
|
|
1026
|
+
)
|
|
861
1027
|
|
|
862
1028
|
# Set up logging
|
|
863
1029
|
log_level_name = args_dict.get("log_level") or cfg["log_level"]
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
1030
|
+
|
|
1031
|
+
loggeria.setup_conductor_logging(
|
|
1032
|
+
logger_level=loggeria.LEVEL_MAP.get(log_level_name),
|
|
1033
|
+
log_dirpath=args_dict.get("log_dir"),
|
|
1034
|
+
log_filename="conductor_uploader.log",
|
|
1035
|
+
disable_console_logging=not args_dict["log_to_console"],
|
|
1036
|
+
use_system_log=False,
|
|
1037
|
+
)
|
|
1038
|
+
|
|
1039
|
+
print("Logging to %s", loggeria.LOG_PATH)
|
|
1040
|
+
|
|
867
1041
|
logger.debug("Uploader parsed_args is %s", args_dict)
|
|
1042
|
+
|
|
868
1043
|
resolved_args = resolve_args(args_dict)
|
|
869
1044
|
uploader = Uploader(resolved_args)
|
|
870
|
-
|
|
1045
|
+
|
|
1046
|
+
if args.paths:
|
|
1047
|
+
processed_filepaths = file_utils.process_upload_filepaths(
|
|
1048
|
+
args.paths[0])
|
|
1049
|
+
file_map = {path: None for path in processed_filepaths}
|
|
1050
|
+
uploader.handle_upload_response(project=None, upload_files=file_map)
|
|
1051
|
+
|
|
1052
|
+
else:
|
|
1053
|
+
uploader.main()
|
|
871
1054
|
|
|
872
1055
|
|
|
873
1056
|
def get_file_info(filepath):
|
|
@@ -891,7 +1074,7 @@ def resolve_args(args):
|
|
|
891
1074
|
Resolve all arguments, reconciling differences between command line args and config.yml args.
|
|
892
1075
|
See resolve_arg function.
|
|
893
1076
|
"""
|
|
894
|
-
|
|
1077
|
+
|
|
895
1078
|
args["md5_caching"] = resolve_arg("md5_caching", args)
|
|
896
1079
|
args["database_filepath"] = resolve_arg("database_filepath", args)
|
|
897
1080
|
args["location"] = resolve_arg("location", args)
|
|
@@ -899,18 +1082,18 @@ def resolve_args(args):
|
|
|
899
1082
|
|
|
900
1083
|
return args
|
|
901
1084
|
|
|
1085
|
+
|
|
902
1086
|
def resolve_arg(key, args):
|
|
903
1087
|
"""
|
|
904
1088
|
If the key doesn't exist (or is None), grab it from the config.
|
|
905
1089
|
"""
|
|
906
|
-
|
|
1090
|
+
|
|
907
1091
|
cfg = config.config().config
|
|
908
|
-
config_value = cfg.get(key)
|
|
909
|
-
|
|
1092
|
+
config_value = cfg.get(key)
|
|
1093
|
+
|
|
910
1094
|
value = args.get(key, config_value)
|
|
911
|
-
|
|
1095
|
+
|
|
912
1096
|
if value is None:
|
|
913
1097
|
value = config_value
|
|
914
|
-
|
|
1098
|
+
|
|
915
1099
|
return value
|
|
916
|
-
|