ciocore 5.1.1__py2.py3-none-any.whl → 10.0.0b3__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. ciocore/VERSION +1 -1
  2. ciocore/__init__.py +23 -1
  3. ciocore/api_client.py +655 -160
  4. ciocore/auth/__init__.py +5 -3
  5. ciocore/cli.py +501 -0
  6. ciocore/common.py +15 -13
  7. ciocore/conductor_submit.py +77 -60
  8. ciocore/config.py +127 -13
  9. ciocore/data.py +162 -77
  10. ciocore/docsite/404.html +746 -0
  11. ciocore/docsite/apidoc/api_client/index.html +3605 -0
  12. ciocore/docsite/apidoc/apidoc/index.html +909 -0
  13. ciocore/docsite/apidoc/config/index.html +1652 -0
  14. ciocore/docsite/apidoc/data/index.html +1553 -0
  15. ciocore/docsite/apidoc/hardware_set/index.html +2460 -0
  16. ciocore/docsite/apidoc/package_environment/index.html +1507 -0
  17. ciocore/docsite/apidoc/package_tree/index.html +2386 -0
  18. ciocore/docsite/assets/_mkdocstrings.css +16 -0
  19. ciocore/docsite/assets/images/favicon.png +0 -0
  20. ciocore/docsite/assets/javascripts/bundle.471ce7a9.min.js +29 -0
  21. ciocore/docsite/assets/javascripts/bundle.471ce7a9.min.js.map +7 -0
  22. ciocore/docsite/assets/javascripts/lunr/min/lunr.ar.min.js +1 -0
  23. ciocore/docsite/assets/javascripts/lunr/min/lunr.da.min.js +18 -0
  24. ciocore/docsite/assets/javascripts/lunr/min/lunr.de.min.js +18 -0
  25. ciocore/docsite/assets/javascripts/lunr/min/lunr.du.min.js +18 -0
  26. ciocore/docsite/assets/javascripts/lunr/min/lunr.el.min.js +1 -0
  27. ciocore/docsite/assets/javascripts/lunr/min/lunr.es.min.js +18 -0
  28. ciocore/docsite/assets/javascripts/lunr/min/lunr.fi.min.js +18 -0
  29. ciocore/docsite/assets/javascripts/lunr/min/lunr.fr.min.js +18 -0
  30. ciocore/docsite/assets/javascripts/lunr/min/lunr.he.min.js +1 -0
  31. ciocore/docsite/assets/javascripts/lunr/min/lunr.hi.min.js +1 -0
  32. ciocore/docsite/assets/javascripts/lunr/min/lunr.hu.min.js +18 -0
  33. ciocore/docsite/assets/javascripts/lunr/min/lunr.hy.min.js +1 -0
  34. ciocore/docsite/assets/javascripts/lunr/min/lunr.it.min.js +18 -0
  35. ciocore/docsite/assets/javascripts/lunr/min/lunr.ja.min.js +1 -0
  36. ciocore/docsite/assets/javascripts/lunr/min/lunr.jp.min.js +1 -0
  37. ciocore/docsite/assets/javascripts/lunr/min/lunr.kn.min.js +1 -0
  38. ciocore/docsite/assets/javascripts/lunr/min/lunr.ko.min.js +1 -0
  39. ciocore/docsite/assets/javascripts/lunr/min/lunr.multi.min.js +1 -0
  40. ciocore/docsite/assets/javascripts/lunr/min/lunr.nl.min.js +18 -0
  41. ciocore/docsite/assets/javascripts/lunr/min/lunr.no.min.js +18 -0
  42. ciocore/docsite/assets/javascripts/lunr/min/lunr.pt.min.js +18 -0
  43. ciocore/docsite/assets/javascripts/lunr/min/lunr.ro.min.js +18 -0
  44. ciocore/docsite/assets/javascripts/lunr/min/lunr.ru.min.js +18 -0
  45. ciocore/docsite/assets/javascripts/lunr/min/lunr.sa.min.js +1 -0
  46. ciocore/docsite/assets/javascripts/lunr/min/lunr.stemmer.support.min.js +1 -0
  47. ciocore/docsite/assets/javascripts/lunr/min/lunr.sv.min.js +18 -0
  48. ciocore/docsite/assets/javascripts/lunr/min/lunr.ta.min.js +1 -0
  49. ciocore/docsite/assets/javascripts/lunr/min/lunr.te.min.js +1 -0
  50. ciocore/docsite/assets/javascripts/lunr/min/lunr.th.min.js +1 -0
  51. ciocore/docsite/assets/javascripts/lunr/min/lunr.tr.min.js +18 -0
  52. ciocore/docsite/assets/javascripts/lunr/min/lunr.vi.min.js +1 -0
  53. ciocore/docsite/assets/javascripts/lunr/min/lunr.zh.min.js +1 -0
  54. ciocore/docsite/assets/javascripts/lunr/tinyseg.js +206 -0
  55. ciocore/docsite/assets/javascripts/lunr/wordcut.js +6708 -0
  56. ciocore/docsite/assets/javascripts/workers/search.b8dbb3d2.min.js +42 -0
  57. ciocore/docsite/assets/javascripts/workers/search.b8dbb3d2.min.js.map +7 -0
  58. ciocore/docsite/assets/stylesheets/main.3cba04c6.min.css +1 -0
  59. ciocore/docsite/assets/stylesheets/main.3cba04c6.min.css.map +1 -0
  60. ciocore/docsite/assets/stylesheets/palette.06af60db.min.css +1 -0
  61. ciocore/docsite/assets/stylesheets/palette.06af60db.min.css.map +1 -0
  62. ciocore/docsite/cmdline/docs/index.html +871 -0
  63. ciocore/docsite/cmdline/downloader/index.html +934 -0
  64. ciocore/docsite/cmdline/packages/index.html +878 -0
  65. ciocore/docsite/cmdline/uploader/index.html +995 -0
  66. ciocore/docsite/how-to-guides/index.html +869 -0
  67. ciocore/docsite/index.html +895 -0
  68. ciocore/docsite/logo.png +0 -0
  69. ciocore/docsite/objects.inv +0 -0
  70. ciocore/docsite/search/search_index.json +1 -0
  71. ciocore/docsite/sitemap.xml +3 -0
  72. ciocore/docsite/sitemap.xml.gz +0 -0
  73. ciocore/docsite/stylesheets/extra.css +26 -0
  74. ciocore/docsite/stylesheets/tables.css +167 -0
  75. ciocore/downloader/base_downloader.py +644 -0
  76. ciocore/downloader/download_runner_base.py +47 -0
  77. ciocore/downloader/job_downloader.py +119 -0
  78. ciocore/{downloader.py → downloader/legacy_downloader.py} +12 -9
  79. ciocore/downloader/log.py +73 -0
  80. ciocore/downloader/logging_download_runner.py +87 -0
  81. ciocore/downloader/perpetual_downloader.py +63 -0
  82. ciocore/downloader/registry.py +97 -0
  83. ciocore/downloader/reporter.py +135 -0
  84. ciocore/exceptions.py +8 -2
  85. ciocore/file_utils.py +51 -50
  86. ciocore/hardware_set.py +449 -0
  87. ciocore/loggeria.py +89 -20
  88. ciocore/package_environment.py +110 -48
  89. ciocore/package_query.py +182 -0
  90. ciocore/package_tree.py +319 -258
  91. ciocore/retry.py +0 -0
  92. ciocore/uploader/_uploader.py +547 -364
  93. ciocore/uploader/thread_queue_job.py +176 -0
  94. ciocore/uploader/upload_stats/__init__.py +3 -4
  95. ciocore/uploader/upload_stats/stats_formats.py +10 -4
  96. ciocore/validator.py +34 -2
  97. ciocore/worker.py +174 -151
  98. ciocore-10.0.0b3.dist-info/METADATA +928 -0
  99. ciocore-10.0.0b3.dist-info/RECORD +128 -0
  100. {ciocore-5.1.1.dist-info → ciocore-10.0.0b3.dist-info}/WHEEL +1 -1
  101. ciocore-10.0.0b3.dist-info/entry_points.txt +2 -0
  102. tests/instance_type_fixtures.py +175 -0
  103. tests/package_fixtures.py +205 -0
  104. tests/test_api_client.py +297 -12
  105. tests/test_base_downloader.py +104 -0
  106. tests/test_cli.py +149 -0
  107. tests/test_common.py +1 -7
  108. tests/test_config.py +40 -18
  109. tests/test_data.py +162 -173
  110. tests/test_downloader.py +118 -0
  111. tests/test_hardware_set.py +139 -0
  112. tests/test_job_downloader.py +213 -0
  113. tests/test_package_query.py +38 -0
  114. tests/test_package_tree.py +91 -291
  115. tests/test_submit.py +44 -18
  116. tests/test_uploader.py +1 -4
  117. ciocore/__about__.py +0 -10
  118. ciocore/cli/conductor.py +0 -191
  119. ciocore/compat.py +0 -15
  120. ciocore-5.1.1.data/scripts/conductor +0 -19
  121. ciocore-5.1.1.data/scripts/conductor.bat +0 -13
  122. ciocore-5.1.1.dist-info/METADATA +0 -408
  123. ciocore-5.1.1.dist-info/RECORD +0 -47
  124. tests/mocks/api_client_mock.py +0 -51
  125. /ciocore/{cli → downloader}/__init__.py +0 -0
  126. {ciocore-5.1.1.dist-info → ciocore-10.0.0b3.dist-info}/top_level.txt +0 -0
@@ -1,32 +1,32 @@
1
1
  import datetime
2
- import importlib
3
2
  import json
4
3
  import logging
5
4
  import os
5
+ import pathlib
6
+ import requests.exceptions
6
7
  import sys
7
8
  import time
8
9
  import threading
9
10
  import traceback
10
11
 
11
- try:
12
- import Queue as queue
13
- except ImportError:
14
- import queue
15
-
16
12
  import ciocore
17
- from ciocore import config
18
- from ciocore import api_client, common, worker, client_db, loggeria
19
- from ciocore.common import CONDUCTOR_LOGGER_NAME
13
+ from ciocore import (
14
+ api_client,
15
+ client_db,
16
+ common,
17
+ config,
18
+ file_utils,
19
+ loggeria,
20
+ worker,
21
+ exceptions,
22
+ )
20
23
 
21
- from .upload_stats import UploadStats
24
+ from . import thread_queue_job
22
25
 
23
- LOG_FORMATTER = logging.Formatter(
24
- "%(asctime)s %(name)s%(levelname)9s %(threadName)s: %(message)s"
25
- )
26
- logger = logging.getLogger(CONDUCTOR_LOGGER_NAME)
26
+ from .upload_stats import UploadStats
27
27
 
28
- SINGLEPART = "singlepart"
29
- MULTIPART = "multipart"
28
+ logger = logging.getLogger(
29
+ "{}.uploader".format(loggeria.CONDUCTOR_LOGGER_NAME))
30
30
 
31
31
 
32
32
  class MD5Worker(worker.ThreadWorker):
@@ -43,26 +43,33 @@ class MD5Worker(worker.ThreadWorker):
43
43
 
44
44
  def do_work(self, job, thread_int):
45
45
  logger.debug("job is %s", job)
46
- filename, submission_time_md5 = job
47
- filename = str(filename)
48
- current_md5, cache_hit = self.get_md5(filename)
46
+
47
+ current_md5, cache_hit = self.get_md5(job.path)
49
48
 
50
49
  # if a submission time md5 was provided then check against it
51
- if submission_time_md5:
52
- logger.info("Enforcing md5 match: %s for: %s", submission_time_md5, filename)
53
- if current_md5 != submission_time_md5:
54
- message = "MD5 of %s has changed since submission\n" % filename
55
- message += "submitted md5: %s\n" % submission_time_md5
50
+ if job.file_md5:
51
+ logger.info(
52
+ "Enforcing md5 match: %s for: %s", job.file_md5, job.path
53
+ )
54
+ if current_md5 != job.file_md5:
55
+ message = "MD5 of %s has changed since submission\n" % job.path
56
+ message += "submitted md5: %s\n" % job.file_md5
56
57
  message += "current md5: %s\n" % current_md5
57
- message += "This is likely due to the file being written to after the user"
58
+ message += (
59
+ "This is likely due to the file being written to after the user"
60
+ )
58
61
  message += " submitted the job but before it got uploaded to conductor"
59
62
  logger.error(message)
60
- raise Exception(message)
61
- self.metric_store.set_dict("file_md5s", filename, current_md5)
62
- self.metric_store.set_dict("file_md5s_cache_hit", filename, cache_hit)
63
- size_bytes = os.path.getsize(filename)
64
-
65
- return (filename, current_md5, size_bytes)
63
+ raise exceptions.UploadError(message)
64
+
65
+ else:
66
+ job.file_md5 = current_md5
67
+
68
+ self.metric_store.set_dict("file_md5s", job.path, current_md5)
69
+ self.metric_store.set_dict("file_md5s_cache_hit", job.path, cache_hit)
70
+ job.file_size = os.path.getsize(job.path)
71
+
72
+ return job
66
73
 
67
74
  def get_md5(self, filepath):
68
75
  """
@@ -71,9 +78,9 @@ class MD5Worker(worker.ThreadWorker):
71
78
  Use the sqlite db cache to retrive this (if the cache is valid), otherwise generate the md5
72
79
  from scratch
73
80
  """
74
-
81
+
75
82
  cache_hit = True
76
-
83
+
77
84
  # If md5 caching is disable, then just generate the md5 from scratch
78
85
  if not self.md5_caching:
79
86
  cache_hit = False
@@ -99,7 +106,9 @@ class MD5Worker(worker.ThreadWorker):
99
106
  """
100
107
  Store the given file_info into the database
101
108
  """
102
- client_db.FilesDB.add_file(file_info, db_filepath=self.database_filepath, thread_safe=True)
109
+ client_db.FilesDB.add_file(
110
+ file_info, db_filepath=self.database_filepath, thread_safe=True
111
+ )
103
112
 
104
113
 
105
114
  class MD5OutputWorker(worker.ThreadWorker):
@@ -110,135 +119,111 @@ class MD5OutputWorker(worker.ThreadWorker):
110
119
 
111
120
  def __init__(self, *args, **kwargs):
112
121
  super(MD5OutputWorker, self).__init__(*args, **kwargs)
113
- self.batch_size = 20 # the controlls the batch size for http get_signed_urls
122
+ self.batch_size = 20 # the controls the batch size for http get_signed_urls
114
123
  self.wait_time = 2
115
- self.batch = []
124
+ self.batch = {}
116
125
 
117
126
  def check_for_poison_pill(self, job):
118
127
  """we need to make sure we ship the last batch before we terminate"""
119
128
  if job == self.PoisonPill():
120
129
  logger.debug("md5outputworker got poison pill")
121
130
  self.ship_batch()
122
- self.mark_done()
123
- _thread.exit()
131
+ super(MD5OutputWorker, self).check_for_poison_pill(job)
124
132
 
125
133
  # helper function to ship batch
126
134
  def ship_batch(self):
127
135
  if self.batch:
128
136
  logger.debug("sending batch: %s", self.batch)
129
137
  self.put_job(self.batch)
130
- self.batch = []
138
+ self.batch = {}
131
139
 
132
140
  @common.dec_catch_exception(raise_=True)
133
141
  def target(self, thread_int):
134
-
135
142
  while not common.SIGINT_EXIT:
136
-
137
143
  job = None
138
144
 
139
145
  try:
140
146
  logger.debug("Worker querying for job")
141
147
  job = self.in_queue.get(block=True, timeout=self.wait_time)
142
148
  logger.debug("Got job")
143
- queue_size = self.in_queue.qsize()
144
-
149
+ queue_size = self.in_queue.qsize()
150
+
145
151
  except:
146
-
147
152
  logger.debug("No jobs available")
148
-
149
- if self._job_counter.value > 0:
150
-
153
+
154
+ if self._job_counter.value >= self.task_count:
151
155
  if self.batch:
152
156
  self.ship_batch()
153
-
154
- logger.debug("Worker has completed all of its tasks ({})".format(job))
157
+
158
+ logger.debug(
159
+ "Worker has completed all of its tasks (%s)", job)
155
160
  self.thread_complete_counter.decrement()
156
161
  break
157
-
158
- else:
162
+
163
+ elif self._job_counter.value == 0:
159
164
  logger.debug("Worker waiting for first job")
160
- continue
161
-
162
- logger.debug("Worker got job {}".format(job))
163
- self._job_counter.increment()
164
- logger.debug("Processing Job '{}' #{} on {}. {} tasks remaining in queue".format( job,
165
- self._job_counter.value,
166
- self,
167
- queue_size))
168
165
 
169
- self.check_for_poison_pill(job)
170
-
171
- # add file info to the batch list
172
- self.batch.append(
173
- {
174
- "path": job[0],
175
- "hash": job[1],
176
- "size": job[2],
177
- }
166
+ time.sleep(1)
167
+ continue
168
+
169
+ logger.debug("Worker got job %s", job)
170
+ self._job_counter.increment()
171
+ logger.debug(
172
+ "Processing Job '%s' #%s on %s. %s tasks remaining in queue",
173
+ job,
174
+ self._job_counter.value,
175
+ self,
176
+ queue_size,
178
177
  )
179
178
 
180
- # if the batch is self.batch_size, ship it
181
- if len(self.batch) == self.batch_size:
182
- self.ship_batch()
179
+ try:
180
+ self.check_for_poison_pill(job)
181
+
182
+ # add file info to the batch list
183
+ self.batch[job.path] = job
184
+
185
+ # if the batch is self.batch_size, ship it
186
+ if len(self.batch) == self.batch_size:
187
+ self.ship_batch()
188
+
189
+ # mark this task as done
190
+ self.mark_done()
191
+
192
+ except Exception as exception:
193
+ logger.exception(
194
+ 'CAUGHT EXCEPTION on job "%s" [%s]:\n', job, self)
183
195
 
184
- # mark this task as done
185
- self.mark_done()
196
+ # if there is no error queue to dump data into, then simply raise the exception
197
+ if self.error_queue is None:
198
+ raise
199
+
200
+ self.error_queue.put(sys.exc_info())
201
+ # exit the while loop to stop the thread
202
+ break
186
203
 
187
204
 
188
205
  class HttpBatchWorker(worker.ThreadWorker):
189
206
  """
190
- This worker receives a batched list of files (path, hash, size) and makes an batched http api
191
- call which returns a mixture of multiPartURLs (if any) and singlePartURLs (if any).
192
-
193
- in_queue: [
194
- {
195
- "path": "/linux64/bin/animate",
196
- "hash": "c986fb5f1c9ccf47eecc645081e4b108",
197
- "size": 2147483648
198
- },
199
- {
200
- "path": "/linux64/bin/tiff2ps",
201
- "hash": " fd27a8f925a72e788ea94997ca9a21ca",
202
- "size": 123
203
- },
204
- ]
205
- out_queue: {"multiPartURLs": [
206
- {
207
- "uploadID": "FqzC8mkGxTsLzAR5CuBv771an9D5WLthLbl_xFKCaqKEdqf",
208
- "filePath": "/linux64/bin/animate",
209
- "md5": "c986fb5f1c9ccf47eecc645081e4b108",
210
- "partSize": 1073741824,
211
- "parts": [
212
- {
213
- "partNumber": 1,
214
- "url": "https://www.signedurlexample.com/signature1"
215
- },
216
- {
217
- "partNumber": 2,
218
- "url": "https://www.signedurlexample.com/signature1"
219
- }
220
- ]
221
- }
222
- ],
223
- "singlePartURLs": [
224
- {
225
- "filePath": "/linux64/bin/tiff2ps",
226
- "fileSize": 123,
227
- "preSignedURL": "https://www.signedurlexample.com/signature2"
228
- }
229
- ]
230
- }
207
+ This worker recieves a list of ThreadQueue Jobs with path, hash, size attributes. It generates
208
+ the request to the back-end to get signed upload urls for each file in the batch. The result
209
+ can be a mix of multi-part and single-part upload urls - each one has a unique set of data.
210
+
211
+ If a requested file isn't part of the result, it indicates that it already exists on the bucket
212
+ and hence has been previously uploaded.
213
+
214
+ This will will add values for upload_type, presigned_url, part_size, parts, kms_key_name and
215
+ upload_id (of the file - NOT the Upload entity) to each job.
231
216
  """
232
217
 
233
218
  def __init__(self, *args, **kwargs):
234
219
  super(HttpBatchWorker, self).__init__(*args, **kwargs)
235
220
  self.api_client = api_client.ApiClient()
236
- self.project = kwargs.get("project")
237
221
 
238
- def make_request(self, job):
222
+ def make_request(self, jobs):
239
223
  uri_path = "/api/v2/files/get_upload_urls"
240
224
  headers = {"Content-Type": "application/json"}
241
- data = {"upload_files": job, "project": self.project}
225
+ data = {"upload_files": thread_queue_job.ThreadQueueJob.format_for_upload_request(jobs.values()),
226
+ "project": list(jobs.values())[0].project}
242
227
 
243
228
  response_str, response_code = self.api_client.make_request(
244
229
  uri_path=uri_path,
@@ -252,86 +237,83 @@ class HttpBatchWorker(worker.ThreadWorker):
252
237
  if response_code == 200:
253
238
  url_list = json.loads(response_str)
254
239
  return url_list
240
+
255
241
  if response_code == 204:
256
242
  return None
257
- raise Exception("%s Failed request to: %s\n%s" % (response_code, uri_path, response_str))
258
-
259
- def do_work(self, job, thread_int):
260
- logger.debug("getting upload urls for %s", job)
261
- result = self.make_request(job)
262
243
 
263
- # Determine which files have already been uploaded by looking at the difference betweeen
264
- # the file paths in job and the file paths returned by the request. Only files that need
265
- # to be uploaded are returned by the request
266
- incoming_file_paths = set([ item['path'] for item in job] )
244
+ raise exceptions.UploadError(
245
+ "%s Failed request to: %s\n%s" % (
246
+ response_code, uri_path, response_str)
247
+ )
267
248
 
249
+ def do_work(self, jobs, thread_int):
250
+ logger.debug("Getting upload urls for %s", jobs)
251
+ result = self.make_request(jobs)
252
+ logger.debug("Got result: %s", result)
253
+
254
+ # Determine which files have already been uploaded by looking at the difference between
255
+ # the file paths in job and the file paths returned by the request. Only files that need
256
+ # to be uploaded are returned by the request.
257
+ # Ideally, the MD5 would be used as the key but because the MD5 isn't returned for single-
258
+ # part files, we have to use the file path instead.
268
259
  if result:
269
-
270
- for item_type in result.values():
271
- for item in item_type:
272
- incoming_file_paths.remove(item['filePath'])
273
-
274
- for path in incoming_file_paths:
275
- self.metric_store.increment("already_uploaded", True, path)
276
-
277
- return result
260
+ for upload_type, items in result.items():
261
+ for item in items:
278
262
 
279
- """
280
- This worker subscribes to a queue of list of file uploads (multipart and singlepart).
263
+ job_key = item["filePath"]
281
264
 
282
- For each item on the queue, it uses the HttpBatchWorker response payload fileSize (bytes) to be
283
- uploaded, and aggregates the total size for all uploads.
265
+ logger.debug("Matching %s in request", job_key)
284
266
 
285
- It then places a tuple of (filepath, file_size, upload, type of upload(multipart or singlepart))
286
- onto the out_queue
267
+ jobs[job_key].upload_type = upload_type
268
+ jobs[job_key].kms_key_name = result.get('kmsKeyName')
287
269
 
288
- The bytes_to_upload arg is used to hold the aggregated size of all files that need to be uploaded.
289
- Note: This is stored as an [int] in order to pass it by reference, as it needs to be accessed and
290
- reset by the caller.
291
- """
270
+ self.metric_store.increment(
271
+ "bytes_to_upload", jobs[job_key].file_size, item["filePath"])
272
+ self.metric_store.increment("num_files_to_upload")
292
273
 
274
+ if upload_type == "multiPartURLs":
275
+ jobs[job_key].part_size = item["partSize"]
276
+ jobs[job_key].set_parts(item["parts"])
277
+ jobs[job_key].file_upload_id = item.get("uploadID")
293
278
 
294
- class FileStatWorker(worker.ThreadWorker):
295
- def __init__(self, *args, **kwargs):
296
- super(FileStatWorker, self).__init__(*args, **kwargs)
279
+ elif upload_type == "singlePartURLs":
280
+ jobs[job_key].presigned_url = item["preSignedURL"]
297
281
 
298
- def do_work(self, job, thread_int):
299
- """
300
- Job is a list of file uploads (multipart and singlepart) returned from File API. The
301
- FileStatWorker iterates through the list. For each item, it aggregates the filesize in
302
- bytes, and passes the upload into the UploadWorker queue.
303
- """
282
+ else:
283
+ raise exceptions.UploadError("Unknown upload_type '{}' for {}".format(upload_type,
284
+ item))
304
285
 
305
- if job:
306
- # iterate through singlepart urls
307
- for singlepart_upload in job.get("singlePartURLs", []):
308
- path = singlepart_upload["filePath"]
309
- file_size = singlepart_upload["fileSize"]
310
- upload_url = singlepart_upload["preSignedURL"]
311
-
312
- self.metric_store.increment("bytes_to_upload", file_size, path)
313
- self.metric_store.increment("num_files_to_upload")
314
-
315
- self.put_job((path, file_size, upload_url, SINGLEPART))
316
-
317
- # iterate through multipart
318
- for multipart_upload in job.get("multiPartURLs", []):
319
- path = multipart_upload["filePath"]
320
- file_size = multipart_upload["fileSize"]
321
-
322
- self.metric_store.increment("bytes_to_upload", file_size, path)
323
- self.metric_store.increment("num_files_to_upload")
324
-
325
- self.put_job((path, file_size, multipart_upload, MULTIPART))
286
+ # If a job has no upload_type, it indicates it wasn't part of the result
287
+ # above and has already been uploaded.
288
+ # If it's a multipart job we need to split it into a job per part (to allow
289
+ # for parallelization of the uploads).
290
+ for job_count, job in enumerate(jobs.values()):
291
+
292
+ if job.upload_type is None:
293
+ job.already_uploaded = True
294
+ self.metric_store.increment("already_uploaded", True, job.path)
295
+
296
+ if job.is_multipart():
297
+ logger.debug(
298
+ "Job is multipart: %s, splitting parts into separate jobs", job)
299
+ for part_job in job.create_multipart_jobs():
300
+ self.put_job(part_job)
301
+
302
+ else:
303
+ logger.debug("Job is singlepart: %s, adding to out_queue", job)
304
+ self.put_job(job)
305
+
306
+ # The job counter is already incremented in target() once, so skip the first
307
+ # iteration
308
+ if job_count > 0:
309
+ self._job_counter.increment()
326
310
 
327
- # make sure we return None, so no message is automatically added to the out_queue
328
311
  return None
329
312
 
330
313
 
331
314
  class UploadWorker(worker.ThreadWorker):
332
315
  """
333
- This worker receives a either (filepath: signed_upload_url) pair or (filepath: multipart (dict))
334
- and performs an upload of the specified file to the provided url.
316
+ This worker receives a thread_queue_job.ThreadQueueJob and performs the upload.
335
317
  """
336
318
 
337
319
  def __init__(self, *args, **kwargs):
@@ -339,7 +321,6 @@ class UploadWorker(worker.ThreadWorker):
339
321
  self.chunk_size = 1048576 # 1M
340
322
  self.report_size = 10485760 # 10M
341
323
  self.api_client = api_client.ApiClient()
342
- self.project = kwargs.get("project")
343
324
 
344
325
  def chunked_reader(self, filename):
345
326
  with open(filename, "rb") as fp:
@@ -352,37 +333,45 @@ class UploadWorker(worker.ThreadWorker):
352
333
  yield data
353
334
 
354
335
  # report upload progress
355
- self.metric_store.increment("bytes_uploaded", len(data), filename)
336
+ self.metric_store.increment(
337
+ "bytes_uploaded", len(data), filename)
356
338
 
357
339
  def do_work(self, job, thread_int):
358
-
359
- if job:
360
- filename = job[0]
361
- file_size = job[1]
362
- upload = job[2]
363
- upload_type = job[3]
364
-
365
- md5 = self.metric_store.get_dict("file_md5s", filename)
366
-
367
- try:
368
- if upload_type == SINGLEPART:
369
- return self.do_singlepart_upload(upload, filename, file_size, md5)
370
- elif upload_type == MULTIPART:
371
- return self.do_multipart_upload(upload, filename, md5)
372
-
373
- raise Exception("upload_type neither %s or %s" % (SINGLEPART, MULTIPART))
374
- except:
375
- logger.exception("Failed to upload file: %s because of:\n", filename)
376
- real_md5 = common.get_base64_md5(filename)
377
- error_message = "ALERT! File %s retried and still failed!\n" % filename
378
- error_message += "expected md5 is %s, real md5 is %s" % (md5, real_md5)
379
- logger.error(error_message)
380
- raise
381
-
382
- return None
340
+
341
+ if not job:
342
+ return worker.EMPTY_JOB
343
+
344
+ if job.already_uploaded:
345
+ logger.debug("Job is already uploaded: %s", job.path)
346
+ return job
347
+
348
+ try:
349
+ if job.is_multipart():
350
+ return self.do_multipart_upload(job)
351
+
352
+ else:
353
+ return self.do_singlepart_upload(job)
354
+
355
+ except Exception as err_msg:
356
+ real_md5 = common.get_base64_md5(job.path)
357
+
358
+ # Gather helpful details from the exception
359
+ exc_tb = sys.exc_info()[2]
360
+ exception_line_num = exc_tb.tb_lineno
361
+ exception_file = pathlib.Path(
362
+ exc_tb.tb_frame.f_code.co_filename).name
363
+
364
+ if isinstance(err_msg, requests.exceptions.HTTPError):
365
+ error_message = f"Upload of {job.path} failed with a response code {err_msg.response.status_code} ({err_msg.response.reason}) (expected '{job.file_md5}', got '{real_md5}')"
366
+ else:
367
+ error_message = (
368
+ f"Upload of {job.path} failed. (expected '{job.file_md5}', got '{real_md5}') {str(err_msg)} [{exception_file}-{exception_line_num}]"
369
+ )
370
+
371
+ raise exceptions.UploadError(error_message)
383
372
 
384
373
  @common.DecRetry(retry_exceptions=api_client.CONNECTION_EXCEPTIONS, tries=5)
385
- def do_singlepart_upload(self, upload_url, filename, file_size, md5):
374
+ def do_singlepart_upload(self, job):
386
375
  """
387
376
  Note that for GCS we don't rely on the make_request's own retry mechanism because we need to
388
377
  recreate the chunked_reader generator before retrying the request. Instead, we wrap this
@@ -392,19 +381,19 @@ class UploadWorker(worker.ThreadWorker):
392
381
  headers that S3 does not accept.
393
382
  """
394
383
 
395
- if "amazonaws" in upload_url:
384
+ if job.is_vendor_aws() or job.is_vendor_cw():
396
385
  # must declare content-length ourselves due to zero byte bug in requests library.
397
386
  # api_client.make_prepared_request docstring.
398
387
  headers = {
399
388
  "Content-Type": "application/octet-stream",
400
- "Content-Length": str(file_size),
389
+ "Content-Length": str(job.file_size),
401
390
  }
402
391
 
403
- with open(filename, "rb") as fh:
392
+ with open(job.path, "rb") as fh:
404
393
  # TODO: support chunked
405
394
  response = self.api_client.make_prepared_request(
406
395
  verb="PUT",
407
- url=upload_url,
396
+ url=job.presigned_url,
408
397
  headers=headers,
409
398
  params=None,
410
399
  data=fh,
@@ -418,79 +407,59 @@ class UploadWorker(worker.ThreadWorker):
418
407
  response.close()
419
408
 
420
409
  # report upload progress
421
- self.metric_store.increment("bytes_uploaded", file_size, filename)
410
+ self.metric_store.increment(
411
+ "bytes_uploaded", job.file_size, job.path)
412
+
422
413
  else:
423
- headers = {"Content-MD5": md5, "Content-Type": "application/octet-stream"}
414
+ headers = {"Content-MD5": job.file_md5,
415
+ "Content-Type": "application/octet-stream"}
416
+
417
+ if job.kms_key_name is not None:
418
+ headers["x-goog-encryption-kms-key-name"] = job.kms_key_name
424
419
 
425
- return self.api_client.make_request(
426
- conductor_url=upload_url,
420
+ response = self.api_client.make_request(
421
+ conductor_url=job.presigned_url,
427
422
  headers=headers,
428
- data=self.chunked_reader(filename),
423
+ data=self.chunked_reader(job.path),
429
424
  verb="PUT",
430
425
  tries=1,
431
426
  use_api_key=True,
432
427
  )
433
428
 
434
- def do_multipart_upload(self, upload, filename, md5):
429
+ logger.debug("Response from upload: %s", response)
430
+
431
+ return job
432
+
433
+ def do_multipart_upload(self, job):
435
434
  """
436
435
  Files will be split into partSize returned by the FileAPI and hydrated once all parts are
437
436
  uploaded. On successful part upload, response headers will contain an ETag. This value must
438
437
  be tracked along with the part number in order to complete and hydrate the file.
439
438
  """
440
- uploads = []
441
- complete_payload = {
442
- "uploadID": upload["uploadID"],
443
- "hash": md5,
444
- "completedParts": [],
445
- "project": self.project,
446
- }
447
439
 
448
- # iterate over parts and upload
449
- for part in upload["parts"]:
450
- resp_headers = self._do_multipart_upload(
451
- upload_url=part["url"],
452
- filename=filename,
453
- part_number=part["partNumber"],
454
- part_size=upload["partSize"],
455
- )
440
+ resp_headers = self._do_multipart_upload(job)
456
441
 
457
- if resp_headers:
458
- uploads.append(upload["uploadID"])
459
- completed_part = {
460
- "partNumber": part["partNumber"],
461
- "etag": resp_headers["ETag"].strip('"'),
462
- }
463
- complete_payload["completedParts"].append(completed_part)
442
+ if resp_headers:
443
+ job.etag = resp_headers["ETag"].strip('"')
464
444
 
465
- # Complete multipart upload in order to hydrate file for availability
466
- uri_path = "/api/v2/files/multipart/complete"
467
- headers = {"Content-Type": "application/json"}
468
- self.api_client.make_request(
469
- uri_path=uri_path,
470
- verb="POST",
471
- headers=headers,
472
- data=json.dumps(complete_payload),
473
- raise_on_error=True,
474
- use_api_key=True,
475
- )
476
-
477
- return uploads
445
+ return job
478
446
 
479
447
  @common.DecRetry(retry_exceptions=api_client.CONNECTION_EXCEPTIONS, tries=5)
480
- def _do_multipart_upload(self, upload_url, filename, part_number, part_size):
481
- with open(filename, "rb") as fh:
448
+ def _do_multipart_upload(self, job):
449
+
450
+ with open(job.path, "rb") as fh:
482
451
  # seek to the correct part position
483
- start = (part_number - 1) * part_size
452
+ start = (job.part_index - 1) * job.part_size
484
453
  fh.seek(start)
485
454
 
486
455
  # read up to part size determined by file-api
487
- data = fh.read(part_size)
456
+ data = fh.read(job.part_size)
488
457
  content_length = len(data)
489
458
 
490
459
  # upload part
491
460
  response = self.api_client.make_prepared_request(
492
461
  verb="PUT",
493
- url=upload_url,
462
+ url=job.presigned_url,
494
463
  headers={"Content-Type": "application/octet-stream"},
495
464
  params=None,
496
465
  data=data,
@@ -501,19 +470,127 @@ class UploadWorker(worker.ThreadWorker):
501
470
  )
502
471
 
503
472
  # report upload progress
504
- self.metric_store.increment("bytes_uploaded", content_length, filename)
473
+ self.metric_store.increment(
474
+ "bytes_uploaded", content_length, job.path)
505
475
 
506
476
  # close response object to add back to pool
507
477
  # https://requests.readthedocs.io/en/master/user/advanced/#body-content-workflow
508
478
  response.close()
509
479
 
480
+ logger.debug("Response from multipart upload: %s", response)
481
+
510
482
  return response.headers
511
483
 
512
484
 
513
- class Uploader(object):
485
+ class MultiPartSiphonWorker(worker.ThreadWorker):
486
+ """
487
+ This class is responsible for gathering all the jobs (aka files) and ensuring
488
+ the necessary steps are taken to have them available to be used by a Conductor Job.
489
+
490
+ For single-part files, this simply means passing the job to the out_queue so
491
+ that the Uploader is aware that the file has been sucesfully uploaded.
492
+
493
+ For multi-part files, this means collecting all the parts together and then
494
+ sending a request to the backend indicating that the file is complete.
495
+ """
496
+
497
+ def __init__(self, *args, **kwargs):
498
+ super(MultiPartSiphonWorker, self).__init__(*args, **kwargs)
499
+
500
+ self.api_client = api_client.ApiClient()
501
+ self.multipart_siphon = {}
502
+
503
+ def do_work(self, job, thread_int):
504
+ """
505
+ Process files that have already been uploaded.
506
+
507
+ If it's a single-part file, add the job to the out queue, so that it can
508
+ be used to determine if the Upload entity is complete.
509
+
510
+ If it's a multi-part upload, collect all the parts together. Once all the
511
+ parts have been accumulated, mark it as complete and add the file to the
512
+ out queue.
513
+ """
514
+
515
+ if not job:
516
+ return None
517
+
518
+ if not job.is_multipart():
519
+ logger.debug("Job is not multipart (%s, %s)",
520
+ job.total_parts, job.part_index)
521
+
522
+ return job
523
+
524
+ if job.file_md5 not in self.multipart_siphon:
525
+ self.multipart_siphon[job.file_md5] = []
526
+
527
+ # Add to the task count for this worker.
528
+ # -1 because a task has already been added for a single file
529
+ # but not all its parts.
530
+ old_task_count = self.task_count
531
+ self.task_count += job.total_parts - 1
532
+ logger.debug("Incrementing task count to %s from %s",
533
+ self.task_count, old_task_count)
534
+
535
+ self.multipart_siphon[job.file_md5].append(job)
536
+
537
+ if len(self.multipart_siphon[job.file_md5]) == job.total_parts:
514
538
 
539
+ complete_payload = {
540
+ "uploadID": job.file_upload_id,
541
+ "hash": job.file_md5,
542
+ "completedParts": thread_queue_job.ThreadQueueJob.aggregate_parts(self.multipart_siphon[job.file_md5]),
543
+ "project": job.project,
544
+ }
545
+
546
+ # Complete multipart upload in order to hydrate file for availability
547
+ logger.debug("Complete payload: %s", complete_payload)
548
+ uri_path = "/api/v2/files/multipart/complete"
549
+ headers = {"Content-Type": "application/json"}
550
+ self.api_client.make_request(
551
+ uri_path=uri_path,
552
+ verb="POST",
553
+ headers=headers,
554
+ data=json.dumps(complete_payload),
555
+ raise_on_error=True,
556
+ use_api_key=True,
557
+ )
558
+
559
+ logger.debug("JSON payload: '%s'",
560
+ json.dumps(complete_payload))
561
+
562
+ for job_part in self.multipart_siphon[job.file_md5]:
563
+ self.put_job(job_part)
564
+
565
+ return None
566
+
567
+ def is_complete(self):
568
+ # Get the number of files already uploaded as they are not passed to the Upload
569
+ # worker
570
+ file_store = self.metric_store.get("files")
571
+
572
+ if isinstance(file_store, dict):
573
+ already_completed_uploads = len(
574
+ [x for x in file_store.values() if x["already_uploaded"]]
575
+ )
576
+ queue_size = self.out_queue.qsize()
577
+ logger.debug(
578
+ "Is complete? out_queue_size=%s, completed_uploads=%s, task_count=%s",
579
+ queue_size,
580
+ already_completed_uploads,
581
+ self.task_count,
582
+ )
583
+
584
+ return (queue_size) >= self.task_count
585
+
586
+ else:
587
+ logger.debug("Is complete?: Files not initialized yet")
588
+ return False
589
+
590
+
591
+ class Uploader(object):
515
592
  sleep_time = 10
516
-
593
+
517
594
  CLIENT_NAME = "Uploader"
518
595
 
519
596
  def __init__(self, args=None):
@@ -528,24 +605,23 @@ class Uploader(object):
528
605
  self.cancel = False
529
606
  self.error_messages = []
530
607
  self.num_files_to_process = 0
531
-
608
+
532
609
  self.report_status_thread = None
533
610
  self.monitor_status_thread = None
534
-
611
+
535
612
  def emit_progress(self, upload_stats):
536
-
537
- if self.progress_callback:
613
+ if self.progress_callback:
538
614
  self.progress_callback(upload_stats)
539
615
 
540
616
  def prepare_workers(self):
541
617
  logger.debug("preparing workers...")
542
-
618
+
543
619
  if isinstance(threading.current_thread(), threading._MainThread):
544
620
  common.register_sigint_signal_handler()
621
+
545
622
  self.manager = None
546
623
 
547
- def create_manager(self, project):
548
-
624
+ def create_manager(self):
549
625
  job_description = [
550
626
  (
551
627
  MD5Worker,
@@ -556,40 +632,31 @@ class Uploader(object):
556
632
  "md5_caching": self.args["md5_caching"],
557
633
  },
558
634
  ),
559
- (
560
- MD5OutputWorker, [], {"thread_count": 1}
561
- ),
562
- (
563
- HttpBatchWorker,
564
- [],
565
- {"thread_count": self.args["thread_count"], "project": project},
566
- ),
567
- (
568
- FileStatWorker, [], {"thread_count": 1}
569
- ),
570
- (
571
- UploadWorker, [], {"thread_count": self.args["thread_count"]}
572
- ),
635
+ (MD5OutputWorker, [], {"thread_count": 1}),
636
+ (HttpBatchWorker, [], {"thread_count": 1}),
637
+ (UploadWorker, [], {"thread_count": self.args["thread_count"]}),
638
+ (MultiPartSiphonWorker, [], {"thread_count": 1})
573
639
  ]
574
640
 
575
641
  manager = worker.JobManager(job_description)
576
- manager.start()
577
642
  return manager
578
643
 
579
644
  @common.dec_catch_exception(raise_=True)
580
645
  def report_status(self):
581
646
  logger.debug("started report_status thread")
582
- update_interval = 5
647
+ update_interval = 15
583
648
  while True:
584
-
585
649
  # don't report status if we are doing a local_upload
586
650
  if not self.upload_id:
587
- logger.debug("not updating status as we were not provided an upload_id")
651
+ logger.debug(
652
+ "not updating status as we were not provided an upload_id")
588
653
  return
589
654
 
590
655
  if self.working:
591
- bytes_to_upload = self.manager.metric_store.get("bytes_to_upload")
592
- bytes_uploaded = self.manager.metric_store.get("bytes_uploaded")
656
+ bytes_to_upload = self.manager.metric_store.get(
657
+ "bytes_to_upload")
658
+ bytes_uploaded = self.manager.metric_store.get(
659
+ "bytes_uploaded")
593
660
  try:
594
661
  status_dict = {
595
662
  "upload_id": self.upload_id,
@@ -608,7 +675,7 @@ class Uploader(object):
608
675
  logger.error("could not report status:")
609
676
  logger.error(traceback.print_exc())
610
677
  logger.error(traceback.format_exc())
611
-
678
+
612
679
  else:
613
680
  break
614
681
 
@@ -616,14 +683,16 @@ class Uploader(object):
616
683
 
617
684
  def create_report_status_thread(self):
618
685
  logger.debug("creating reporter thread")
619
- self.report_status_thread = threading.Thread(name="ReporterThread", target=self.report_status)
686
+ self.report_status_thread = threading.Thread(
687
+ name="ReporterThread", target=self.report_status
688
+ )
620
689
  self.report_status_thread.daemon = True
621
690
  self.report_status_thread.start()
622
691
 
623
692
  @common.dec_catch_exception(raise_=True)
624
693
  def monitor_status(self, progress_handler):
625
694
  logger.debug("starting monitor_status thread")
626
- update_interval = 0.5
695
+ update_interval = 5
627
696
 
628
697
  def sleep():
629
698
  time.sleep(update_interval)
@@ -631,19 +700,27 @@ class Uploader(object):
631
700
  while True:
632
701
  if self.working:
633
702
  try:
634
- upload_stats = UploadStats.create (self.manager.metric_store, self.num_files_to_process, self.job_start_time)
703
+ upload_stats = UploadStats.create(
704
+ self.manager.metric_store,
705
+ self.num_files_to_process,
706
+ self.job_start_time,
707
+ )
635
708
  progress_handler(upload_stats)
636
709
  except Exception as e:
637
710
  print(e)
638
711
  print(traceback.format_exc())
639
-
712
+
640
713
  else:
641
714
  break
642
715
  sleep()
643
716
 
644
717
  def create_monitor_status_thread(self):
645
718
  logger.debug("creating console status thread")
646
- self.monitor_status_thread = threading.Thread(name="PrintStatusThread", target=self.monitor_status, args=(self.emit_progress,))
719
+ self.monitor_status_thread = threading.Thread(
720
+ name="PrintStatusThread",
721
+ target=self.monitor_status,
722
+ args=(self.emit_progress,),
723
+ )
647
724
 
648
725
  # make sure threads don't stop the program from exiting
649
726
  self.monitor_status_thread.daemon = True
@@ -652,24 +729,69 @@ class Uploader(object):
652
729
  self.monitor_status_thread.start()
653
730
 
654
731
  def mark_upload_finished(self, upload_id, upload_files):
655
-
656
- data = {"upload_id": upload_id, "status": "server_pending", "upload_files": upload_files}
732
+ data = {
733
+ "upload_id": upload_id,
734
+ "status": "server_pending",
735
+ "upload_files": upload_files,
736
+ }
657
737
 
658
738
  self.api_client.make_request(
659
- "/uploads/%s/finish" % upload_id, data=json.dumps(data), verb="POST", use_api_key=True
739
+ "/uploads/%s/finish" % upload_id,
740
+ data=json.dumps(data),
741
+ verb="POST",
742
+ use_api_key=True,
660
743
  )
661
744
  return True
662
745
 
663
746
  def mark_upload_failed(self, error_message, upload_id):
664
- logger.error("failing upload due to: \n%s" % error_message)
747
+ logger.error("Upload failed: %s", error_message)
665
748
 
666
749
  # report error_message to the app
667
750
  self.api_client.make_request(
668
- "/uploads/%s/fail" % upload_id, data=error_message, verb="POST", use_api_key=True
751
+ "/uploads/%s/fail" % upload_id,
752
+ data=error_message,
753
+ verb="POST",
754
+ use_api_key=True,
669
755
  )
670
756
 
671
757
  return True
672
758
 
759
+ def assets_only(self, *paths):
760
+ processed_filepaths = file_utils.process_upload_filepaths(paths)
761
+ file_map = {path: None for path in processed_filepaths}
762
+ self.handle_upload_response(project=None, upload_files=file_map)
763
+
764
+ if common.SIGINT_EXIT or self.cancel:
765
+ print("\nUpload cancelled\n")
766
+
767
+ elif self.error_messages:
768
+ print("\nUpload of {} file completed with errors\n".format(len(file_map)))
769
+
770
+ else:
771
+ print("\nUpload of {} file completed\n".format(len(file_map)))
772
+
773
+ error_messages = []
774
+
775
+ for exception in self.error_messages:
776
+ error_messages.append(str(exception[1]))
777
+ traceback_message = "".join(
778
+ traceback.format_exception(None, exception[1], exception[2]))
779
+ print(traceback_message)
780
+ logger.error(traceback_message)
781
+
782
+ if error_messages:
783
+
784
+ log_file = loggeria.LOG_PATH
785
+ sys.stderr.write("\nError uploading files:\n")
786
+
787
+ for err_msg in error_messages:
788
+ sys.stderr.write("\t{}\n".format(err_msg))
789
+
790
+ sys.stderr.write(
791
+ "\nSee log {} for more details\n\n".format(log_file))
792
+
793
+ self.error_messages = []
794
+
673
795
  def handle_upload_response(self, project, upload_files, upload_id=None):
674
796
  """
675
797
  This is a really confusing method and should probably be split into to clear logic
@@ -679,7 +801,6 @@ class Uploader(object):
679
801
  only be fed uploads by the app which have valid projects attached to them.
680
802
  """
681
803
  try:
682
-
683
804
  logger.info("%s", " NEXT UPLOAD ".center(30, "#"))
684
805
  logger.info("project: %s", project)
685
806
  logger.info("upload_id is %s", upload_id)
@@ -691,7 +812,7 @@ class Uploader(object):
691
812
 
692
813
  # reset counters
693
814
  self.num_files_to_process = len(upload_files)
694
- logger.debug( "Processing {} files".format(self.num_files_to_process))
815
+ logger.debug("Processing %s files", self.num_files_to_process)
695
816
  self.job_start_time = datetime.datetime.now()
696
817
  self.upload_id = upload_id
697
818
  self.job_failed = False
@@ -700,9 +821,17 @@ class Uploader(object):
700
821
  self.working = True
701
822
 
702
823
  self.prepare_workers()
703
-
824
+
825
+ # Adjust the number of threads
826
+ if self.num_files_to_process < self.args["thread_count"]:
827
+ self.args["thread_count"] = min(self.args["thread_count"], self.num_files_to_process)
828
+ logger.info(
829
+ "Adjusting thread count to %s", self.args["thread_count"]
830
+ )
831
+
704
832
  # create worker pools
705
- self.manager = self.create_manager(project)
833
+ self.manager = self.create_manager()
834
+ self.manager.start()
706
835
 
707
836
  # create reporters
708
837
  logger.debug("creating report status thread...")
@@ -711,65 +840,72 @@ class Uploader(object):
711
840
  # load tasks into worker pools
712
841
  for path in upload_files:
713
842
  md5 = upload_files[path]
714
- self.manager.add_task((path, md5))
843
+ self.manager.add_task((path, md5, project))
715
844
 
716
845
  logger.info("creating console status thread...")
717
- self.create_monitor_status_thread()
846
+ self.create_monitor_status_thread()
718
847
 
719
- #wait for work to finish
848
+ # wait for work to finish
720
849
  while not self.manager.is_complete():
721
- logger.info("Manager is running, cancel requested?: {}".format(self.cancel))
722
-
723
- if self.cancel or self.manager.error:
850
+ logger.debug(
851
+ "Manager is running, cancel requested?: %s", self.cancel)
852
+
853
+ if self.cancel or self.manager.error or common.SIGINT_EXIT:
724
854
  self.error_messages = self.manager.stop_work()
725
855
  logger.debug("Manager sucesfully stopped")
726
856
  break
727
-
728
- time.sleep(1)
729
-
857
+
858
+ time.sleep(5)
859
+
730
860
  # Shutdown the manager once all jobs are done
731
- if not self.cancel and not self.manager.error:
732
- logger.debug("Waiting for Manager to join")
861
+ if not (self.cancel or self.manager.error or common.SIGINT_EXIT):
733
862
  self.manager.join()
734
863
 
735
- upload_stats = UploadStats.create(self.manager.metric_store, self.num_files_to_process, self.job_start_time)
864
+ upload_stats = UploadStats.create(
865
+ self.manager.metric_store,
866
+ self.num_files_to_process,
867
+ self.job_start_time,
868
+ )
736
869
  logger.info(upload_stats.get_formatted_text())
737
- self.emit_progress(upload_stats)
738
-
739
- logger.debug("error_message: %s", self.error_messages)
870
+ self.emit_progress(upload_stats)
871
+
872
+ logger.debug("Error_message: %s", self.error_messages)
740
873
 
741
874
  # signal to the reporter to stop working
742
875
  self.working = False
743
- logger.info("done uploading files")
744
-
876
+
745
877
  logger.debug("Waiting for reporter status thread to join")
746
878
  self.report_status_thread.join()
747
-
879
+
748
880
  logger.debug("Waiting for print status thread to join")
749
- self.monitor_status_thread.join()
881
+ self.monitor_status_thread.join()
750
882
 
751
883
  # Despite storing lots of data about new uploads, we will only send back the things
752
884
  # that have changed, to keep payloads small.
753
885
  finished_upload_files = {}
754
- if self.upload_id:
886
+ if self.upload_id and not self.error_messages:
755
887
  md5s = self.return_md5s()
756
888
  for path in md5s:
757
- finished_upload_files[path] = {"source": path, "md5": md5s[path]}
889
+ finished_upload_files[path] = {
890
+ "source": path, "md5": md5s[path]}
758
891
 
759
- self.mark_upload_finished(self.upload_id, finished_upload_files)
892
+ self.mark_upload_finished(
893
+ self.upload_id, finished_upload_files)
760
894
 
761
895
  except:
762
896
  self.error_messages.append(sys.exc_info())
763
897
 
764
898
  def main(self, run_one_loop=False):
765
-
766
899
  def show_ouput(upload_stats):
767
- logger.info(upload_stats.get_formatted_text())
768
-
900
+ print(upload_stats.get_formatted_text())
901
+ logger.info("File Progress: %s", upload_stats.file_progress)
902
+
769
903
  self.progress_callback = show_ouput
770
-
904
+
771
905
  logger.info("Uploader Started. Checking for uploads...")
772
906
 
907
+ waiting_for_uploads_flag = False
908
+
773
909
  while not common.SIGINT_EXIT:
774
910
  try:
775
911
  # TODO: we should pass args as url params, not http data
@@ -777,16 +913,27 @@ class Uploader(object):
777
913
  data["location"] = self.location
778
914
  logger.debug("Data: %s", data)
779
915
  resp_str, resp_code = self.api_client.make_request(
780
- "/uploads/client/next", data=json.dumps(data), verb="PUT", use_api_key=True
916
+ "/uploads/client/next",
917
+ data=json.dumps(data),
918
+ verb="PUT",
919
+ use_api_key=True,
781
920
  )
782
921
  if resp_code == 204:
922
+ if not waiting_for_uploads_flag:
923
+ sys.stdout.write("\nWaiting for jobs to upload ")
924
+ sys.stdout.flush()
925
+
783
926
  logger.debug("no files to upload")
784
927
  sys.stdout.write(".")
785
928
  sys.stdout.flush()
786
929
  time.sleep(self.sleep_time)
930
+ waiting_for_uploads_flag = True
787
931
  continue
932
+
788
933
  elif resp_code != 201:
789
- logger.error("received invalid response code from app %s", resp_code)
934
+ logger.error(
935
+ "received invalid response code from app %s", resp_code
936
+ )
790
937
  logger.error("response is %s", resp_str)
791
938
  time.sleep(self.sleep_time)
792
939
  continue
@@ -796,6 +943,7 @@ class Uploader(object):
796
943
  try:
797
944
  json_data = json.loads(resp_str)
798
945
  upload = json_data.get("data", {})
946
+
799
947
  except ValueError:
800
948
  logger.error("response was not valid json: %s", resp_str)
801
949
  time.sleep(self.sleep_time)
@@ -806,24 +954,51 @@ class Uploader(object):
806
954
  project = upload["project"]
807
955
 
808
956
  self.handle_upload_response(project, upload_files, upload_id)
957
+
958
+ if self.error_messages:
959
+ logger.info("Upload of entity %s failed with errors.", upload_id)
960
+
961
+ else:
962
+ logger.info("Upload of entity %s completed.", upload_id)
809
963
 
810
- error_message = []
811
-
964
+ upload_stats = UploadStats.create(
965
+ self.manager.metric_store,
966
+ self.num_files_to_process,
967
+ self.job_start_time,
968
+ )
969
+ show_ouput(upload_stats)
970
+ logger.debug(self.manager.worker_queue_status_text())
971
+
972
+ error_messages = []
973
+
812
974
  for exception in self.error_messages:
813
-
814
- error_message.append("{}: {}".format(exception[1],
815
- traceback.format_tb(exception[2])))
816
-
817
- if error_message:
818
- self.mark_upload_failed("\n".join(error_message, upload_id))
819
-
975
+ error_messages.append(str(exception[1]))
976
+
977
+ if error_messages:
978
+ self.mark_upload_failed(
979
+ error_message="Uploader ERROR: {}".format(
980
+ "\n".join(error_messages)),
981
+ upload_id=upload_id
982
+ )
983
+
984
+ log_file = loggeria.LOG_PATH
985
+ sys.stderr.write("\nError uploading files:\n")
986
+
987
+ for err_msg in error_messages:
988
+ sys.stderr.write("\t{}\n".format(err_msg))
989
+
990
+ sys.stderr.write(
991
+ "\nSee log {} for more details\n\n".format(log_file))
992
+
820
993
  self.error_messages = []
821
994
 
995
+ waiting_for_uploads_flag = False
996
+
822
997
  except KeyboardInterrupt:
823
998
  logger.info("ctrl-c exit")
824
999
  break
825
- except:
826
- logger.exception("Caught exception:\n")
1000
+ except Exception as err_msg:
1001
+ logger.exception("Caught exception:\n%s", err_msg)
827
1002
  time.sleep(self.sleep_time)
828
1003
  continue
829
1004
 
@@ -836,17 +1011,6 @@ class Uploader(object):
836
1011
  """
837
1012
  return self.manager.metric_store.get_dict("file_md5s")
838
1013
 
839
- def set_logging(level=None, log_dirpath=None):
840
- log_filepath = None
841
- if log_dirpath:
842
- log_filepath = os.path.join(log_dirpath, "conductor_ul_log")
843
- loggeria.setup_conductor_logging(
844
- logger_level=level,
845
- console_formatter=LOG_FORMATTER,
846
- file_formatter=LOG_FORMATTER,
847
- log_filepath=log_filepath,
848
- )
849
-
850
1014
 
851
1015
  def run_uploader(args):
852
1016
  """
@@ -856,18 +1020,37 @@ def run_uploader(args):
856
1020
  # convert the Namespace object to a dictionary
857
1021
  args_dict = vars(args)
858
1022
  cfg = config.config().config
859
-
860
- api_client.ApiClient.register_client(client_name = Uploader.CLIENT_NAME, client_version=ciocore.__version__)
1023
+
1024
+ api_client.ApiClient.register_client(
1025
+ client_name=Uploader.CLIENT_NAME, client_version=ciocore.version
1026
+ )
861
1027
 
862
1028
  # Set up logging
863
1029
  log_level_name = args_dict.get("log_level") or cfg["log_level"]
864
- log_level = loggeria.LEVEL_MAP.get(log_level_name)
865
- log_dirpath = args_dict.get("log_dir")
866
- set_logging(log_level, log_dirpath)
1030
+
1031
+ loggeria.setup_conductor_logging(
1032
+ logger_level=loggeria.LEVEL_MAP.get(log_level_name),
1033
+ log_dirpath=args_dict.get("log_dir"),
1034
+ log_filename="conductor_uploader.log",
1035
+ disable_console_logging=not args_dict["log_to_console"],
1036
+ use_system_log=False,
1037
+ )
1038
+
1039
+ print("Logging to %s", loggeria.LOG_PATH)
1040
+
867
1041
  logger.debug("Uploader parsed_args is %s", args_dict)
1042
+
868
1043
  resolved_args = resolve_args(args_dict)
869
1044
  uploader = Uploader(resolved_args)
870
- uploader.main()
1045
+
1046
+ if args.paths:
1047
+ processed_filepaths = file_utils.process_upload_filepaths(
1048
+ args.paths[0])
1049
+ file_map = {path: None for path in processed_filepaths}
1050
+ uploader.handle_upload_response(project=None, upload_files=file_map)
1051
+
1052
+ else:
1053
+ uploader.main()
871
1054
 
872
1055
 
873
1056
  def get_file_info(filepath):
@@ -891,7 +1074,7 @@ def resolve_args(args):
891
1074
  Resolve all arguments, reconciling differences between command line args and config.yml args.
892
1075
  See resolve_arg function.
893
1076
  """
894
-
1077
+
895
1078
  args["md5_caching"] = resolve_arg("md5_caching", args)
896
1079
  args["database_filepath"] = resolve_arg("database_filepath", args)
897
1080
  args["location"] = resolve_arg("location", args)
@@ -899,18 +1082,18 @@ def resolve_args(args):
899
1082
 
900
1083
  return args
901
1084
 
1085
+
902
1086
  def resolve_arg(key, args):
903
1087
  """
904
1088
  If the key doesn't exist (or is None), grab it from the config.
905
1089
  """
906
-
1090
+
907
1091
  cfg = config.config().config
908
- config_value = cfg.get(key)
909
-
1092
+ config_value = cfg.get(key)
1093
+
910
1094
  value = args.get(key, config_value)
911
-
1095
+
912
1096
  if value is None:
913
1097
  value = config_value
914
-
1098
+
915
1099
  return value
916
-