toil 6.1.0a1__py3-none-any.whl → 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +1 -232
- toil/batchSystems/abstractBatchSystem.py +41 -17
- toil/batchSystems/abstractGridEngineBatchSystem.py +79 -65
- toil/batchSystems/awsBatch.py +8 -8
- toil/batchSystems/cleanup_support.py +7 -3
- toil/batchSystems/contained_executor.py +4 -5
- toil/batchSystems/gridengine.py +1 -1
- toil/batchSystems/htcondor.py +5 -5
- toil/batchSystems/kubernetes.py +25 -11
- toil/batchSystems/local_support.py +3 -3
- toil/batchSystems/lsf.py +9 -9
- toil/batchSystems/mesos/batchSystem.py +4 -4
- toil/batchSystems/mesos/executor.py +3 -2
- toil/batchSystems/options.py +9 -0
- toil/batchSystems/singleMachine.py +11 -10
- toil/batchSystems/slurm.py +129 -16
- toil/batchSystems/torque.py +1 -1
- toil/bus.py +45 -3
- toil/common.py +56 -31
- toil/cwl/cwltoil.py +442 -371
- toil/deferred.py +1 -1
- toil/exceptions.py +1 -1
- toil/fileStores/abstractFileStore.py +69 -20
- toil/fileStores/cachingFileStore.py +6 -22
- toil/fileStores/nonCachingFileStore.py +6 -15
- toil/job.py +270 -86
- toil/jobStores/abstractJobStore.py +37 -31
- toil/jobStores/aws/jobStore.py +280 -218
- toil/jobStores/aws/utils.py +60 -31
- toil/jobStores/conftest.py +2 -2
- toil/jobStores/fileJobStore.py +3 -3
- toil/jobStores/googleJobStore.py +3 -4
- toil/leader.py +89 -38
- toil/lib/aws/__init__.py +26 -10
- toil/lib/aws/iam.py +2 -2
- toil/lib/aws/session.py +62 -22
- toil/lib/aws/utils.py +73 -37
- toil/lib/conversions.py +24 -1
- toil/lib/ec2.py +118 -69
- toil/lib/expando.py +1 -1
- toil/lib/generatedEC2Lists.py +8 -8
- toil/lib/io.py +42 -4
- toil/lib/misc.py +1 -3
- toil/lib/resources.py +57 -16
- toil/lib/retry.py +12 -5
- toil/lib/threading.py +29 -14
- toil/lib/throttle.py +1 -1
- toil/options/common.py +31 -30
- toil/options/wdl.py +5 -0
- toil/provisioners/__init__.py +9 -3
- toil/provisioners/abstractProvisioner.py +12 -2
- toil/provisioners/aws/__init__.py +20 -15
- toil/provisioners/aws/awsProvisioner.py +406 -329
- toil/provisioners/gceProvisioner.py +2 -2
- toil/provisioners/node.py +13 -5
- toil/server/app.py +1 -1
- toil/statsAndLogging.py +93 -23
- toil/test/__init__.py +27 -12
- toil/test/batchSystems/batchSystemTest.py +40 -33
- toil/test/batchSystems/batch_system_plugin_test.py +79 -0
- toil/test/batchSystems/test_slurm.py +22 -7
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +58 -0
- toil/test/cwl/cwlTest.py +245 -236
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +11 -14
- toil/test/jobStores/jobStoreTest.py +40 -54
- toil/test/lib/aws/test_iam.py +2 -2
- toil/test/lib/test_ec2.py +1 -1
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +37 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
- toil/test/provisioners/clusterTest.py +99 -16
- toil/test/server/serverTest.py +2 -2
- toil/test/src/autoDeploymentTest.py +1 -1
- toil/test/src/dockerCheckTest.py +2 -1
- toil/test/src/environmentTest.py +125 -0
- toil/test/src/fileStoreTest.py +1 -1
- toil/test/src/jobDescriptionTest.py +18 -8
- toil/test/src/jobTest.py +1 -1
- toil/test/src/realtimeLoggerTest.py +4 -0
- toil/test/src/workerTest.py +52 -19
- toil/test/utils/toilDebugTest.py +62 -4
- toil/test/utils/utilsTest.py +23 -21
- toil/test/wdl/wdltoil_test.py +49 -21
- toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
- toil/toilState.py +68 -9
- toil/utils/toilDebugFile.py +1 -1
- toil/utils/toilDebugJob.py +153 -26
- toil/utils/toilLaunchCluster.py +12 -2
- toil/utils/toilRsyncCluster.py +7 -2
- toil/utils/toilSshCluster.py +7 -3
- toil/utils/toilStats.py +310 -266
- toil/utils/toilStatus.py +98 -52
- toil/version.py +11 -11
- toil/wdl/wdltoil.py +644 -225
- toil/worker.py +125 -83
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
- toil-7.0.0.dist-info/METADATA +158 -0
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/RECORD +103 -96
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
toil/jobStores/aws/utils.py
CHANGED
|
@@ -17,25 +17,25 @@ import logging
|
|
|
17
17
|
import os
|
|
18
18
|
import types
|
|
19
19
|
from ssl import SSLError
|
|
20
|
-
from typing import Optional, cast
|
|
20
|
+
from typing import Optional, cast, TYPE_CHECKING, Dict, List, Tuple
|
|
21
21
|
|
|
22
22
|
from boto3.s3.transfer import TransferConfig
|
|
23
|
-
from boto.exception import SDBResponseError
|
|
24
23
|
from botocore.client import Config
|
|
25
24
|
from botocore.exceptions import ClientError
|
|
26
|
-
from
|
|
25
|
+
from mypy_boto3_sdb.type_defs import ItemTypeDef, AttributeTypeDef
|
|
27
26
|
|
|
28
|
-
from toil.lib.aws import session
|
|
29
|
-
from toil.lib.aws.utils import
|
|
27
|
+
from toil.lib.aws import session, AWSServerErrors
|
|
28
|
+
from toil.lib.aws.utils import connection_error, get_bucket_region
|
|
30
29
|
from toil.lib.compatibility import compat_bytes
|
|
31
30
|
from toil.lib.retry import (DEFAULT_DELAYS,
|
|
32
31
|
DEFAULT_TIMEOUT,
|
|
33
|
-
ErrorCondition,
|
|
34
32
|
get_error_code,
|
|
35
33
|
get_error_message,
|
|
36
34
|
get_error_status,
|
|
37
35
|
old_retry,
|
|
38
36
|
retry)
|
|
37
|
+
if TYPE_CHECKING:
|
|
38
|
+
from mypy_boto3_s3 import S3ServiceResource
|
|
39
39
|
|
|
40
40
|
logger = logging.getLogger(__name__)
|
|
41
41
|
|
|
@@ -125,11 +125,11 @@ class SDBHelper:
|
|
|
125
125
|
return cls._maxChunks() * cls.maxValueSize
|
|
126
126
|
|
|
127
127
|
@classmethod
|
|
128
|
-
def binaryToAttributes(cls, binary):
|
|
128
|
+
def binaryToAttributes(cls, binary) -> Dict[str, str]:
|
|
129
129
|
"""
|
|
130
130
|
Turn a bytestring, or None, into SimpleDB attributes.
|
|
131
131
|
"""
|
|
132
|
-
if binary is None: return {'numChunks': 0}
|
|
132
|
+
if binary is None: return {'numChunks': '0'}
|
|
133
133
|
assert isinstance(binary, bytes)
|
|
134
134
|
assert len(binary) <= cls.maxBinarySize()
|
|
135
135
|
# The use of compression is just an optimization. We can't include it in the maxValueSize
|
|
@@ -143,10 +143,41 @@ class SDBHelper:
|
|
|
143
143
|
assert len(encoded) <= cls._maxEncodedSize()
|
|
144
144
|
n = cls.maxValueSize
|
|
145
145
|
chunks = (encoded[i:i + n] for i in range(0, len(encoded), n))
|
|
146
|
-
attributes = {cls._chunkName(i): chunk for i, chunk in enumerate(chunks)}
|
|
147
|
-
attributes.update({'numChunks': len(attributes)})
|
|
146
|
+
attributes = {cls._chunkName(i): chunk.decode("utf-8") for i, chunk in enumerate(chunks)}
|
|
147
|
+
attributes.update({'numChunks': str(len(attributes))})
|
|
148
148
|
return attributes
|
|
149
149
|
|
|
150
|
+
@classmethod
|
|
151
|
+
def attributeDictToList(cls, attributes: Dict[str, str]) -> List[AttributeTypeDef]:
|
|
152
|
+
"""
|
|
153
|
+
Convert the attribute dict (ex: from binaryToAttributes) into a list of attribute typed dicts
|
|
154
|
+
to be compatible with boto3 argument syntax
|
|
155
|
+
:param attributes: Dict[str, str], attribute in object form
|
|
156
|
+
:return: List[AttributeTypeDef], list of attributes in typed dict form
|
|
157
|
+
"""
|
|
158
|
+
return [{"Name": name, "Value": value} for name, value in attributes.items()]
|
|
159
|
+
|
|
160
|
+
@classmethod
|
|
161
|
+
def attributeListToDict(cls, attributes: List[AttributeTypeDef]) -> Dict[str, str]:
|
|
162
|
+
"""
|
|
163
|
+
Convert the attribute boto3 representation of list of attribute typed dicts
|
|
164
|
+
back to a dictionary with name, value pairs
|
|
165
|
+
:param attribute: List[AttributeTypeDef, attribute in typed dict form
|
|
166
|
+
:return: Dict[str, str], attribute in dict form
|
|
167
|
+
"""
|
|
168
|
+
return {attribute["Name"]: attribute["Value"] for attribute in attributes}
|
|
169
|
+
|
|
170
|
+
@classmethod
|
|
171
|
+
def get_attributes_from_item(cls, item: ItemTypeDef, keys: List[str]) -> List[Optional[str]]:
|
|
172
|
+
return_values: List[Optional[str]] = [None for _ in keys]
|
|
173
|
+
mapped_indices: Dict[str, int] = {name: index for index, name in enumerate(keys)}
|
|
174
|
+
for attribute in item["Attributes"]:
|
|
175
|
+
name = attribute["Name"]
|
|
176
|
+
value = attribute["Value"]
|
|
177
|
+
if name in mapped_indices:
|
|
178
|
+
return_values[mapped_indices[name]] = value
|
|
179
|
+
return return_values
|
|
180
|
+
|
|
150
181
|
@classmethod
|
|
151
182
|
def _chunkName(cls, i):
|
|
152
183
|
return str(i).zfill(3)
|
|
@@ -165,14 +196,21 @@ class SDBHelper:
|
|
|
165
196
|
return 'numChunks'
|
|
166
197
|
|
|
167
198
|
@classmethod
|
|
168
|
-
def attributesToBinary(cls, attributes):
|
|
199
|
+
def attributesToBinary(cls, attributes: List[AttributeTypeDef]) -> Tuple[bytes, int]:
|
|
169
200
|
"""
|
|
170
201
|
:rtype: (str|None,int)
|
|
171
202
|
:return: the binary data and the number of chunks it was composed from
|
|
172
203
|
"""
|
|
173
|
-
chunks = [
|
|
204
|
+
chunks = []
|
|
205
|
+
numChunks: int = 0
|
|
206
|
+
for attribute in attributes:
|
|
207
|
+
name = attribute["Name"]
|
|
208
|
+
value = attribute["Value"]
|
|
209
|
+
if cls._isValidChunkName(name):
|
|
210
|
+
chunks.append((int(name), value))
|
|
211
|
+
if name == "numChunks":
|
|
212
|
+
numChunks = int(value)
|
|
174
213
|
chunks.sort()
|
|
175
|
-
numChunks = int(attributes['numChunks'])
|
|
176
214
|
if numChunks:
|
|
177
215
|
serializedJob = b''.join(v.encode() for k, v in chunks)
|
|
178
216
|
compressed = base64.b64decode(serializedJob)
|
|
@@ -192,10 +230,7 @@ def fileSizeAndTime(localFilePath):
|
|
|
192
230
|
return file_stat.st_size, file_stat.st_mtime
|
|
193
231
|
|
|
194
232
|
|
|
195
|
-
@retry(errors=[
|
|
196
|
-
error=ClientError,
|
|
197
|
-
error_codes=[404, 500, 502, 503, 504]
|
|
198
|
-
)])
|
|
233
|
+
@retry(errors=[AWSServerErrors])
|
|
199
234
|
def uploadFromPath(localFilePath: str,
|
|
200
235
|
resource,
|
|
201
236
|
bucketName: str,
|
|
@@ -231,10 +266,7 @@ def uploadFromPath(localFilePath: str,
|
|
|
231
266
|
return version
|
|
232
267
|
|
|
233
268
|
|
|
234
|
-
@retry(errors=[
|
|
235
|
-
error=ClientError,
|
|
236
|
-
error_codes=[404, 500, 502, 503, 504]
|
|
237
|
-
)])
|
|
269
|
+
@retry(errors=[AWSServerErrors])
|
|
238
270
|
def uploadFile(readable,
|
|
239
271
|
resource,
|
|
240
272
|
bucketName: str,
|
|
@@ -286,11 +318,8 @@ class ServerSideCopyProhibitedError(RuntimeError):
|
|
|
286
318
|
insists that you pay to download and upload the data yourself instead.
|
|
287
319
|
"""
|
|
288
320
|
|
|
289
|
-
@retry(errors=[
|
|
290
|
-
|
|
291
|
-
error_codes=[404, 500, 502, 503, 504]
|
|
292
|
-
)])
|
|
293
|
-
def copyKeyMultipart(resource: S3ServiceResource,
|
|
321
|
+
@retry(errors=[AWSServerErrors])
|
|
322
|
+
def copyKeyMultipart(resource: "S3ServiceResource",
|
|
294
323
|
srcBucketName: str,
|
|
295
324
|
srcKeyName: str,
|
|
296
325
|
srcKeyVersion: str,
|
|
@@ -346,7 +375,7 @@ def copyKeyMultipart(resource: S3ServiceResource,
|
|
|
346
375
|
# not wherever the bucket virtual hostnames go.
|
|
347
376
|
source_region = get_bucket_region(srcBucketName)
|
|
348
377
|
source_client = cast(
|
|
349
|
-
S3Client,
|
|
378
|
+
"S3Client",
|
|
350
379
|
session.client(
|
|
351
380
|
's3',
|
|
352
381
|
region_name=source_region,
|
|
@@ -438,9 +467,9 @@ def sdb_unavailable(e):
|
|
|
438
467
|
|
|
439
468
|
|
|
440
469
|
def no_such_sdb_domain(e):
|
|
441
|
-
return (isinstance(e,
|
|
442
|
-
and e
|
|
443
|
-
and e.
|
|
470
|
+
return (isinstance(e, ClientError)
|
|
471
|
+
and get_error_code(e)
|
|
472
|
+
and get_error_code(e).endswith('NoSuchDomain'))
|
|
444
473
|
|
|
445
474
|
|
|
446
475
|
def retryable_ssl_error(e):
|
|
@@ -451,7 +480,7 @@ def retryable_ssl_error(e):
|
|
|
451
480
|
def retryable_sdb_errors(e):
|
|
452
481
|
return (sdb_unavailable(e)
|
|
453
482
|
or no_such_sdb_domain(e)
|
|
454
|
-
or
|
|
483
|
+
or connection_error(e)
|
|
455
484
|
or retryable_ssl_error(e))
|
|
456
485
|
|
|
457
486
|
|
toil/jobStores/conftest.py
CHANGED
toil/jobStores/fileJobStore.py
CHANGED
|
@@ -113,7 +113,7 @@ class FileJobStore(AbstractJobStore):
|
|
|
113
113
|
os.mkdir(self.jobStoreDir)
|
|
114
114
|
except OSError as e:
|
|
115
115
|
if e.errno == errno.EEXIST:
|
|
116
|
-
raise JobStoreExistsException(self.jobStoreDir)
|
|
116
|
+
raise JobStoreExistsException(self.jobStoreDir, "file")
|
|
117
117
|
else:
|
|
118
118
|
raise
|
|
119
119
|
os.makedirs(self.jobsDir, exist_ok=True)
|
|
@@ -127,7 +127,7 @@ class FileJobStore(AbstractJobStore):
|
|
|
127
127
|
|
|
128
128
|
def resume(self):
|
|
129
129
|
if not os.path.isdir(self.jobStoreDir):
|
|
130
|
-
raise NoSuchJobStoreException(self.jobStoreDir)
|
|
130
|
+
raise NoSuchJobStoreException(self.jobStoreDir, "file")
|
|
131
131
|
super().resume()
|
|
132
132
|
|
|
133
133
|
def destroy(self):
|
|
@@ -920,7 +920,7 @@ class FileJobStore(AbstractJobStore):
|
|
|
920
920
|
:raise NoSuchFileException: if the file with ID jobStoreFileID does
|
|
921
921
|
not exist or is not a file
|
|
922
922
|
"""
|
|
923
|
-
if not self.file_exists(
|
|
923
|
+
if not self.file_exists(jobStoreFileID):
|
|
924
924
|
raise NoSuchFileException(jobStoreFileID)
|
|
925
925
|
|
|
926
926
|
def _get_arbitrary_jobs_dir_for_name(self, jobNameSlug):
|
toil/jobStores/googleJobStore.py
CHANGED
|
@@ -164,7 +164,7 @@ class GoogleJobStore(AbstractJobStore):
|
|
|
164
164
|
try:
|
|
165
165
|
self.bucket = self.storageClient.create_bucket(self.bucketName)
|
|
166
166
|
except exceptions.Conflict:
|
|
167
|
-
raise JobStoreExistsException(self.locator)
|
|
167
|
+
raise JobStoreExistsException(self.locator, "google")
|
|
168
168
|
super().initialize(config)
|
|
169
169
|
|
|
170
170
|
# set up sever side encryption after we set up config in super
|
|
@@ -178,7 +178,7 @@ class GoogleJobStore(AbstractJobStore):
|
|
|
178
178
|
try:
|
|
179
179
|
self.bucket = self.storageClient.get_bucket(self.bucketName)
|
|
180
180
|
except exceptions.NotFound:
|
|
181
|
-
raise NoSuchJobStoreException(self.locator)
|
|
181
|
+
raise NoSuchJobStoreException(self.locator, "google")
|
|
182
182
|
super().resume()
|
|
183
183
|
|
|
184
184
|
@google_retry
|
|
@@ -209,8 +209,7 @@ class GoogleJobStore(AbstractJobStore):
|
|
|
209
209
|
|
|
210
210
|
def assign_job_id(self, job_description):
|
|
211
211
|
jobStoreID = self._new_job_id()
|
|
212
|
-
log.debug("Assigning ID to job %s
|
|
213
|
-
jobStoreID, '<no command>' if job_description.command is None else job_description.command)
|
|
212
|
+
log.debug("Assigning ID to job %s", jobStoreID)
|
|
214
213
|
job_description.jobStoreID = jobStoreID
|
|
215
214
|
|
|
216
215
|
@contextmanager
|
toil/leader.py
CHANGED
|
@@ -28,14 +28,16 @@ import enlighten
|
|
|
28
28
|
from toil import resolveEntryPoint
|
|
29
29
|
from toil.batchSystems import DeadlockException
|
|
30
30
|
from toil.batchSystems.abstractBatchSystem import (AbstractBatchSystem,
|
|
31
|
-
BatchJobExitReason
|
|
31
|
+
BatchJobExitReason,
|
|
32
|
+
EXIT_STATUS_UNAVAILABLE_VALUE)
|
|
32
33
|
from toil.bus import (JobCompletedMessage,
|
|
33
34
|
JobFailedMessage,
|
|
34
35
|
JobIssuedMessage,
|
|
35
36
|
JobMissingMessage,
|
|
36
37
|
JobUpdatedMessage,
|
|
37
38
|
QueueSizeMessage,
|
|
38
|
-
gen_message_bus_path
|
|
39
|
+
gen_message_bus_path,
|
|
40
|
+
get_job_kind)
|
|
39
41
|
from toil.common import Config, ToilMetrics
|
|
40
42
|
from toil.cwl.utils import CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
|
|
41
43
|
from toil.exceptions import FailedJobsException
|
|
@@ -117,7 +119,12 @@ class Leader:
|
|
|
117
119
|
if self.config.write_messages is None:
|
|
118
120
|
# The user hasn't specified a place for the message bus so we
|
|
119
121
|
# should make one.
|
|
120
|
-
|
|
122
|
+
# pass in coordination_dir for toil-cwl-runner; we want to obey --tmpdir-prefix
|
|
123
|
+
# from cwltool and we change the coordination_dir when detected. we don't want
|
|
124
|
+
# to make another config attribute so put the message bus in the already prefixed dir
|
|
125
|
+
# if a coordination_dir is provided normally, we can still put the bus in there
|
|
126
|
+
# as the coordination dir should serve a similar purpose to the tmp directory
|
|
127
|
+
self.config.write_messages = gen_message_bus_path(config.coordination_dir)
|
|
121
128
|
|
|
122
129
|
# Message bus messages need to go to the given file.
|
|
123
130
|
# Keep a reference to the return value so the listener stays alive.
|
|
@@ -287,7 +294,11 @@ class Leader:
|
|
|
287
294
|
for job_id in self.toilState.totalFailedJobs:
|
|
288
295
|
# Refresh all the failed jobs to get e.g. the log file IDs that the workers wrote
|
|
289
296
|
self.toilState.reset_job(job_id)
|
|
290
|
-
|
|
297
|
+
try:
|
|
298
|
+
failed_jobs.append(self.toilState.get_job(job_id))
|
|
299
|
+
except NoSuchJobException:
|
|
300
|
+
# Job actually finished and was removed
|
|
301
|
+
pass
|
|
291
302
|
|
|
292
303
|
logger.info("Failed jobs at end of the run: %s", ' '.join(str(j) for j in failed_jobs))
|
|
293
304
|
raise FailedJobsException(self.jobStore, failed_jobs, exit_code=self.recommended_fail_exit_code)
|
|
@@ -520,10 +531,10 @@ class Leader:
|
|
|
520
531
|
"manager: %s", readyJob.jobStoreID)
|
|
521
532
|
elif readyJob.jobStoreID in self.toilState.hasFailedSuccessors:
|
|
522
533
|
self._processFailedSuccessors(job_id)
|
|
523
|
-
elif readyJob.
|
|
524
|
-
# The job has a
|
|
534
|
+
elif readyJob.has_body() or result_status != 0:
|
|
535
|
+
# The job has a body it must be run before any successors.
|
|
525
536
|
# Similarly, if the job previously failed we rerun it, even if it doesn't have a
|
|
526
|
-
#
|
|
537
|
+
# body to run, to eliminate any parts of the stack now completed.
|
|
527
538
|
isServiceJob = readyJob.jobStoreID in self.toilState.service_to_client
|
|
528
539
|
|
|
529
540
|
# We want to run the job, and expend one of its "tries" (possibly
|
|
@@ -549,6 +560,7 @@ class Leader:
|
|
|
549
560
|
for serviceID in serviceJobList:
|
|
550
561
|
if serviceID in self.toilState.service_to_client:
|
|
551
562
|
raise RuntimeError(f"The ready service ID: {serviceID} was already added.")
|
|
563
|
+
# TODO: Why do we refresh here?
|
|
552
564
|
self.toilState.reset_job(serviceID)
|
|
553
565
|
serviceHost = self.toilState.get_job(serviceID)
|
|
554
566
|
self.toilState.service_to_client[serviceID] = readyJob.jobStoreID
|
|
@@ -705,8 +717,9 @@ class Leader:
|
|
|
705
717
|
if exitStatus == 0:
|
|
706
718
|
logger.debug('Job ended: %s', updatedJob)
|
|
707
719
|
else:
|
|
708
|
-
|
|
709
|
-
|
|
720
|
+
status_string = str(exitStatus) if exitStatus != EXIT_STATUS_UNAVAILABLE_VALUE else "<UNAVAILABLE>"
|
|
721
|
+
logger.warning(f'Job failed with exit value {status_string}: {updatedJob}\n'
|
|
722
|
+
f'Exit reason: {BatchJobExitReason.to_string(exitReason)}')
|
|
710
723
|
if exitStatus == CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE:
|
|
711
724
|
# This is a CWL job informing us that the workflow is
|
|
712
725
|
# asking things of us that Toil can't do. When we raise an
|
|
@@ -715,7 +728,7 @@ class Leader:
|
|
|
715
728
|
logger.warning("This indicates an unsupported CWL requirement!")
|
|
716
729
|
self.recommended_fail_exit_code = CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
|
|
717
730
|
# Tell everyone it stopped running.
|
|
718
|
-
self._messages.publish(JobCompletedMessage(updatedJob.
|
|
731
|
+
self._messages.publish(JobCompletedMessage(get_job_kind(updatedJob.get_names()), updatedJob.jobStoreID, exitStatus))
|
|
719
732
|
self.process_finished_job(bsID, exitStatus, wall_time=wallTime, exit_reason=exitReason)
|
|
720
733
|
|
|
721
734
|
def _processLostJobs(self):
|
|
@@ -893,11 +906,6 @@ class Leader:
|
|
|
893
906
|
workerCommand.append('--context')
|
|
894
907
|
workerCommand.append(base64.b64encode(pickle.dumps(context)).decode('utf-8'))
|
|
895
908
|
|
|
896
|
-
# We locally override the command. This shouldn't get persisted back to
|
|
897
|
-
# the job store, or we will detach the job body from the job
|
|
898
|
-
# description. TODO: Don't do it this way! It's weird!
|
|
899
|
-
jobNode.command = ' '.join(workerCommand)
|
|
900
|
-
|
|
901
909
|
omp_threads = os.environ.get('OMP_NUM_THREADS') \
|
|
902
910
|
or str(max(1, int(jobNode.cores))) # make sure OMP_NUM_THREADS is a positive integer
|
|
903
911
|
|
|
@@ -907,7 +915,7 @@ class Leader:
|
|
|
907
915
|
}
|
|
908
916
|
|
|
909
917
|
# jobBatchSystemID is an int for each job
|
|
910
|
-
jobBatchSystemID = self.batchSystem.issueBatchJob(jobNode, job_environment=job_environment)
|
|
918
|
+
jobBatchSystemID = self.batchSystem.issueBatchJob(' '.join(workerCommand), jobNode, job_environment=job_environment)
|
|
911
919
|
# Record the job by the ID the batch system will use to talk about it with us
|
|
912
920
|
self.issued_jobs_by_batch_system_id[jobBatchSystemID] = jobNode.jobStoreID
|
|
913
921
|
# Record that this job is issued right now and shouldn't e.g. be issued again.
|
|
@@ -921,7 +929,7 @@ class Leader:
|
|
|
921
929
|
"%s and %s",
|
|
922
930
|
jobNode, str(jobBatchSystemID), jobNode.requirements_string())
|
|
923
931
|
# Tell everyone it is issued and the queue size changed
|
|
924
|
-
self._messages.publish(JobIssuedMessage(jobNode.
|
|
932
|
+
self._messages.publish(JobIssuedMessage(get_job_kind(jobNode.get_names()), jobNode.jobStoreID, jobBatchSystemID))
|
|
925
933
|
self._messages.publish(QueueSizeMessage(self.getNumberOfJobsIssued()))
|
|
926
934
|
# Tell the user there's another job to do
|
|
927
935
|
self.progress_overall.total += 1
|
|
@@ -1045,7 +1053,7 @@ class Leader:
|
|
|
1045
1053
|
jobs = [job for job in jobs if job.preemptible == preemptible]
|
|
1046
1054
|
return jobs
|
|
1047
1055
|
|
|
1048
|
-
def killJobs(self, jobsToKill):
|
|
1056
|
+
def killJobs(self, jobsToKill, exit_reason: BatchJobExitReason = BatchJobExitReason.KILLED):
|
|
1049
1057
|
"""
|
|
1050
1058
|
Kills the given set of jobs and then sends them for processing.
|
|
1051
1059
|
|
|
@@ -1059,7 +1067,7 @@ class Leader:
|
|
|
1059
1067
|
self.batchSystem.killBatchJobs(jobsToKill)
|
|
1060
1068
|
for jobBatchSystemID in jobsToKill:
|
|
1061
1069
|
# Reissue immediately, noting that we killed the job
|
|
1062
|
-
willRerun = self.process_finished_job(jobBatchSystemID, 1, exit_reason=
|
|
1070
|
+
willRerun = self.process_finished_job(jobBatchSystemID, 1, exit_reason=exit_reason)
|
|
1063
1071
|
|
|
1064
1072
|
if willRerun:
|
|
1065
1073
|
# Compose a list of all the jobs that will run again
|
|
@@ -1089,7 +1097,7 @@ class Leader:
|
|
|
1089
1097
|
str(runningJobs[jobBatchSystemID]),
|
|
1090
1098
|
str(maxJobDuration))
|
|
1091
1099
|
jobsToKill.append(jobBatchSystemID)
|
|
1092
|
-
reissued = self.killJobs(jobsToKill)
|
|
1100
|
+
reissued = self.killJobs(jobsToKill, exit_reason=BatchJobExitReason.MAXJOBDURATION)
|
|
1093
1101
|
if len(jobsToKill) > 0:
|
|
1094
1102
|
# Summarize our actions
|
|
1095
1103
|
logger.info("Killed %d over long jobs and reissued %d of them", len(jobsToKill), len(reissued))
|
|
@@ -1127,7 +1135,7 @@ class Leader:
|
|
|
1127
1135
|
if timesMissing == killAfterNTimesMissing:
|
|
1128
1136
|
self.reissueMissingJobs_missingHash.pop(jobBatchSystemID)
|
|
1129
1137
|
jobsToKill.append(jobBatchSystemID)
|
|
1130
|
-
self.killJobs(jobsToKill)
|
|
1138
|
+
self.killJobs(jobsToKill, exit_reason=BatchJobExitReason.MISSING)
|
|
1131
1139
|
return len( self.reissueMissingJobs_missingHash ) == 0 #We use this to inform
|
|
1132
1140
|
#if there are missing jobs
|
|
1133
1141
|
|
|
@@ -1157,7 +1165,7 @@ class Leader:
|
|
|
1157
1165
|
self.progress_overall.update(incr=-1)
|
|
1158
1166
|
self.progress_failed.update(incr=1)
|
|
1159
1167
|
|
|
1160
|
-
# Delegate to the
|
|
1168
|
+
# Delegate to the version that uses a JobDescription
|
|
1161
1169
|
return self.process_finished_job_description(issued_job, result_status, wall_time, exit_reason, batch_system_id)
|
|
1162
1170
|
|
|
1163
1171
|
def process_finished_job_description(self, finished_job: JobDescription, result_status: int,
|
|
@@ -1165,7 +1173,7 @@ class Leader:
|
|
|
1165
1173
|
exit_reason: Optional[BatchJobExitReason] = None,
|
|
1166
1174
|
batch_system_id: Optional[int] = None) -> bool:
|
|
1167
1175
|
"""
|
|
1168
|
-
Process a finished JobDescription based upon its
|
|
1176
|
+
Process a finished JobDescription based upon its success or failure.
|
|
1169
1177
|
|
|
1170
1178
|
If wall-clock time is available, informs the cluster scaler about the
|
|
1171
1179
|
job finishing.
|
|
@@ -1188,19 +1196,62 @@ class Leader:
|
|
|
1188
1196
|
logger.debug("Job %s continues to exist (i.e. has more to do)", finished_job)
|
|
1189
1197
|
try:
|
|
1190
1198
|
# Reload the job as modified by the worker
|
|
1191
|
-
|
|
1192
|
-
|
|
1199
|
+
if finished_job.has_body():
|
|
1200
|
+
# The worker was expected to do some work. We expect the
|
|
1201
|
+
# worker to have updated the job description.
|
|
1202
|
+
|
|
1203
|
+
# If the job succeeded, we wait around to see the update
|
|
1204
|
+
# and fail the job if we don't see it.
|
|
1205
|
+
if result_status == 0:
|
|
1206
|
+
timeout = self.config.job_store_timeout
|
|
1207
|
+
complaint = (
|
|
1208
|
+
f"has no new version available after {timeout} "
|
|
1209
|
+
"seconds. Either worker updates to "
|
|
1210
|
+
"the job store are delayed longer than your "
|
|
1211
|
+
"--jobStoreTimeout, or the worker trying to run the "
|
|
1212
|
+
"job was killed (or never started)."
|
|
1213
|
+
)
|
|
1214
|
+
else:
|
|
1215
|
+
timeout = 0
|
|
1216
|
+
complaint = (
|
|
1217
|
+
"has no new version available immediately. The "
|
|
1218
|
+
"batch system may have killed (or never started) "
|
|
1219
|
+
"the Toil worker."
|
|
1220
|
+
)
|
|
1221
|
+
change_detected = self.toilState.reset_job_expecting_change(job_store_id, timeout)
|
|
1222
|
+
replacement_job = self.toilState.get_job(job_store_id)
|
|
1223
|
+
|
|
1224
|
+
if not change_detected:
|
|
1225
|
+
logger.warning(
|
|
1226
|
+
'Job %s %s',
|
|
1227
|
+
replacement_job,
|
|
1228
|
+
complaint
|
|
1229
|
+
)
|
|
1230
|
+
if result_status == 0:
|
|
1231
|
+
# Make the job fail because we ran it and it finished
|
|
1232
|
+
# and we never heard back.
|
|
1233
|
+
logger.error(
|
|
1234
|
+
'Marking ostensibly successful job %s that did '
|
|
1235
|
+
'not report in to the job store before '
|
|
1236
|
+
'--jobStoreTimeout as having been partitioned '
|
|
1237
|
+
'from us.',
|
|
1238
|
+
replacement_job
|
|
1239
|
+
)
|
|
1240
|
+
result_status = EXIT_STATUS_UNAVAILABLE_VALUE
|
|
1241
|
+
exit_reason = BatchJobExitReason.PARTITION
|
|
1242
|
+
else:
|
|
1243
|
+
# If there was no body sent, the worker won't commit any
|
|
1244
|
+
# changes to the job description. So don't wait around for
|
|
1245
|
+
# any and don't complain if we don't see them.
|
|
1246
|
+
self.toilState.reset_job(job_store_id)
|
|
1247
|
+
replacement_job = self.toilState.get_job(job_store_id)
|
|
1248
|
+
|
|
1193
1249
|
except NoSuchJobException:
|
|
1194
1250
|
# We have a ghost job - the job has been deleted but a stale
|
|
1195
1251
|
# read from e.g. a non-POSIX-compliant filesystem gave us a
|
|
1196
1252
|
# false positive when we checked for its existence. Process the
|
|
1197
1253
|
# job from here as any other job removed from the job store.
|
|
1198
|
-
|
|
1199
|
-
# have a strongly-consistent communications channel. See
|
|
1200
|
-
# https://github.com/BD2KGenomics/toil/issues/1091
|
|
1201
|
-
logger.warning('Got a stale read for job %s; caught its '
|
|
1202
|
-
'completion in time, but other jobs may try to run twice! Fix '
|
|
1203
|
-
'the consistency of your job store storage!', finished_job)
|
|
1254
|
+
logger.debug("Job %s is actually complete upon closer inspection", finished_job)
|
|
1204
1255
|
self.processRemovedJob(finished_job, result_status)
|
|
1205
1256
|
return False
|
|
1206
1257
|
if replacement_job.logJobStoreFileID is not None:
|
|
@@ -1208,11 +1259,12 @@ class Leader:
|
|
|
1208
1259
|
# more memory efficient than read().striplines() while leaving off the
|
|
1209
1260
|
# trailing \n left when using readlines()
|
|
1210
1261
|
# http://stackoverflow.com/a/15233739
|
|
1211
|
-
StatsAndLogging.logWithFormatting(job_store_id, log_stream, method=logger.warning,
|
|
1262
|
+
StatsAndLogging.logWithFormatting(f'Log from job "{job_store_id}"', log_stream, method=logger.warning,
|
|
1212
1263
|
message='The job seems to have left a log file, indicating failure: %s' % replacement_job)
|
|
1213
1264
|
if self.config.writeLogs or self.config.writeLogsGzip:
|
|
1214
1265
|
with replacement_job.getLogFileHandle(self.jobStore) as log_stream:
|
|
1215
|
-
|
|
1266
|
+
# Send log data from the job store to each per-job log file involved.
|
|
1267
|
+
StatsAndLogging.writeLogFiles([names.stats_name for names in replacement_job.get_chain()], log_stream, self.config, failed=True)
|
|
1216
1268
|
if result_status != 0:
|
|
1217
1269
|
# If the batch system returned a non-zero exit code then the worker
|
|
1218
1270
|
# is assumed not to have captured the failure of the job, so we
|
|
@@ -1236,13 +1288,12 @@ class Leader:
|
|
|
1236
1288
|
else:
|
|
1237
1289
|
with log_stream:
|
|
1238
1290
|
if os.path.getsize(log_file) > 0:
|
|
1239
|
-
StatsAndLogging.logWithFormatting(job_store_id, log_stream, method=logger.warning,
|
|
1291
|
+
StatsAndLogging.logWithFormatting(f'Log from job "{job_store_id}"', log_stream, method=logger.warning,
|
|
1240
1292
|
message='The batch system left a non-empty file %s:' % log_file)
|
|
1241
1293
|
if self.config.writeLogs or self.config.writeLogsGzip:
|
|
1242
1294
|
file_root, _ = os.path.splitext(os.path.basename(log_file))
|
|
1243
|
-
job_names = replacement_job.
|
|
1244
|
-
|
|
1245
|
-
job_names = [str(replacement_job)]
|
|
1295
|
+
job_names = [names.stats_name for names in replacement_job.get_chain()]
|
|
1296
|
+
# Tack the batch system log file name onto each job's name
|
|
1246
1297
|
job_names = [j + '_' + file_root for j in job_names]
|
|
1247
1298
|
log_stream.seek(0)
|
|
1248
1299
|
StatsAndLogging.writeLogFiles(job_names, log_stream, self.config, failed=True)
|
|
@@ -1309,7 +1360,7 @@ class Leader:
|
|
|
1309
1360
|
|
|
1310
1361
|
# Tell everyone it failed
|
|
1311
1362
|
|
|
1312
|
-
self._messages.publish(JobFailedMessage(job_desc.
|
|
1363
|
+
self._messages.publish(JobFailedMessage(get_job_kind(job_desc.get_names()), job_id))
|
|
1313
1364
|
|
|
1314
1365
|
if job_id in self.toilState.service_to_client:
|
|
1315
1366
|
# Is a service job
|
toil/lib/aws/__init__.py
CHANGED
|
@@ -16,11 +16,25 @@ import logging
|
|
|
16
16
|
import os
|
|
17
17
|
import re
|
|
18
18
|
import socket
|
|
19
|
+
import toil.lib.retry
|
|
19
20
|
from http.client import HTTPException
|
|
20
|
-
from typing import Dict, MutableMapping, Optional
|
|
21
|
+
from typing import Dict, MutableMapping, Optional, Union, Literal
|
|
21
22
|
from urllib.error import URLError
|
|
22
23
|
from urllib.request import urlopen
|
|
23
24
|
|
|
25
|
+
from botocore.exceptions import ClientError
|
|
26
|
+
|
|
27
|
+
from mypy_boto3_s3.literals import BucketLocationConstraintType
|
|
28
|
+
|
|
29
|
+
AWSRegionName = Union[BucketLocationConstraintType, Literal["us-east-1"]]
|
|
30
|
+
|
|
31
|
+
# These are errors where we think something randomly
|
|
32
|
+
# went wrong on the AWS side and we ought to retry.
|
|
33
|
+
AWSServerErrors = toil.lib.retry.ErrorCondition(
|
|
34
|
+
error=ClientError,
|
|
35
|
+
error_codes=[404, 500, 502, 503, 504]
|
|
36
|
+
)
|
|
37
|
+
|
|
24
38
|
logger = logging.getLogger(__name__)
|
|
25
39
|
|
|
26
40
|
# This file isn't allowed to import anything that depends on Boto or Boto3,
|
|
@@ -67,11 +81,10 @@ def get_aws_zone_from_metadata() -> Optional[str]:
|
|
|
67
81
|
# metadata.
|
|
68
82
|
try:
|
|
69
83
|
# Use the EC2 metadata service
|
|
70
|
-
import
|
|
71
|
-
|
|
72
|
-
from boto.utils import get_instance_metadata
|
|
84
|
+
from ec2_metadata import ec2_metadata
|
|
85
|
+
|
|
73
86
|
logger.debug("Fetch AZ from EC2 metadata")
|
|
74
|
-
return
|
|
87
|
+
return ec2_metadata.availability_zone
|
|
75
88
|
except ImportError:
|
|
76
89
|
# This is expected to happen a lot
|
|
77
90
|
logger.debug("No boto to fetch ECS metadata")
|
|
@@ -82,12 +95,15 @@ def get_aws_zone_from_metadata() -> Optional[str]:
|
|
|
82
95
|
|
|
83
96
|
def get_aws_zone_from_boto() -> Optional[str]:
|
|
84
97
|
"""
|
|
85
|
-
Get the AWS zone from the
|
|
86
|
-
|
|
98
|
+
Get the AWS zone from the Boto3 config file or from AWS_DEFAULT_REGION, if it is configured and the
|
|
99
|
+
boto3 module is available.
|
|
87
100
|
"""
|
|
88
101
|
try:
|
|
89
|
-
import
|
|
90
|
-
|
|
102
|
+
import boto3
|
|
103
|
+
from session import client
|
|
104
|
+
boto3_session = boto3.session.Session()
|
|
105
|
+
# this should check AWS_DEFAULT_REGION and ~/.aws/config
|
|
106
|
+
zone = boto3_session.region_name
|
|
91
107
|
if zone is not None:
|
|
92
108
|
zone += 'a' # derive an availability zone in the region
|
|
93
109
|
return zone
|
|
@@ -128,7 +144,7 @@ def get_current_aws_zone() -> Optional[str]:
|
|
|
128
144
|
get_aws_zone_from_environment_region() or \
|
|
129
145
|
get_aws_zone_from_boto()
|
|
130
146
|
|
|
131
|
-
def zone_to_region(zone: str) ->
|
|
147
|
+
def zone_to_region(zone: str) -> AWSRegionName:
|
|
132
148
|
"""Get a region (e.g. us-west-2) from a zone (e.g. us-west-1c)."""
|
|
133
149
|
# re.compile() caches the regex internally so we don't have to
|
|
134
150
|
availability_zone = re.compile(r'^([a-z]{2}-[a-z]+-[1-9][0-9]*)([a-z])$')
|
toil/lib/aws/iam.py
CHANGED
|
@@ -257,8 +257,8 @@ def get_policy_permissions(region: str) -> AllowedActionCollection:
|
|
|
257
257
|
:param zone: AWS zone to connect to
|
|
258
258
|
"""
|
|
259
259
|
|
|
260
|
-
iam: IAMClient =
|
|
261
|
-
sts: STSClient =
|
|
260
|
+
iam: IAMClient = get_client('iam', region)
|
|
261
|
+
sts: STSClient = get_client('sts', region)
|
|
262
262
|
#TODO Condider effect: deny at some point
|
|
263
263
|
allowed_actions: AllowedActionCollection = defaultdict(lambda: {'Action': [], 'NotAction': []})
|
|
264
264
|
try:
|