toil 6.1.0__py3-none-any.whl → 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +1 -232
- toil/batchSystems/abstractBatchSystem.py +22 -13
- toil/batchSystems/abstractGridEngineBatchSystem.py +59 -45
- toil/batchSystems/awsBatch.py +8 -8
- toil/batchSystems/contained_executor.py +4 -5
- toil/batchSystems/gridengine.py +1 -1
- toil/batchSystems/htcondor.py +5 -5
- toil/batchSystems/kubernetes.py +25 -11
- toil/batchSystems/local_support.py +3 -3
- toil/batchSystems/lsf.py +2 -2
- toil/batchSystems/mesos/batchSystem.py +4 -4
- toil/batchSystems/mesos/executor.py +3 -2
- toil/batchSystems/options.py +9 -0
- toil/batchSystems/singleMachine.py +11 -10
- toil/batchSystems/slurm.py +64 -22
- toil/batchSystems/torque.py +1 -1
- toil/bus.py +7 -3
- toil/common.py +36 -13
- toil/cwl/cwltoil.py +365 -312
- toil/deferred.py +1 -1
- toil/fileStores/abstractFileStore.py +17 -17
- toil/fileStores/cachingFileStore.py +2 -2
- toil/fileStores/nonCachingFileStore.py +1 -1
- toil/job.py +228 -60
- toil/jobStores/abstractJobStore.py +18 -10
- toil/jobStores/aws/jobStore.py +280 -218
- toil/jobStores/aws/utils.py +57 -29
- toil/jobStores/conftest.py +2 -2
- toil/jobStores/fileJobStore.py +2 -2
- toil/jobStores/googleJobStore.py +3 -4
- toil/leader.py +72 -24
- toil/lib/aws/__init__.py +26 -10
- toil/lib/aws/iam.py +2 -2
- toil/lib/aws/session.py +62 -22
- toil/lib/aws/utils.py +73 -37
- toil/lib/conversions.py +5 -1
- toil/lib/ec2.py +118 -69
- toil/lib/expando.py +1 -1
- toil/lib/io.py +14 -2
- toil/lib/misc.py +1 -3
- toil/lib/resources.py +55 -21
- toil/lib/retry.py +12 -5
- toil/lib/threading.py +2 -2
- toil/lib/throttle.py +1 -1
- toil/options/common.py +27 -24
- toil/provisioners/__init__.py +9 -3
- toil/provisioners/abstractProvisioner.py +9 -7
- toil/provisioners/aws/__init__.py +20 -15
- toil/provisioners/aws/awsProvisioner.py +406 -329
- toil/provisioners/gceProvisioner.py +2 -2
- toil/provisioners/node.py +13 -5
- toil/server/app.py +1 -1
- toil/statsAndLogging.py +58 -16
- toil/test/__init__.py +27 -12
- toil/test/batchSystems/batchSystemTest.py +40 -33
- toil/test/batchSystems/batch_system_plugin_test.py +79 -0
- toil/test/batchSystems/test_slurm.py +1 -1
- toil/test/cwl/cwlTest.py +8 -91
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +10 -13
- toil/test/jobStores/jobStoreTest.py +33 -49
- toil/test/lib/aws/test_iam.py +2 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
- toil/test/provisioners/clusterTest.py +90 -8
- toil/test/server/serverTest.py +2 -2
- toil/test/src/autoDeploymentTest.py +1 -1
- toil/test/src/dockerCheckTest.py +2 -1
- toil/test/src/environmentTest.py +125 -0
- toil/test/src/fileStoreTest.py +1 -1
- toil/test/src/jobDescriptionTest.py +18 -8
- toil/test/src/jobTest.py +1 -1
- toil/test/src/realtimeLoggerTest.py +4 -0
- toil/test/src/workerTest.py +52 -19
- toil/test/utils/toilDebugTest.py +61 -3
- toil/test/utils/utilsTest.py +20 -18
- toil/test/wdl/wdltoil_test.py +24 -71
- toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
- toil/toilState.py +68 -9
- toil/utils/toilDebugJob.py +153 -26
- toil/utils/toilLaunchCluster.py +12 -2
- toil/utils/toilRsyncCluster.py +7 -2
- toil/utils/toilSshCluster.py +7 -3
- toil/utils/toilStats.py +2 -1
- toil/utils/toilStatus.py +97 -51
- toil/version.py +10 -10
- toil/wdl/wdltoil.py +318 -51
- toil/worker.py +96 -69
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/METADATA +55 -21
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/RECORD +93 -90
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
toil/jobStores/aws/utils.py
CHANGED
|
@@ -17,26 +17,25 @@ import logging
|
|
|
17
17
|
import os
|
|
18
18
|
import types
|
|
19
19
|
from ssl import SSLError
|
|
20
|
-
from typing import Optional, cast, TYPE_CHECKING
|
|
20
|
+
from typing import Optional, cast, TYPE_CHECKING, Dict, List, Tuple
|
|
21
21
|
|
|
22
22
|
from boto3.s3.transfer import TransferConfig
|
|
23
|
-
from boto.exception import SDBResponseError
|
|
24
23
|
from botocore.client import Config
|
|
25
24
|
from botocore.exceptions import ClientError
|
|
25
|
+
from mypy_boto3_sdb.type_defs import ItemTypeDef, AttributeTypeDef
|
|
26
26
|
|
|
27
|
-
from toil.lib.aws import session
|
|
28
|
-
from toil.lib.aws.utils import
|
|
27
|
+
from toil.lib.aws import session, AWSServerErrors
|
|
28
|
+
from toil.lib.aws.utils import connection_error, get_bucket_region
|
|
29
29
|
from toil.lib.compatibility import compat_bytes
|
|
30
30
|
from toil.lib.retry import (DEFAULT_DELAYS,
|
|
31
31
|
DEFAULT_TIMEOUT,
|
|
32
|
-
ErrorCondition,
|
|
33
32
|
get_error_code,
|
|
34
33
|
get_error_message,
|
|
35
34
|
get_error_status,
|
|
36
35
|
old_retry,
|
|
37
36
|
retry)
|
|
38
37
|
if TYPE_CHECKING:
|
|
39
|
-
from mypy_boto3_s3 import
|
|
38
|
+
from mypy_boto3_s3 import S3ServiceResource
|
|
40
39
|
|
|
41
40
|
logger = logging.getLogger(__name__)
|
|
42
41
|
|
|
@@ -126,11 +125,11 @@ class SDBHelper:
|
|
|
126
125
|
return cls._maxChunks() * cls.maxValueSize
|
|
127
126
|
|
|
128
127
|
@classmethod
|
|
129
|
-
def binaryToAttributes(cls, binary):
|
|
128
|
+
def binaryToAttributes(cls, binary) -> Dict[str, str]:
|
|
130
129
|
"""
|
|
131
130
|
Turn a bytestring, or None, into SimpleDB attributes.
|
|
132
131
|
"""
|
|
133
|
-
if binary is None: return {'numChunks': 0}
|
|
132
|
+
if binary is None: return {'numChunks': '0'}
|
|
134
133
|
assert isinstance(binary, bytes)
|
|
135
134
|
assert len(binary) <= cls.maxBinarySize()
|
|
136
135
|
# The use of compression is just an optimization. We can't include it in the maxValueSize
|
|
@@ -144,10 +143,41 @@ class SDBHelper:
|
|
|
144
143
|
assert len(encoded) <= cls._maxEncodedSize()
|
|
145
144
|
n = cls.maxValueSize
|
|
146
145
|
chunks = (encoded[i:i + n] for i in range(0, len(encoded), n))
|
|
147
|
-
attributes = {cls._chunkName(i): chunk for i, chunk in enumerate(chunks)}
|
|
148
|
-
attributes.update({'numChunks': len(attributes)})
|
|
146
|
+
attributes = {cls._chunkName(i): chunk.decode("utf-8") for i, chunk in enumerate(chunks)}
|
|
147
|
+
attributes.update({'numChunks': str(len(attributes))})
|
|
149
148
|
return attributes
|
|
150
149
|
|
|
150
|
+
@classmethod
|
|
151
|
+
def attributeDictToList(cls, attributes: Dict[str, str]) -> List[AttributeTypeDef]:
|
|
152
|
+
"""
|
|
153
|
+
Convert the attribute dict (ex: from binaryToAttributes) into a list of attribute typed dicts
|
|
154
|
+
to be compatible with boto3 argument syntax
|
|
155
|
+
:param attributes: Dict[str, str], attribute in object form
|
|
156
|
+
:return: List[AttributeTypeDef], list of attributes in typed dict form
|
|
157
|
+
"""
|
|
158
|
+
return [{"Name": name, "Value": value} for name, value in attributes.items()]
|
|
159
|
+
|
|
160
|
+
@classmethod
|
|
161
|
+
def attributeListToDict(cls, attributes: List[AttributeTypeDef]) -> Dict[str, str]:
|
|
162
|
+
"""
|
|
163
|
+
Convert the attribute boto3 representation of list of attribute typed dicts
|
|
164
|
+
back to a dictionary with name, value pairs
|
|
165
|
+
:param attribute: List[AttributeTypeDef, attribute in typed dict form
|
|
166
|
+
:return: Dict[str, str], attribute in dict form
|
|
167
|
+
"""
|
|
168
|
+
return {attribute["Name"]: attribute["Value"] for attribute in attributes}
|
|
169
|
+
|
|
170
|
+
@classmethod
|
|
171
|
+
def get_attributes_from_item(cls, item: ItemTypeDef, keys: List[str]) -> List[Optional[str]]:
|
|
172
|
+
return_values: List[Optional[str]] = [None for _ in keys]
|
|
173
|
+
mapped_indices: Dict[str, int] = {name: index for index, name in enumerate(keys)}
|
|
174
|
+
for attribute in item["Attributes"]:
|
|
175
|
+
name = attribute["Name"]
|
|
176
|
+
value = attribute["Value"]
|
|
177
|
+
if name in mapped_indices:
|
|
178
|
+
return_values[mapped_indices[name]] = value
|
|
179
|
+
return return_values
|
|
180
|
+
|
|
151
181
|
@classmethod
|
|
152
182
|
def _chunkName(cls, i):
|
|
153
183
|
return str(i).zfill(3)
|
|
@@ -166,14 +196,21 @@ class SDBHelper:
|
|
|
166
196
|
return 'numChunks'
|
|
167
197
|
|
|
168
198
|
@classmethod
|
|
169
|
-
def attributesToBinary(cls, attributes):
|
|
199
|
+
def attributesToBinary(cls, attributes: List[AttributeTypeDef]) -> Tuple[bytes, int]:
|
|
170
200
|
"""
|
|
171
201
|
:rtype: (str|None,int)
|
|
172
202
|
:return: the binary data and the number of chunks it was composed from
|
|
173
203
|
"""
|
|
174
|
-
chunks = [
|
|
204
|
+
chunks = []
|
|
205
|
+
numChunks: int = 0
|
|
206
|
+
for attribute in attributes:
|
|
207
|
+
name = attribute["Name"]
|
|
208
|
+
value = attribute["Value"]
|
|
209
|
+
if cls._isValidChunkName(name):
|
|
210
|
+
chunks.append((int(name), value))
|
|
211
|
+
if name == "numChunks":
|
|
212
|
+
numChunks = int(value)
|
|
175
213
|
chunks.sort()
|
|
176
|
-
numChunks = int(attributes['numChunks'])
|
|
177
214
|
if numChunks:
|
|
178
215
|
serializedJob = b''.join(v.encode() for k, v in chunks)
|
|
179
216
|
compressed = base64.b64decode(serializedJob)
|
|
@@ -193,10 +230,7 @@ def fileSizeAndTime(localFilePath):
|
|
|
193
230
|
return file_stat.st_size, file_stat.st_mtime
|
|
194
231
|
|
|
195
232
|
|
|
196
|
-
@retry(errors=[
|
|
197
|
-
error=ClientError,
|
|
198
|
-
error_codes=[404, 500, 502, 503, 504]
|
|
199
|
-
)])
|
|
233
|
+
@retry(errors=[AWSServerErrors])
|
|
200
234
|
def uploadFromPath(localFilePath: str,
|
|
201
235
|
resource,
|
|
202
236
|
bucketName: str,
|
|
@@ -232,10 +266,7 @@ def uploadFromPath(localFilePath: str,
|
|
|
232
266
|
return version
|
|
233
267
|
|
|
234
268
|
|
|
235
|
-
@retry(errors=[
|
|
236
|
-
error=ClientError,
|
|
237
|
-
error_codes=[404, 500, 502, 503, 504]
|
|
238
|
-
)])
|
|
269
|
+
@retry(errors=[AWSServerErrors])
|
|
239
270
|
def uploadFile(readable,
|
|
240
271
|
resource,
|
|
241
272
|
bucketName: str,
|
|
@@ -287,10 +318,7 @@ class ServerSideCopyProhibitedError(RuntimeError):
|
|
|
287
318
|
insists that you pay to download and upload the data yourself instead.
|
|
288
319
|
"""
|
|
289
320
|
|
|
290
|
-
@retry(errors=[
|
|
291
|
-
error=ClientError,
|
|
292
|
-
error_codes=[404, 500, 502, 503, 504]
|
|
293
|
-
)])
|
|
321
|
+
@retry(errors=[AWSServerErrors])
|
|
294
322
|
def copyKeyMultipart(resource: "S3ServiceResource",
|
|
295
323
|
srcBucketName: str,
|
|
296
324
|
srcKeyName: str,
|
|
@@ -439,9 +467,9 @@ def sdb_unavailable(e):
|
|
|
439
467
|
|
|
440
468
|
|
|
441
469
|
def no_such_sdb_domain(e):
|
|
442
|
-
return (isinstance(e,
|
|
443
|
-
and e
|
|
444
|
-
and e.
|
|
470
|
+
return (isinstance(e, ClientError)
|
|
471
|
+
and get_error_code(e)
|
|
472
|
+
and get_error_code(e).endswith('NoSuchDomain'))
|
|
445
473
|
|
|
446
474
|
|
|
447
475
|
def retryable_ssl_error(e):
|
|
@@ -452,7 +480,7 @@ def retryable_ssl_error(e):
|
|
|
452
480
|
def retryable_sdb_errors(e):
|
|
453
481
|
return (sdb_unavailable(e)
|
|
454
482
|
or no_such_sdb_domain(e)
|
|
455
|
-
or
|
|
483
|
+
or connection_error(e)
|
|
456
484
|
or retryable_ssl_error(e))
|
|
457
485
|
|
|
458
486
|
|
toil/jobStores/conftest.py
CHANGED
toil/jobStores/fileJobStore.py
CHANGED
|
@@ -113,7 +113,7 @@ class FileJobStore(AbstractJobStore):
|
|
|
113
113
|
os.mkdir(self.jobStoreDir)
|
|
114
114
|
except OSError as e:
|
|
115
115
|
if e.errno == errno.EEXIST:
|
|
116
|
-
raise JobStoreExistsException(self.jobStoreDir)
|
|
116
|
+
raise JobStoreExistsException(self.jobStoreDir, "file")
|
|
117
117
|
else:
|
|
118
118
|
raise
|
|
119
119
|
os.makedirs(self.jobsDir, exist_ok=True)
|
|
@@ -127,7 +127,7 @@ class FileJobStore(AbstractJobStore):
|
|
|
127
127
|
|
|
128
128
|
def resume(self):
|
|
129
129
|
if not os.path.isdir(self.jobStoreDir):
|
|
130
|
-
raise NoSuchJobStoreException(self.jobStoreDir)
|
|
130
|
+
raise NoSuchJobStoreException(self.jobStoreDir, "file")
|
|
131
131
|
super().resume()
|
|
132
132
|
|
|
133
133
|
def destroy(self):
|
toil/jobStores/googleJobStore.py
CHANGED
|
@@ -164,7 +164,7 @@ class GoogleJobStore(AbstractJobStore):
|
|
|
164
164
|
try:
|
|
165
165
|
self.bucket = self.storageClient.create_bucket(self.bucketName)
|
|
166
166
|
except exceptions.Conflict:
|
|
167
|
-
raise JobStoreExistsException(self.locator)
|
|
167
|
+
raise JobStoreExistsException(self.locator, "google")
|
|
168
168
|
super().initialize(config)
|
|
169
169
|
|
|
170
170
|
# set up sever side encryption after we set up config in super
|
|
@@ -178,7 +178,7 @@ class GoogleJobStore(AbstractJobStore):
|
|
|
178
178
|
try:
|
|
179
179
|
self.bucket = self.storageClient.get_bucket(self.bucketName)
|
|
180
180
|
except exceptions.NotFound:
|
|
181
|
-
raise NoSuchJobStoreException(self.locator)
|
|
181
|
+
raise NoSuchJobStoreException(self.locator, "google")
|
|
182
182
|
super().resume()
|
|
183
183
|
|
|
184
184
|
@google_retry
|
|
@@ -209,8 +209,7 @@ class GoogleJobStore(AbstractJobStore):
|
|
|
209
209
|
|
|
210
210
|
def assign_job_id(self, job_description):
|
|
211
211
|
jobStoreID = self._new_job_id()
|
|
212
|
-
log.debug("Assigning ID to job %s
|
|
213
|
-
jobStoreID, '<no command>' if job_description.command is None else job_description.command)
|
|
212
|
+
log.debug("Assigning ID to job %s", jobStoreID)
|
|
214
213
|
job_description.jobStoreID = jobStoreID
|
|
215
214
|
|
|
216
215
|
@contextmanager
|
toil/leader.py
CHANGED
|
@@ -119,7 +119,12 @@ class Leader:
|
|
|
119
119
|
if self.config.write_messages is None:
|
|
120
120
|
# The user hasn't specified a place for the message bus so we
|
|
121
121
|
# should make one.
|
|
122
|
-
|
|
122
|
+
# pass in coordination_dir for toil-cwl-runner; we want to obey --tmpdir-prefix
|
|
123
|
+
# from cwltool and we change the coordination_dir when detected. we don't want
|
|
124
|
+
# to make another config attribute so put the message bus in the already prefixed dir
|
|
125
|
+
# if a coordination_dir is provided normally, we can still put the bus in there
|
|
126
|
+
# as the coordination dir should serve a similar purpose to the tmp directory
|
|
127
|
+
self.config.write_messages = gen_message_bus_path(config.coordination_dir)
|
|
123
128
|
|
|
124
129
|
# Message bus messages need to go to the given file.
|
|
125
130
|
# Keep a reference to the return value so the listener stays alive.
|
|
@@ -289,7 +294,11 @@ class Leader:
|
|
|
289
294
|
for job_id in self.toilState.totalFailedJobs:
|
|
290
295
|
# Refresh all the failed jobs to get e.g. the log file IDs that the workers wrote
|
|
291
296
|
self.toilState.reset_job(job_id)
|
|
292
|
-
|
|
297
|
+
try:
|
|
298
|
+
failed_jobs.append(self.toilState.get_job(job_id))
|
|
299
|
+
except NoSuchJobException:
|
|
300
|
+
# Job actually finished and was removed
|
|
301
|
+
pass
|
|
293
302
|
|
|
294
303
|
logger.info("Failed jobs at end of the run: %s", ' '.join(str(j) for j in failed_jobs))
|
|
295
304
|
raise FailedJobsException(self.jobStore, failed_jobs, exit_code=self.recommended_fail_exit_code)
|
|
@@ -522,10 +531,10 @@ class Leader:
|
|
|
522
531
|
"manager: %s", readyJob.jobStoreID)
|
|
523
532
|
elif readyJob.jobStoreID in self.toilState.hasFailedSuccessors:
|
|
524
533
|
self._processFailedSuccessors(job_id)
|
|
525
|
-
elif readyJob.
|
|
526
|
-
# The job has a
|
|
534
|
+
elif readyJob.has_body() or result_status != 0:
|
|
535
|
+
# The job has a body it must be run before any successors.
|
|
527
536
|
# Similarly, if the job previously failed we rerun it, even if it doesn't have a
|
|
528
|
-
#
|
|
537
|
+
# body to run, to eliminate any parts of the stack now completed.
|
|
529
538
|
isServiceJob = readyJob.jobStoreID in self.toilState.service_to_client
|
|
530
539
|
|
|
531
540
|
# We want to run the job, and expend one of its "tries" (possibly
|
|
@@ -551,6 +560,7 @@ class Leader:
|
|
|
551
560
|
for serviceID in serviceJobList:
|
|
552
561
|
if serviceID in self.toilState.service_to_client:
|
|
553
562
|
raise RuntimeError(f"The ready service ID: {serviceID} was already added.")
|
|
563
|
+
# TODO: Why do we refresh here?
|
|
554
564
|
self.toilState.reset_job(serviceID)
|
|
555
565
|
serviceHost = self.toilState.get_job(serviceID)
|
|
556
566
|
self.toilState.service_to_client[serviceID] = readyJob.jobStoreID
|
|
@@ -896,11 +906,6 @@ class Leader:
|
|
|
896
906
|
workerCommand.append('--context')
|
|
897
907
|
workerCommand.append(base64.b64encode(pickle.dumps(context)).decode('utf-8'))
|
|
898
908
|
|
|
899
|
-
# We locally override the command. This shouldn't get persisted back to
|
|
900
|
-
# the job store, or we will detach the job body from the job
|
|
901
|
-
# description. TODO: Don't do it this way! It's weird!
|
|
902
|
-
jobNode.command = ' '.join(workerCommand)
|
|
903
|
-
|
|
904
909
|
omp_threads = os.environ.get('OMP_NUM_THREADS') \
|
|
905
910
|
or str(max(1, int(jobNode.cores))) # make sure OMP_NUM_THREADS is a positive integer
|
|
906
911
|
|
|
@@ -910,7 +915,7 @@ class Leader:
|
|
|
910
915
|
}
|
|
911
916
|
|
|
912
917
|
# jobBatchSystemID is an int for each job
|
|
913
|
-
jobBatchSystemID = self.batchSystem.issueBatchJob(jobNode, job_environment=job_environment)
|
|
918
|
+
jobBatchSystemID = self.batchSystem.issueBatchJob(' '.join(workerCommand), jobNode, job_environment=job_environment)
|
|
914
919
|
# Record the job by the ID the batch system will use to talk about it with us
|
|
915
920
|
self.issued_jobs_by_batch_system_id[jobBatchSystemID] = jobNode.jobStoreID
|
|
916
921
|
# Record that this job is issued right now and shouldn't e.g. be issued again.
|
|
@@ -1048,7 +1053,7 @@ class Leader:
|
|
|
1048
1053
|
jobs = [job for job in jobs if job.preemptible == preemptible]
|
|
1049
1054
|
return jobs
|
|
1050
1055
|
|
|
1051
|
-
def killJobs(self, jobsToKill):
|
|
1056
|
+
def killJobs(self, jobsToKill, exit_reason: BatchJobExitReason = BatchJobExitReason.KILLED):
|
|
1052
1057
|
"""
|
|
1053
1058
|
Kills the given set of jobs and then sends them for processing.
|
|
1054
1059
|
|
|
@@ -1062,7 +1067,7 @@ class Leader:
|
|
|
1062
1067
|
self.batchSystem.killBatchJobs(jobsToKill)
|
|
1063
1068
|
for jobBatchSystemID in jobsToKill:
|
|
1064
1069
|
# Reissue immediately, noting that we killed the job
|
|
1065
|
-
willRerun = self.process_finished_job(jobBatchSystemID, 1, exit_reason=
|
|
1070
|
+
willRerun = self.process_finished_job(jobBatchSystemID, 1, exit_reason=exit_reason)
|
|
1066
1071
|
|
|
1067
1072
|
if willRerun:
|
|
1068
1073
|
# Compose a list of all the jobs that will run again
|
|
@@ -1092,7 +1097,7 @@ class Leader:
|
|
|
1092
1097
|
str(runningJobs[jobBatchSystemID]),
|
|
1093
1098
|
str(maxJobDuration))
|
|
1094
1099
|
jobsToKill.append(jobBatchSystemID)
|
|
1095
|
-
reissued = self.killJobs(jobsToKill)
|
|
1100
|
+
reissued = self.killJobs(jobsToKill, exit_reason=BatchJobExitReason.MAXJOBDURATION)
|
|
1096
1101
|
if len(jobsToKill) > 0:
|
|
1097
1102
|
# Summarize our actions
|
|
1098
1103
|
logger.info("Killed %d over long jobs and reissued %d of them", len(jobsToKill), len(reissued))
|
|
@@ -1130,7 +1135,7 @@ class Leader:
|
|
|
1130
1135
|
if timesMissing == killAfterNTimesMissing:
|
|
1131
1136
|
self.reissueMissingJobs_missingHash.pop(jobBatchSystemID)
|
|
1132
1137
|
jobsToKill.append(jobBatchSystemID)
|
|
1133
|
-
self.killJobs(jobsToKill)
|
|
1138
|
+
self.killJobs(jobsToKill, exit_reason=BatchJobExitReason.MISSING)
|
|
1134
1139
|
return len( self.reissueMissingJobs_missingHash ) == 0 #We use this to inform
|
|
1135
1140
|
#if there are missing jobs
|
|
1136
1141
|
|
|
@@ -1168,7 +1173,7 @@ class Leader:
|
|
|
1168
1173
|
exit_reason: Optional[BatchJobExitReason] = None,
|
|
1169
1174
|
batch_system_id: Optional[int] = None) -> bool:
|
|
1170
1175
|
"""
|
|
1171
|
-
Process a finished JobDescription based upon its
|
|
1176
|
+
Process a finished JobDescription based upon its success or failure.
|
|
1172
1177
|
|
|
1173
1178
|
If wall-clock time is available, informs the cluster scaler about the
|
|
1174
1179
|
job finishing.
|
|
@@ -1191,19 +1196,62 @@ class Leader:
|
|
|
1191
1196
|
logger.debug("Job %s continues to exist (i.e. has more to do)", finished_job)
|
|
1192
1197
|
try:
|
|
1193
1198
|
# Reload the job as modified by the worker
|
|
1194
|
-
|
|
1195
|
-
|
|
1199
|
+
if finished_job.has_body():
|
|
1200
|
+
# The worker was expected to do some work. We expect the
|
|
1201
|
+
# worker to have updated the job description.
|
|
1202
|
+
|
|
1203
|
+
# If the job succeeded, we wait around to see the update
|
|
1204
|
+
# and fail the job if we don't see it.
|
|
1205
|
+
if result_status == 0:
|
|
1206
|
+
timeout = self.config.job_store_timeout
|
|
1207
|
+
complaint = (
|
|
1208
|
+
f"has no new version available after {timeout} "
|
|
1209
|
+
"seconds. Either worker updates to "
|
|
1210
|
+
"the job store are delayed longer than your "
|
|
1211
|
+
"--jobStoreTimeout, or the worker trying to run the "
|
|
1212
|
+
"job was killed (or never started)."
|
|
1213
|
+
)
|
|
1214
|
+
else:
|
|
1215
|
+
timeout = 0
|
|
1216
|
+
complaint = (
|
|
1217
|
+
"has no new version available immediately. The "
|
|
1218
|
+
"batch system may have killed (or never started) "
|
|
1219
|
+
"the Toil worker."
|
|
1220
|
+
)
|
|
1221
|
+
change_detected = self.toilState.reset_job_expecting_change(job_store_id, timeout)
|
|
1222
|
+
replacement_job = self.toilState.get_job(job_store_id)
|
|
1223
|
+
|
|
1224
|
+
if not change_detected:
|
|
1225
|
+
logger.warning(
|
|
1226
|
+
'Job %s %s',
|
|
1227
|
+
replacement_job,
|
|
1228
|
+
complaint
|
|
1229
|
+
)
|
|
1230
|
+
if result_status == 0:
|
|
1231
|
+
# Make the job fail because we ran it and it finished
|
|
1232
|
+
# and we never heard back.
|
|
1233
|
+
logger.error(
|
|
1234
|
+
'Marking ostensibly successful job %s that did '
|
|
1235
|
+
'not report in to the job store before '
|
|
1236
|
+
'--jobStoreTimeout as having been partitioned '
|
|
1237
|
+
'from us.',
|
|
1238
|
+
replacement_job
|
|
1239
|
+
)
|
|
1240
|
+
result_status = EXIT_STATUS_UNAVAILABLE_VALUE
|
|
1241
|
+
exit_reason = BatchJobExitReason.PARTITION
|
|
1242
|
+
else:
|
|
1243
|
+
# If there was no body sent, the worker won't commit any
|
|
1244
|
+
# changes to the job description. So don't wait around for
|
|
1245
|
+
# any and don't complain if we don't see them.
|
|
1246
|
+
self.toilState.reset_job(job_store_id)
|
|
1247
|
+
replacement_job = self.toilState.get_job(job_store_id)
|
|
1248
|
+
|
|
1196
1249
|
except NoSuchJobException:
|
|
1197
1250
|
# We have a ghost job - the job has been deleted but a stale
|
|
1198
1251
|
# read from e.g. a non-POSIX-compliant filesystem gave us a
|
|
1199
1252
|
# false positive when we checked for its existence. Process the
|
|
1200
1253
|
# job from here as any other job removed from the job store.
|
|
1201
|
-
|
|
1202
|
-
# have a strongly-consistent communications channel. See
|
|
1203
|
-
# https://github.com/BD2KGenomics/toil/issues/1091
|
|
1204
|
-
logger.warning('Got a stale read for job %s; caught its '
|
|
1205
|
-
'completion in time, but other jobs may try to run twice! Fix '
|
|
1206
|
-
'the consistency of your job store storage!', finished_job)
|
|
1254
|
+
logger.debug("Job %s is actually complete upon closer inspection", finished_job)
|
|
1207
1255
|
self.processRemovedJob(finished_job, result_status)
|
|
1208
1256
|
return False
|
|
1209
1257
|
if replacement_job.logJobStoreFileID is not None:
|
toil/lib/aws/__init__.py
CHANGED
|
@@ -16,11 +16,25 @@ import logging
|
|
|
16
16
|
import os
|
|
17
17
|
import re
|
|
18
18
|
import socket
|
|
19
|
+
import toil.lib.retry
|
|
19
20
|
from http.client import HTTPException
|
|
20
|
-
from typing import Dict, MutableMapping, Optional
|
|
21
|
+
from typing import Dict, MutableMapping, Optional, Union, Literal
|
|
21
22
|
from urllib.error import URLError
|
|
22
23
|
from urllib.request import urlopen
|
|
23
24
|
|
|
25
|
+
from botocore.exceptions import ClientError
|
|
26
|
+
|
|
27
|
+
from mypy_boto3_s3.literals import BucketLocationConstraintType
|
|
28
|
+
|
|
29
|
+
AWSRegionName = Union[BucketLocationConstraintType, Literal["us-east-1"]]
|
|
30
|
+
|
|
31
|
+
# These are errors where we think something randomly
|
|
32
|
+
# went wrong on the AWS side and we ought to retry.
|
|
33
|
+
AWSServerErrors = toil.lib.retry.ErrorCondition(
|
|
34
|
+
error=ClientError,
|
|
35
|
+
error_codes=[404, 500, 502, 503, 504]
|
|
36
|
+
)
|
|
37
|
+
|
|
24
38
|
logger = logging.getLogger(__name__)
|
|
25
39
|
|
|
26
40
|
# This file isn't allowed to import anything that depends on Boto or Boto3,
|
|
@@ -67,11 +81,10 @@ def get_aws_zone_from_metadata() -> Optional[str]:
|
|
|
67
81
|
# metadata.
|
|
68
82
|
try:
|
|
69
83
|
# Use the EC2 metadata service
|
|
70
|
-
import
|
|
71
|
-
|
|
72
|
-
from boto.utils import get_instance_metadata
|
|
84
|
+
from ec2_metadata import ec2_metadata
|
|
85
|
+
|
|
73
86
|
logger.debug("Fetch AZ from EC2 metadata")
|
|
74
|
-
return
|
|
87
|
+
return ec2_metadata.availability_zone
|
|
75
88
|
except ImportError:
|
|
76
89
|
# This is expected to happen a lot
|
|
77
90
|
logger.debug("No boto to fetch ECS metadata")
|
|
@@ -82,12 +95,15 @@ def get_aws_zone_from_metadata() -> Optional[str]:
|
|
|
82
95
|
|
|
83
96
|
def get_aws_zone_from_boto() -> Optional[str]:
|
|
84
97
|
"""
|
|
85
|
-
Get the AWS zone from the
|
|
86
|
-
|
|
98
|
+
Get the AWS zone from the Boto3 config file or from AWS_DEFAULT_REGION, if it is configured and the
|
|
99
|
+
boto3 module is available.
|
|
87
100
|
"""
|
|
88
101
|
try:
|
|
89
|
-
import
|
|
90
|
-
|
|
102
|
+
import boto3
|
|
103
|
+
from session import client
|
|
104
|
+
boto3_session = boto3.session.Session()
|
|
105
|
+
# this should check AWS_DEFAULT_REGION and ~/.aws/config
|
|
106
|
+
zone = boto3_session.region_name
|
|
91
107
|
if zone is not None:
|
|
92
108
|
zone += 'a' # derive an availability zone in the region
|
|
93
109
|
return zone
|
|
@@ -128,7 +144,7 @@ def get_current_aws_zone() -> Optional[str]:
|
|
|
128
144
|
get_aws_zone_from_environment_region() or \
|
|
129
145
|
get_aws_zone_from_boto()
|
|
130
146
|
|
|
131
|
-
def zone_to_region(zone: str) ->
|
|
147
|
+
def zone_to_region(zone: str) -> AWSRegionName:
|
|
132
148
|
"""Get a region (e.g. us-west-2) from a zone (e.g. us-west-1c)."""
|
|
133
149
|
# re.compile() caches the regex internally so we don't have to
|
|
134
150
|
availability_zone = re.compile(r'^([a-z]{2}-[a-z]+-[1-9][0-9]*)([a-z])$')
|
toil/lib/aws/iam.py
CHANGED
|
@@ -257,8 +257,8 @@ def get_policy_permissions(region: str) -> AllowedActionCollection:
|
|
|
257
257
|
:param zone: AWS zone to connect to
|
|
258
258
|
"""
|
|
259
259
|
|
|
260
|
-
iam: IAMClient =
|
|
261
|
-
sts: STSClient =
|
|
260
|
+
iam: IAMClient = get_client('iam', region)
|
|
261
|
+
sts: STSClient = get_client('sts', region)
|
|
262
262
|
#TODO Condider effect: deny at some point
|
|
263
263
|
allowed_actions: AllowedActionCollection = defaultdict(lambda: {'Action': [], 'NotAction': []})
|
|
264
264
|
try:
|
toil/lib/aws/session.py
CHANGED
|
@@ -15,16 +15,21 @@ import collections
|
|
|
15
15
|
import logging
|
|
16
16
|
import os
|
|
17
17
|
import threading
|
|
18
|
-
from typing import Dict, Optional, Tuple, cast
|
|
18
|
+
from typing import Dict, Optional, Tuple, cast, Union, Literal, overload, TypeVar
|
|
19
19
|
|
|
20
20
|
import boto3
|
|
21
21
|
import boto3.resources.base
|
|
22
|
-
import boto.connection
|
|
23
22
|
import botocore
|
|
24
23
|
from boto3 import Session
|
|
25
24
|
from botocore.client import Config
|
|
26
25
|
from botocore.session import get_session
|
|
27
26
|
from botocore.utils import JSONFileCache
|
|
27
|
+
from mypy_boto3_autoscaling import AutoScalingClient
|
|
28
|
+
from mypy_boto3_ec2 import EC2Client, EC2ServiceResource
|
|
29
|
+
from mypy_boto3_iam import IAMClient, IAMServiceResource
|
|
30
|
+
from mypy_boto3_s3 import S3Client, S3ServiceResource
|
|
31
|
+
from mypy_boto3_sdb import SimpleDBClient
|
|
32
|
+
from mypy_boto3_sts import STSClient
|
|
28
33
|
|
|
29
34
|
logger = logging.getLogger(__name__)
|
|
30
35
|
|
|
@@ -120,6 +125,13 @@ class AWSConnectionManager:
|
|
|
120
125
|
storage.item = _new_boto3_session(region_name=region)
|
|
121
126
|
return cast(boto3.session.Session, storage.item)
|
|
122
127
|
|
|
128
|
+
@overload
|
|
129
|
+
def resource(self, region: Optional[str], service_name: Literal["s3"], endpoint_url: Optional[str] = None) -> S3ServiceResource: ...
|
|
130
|
+
@overload
|
|
131
|
+
def resource(self, region: Optional[str], service_name: Literal["iam"], endpoint_url: Optional[str] = None) -> IAMServiceResource: ...
|
|
132
|
+
@overload
|
|
133
|
+
def resource(self, region: Optional[str], service_name: Literal["ec2"], endpoint_url: Optional[str] = None) -> EC2ServiceResource: ...
|
|
134
|
+
|
|
123
135
|
def resource(self, region: Optional[str], service_name: str, endpoint_url: Optional[str] = None) -> boto3.resources.base.ServiceResource:
|
|
124
136
|
"""
|
|
125
137
|
Get the Boto3 Resource to use with the given service (like 'ec2') in the given region.
|
|
@@ -146,7 +158,28 @@ class AWSConnectionManager:
|
|
|
146
158
|
|
|
147
159
|
return cast(boto3.resources.base.ServiceResource, storage.item)
|
|
148
160
|
|
|
149
|
-
|
|
161
|
+
@overload
|
|
162
|
+
def client(self, region: Optional[str], service_name: Literal["ec2"], endpoint_url: Optional[str] = None,
|
|
163
|
+
config: Optional[Config] = None) -> EC2Client: ...
|
|
164
|
+
@overload
|
|
165
|
+
def client(self, region: Optional[str], service_name: Literal["iam"], endpoint_url: Optional[str] = None,
|
|
166
|
+
config: Optional[Config] = None) -> IAMClient: ...
|
|
167
|
+
@overload
|
|
168
|
+
def client(self, region: Optional[str], service_name: Literal["s3"], endpoint_url: Optional[str] = None,
|
|
169
|
+
config: Optional[Config] = None) -> S3Client: ...
|
|
170
|
+
@overload
|
|
171
|
+
def client(self, region: Optional[str], service_name: Literal["sts"], endpoint_url: Optional[str] = None,
|
|
172
|
+
config: Optional[Config] = None) -> STSClient: ...
|
|
173
|
+
@overload
|
|
174
|
+
def client(self, region: Optional[str], service_name: Literal["sdb"], endpoint_url: Optional[str] = None,
|
|
175
|
+
config: Optional[Config] = None) -> SimpleDBClient: ...
|
|
176
|
+
@overload
|
|
177
|
+
def client(self, region: Optional[str], service_name: Literal["autoscaling"], endpoint_url: Optional[str] = None,
|
|
178
|
+
config: Optional[Config] = None) -> AutoScalingClient: ...
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def client(self, region: Optional[str], service_name: Literal["ec2", "iam", "s3", "sts", "sdb", "autoscaling"], endpoint_url: Optional[str] = None,
|
|
182
|
+
config: Optional[Config] = None) -> botocore.client.BaseClient:
|
|
150
183
|
"""
|
|
151
184
|
Get the Boto3 Client to use with the given service (like 'ec2') in the given region.
|
|
152
185
|
|
|
@@ -159,9 +192,9 @@ class AWSConnectionManager:
|
|
|
159
192
|
# Don't try and memoize if a custom config is used
|
|
160
193
|
with _init_lock:
|
|
161
194
|
if endpoint_url is not None:
|
|
162
|
-
return self.session(region).client(service_name, endpoint_url=endpoint_url, config=config)
|
|
195
|
+
return self.session(region).client(service_name, endpoint_url=endpoint_url, config=config)
|
|
163
196
|
else:
|
|
164
|
-
return self.session(region).client(service_name, config=config)
|
|
197
|
+
return self.session(region).client(service_name, config=config)
|
|
165
198
|
|
|
166
199
|
key = (region, service_name, endpoint_url)
|
|
167
200
|
storage = self.client_cache[key]
|
|
@@ -172,25 +205,12 @@ class AWSConnectionManager:
|
|
|
172
205
|
if endpoint_url is not None:
|
|
173
206
|
# The Boto3 stubs are probably missing an overload here too. See:
|
|
174
207
|
# <https://github.com/vemel/mypy_boto3_builder/issues/121#issuecomment-1011322636>
|
|
175
|
-
storage.item = self.session(region).client(service_name, endpoint_url=endpoint_url)
|
|
208
|
+
storage.item = self.session(region).client(service_name, endpoint_url=endpoint_url)
|
|
176
209
|
else:
|
|
177
210
|
# We might not be able to pass None to Boto3 and have it be the same as no argument.
|
|
178
|
-
storage.item = self.session(region).client(service_name)
|
|
211
|
+
storage.item = self.session(region).client(service_name)
|
|
179
212
|
return cast(botocore.client.BaseClient , storage.item)
|
|
180
213
|
|
|
181
|
-
def boto2(self, region: Optional[str], service_name: str) -> boto.connection.AWSAuthConnection:
|
|
182
|
-
"""
|
|
183
|
-
Get the connected boto2 connection for the given region and service.
|
|
184
|
-
"""
|
|
185
|
-
if service_name == 'iam':
|
|
186
|
-
# IAM connections are regionless
|
|
187
|
-
region = 'universal'
|
|
188
|
-
key = (region, service_name)
|
|
189
|
-
storage = self.boto2_cache[key]
|
|
190
|
-
if not hasattr(storage, 'item'):
|
|
191
|
-
with _init_lock:
|
|
192
|
-
storage.item = getattr(boto, service_name).connect_to_region(region, profile_name=os.environ.get("TOIL_AWS_PROFILE", None))
|
|
193
|
-
return cast(boto.connection.AWSAuthConnection, storage.item)
|
|
194
214
|
|
|
195
215
|
# If you don't want your own AWSConnectionManager, we have a global one and some global functions
|
|
196
216
|
_global_manager = AWSConnectionManager()
|
|
@@ -205,7 +225,20 @@ def establish_boto3_session(region_name: Optional[str] = None) -> Session:
|
|
|
205
225
|
# Just use a global version of the manager. Note that we change the argument order!
|
|
206
226
|
return _global_manager.session(region_name)
|
|
207
227
|
|
|
208
|
-
|
|
228
|
+
@overload
|
|
229
|
+
def client(service_name: Literal["ec2"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None, config: Optional[Config] = None) -> EC2Client: ...
|
|
230
|
+
@overload
|
|
231
|
+
def client(service_name: Literal["iam"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None, config: Optional[Config] = None) -> IAMClient: ...
|
|
232
|
+
@overload
|
|
233
|
+
def client(service_name: Literal["s3"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None, config: Optional[Config] = None) -> S3Client: ...
|
|
234
|
+
@overload
|
|
235
|
+
def client(service_name: Literal["sts"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None, config: Optional[Config] = None) -> STSClient: ...
|
|
236
|
+
@overload
|
|
237
|
+
def client(service_name: Literal["sdb"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None, config: Optional[Config] = None) -> SimpleDBClient: ...
|
|
238
|
+
@overload
|
|
239
|
+
def client(service_name: Literal["autoscaling"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None, config: Optional[Config] = None) -> AutoScalingClient: ...
|
|
240
|
+
|
|
241
|
+
def client(service_name: Literal["ec2", "iam", "s3", "sts", "sdb", "autoscaling"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None, config: Optional[Config] = None) -> botocore.client.BaseClient:
|
|
209
242
|
"""
|
|
210
243
|
Get a Boto 3 client for a particular AWS service, usable by the current thread.
|
|
211
244
|
|
|
@@ -215,7 +248,14 @@ def client(service_name: str, region_name: Optional[str] = None, endpoint_url: O
|
|
|
215
248
|
# Just use a global version of the manager. Note that we change the argument order!
|
|
216
249
|
return _global_manager.client(region_name, service_name, endpoint_url=endpoint_url, config=config)
|
|
217
250
|
|
|
218
|
-
|
|
251
|
+
@overload
|
|
252
|
+
def resource(service_name: Literal["s3"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None) -> S3ServiceResource: ...
|
|
253
|
+
@overload
|
|
254
|
+
def resource(service_name: Literal["iam"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None) -> IAMServiceResource: ...
|
|
255
|
+
@overload
|
|
256
|
+
def resource(service_name: Literal["ec2"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None) -> EC2ServiceResource: ...
|
|
257
|
+
|
|
258
|
+
def resource(service_name: Literal["s3", "iam", "ec2"], region_name: Optional[str] = None, endpoint_url: Optional[str] = None) -> boto3.resources.base.ServiceResource:
|
|
219
259
|
"""
|
|
220
260
|
Get a Boto 3 resource for a particular AWS service, usable by the current thread.
|
|
221
261
|
|