toil 6.1.0__py3-none-any.whl → 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +1 -232
- toil/batchSystems/abstractBatchSystem.py +22 -13
- toil/batchSystems/abstractGridEngineBatchSystem.py +59 -45
- toil/batchSystems/awsBatch.py +8 -8
- toil/batchSystems/contained_executor.py +4 -5
- toil/batchSystems/gridengine.py +1 -1
- toil/batchSystems/htcondor.py +5 -5
- toil/batchSystems/kubernetes.py +25 -11
- toil/batchSystems/local_support.py +3 -3
- toil/batchSystems/lsf.py +2 -2
- toil/batchSystems/mesos/batchSystem.py +4 -4
- toil/batchSystems/mesos/executor.py +3 -2
- toil/batchSystems/options.py +9 -0
- toil/batchSystems/singleMachine.py +11 -10
- toil/batchSystems/slurm.py +64 -22
- toil/batchSystems/torque.py +1 -1
- toil/bus.py +7 -3
- toil/common.py +36 -13
- toil/cwl/cwltoil.py +365 -312
- toil/deferred.py +1 -1
- toil/fileStores/abstractFileStore.py +17 -17
- toil/fileStores/cachingFileStore.py +2 -2
- toil/fileStores/nonCachingFileStore.py +1 -1
- toil/job.py +228 -60
- toil/jobStores/abstractJobStore.py +18 -10
- toil/jobStores/aws/jobStore.py +280 -218
- toil/jobStores/aws/utils.py +57 -29
- toil/jobStores/conftest.py +2 -2
- toil/jobStores/fileJobStore.py +2 -2
- toil/jobStores/googleJobStore.py +3 -4
- toil/leader.py +72 -24
- toil/lib/aws/__init__.py +26 -10
- toil/lib/aws/iam.py +2 -2
- toil/lib/aws/session.py +62 -22
- toil/lib/aws/utils.py +73 -37
- toil/lib/conversions.py +5 -1
- toil/lib/ec2.py +118 -69
- toil/lib/expando.py +1 -1
- toil/lib/io.py +14 -2
- toil/lib/misc.py +1 -3
- toil/lib/resources.py +55 -21
- toil/lib/retry.py +12 -5
- toil/lib/threading.py +2 -2
- toil/lib/throttle.py +1 -1
- toil/options/common.py +27 -24
- toil/provisioners/__init__.py +9 -3
- toil/provisioners/abstractProvisioner.py +9 -7
- toil/provisioners/aws/__init__.py +20 -15
- toil/provisioners/aws/awsProvisioner.py +406 -329
- toil/provisioners/gceProvisioner.py +2 -2
- toil/provisioners/node.py +13 -5
- toil/server/app.py +1 -1
- toil/statsAndLogging.py +58 -16
- toil/test/__init__.py +27 -12
- toil/test/batchSystems/batchSystemTest.py +40 -33
- toil/test/batchSystems/batch_system_plugin_test.py +79 -0
- toil/test/batchSystems/test_slurm.py +1 -1
- toil/test/cwl/cwlTest.py +8 -91
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +10 -13
- toil/test/jobStores/jobStoreTest.py +33 -49
- toil/test/lib/aws/test_iam.py +2 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
- toil/test/provisioners/clusterTest.py +90 -8
- toil/test/server/serverTest.py +2 -2
- toil/test/src/autoDeploymentTest.py +1 -1
- toil/test/src/dockerCheckTest.py +2 -1
- toil/test/src/environmentTest.py +125 -0
- toil/test/src/fileStoreTest.py +1 -1
- toil/test/src/jobDescriptionTest.py +18 -8
- toil/test/src/jobTest.py +1 -1
- toil/test/src/realtimeLoggerTest.py +4 -0
- toil/test/src/workerTest.py +52 -19
- toil/test/utils/toilDebugTest.py +61 -3
- toil/test/utils/utilsTest.py +20 -18
- toil/test/wdl/wdltoil_test.py +24 -71
- toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
- toil/toilState.py +68 -9
- toil/utils/toilDebugJob.py +153 -26
- toil/utils/toilLaunchCluster.py +12 -2
- toil/utils/toilRsyncCluster.py +7 -2
- toil/utils/toilSshCluster.py +7 -3
- toil/utils/toilStats.py +2 -1
- toil/utils/toilStatus.py +97 -51
- toil/version.py +10 -10
- toil/wdl/wdltoil.py +318 -51
- toil/worker.py +96 -69
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/METADATA +55 -21
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/RECORD +93 -90
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
toil/__init__.py
CHANGED
|
@@ -22,7 +22,6 @@ from datetime import datetime
|
|
|
22
22
|
from typing import TYPE_CHECKING, Optional, Tuple
|
|
23
23
|
|
|
24
24
|
import requests
|
|
25
|
-
from pytz import timezone
|
|
26
25
|
|
|
27
26
|
from docker.errors import ImageNotFound
|
|
28
27
|
from toil.lib.memoize import memoize
|
|
@@ -210,7 +209,7 @@ def customDockerInitCmd() -> str:
|
|
|
210
209
|
private docker registries). Any single quotes are escaped and the command cannot contain a
|
|
211
210
|
set of blacklisted chars (newline or tab).
|
|
212
211
|
|
|
213
|
-
:returns: The custom
|
|
212
|
+
:returns: The custom command, or an empty string is returned if the environment variable is not set.
|
|
214
213
|
"""
|
|
215
214
|
command = lookupEnvVar(name='user-defined custom docker init command',
|
|
216
215
|
envName='TOIL_CUSTOM_DOCKER_INIT_COMMAND',
|
|
@@ -440,7 +439,6 @@ def logProcessContext(config: "Config") -> None:
|
|
|
440
439
|
|
|
441
440
|
|
|
442
441
|
try:
|
|
443
|
-
from boto import provider
|
|
444
442
|
from botocore.credentials import (JSONFileCache,
|
|
445
443
|
RefreshableCredentials,
|
|
446
444
|
create_credential_resolver)
|
|
@@ -477,234 +475,5 @@ try:
|
|
|
477
475
|
"""
|
|
478
476
|
return datetime.strptime(s, datetime_format)
|
|
479
477
|
|
|
480
|
-
|
|
481
|
-
class BotoCredentialAdapter(provider.Provider):
|
|
482
|
-
"""
|
|
483
|
-
Boto 2 Adapter to use AWS credentials obtained via Boto 3's credential finding logic.
|
|
484
|
-
|
|
485
|
-
This allows for automatic role assumption
|
|
486
|
-
respecting the Boto 3 config files, even when parts of the app still use
|
|
487
|
-
Boto 2.
|
|
488
|
-
|
|
489
|
-
This class also handles caching credentials in multi-process environments
|
|
490
|
-
to avoid loads of processes swamping the EC2 metadata service.
|
|
491
|
-
"""
|
|
492
|
-
|
|
493
|
-
# TODO: We take kwargs because new boto2 versions have an 'anon'
|
|
494
|
-
# argument and we want to be future proof
|
|
495
|
-
|
|
496
|
-
def __init__(self, name, access_key=None, secret_key=None,
|
|
497
|
-
security_token=None, profile_name=None, **kwargs):
|
|
498
|
-
"""Create a new BotoCredentialAdapter."""
|
|
499
|
-
# TODO: We take kwargs because new boto2 versions have an 'anon'
|
|
500
|
-
# argument and we want to be future proof
|
|
501
|
-
|
|
502
|
-
if (name == 'aws' or name is None) and access_key is None and not kwargs.get('anon', False):
|
|
503
|
-
# We are on AWS and we don't have credentials passed along and we aren't anonymous.
|
|
504
|
-
# We will backend into a boto3 resolver for getting credentials.
|
|
505
|
-
# Make sure to enable boto3's own caching, so we can share that
|
|
506
|
-
# cache with pure boto3 code elsewhere in Toil.
|
|
507
|
-
# Keep synced with toil.lib.aws.session.establish_boto3_session
|
|
508
|
-
self._boto3_resolver = create_credential_resolver(Session(profile=profile_name), cache=JSONFileCache())
|
|
509
|
-
else:
|
|
510
|
-
# We will use the normal flow
|
|
511
|
-
self._boto3_resolver = None
|
|
512
|
-
|
|
513
|
-
# Pass along all the arguments
|
|
514
|
-
super().__init__(name, access_key=access_key,
|
|
515
|
-
secret_key=secret_key, security_token=security_token,
|
|
516
|
-
profile_name=profile_name, **kwargs)
|
|
517
|
-
|
|
518
|
-
def get_credentials(self, access_key=None, secret_key=None, security_token=None, profile_name=None):
|
|
519
|
-
"""
|
|
520
|
-
Make sure our credential fields are populated.
|
|
521
|
-
|
|
522
|
-
Called by the base class constructor.
|
|
523
|
-
"""
|
|
524
|
-
if self._boto3_resolver is not None:
|
|
525
|
-
# Go get the credentials from the cache, or from boto3 if not cached.
|
|
526
|
-
# We need to be eager here; having the default None
|
|
527
|
-
# _credential_expiry_time makes the accessors never try to refresh.
|
|
528
|
-
self._obtain_credentials_from_cache_or_boto3()
|
|
529
|
-
else:
|
|
530
|
-
# We're not on AWS, or they passed a key, or we're anonymous.
|
|
531
|
-
# Use the normal route; our credentials shouldn't expire.
|
|
532
|
-
super().get_credentials(access_key=access_key,
|
|
533
|
-
secret_key=secret_key, security_token=security_token,
|
|
534
|
-
profile_name=profile_name)
|
|
535
|
-
|
|
536
|
-
def _populate_keys_from_metadata_server(self):
|
|
537
|
-
"""
|
|
538
|
-
Hack to catch _credential_expiry_time being too soon and refresh the credentials.
|
|
539
|
-
|
|
540
|
-
This override is misnamed; it's actually the only hook we have to catch
|
|
541
|
-
_credential_expiry_time being too soon and refresh the credentials. We
|
|
542
|
-
actually just go back and poke the cache to see if it feels like
|
|
543
|
-
getting us new credentials.
|
|
544
|
-
|
|
545
|
-
Boto 2 hardcodes a refresh within 5 minutes of expiry:
|
|
546
|
-
https://github.com/boto/boto/blob/591911db1029f2fbb8ba1842bfcc514159b37b32/boto/provider.py#L247
|
|
547
|
-
|
|
548
|
-
Boto 3 wants to refresh 15 or 10 minutes before expiry:
|
|
549
|
-
https://github.com/boto/botocore/blob/8d3ea0e61473fba43774eb3c74e1b22995ee7370/botocore/credentials.py#L279
|
|
550
|
-
|
|
551
|
-
So if we ever want to refresh, Boto 3 wants to refresh too.
|
|
552
|
-
"""
|
|
553
|
-
# This should only happen if we have expiring credentials, which we should only get from boto3
|
|
554
|
-
if self._boto3_resolver is None:
|
|
555
|
-
raise RuntimeError("The Boto3 resolver should not be None.")
|
|
556
|
-
|
|
557
|
-
self._obtain_credentials_from_cache_or_boto3()
|
|
558
|
-
|
|
559
|
-
@retry()
|
|
560
|
-
def _obtain_credentials_from_boto3(self):
|
|
561
|
-
"""
|
|
562
|
-
Fill our credential fields from Boto 3.
|
|
563
|
-
|
|
564
|
-
We know the current cached credentials are not good, and that we
|
|
565
|
-
need to get them from Boto 3. Fill in our credential fields
|
|
566
|
-
(_access_key, _secret_key, _security_token,
|
|
567
|
-
_credential_expiry_time) from Boto 3.
|
|
568
|
-
"""
|
|
569
|
-
# We get a Credentials object
|
|
570
|
-
# <https://github.com/boto/botocore/blob/8d3ea0e61473fba43774eb3c74e1b22995ee7370/botocore/credentials.py#L227>
|
|
571
|
-
# or a RefreshableCredentials, or None on failure.
|
|
572
|
-
creds = self._boto3_resolver.load_credentials()
|
|
573
|
-
|
|
574
|
-
if creds is None:
|
|
575
|
-
try:
|
|
576
|
-
resolvers = str(self._boto3_resolver.providers)
|
|
577
|
-
except:
|
|
578
|
-
resolvers = "(Resolvers unavailable)"
|
|
579
|
-
raise RuntimeError("Could not obtain AWS credentials from Boto3. Resolvers tried: " + resolvers)
|
|
580
|
-
|
|
581
|
-
# Make sure the credentials actually has some credentials if it is lazy
|
|
582
|
-
creds.get_frozen_credentials()
|
|
583
|
-
|
|
584
|
-
# Get when the credentials will expire, if ever
|
|
585
|
-
if isinstance(creds, RefreshableCredentials):
|
|
586
|
-
# Credentials may expire.
|
|
587
|
-
# Get a naive UTC datetime like boto 2 uses from the boto 3 time.
|
|
588
|
-
self._credential_expiry_time = creds._expiry_time.astimezone(timezone('UTC')).replace(tzinfo=None)
|
|
589
|
-
else:
|
|
590
|
-
# Credentials never expire
|
|
591
|
-
self._credential_expiry_time = None
|
|
592
|
-
|
|
593
|
-
# Then, atomically get all the credentials bits. They may be newer than we think they are, but never older.
|
|
594
|
-
frozen = creds.get_frozen_credentials()
|
|
595
|
-
|
|
596
|
-
# Copy them into us
|
|
597
|
-
self._access_key = frozen.access_key
|
|
598
|
-
self._secret_key = frozen.secret_key
|
|
599
|
-
self._security_token = frozen.token
|
|
600
|
-
|
|
601
|
-
def _obtain_credentials_from_cache_or_boto3(self):
|
|
602
|
-
"""
|
|
603
|
-
Get the cached credentials.
|
|
604
|
-
|
|
605
|
-
Or retrieve them from Boto 3 and cache them
|
|
606
|
-
(or wait for another cooperating process to do so) if they are missing
|
|
607
|
-
or not fresh enough.
|
|
608
|
-
"""
|
|
609
|
-
cache_path = '~/.cache/aws/cached_temporary_credentials'
|
|
610
|
-
path = os.path.expanduser(cache_path)
|
|
611
|
-
tmp_path = path + '.tmp'
|
|
612
|
-
while True:
|
|
613
|
-
log.debug('Attempting to read cached credentials from %s.', path)
|
|
614
|
-
try:
|
|
615
|
-
with open(path) as f:
|
|
616
|
-
content = f.read()
|
|
617
|
-
if content:
|
|
618
|
-
record = content.split('\n')
|
|
619
|
-
if len(record) != 4:
|
|
620
|
-
raise RuntimeError("Number of cached credentials is not 4.")
|
|
621
|
-
self._access_key = record[0]
|
|
622
|
-
self._secret_key = record[1]
|
|
623
|
-
self._security_token = record[2]
|
|
624
|
-
self._credential_expiry_time = str_to_datetime(record[3])
|
|
625
|
-
else:
|
|
626
|
-
log.debug('%s is empty. Credentials are not temporary.', path)
|
|
627
|
-
self._obtain_credentials_from_boto3()
|
|
628
|
-
return
|
|
629
|
-
except OSError as e:
|
|
630
|
-
if e.errno == errno.ENOENT:
|
|
631
|
-
log.debug('Cached credentials are missing.')
|
|
632
|
-
dir_path = os.path.dirname(path)
|
|
633
|
-
if not os.path.exists(dir_path):
|
|
634
|
-
log.debug('Creating parent directory %s', dir_path)
|
|
635
|
-
try:
|
|
636
|
-
# A race would be ok at this point
|
|
637
|
-
os.makedirs(dir_path, exist_ok=True)
|
|
638
|
-
except OSError as e2:
|
|
639
|
-
if e2.errno == errno.EROFS:
|
|
640
|
-
# Sometimes we don't actually have write access to ~.
|
|
641
|
-
# We may be running in a non-writable Toil container.
|
|
642
|
-
# We should just go get our own credentials
|
|
643
|
-
log.debug('Cannot use the credentials cache because we are working on a read-only filesystem.')
|
|
644
|
-
self._obtain_credentials_from_boto3()
|
|
645
|
-
else:
|
|
646
|
-
raise
|
|
647
|
-
else:
|
|
648
|
-
raise
|
|
649
|
-
else:
|
|
650
|
-
if self._credentials_need_refresh():
|
|
651
|
-
log.debug('Cached credentials are expired.')
|
|
652
|
-
else:
|
|
653
|
-
log.debug('Cached credentials exist and are still fresh.')
|
|
654
|
-
return
|
|
655
|
-
# We get here if credentials are missing or expired
|
|
656
|
-
log.debug('Racing to create %s.', tmp_path)
|
|
657
|
-
# Only one process, the winner, will succeed
|
|
658
|
-
try:
|
|
659
|
-
fd = os.open(tmp_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o600)
|
|
660
|
-
except OSError as e:
|
|
661
|
-
if e.errno == errno.EEXIST:
|
|
662
|
-
log.debug('Lost the race to create %s. Waiting on winner to remove it.', tmp_path)
|
|
663
|
-
while os.path.exists(tmp_path):
|
|
664
|
-
time.sleep(0.1)
|
|
665
|
-
log.debug('Winner removed %s. Trying from the top.', tmp_path)
|
|
666
|
-
else:
|
|
667
|
-
raise
|
|
668
|
-
else:
|
|
669
|
-
try:
|
|
670
|
-
log.debug('Won the race to create %s. Requesting credentials from backend.', tmp_path)
|
|
671
|
-
self._obtain_credentials_from_boto3()
|
|
672
|
-
except:
|
|
673
|
-
os.close(fd)
|
|
674
|
-
fd = None
|
|
675
|
-
log.debug('Failed to obtain credentials, removing %s.', tmp_path)
|
|
676
|
-
# This unblocks the losers.
|
|
677
|
-
os.unlink(tmp_path)
|
|
678
|
-
# Bail out. It's too likely to happen repeatedly
|
|
679
|
-
raise
|
|
680
|
-
else:
|
|
681
|
-
if self._credential_expiry_time is None:
|
|
682
|
-
os.close(fd)
|
|
683
|
-
fd = None
|
|
684
|
-
log.debug('Credentials are not temporary. Leaving %s empty and renaming it to %s.',
|
|
685
|
-
tmp_path, path)
|
|
686
|
-
# No need to actually cache permanent credentials,
|
|
687
|
-
# because we know we aren't getting them from the
|
|
688
|
-
# metadata server or by assuming a role. Those both
|
|
689
|
-
# give temporary credentials.
|
|
690
|
-
else:
|
|
691
|
-
log.debug('Writing credentials to %s.', tmp_path)
|
|
692
|
-
with os.fdopen(fd, 'w') as fh:
|
|
693
|
-
fd = None
|
|
694
|
-
fh.write('\n'.join([
|
|
695
|
-
self._access_key,
|
|
696
|
-
self._secret_key,
|
|
697
|
-
self._security_token,
|
|
698
|
-
datetime_to_str(self._credential_expiry_time)]))
|
|
699
|
-
log.debug('Wrote credentials to %s. Renaming to %s.', tmp_path, path)
|
|
700
|
-
os.rename(tmp_path, path)
|
|
701
|
-
return
|
|
702
|
-
finally:
|
|
703
|
-
if fd is not None:
|
|
704
|
-
os.close(fd)
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
provider.Provider = BotoCredentialAdapter
|
|
708
|
-
|
|
709
478
|
except ImportError:
|
|
710
479
|
pass
|
|
@@ -58,6 +58,13 @@ class BatchJobExitReason(enum.IntEnum):
|
|
|
58
58
|
"""Internal error."""
|
|
59
59
|
MEMLIMIT: int = 6
|
|
60
60
|
"""Job hit batch system imposed memory limit."""
|
|
61
|
+
MISSING: int = 7
|
|
62
|
+
"""Job disappeared from the scheduler without actually stopping, so Toil killed it."""
|
|
63
|
+
MAXJOBDURATION: int = 8
|
|
64
|
+
"""Job ran longer than --maxJobDuration, so Toil killed it."""
|
|
65
|
+
PARTITION: int = 9
|
|
66
|
+
"""Job was not able to talk to the leader via the job store, so Toil declared it failed."""
|
|
67
|
+
|
|
61
68
|
|
|
62
69
|
@classmethod
|
|
63
70
|
def to_string(cls, value: int) -> str:
|
|
@@ -156,17 +163,19 @@ class AbstractBatchSystem(ABC):
|
|
|
156
163
|
"""
|
|
157
164
|
|
|
158
165
|
@abstractmethod
|
|
159
|
-
def issueBatchJob(self,
|
|
166
|
+
def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
|
|
160
167
|
"""
|
|
161
168
|
Issues a job with the specified command to the batch system and returns
|
|
162
|
-
a unique
|
|
169
|
+
a unique job ID number.
|
|
163
170
|
|
|
164
|
-
:param
|
|
171
|
+
:param command: the command to execute somewhere to run the Toil
|
|
172
|
+
worker process
|
|
173
|
+
:param job_desc: the JobDescription for the job being run
|
|
165
174
|
:param job_environment: a collection of job-specific environment
|
|
166
|
-
|
|
175
|
+
variables to be set on the worker.
|
|
167
176
|
|
|
168
|
-
:return: a unique
|
|
169
|
-
|
|
177
|
+
:return: a unique job ID number that can be used to reference the newly
|
|
178
|
+
issued job
|
|
170
179
|
"""
|
|
171
180
|
raise NotImplementedError()
|
|
172
181
|
|
|
@@ -188,20 +197,20 @@ class AbstractBatchSystem(ABC):
|
|
|
188
197
|
"""
|
|
189
198
|
Gets all currently issued jobs
|
|
190
199
|
|
|
191
|
-
:return: A list of jobs (as
|
|
192
|
-
|
|
193
|
-
|
|
200
|
+
:return: A list of jobs (as job ID numbers) currently issued (may be
|
|
201
|
+
running, or may be waiting to be run). Despite the result being a
|
|
202
|
+
list, the ordering should not be depended upon.
|
|
194
203
|
"""
|
|
195
204
|
raise NotImplementedError()
|
|
196
205
|
|
|
197
206
|
@abstractmethod
|
|
198
207
|
def getRunningBatchJobIDs(self) -> Dict[int, float]:
|
|
199
208
|
"""
|
|
200
|
-
Gets a map of jobs as
|
|
201
|
-
and how long they have been running, in seconds.
|
|
209
|
+
Gets a map of jobs as job ID numbers that are currently running (not
|
|
210
|
+
just waiting) and how long they have been running, in seconds.
|
|
202
211
|
|
|
203
|
-
:return: dictionary with currently running
|
|
204
|
-
|
|
212
|
+
:return: dictionary with currently running job ID number keys and how
|
|
213
|
+
many seconds they have been running as the value
|
|
205
214
|
"""
|
|
206
215
|
raise NotImplementedError()
|
|
207
216
|
|
|
@@ -25,6 +25,7 @@ from toil.batchSystems.cleanup_support import BatchSystemCleanupSupport
|
|
|
25
25
|
from toil.bus import ExternalBatchIdMessage, get_job_kind
|
|
26
26
|
from toil.job import AcceleratorRequirement
|
|
27
27
|
from toil.lib.misc import CalledProcessErrorStderr
|
|
28
|
+
from toil.lib.retry import old_retry, DEFAULT_DELAYS
|
|
28
29
|
|
|
29
30
|
logger = logging.getLogger(__name__)
|
|
30
31
|
|
|
@@ -44,26 +45,29 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
44
45
|
A partial implementation of BatchSystemSupport for batch systems run on a
|
|
45
46
|
standard HPC cluster. By default auto-deployment is not implemented.
|
|
46
47
|
"""
|
|
48
|
+
class GridEngineThreadException(Exception):
|
|
49
|
+
pass
|
|
47
50
|
|
|
48
|
-
class
|
|
49
|
-
|
|
51
|
+
class GridEngineThread(Thread, metaclass=ABCMeta):
|
|
50
52
|
def __init__(self, newJobsQueue: Queue, updatedJobsQueue: Queue, killQueue: Queue, killedJobsQueue: Queue, boss: 'AbstractGridEngineBatchSystem') -> None:
|
|
51
53
|
"""
|
|
52
|
-
Abstract
|
|
54
|
+
Abstract thread interface class. All instances are created with five
|
|
53
55
|
initial arguments (below). Note the Queue instances passed are empty.
|
|
54
56
|
|
|
55
57
|
:param newJobsQueue: a Queue of new (unsubmitted) jobs
|
|
56
58
|
:param updatedJobsQueue: a Queue of jobs that have been updated
|
|
57
59
|
:param killQueue: a Queue of active jobs that need to be killed
|
|
58
|
-
:param killedJobsQueue: Queue of killed jobs for this
|
|
60
|
+
:param killedJobsQueue: Queue of killed jobs for this thread
|
|
59
61
|
:param boss: the AbstractGridEngineBatchSystem instance that
|
|
60
|
-
controls this
|
|
62
|
+
controls this GridEngineThread
|
|
61
63
|
|
|
62
64
|
"""
|
|
63
65
|
Thread.__init__(self)
|
|
64
66
|
self.boss = boss
|
|
65
67
|
self.boss.config.statePollingWait = \
|
|
66
68
|
self.boss.config.statePollingWait or self.boss.getWaitDuration()
|
|
69
|
+
self.boss.config.state_polling_timeout = \
|
|
70
|
+
self.boss.config.state_polling_timeout or self.boss.config.statePollingWait * 10
|
|
67
71
|
self.newJobsQueue = newJobsQueue
|
|
68
72
|
self.updatedJobsQueue = updatedJobsQueue
|
|
69
73
|
self.killQueue = killQueue
|
|
@@ -74,6 +78,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
74
78
|
self.batchJobIDs: Dict[int, str] = dict()
|
|
75
79
|
self._checkOnJobsCache = None
|
|
76
80
|
self._checkOnJobsTimestamp = None
|
|
81
|
+
self.exception = None
|
|
77
82
|
|
|
78
83
|
def getBatchSystemID(self, jobID: int) -> str:
|
|
79
84
|
"""
|
|
@@ -107,7 +112,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
107
112
|
"""
|
|
108
113
|
Create a new job with the given attributes.
|
|
109
114
|
|
|
110
|
-
Implementation-specific; called by
|
|
115
|
+
Implementation-specific; called by GridEngineThread.run()
|
|
111
116
|
"""
|
|
112
117
|
activity = False
|
|
113
118
|
# Load new job id if present:
|
|
@@ -143,7 +148,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
143
148
|
|
|
144
149
|
def killJobs(self):
|
|
145
150
|
"""
|
|
146
|
-
Kill any running jobs within
|
|
151
|
+
Kill any running jobs within thread
|
|
147
152
|
"""
|
|
148
153
|
killList = list()
|
|
149
154
|
while True:
|
|
@@ -175,7 +180,8 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
175
180
|
while killList:
|
|
176
181
|
for jobID in list(killList):
|
|
177
182
|
batchJobID = self.getBatchSystemID(jobID)
|
|
178
|
-
|
|
183
|
+
exit_code = self.boss.with_retries(self.getJobExitCode, batchJobID)
|
|
184
|
+
if exit_code is not None:
|
|
179
185
|
logger.debug('Adding jobID %s to killedJobsQueue', jobID)
|
|
180
186
|
self.killedJobsQueue.put(jobID)
|
|
181
187
|
killList.remove(jobID)
|
|
@@ -273,14 +279,17 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
273
279
|
while self._runStep():
|
|
274
280
|
pass
|
|
275
281
|
except Exception as ex:
|
|
276
|
-
|
|
277
|
-
|
|
282
|
+
self.exception = ex
|
|
283
|
+
logger.error("GridEngine like batch system failure: %s", ex)
|
|
284
|
+
# don't raise exception as is_alive will still be set to false,
|
|
285
|
+
# signalling exception in the thread as we expect the thread to
|
|
286
|
+
# always be running for the duration of the workflow
|
|
278
287
|
|
|
279
288
|
def coalesce_job_exit_codes(self, batch_job_id_list: list) -> List[Union[int, Tuple[int, Optional[BatchJobExitReason]], None]]:
|
|
280
289
|
"""
|
|
281
290
|
Returns exit codes and possibly exit reasons for a list of jobs, or None if they are running.
|
|
282
291
|
|
|
283
|
-
Called by
|
|
292
|
+
Called by GridEngineThread.checkOnJobs().
|
|
284
293
|
|
|
285
294
|
This is an optional part of the interface. It should raise
|
|
286
295
|
NotImplementedError if not actually implemented for a particular
|
|
@@ -341,7 +350,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
341
350
|
def killJob(self, jobID):
|
|
342
351
|
"""
|
|
343
352
|
Kill specific job with the Toil job ID. Implementation-specific; called
|
|
344
|
-
by
|
|
353
|
+
by GridEngineThread.killJobs()
|
|
345
354
|
|
|
346
355
|
:param string jobID: Toil job ID
|
|
347
356
|
"""
|
|
@@ -356,7 +365,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
356
365
|
|
|
357
366
|
If the job is not running but the exit code is not available, it
|
|
358
367
|
will be EXIT_STATUS_UNAVAILABLE_VALUE. Implementation-specific;
|
|
359
|
-
called by
|
|
368
|
+
called by GridEngineThread.checkOnJobs().
|
|
360
369
|
|
|
361
370
|
The exit code will only be 0 if the job affirmatively succeeded.
|
|
362
371
|
|
|
@@ -375,24 +384,20 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
375
384
|
self.updatedJobsQueue = Queue()
|
|
376
385
|
self.killQueue = Queue()
|
|
377
386
|
self.killedJobsQueue = Queue()
|
|
378
|
-
# get the associated
|
|
379
|
-
self.
|
|
380
|
-
|
|
381
|
-
self.
|
|
387
|
+
# get the associated thread class here
|
|
388
|
+
self.background_thread = self.GridEngineThread(self.newJobsQueue, self.updatedJobsQueue,
|
|
389
|
+
self.killQueue, self.killedJobsQueue, self)
|
|
390
|
+
self.background_thread.start()
|
|
382
391
|
self._getRunningBatchJobIDsTimestamp = None
|
|
383
392
|
self._getRunningBatchJobIDsCache = {}
|
|
384
393
|
|
|
385
|
-
@classmethod
|
|
386
|
-
def supportsWorkerCleanup(cls):
|
|
387
|
-
return False
|
|
388
|
-
|
|
389
394
|
@classmethod
|
|
390
395
|
def supportsAutoDeployment(cls):
|
|
391
396
|
return False
|
|
392
397
|
|
|
393
|
-
def issueBatchJob(self, jobDesc, job_environment: Optional[Dict[str, str]] = None):
|
|
398
|
+
def issueBatchJob(self, command: str, jobDesc, job_environment: Optional[Dict[str, str]] = None):
|
|
394
399
|
# Avoid submitting internal jobs to the batch queue, handle locally
|
|
395
|
-
localID = self.handleLocalJob(jobDesc)
|
|
400
|
+
localID = self.handleLocalJob(command, jobDesc)
|
|
396
401
|
if localID is not None:
|
|
397
402
|
return localID
|
|
398
403
|
else:
|
|
@@ -406,10 +411,10 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
406
411
|
gpus = accelerator['count']
|
|
407
412
|
else:
|
|
408
413
|
gpus = jobDesc.accelerators
|
|
409
|
-
|
|
410
|
-
self.newJobsQueue.put((jobID, jobDesc.cores, jobDesc.memory,
|
|
414
|
+
|
|
415
|
+
self.newJobsQueue.put((jobID, jobDesc.cores, jobDesc.memory, command, get_job_kind(jobDesc.get_names()),
|
|
411
416
|
job_environment, gpus))
|
|
412
|
-
logger.debug("Issued the job command: %s with job id: %s and job name %s",
|
|
417
|
+
logger.debug("Issued the job command: %s with job id: %s and job name %s", command, str(jobID),
|
|
413
418
|
get_job_kind(jobDesc.get_names()))
|
|
414
419
|
return jobID
|
|
415
420
|
|
|
@@ -424,7 +429,12 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
424
429
|
for jobID in jobIDs:
|
|
425
430
|
self.killQueue.put(jobID)
|
|
426
431
|
while jobIDs:
|
|
427
|
-
|
|
432
|
+
try:
|
|
433
|
+
killedJobId = self.killedJobsQueue.get(timeout=10)
|
|
434
|
+
except Empty:
|
|
435
|
+
if not self.background_thread.is_alive():
|
|
436
|
+
raise self.GridEngineThreadException("Grid engine thread failed unexpectedly") from self.background_thread.exception
|
|
437
|
+
continue
|
|
428
438
|
if killedJobId is None:
|
|
429
439
|
break
|
|
430
440
|
jobIDs.remove(killedJobId)
|
|
@@ -456,7 +466,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
456
466
|
self.config.statePollingWait):
|
|
457
467
|
batchIds = self._getRunningBatchJobIDsCache
|
|
458
468
|
else:
|
|
459
|
-
batchIds = self.with_retries(self.
|
|
469
|
+
batchIds = self.with_retries(self.background_thread.getRunningJobIDs)
|
|
460
470
|
self._getRunningBatchJobIDsCache = batchIds
|
|
461
471
|
self._getRunningBatchJobIDsTimestamp = datetime.now()
|
|
462
472
|
batchIds.update(self.getRunningLocalJobIDs())
|
|
@@ -464,6 +474,11 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
464
474
|
|
|
465
475
|
def getUpdatedBatchJob(self, maxWait):
|
|
466
476
|
local_tuple = self.getUpdatedLocalJob(0)
|
|
477
|
+
|
|
478
|
+
if not self.background_thread.is_alive():
|
|
479
|
+
# kill remaining jobs on the thread
|
|
480
|
+
self.background_thread.killJobs()
|
|
481
|
+
raise self.GridEngineThreadException("Unexpected GridEngineThread failure") from self.background_thread.exception
|
|
467
482
|
if local_tuple:
|
|
468
483
|
return local_tuple
|
|
469
484
|
else:
|
|
@@ -477,14 +492,14 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
477
492
|
|
|
478
493
|
def shutdown(self) -> None:
|
|
479
494
|
"""
|
|
480
|
-
Signals
|
|
495
|
+
Signals thread to shutdown (via sentinel) then cleanly joins the thread
|
|
481
496
|
"""
|
|
482
497
|
self.shutdownLocal()
|
|
483
498
|
newJobsQueue = self.newJobsQueue
|
|
484
499
|
self.newJobsQueue = None
|
|
485
500
|
|
|
486
501
|
newJobsQueue.put(None)
|
|
487
|
-
self.
|
|
502
|
+
self.background_thread.join()
|
|
488
503
|
|
|
489
504
|
def setEnv(self, name, value=None):
|
|
490
505
|
if value and ',' in value:
|
|
@@ -503,21 +518,20 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
503
518
|
|
|
504
519
|
def with_retries(self, operation, *args, **kwargs):
|
|
505
520
|
"""
|
|
506
|
-
Call operation with args and kwargs. If one of the calls to
|
|
507
|
-
command fails, sleep and try again
|
|
521
|
+
Call operation with args and kwargs. If one of the calls to a
|
|
522
|
+
command fails, sleep and try again.
|
|
508
523
|
"""
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
else:
|
|
521
|
-
logger.error("Failed operation %s, code %d: %s",
|
|
524
|
+
for attempt in old_retry(
|
|
525
|
+
# Don't retry more often than the state polling wait.
|
|
526
|
+
delays=[max(delay, self.config.statePollingWait) for delay in DEFAULT_DELAYS],
|
|
527
|
+
timeout=self.config.state_polling_timeout,
|
|
528
|
+
predicate=lambda e: isinstance(e, CalledProcessErrorStderr)
|
|
529
|
+
):
|
|
530
|
+
with attempt:
|
|
531
|
+
try:
|
|
532
|
+
return operation(*args, **kwargs)
|
|
533
|
+
except CalledProcessErrorStderr as err:
|
|
534
|
+
logger.error("Errored operation %s, code %d: %s",
|
|
522
535
|
operation.__name__, err.returncode, err.stderr)
|
|
536
|
+
# Raise up to the retry logic, which will retry until timeout
|
|
523
537
|
raise err
|
toil/batchSystems/awsBatch.py
CHANGED
|
@@ -36,7 +36,7 @@ import uuid
|
|
|
36
36
|
from argparse import ArgumentParser, _ArgumentGroup
|
|
37
37
|
from typing import Any, Dict, Iterator, List, Optional, Set, Union
|
|
38
38
|
|
|
39
|
-
from
|
|
39
|
+
from botocore.exceptions import ClientError
|
|
40
40
|
|
|
41
41
|
from toil import applianceSelf
|
|
42
42
|
from toil.batchSystems.abstractBatchSystem import (EXIT_STATUS_UNAVAILABLE_VALUE,
|
|
@@ -156,9 +156,9 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
156
156
|
'AWS Batch can only provide nvidia gpu accelerators.'
|
|
157
157
|
])
|
|
158
158
|
|
|
159
|
-
def issueBatchJob(self, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
|
|
159
|
+
def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
|
|
160
160
|
# Try the job as local
|
|
161
|
-
local_id = self.handleLocalJob(job_desc)
|
|
161
|
+
local_id = self.handleLocalJob(command, job_desc)
|
|
162
162
|
if local_id is not None:
|
|
163
163
|
# It is a local job
|
|
164
164
|
return local_id
|
|
@@ -184,7 +184,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
184
184
|
environment.update(job_environment)
|
|
185
185
|
|
|
186
186
|
# Make a command to run it in the executor
|
|
187
|
-
command_list = pack_job(
|
|
187
|
+
command_list = pack_job(command, self.user_script)
|
|
188
188
|
|
|
189
189
|
# Compose a job spec to submit
|
|
190
190
|
job_spec = {
|
|
@@ -376,7 +376,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
376
376
|
# Get rid of the job definition we are using if we can.
|
|
377
377
|
self._destroy_job_definition()
|
|
378
378
|
|
|
379
|
-
@retry(errors=[
|
|
379
|
+
@retry(errors=[ClientError])
|
|
380
380
|
def _try_terminate(self, aws_id: str) -> None:
|
|
381
381
|
"""
|
|
382
382
|
Internal function. Should not be called outside this class.
|
|
@@ -392,7 +392,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
392
392
|
# Kill the AWS Batch job
|
|
393
393
|
self.client.terminate_job(jobId=aws_id, reason='Killed by Toil')
|
|
394
394
|
|
|
395
|
-
@retry(errors=[
|
|
395
|
+
@retry(errors=[ClientError])
|
|
396
396
|
def _wait_until_stopped(self, aws_id: str) -> None:
|
|
397
397
|
"""
|
|
398
398
|
Internal function. Should not be called outside this class.
|
|
@@ -418,7 +418,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
418
418
|
logger.info('Waiting for killed job %s to stop', self.aws_id_to_bs_id.get(aws_id, aws_id))
|
|
419
419
|
time.sleep(2)
|
|
420
420
|
|
|
421
|
-
@retry(errors=[
|
|
421
|
+
@retry(errors=[ClientError])
|
|
422
422
|
def _get_or_create_job_definition(self) -> str:
|
|
423
423
|
"""
|
|
424
424
|
Internal function. Should not be called outside this class.
|
|
@@ -482,7 +482,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
482
482
|
|
|
483
483
|
return self.job_definition
|
|
484
484
|
|
|
485
|
-
@retry(errors=[
|
|
485
|
+
@retry(errors=[ClientError])
|
|
486
486
|
def _destroy_job_definition(self) -> None:
|
|
487
487
|
"""
|
|
488
488
|
Internal function. Should not be called outside this class.
|
|
@@ -25,18 +25,17 @@ import sys
|
|
|
25
25
|
from typing import Any, Dict, List, Optional
|
|
26
26
|
|
|
27
27
|
from toil.batchSystems.abstractBatchSystem import EXIT_STATUS_UNAVAILABLE_VALUE
|
|
28
|
-
from toil.job import JobDescription
|
|
29
28
|
from toil.resource import Resource
|
|
30
29
|
from toil.statsAndLogging import configure_root_logger, set_log_level
|
|
31
30
|
|
|
32
31
|
logger = logging.getLogger(__name__)
|
|
33
32
|
|
|
34
33
|
|
|
35
|
-
def pack_job(
|
|
34
|
+
def pack_job(command: str, user_script: Optional[Resource] = None, environment: Optional[Dict[str, str]] = None) -> List[str]:
|
|
36
35
|
"""
|
|
37
|
-
Create a command that
|
|
36
|
+
Create a command that runs the given command in an environment.
|
|
38
37
|
|
|
39
|
-
:param
|
|
38
|
+
:param command: Worker command to run to run the job.
|
|
40
39
|
:param user_script: User script that will be loaded before the job is run.
|
|
41
40
|
:param environment: Environment variable dict that will be applied before
|
|
42
41
|
the job is run.
|
|
@@ -46,7 +45,7 @@ def pack_job(job_desc: JobDescription, user_script: Optional[Resource] = None, e
|
|
|
46
45
|
"""
|
|
47
46
|
# Make a job dict to send to the executor.
|
|
48
47
|
# TODO: Factor out executor setup from here and Kubernetes and TES
|
|
49
|
-
job: Dict[str, Any] = {"command":
|
|
48
|
+
job: Dict[str, Any] = {"command": command}
|
|
50
49
|
if user_script is not None:
|
|
51
50
|
# If there's a user script resource be sure to send it along
|
|
52
51
|
job['userScript'] = user_script
|