toil 6.1.0__py3-none-any.whl → 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +1 -232
- toil/batchSystems/abstractBatchSystem.py +22 -13
- toil/batchSystems/abstractGridEngineBatchSystem.py +59 -45
- toil/batchSystems/awsBatch.py +8 -8
- toil/batchSystems/contained_executor.py +4 -5
- toil/batchSystems/gridengine.py +1 -1
- toil/batchSystems/htcondor.py +5 -5
- toil/batchSystems/kubernetes.py +25 -11
- toil/batchSystems/local_support.py +3 -3
- toil/batchSystems/lsf.py +2 -2
- toil/batchSystems/mesos/batchSystem.py +4 -4
- toil/batchSystems/mesos/executor.py +3 -2
- toil/batchSystems/options.py +9 -0
- toil/batchSystems/singleMachine.py +11 -10
- toil/batchSystems/slurm.py +64 -22
- toil/batchSystems/torque.py +1 -1
- toil/bus.py +7 -3
- toil/common.py +36 -13
- toil/cwl/cwltoil.py +365 -312
- toil/deferred.py +1 -1
- toil/fileStores/abstractFileStore.py +17 -17
- toil/fileStores/cachingFileStore.py +2 -2
- toil/fileStores/nonCachingFileStore.py +1 -1
- toil/job.py +228 -60
- toil/jobStores/abstractJobStore.py +18 -10
- toil/jobStores/aws/jobStore.py +280 -218
- toil/jobStores/aws/utils.py +57 -29
- toil/jobStores/conftest.py +2 -2
- toil/jobStores/fileJobStore.py +2 -2
- toil/jobStores/googleJobStore.py +3 -4
- toil/leader.py +72 -24
- toil/lib/aws/__init__.py +26 -10
- toil/lib/aws/iam.py +2 -2
- toil/lib/aws/session.py +62 -22
- toil/lib/aws/utils.py +73 -37
- toil/lib/conversions.py +5 -1
- toil/lib/ec2.py +118 -69
- toil/lib/expando.py +1 -1
- toil/lib/io.py +14 -2
- toil/lib/misc.py +1 -3
- toil/lib/resources.py +55 -21
- toil/lib/retry.py +12 -5
- toil/lib/threading.py +2 -2
- toil/lib/throttle.py +1 -1
- toil/options/common.py +27 -24
- toil/provisioners/__init__.py +9 -3
- toil/provisioners/abstractProvisioner.py +9 -7
- toil/provisioners/aws/__init__.py +20 -15
- toil/provisioners/aws/awsProvisioner.py +406 -329
- toil/provisioners/gceProvisioner.py +2 -2
- toil/provisioners/node.py +13 -5
- toil/server/app.py +1 -1
- toil/statsAndLogging.py +58 -16
- toil/test/__init__.py +27 -12
- toil/test/batchSystems/batchSystemTest.py +40 -33
- toil/test/batchSystems/batch_system_plugin_test.py +79 -0
- toil/test/batchSystems/test_slurm.py +1 -1
- toil/test/cwl/cwlTest.py +8 -91
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +10 -13
- toil/test/jobStores/jobStoreTest.py +33 -49
- toil/test/lib/aws/test_iam.py +2 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
- toil/test/provisioners/clusterTest.py +90 -8
- toil/test/server/serverTest.py +2 -2
- toil/test/src/autoDeploymentTest.py +1 -1
- toil/test/src/dockerCheckTest.py +2 -1
- toil/test/src/environmentTest.py +125 -0
- toil/test/src/fileStoreTest.py +1 -1
- toil/test/src/jobDescriptionTest.py +18 -8
- toil/test/src/jobTest.py +1 -1
- toil/test/src/realtimeLoggerTest.py +4 -0
- toil/test/src/workerTest.py +52 -19
- toil/test/utils/toilDebugTest.py +61 -3
- toil/test/utils/utilsTest.py +20 -18
- toil/test/wdl/wdltoil_test.py +24 -71
- toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
- toil/toilState.py +68 -9
- toil/utils/toilDebugJob.py +153 -26
- toil/utils/toilLaunchCluster.py +12 -2
- toil/utils/toilRsyncCluster.py +7 -2
- toil/utils/toilSshCluster.py +7 -3
- toil/utils/toilStats.py +2 -1
- toil/utils/toilStatus.py +97 -51
- toil/version.py +10 -10
- toil/wdl/wdltoil.py +318 -51
- toil/worker.py +96 -69
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/METADATA +55 -21
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/RECORD +93 -90
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
toil/batchSystems/gridengine.py
CHANGED
|
@@ -28,7 +28,7 @@ logger = logging.getLogger(__name__)
|
|
|
28
28
|
|
|
29
29
|
class GridEngineBatchSystem(AbstractGridEngineBatchSystem):
|
|
30
30
|
|
|
31
|
-
class
|
|
31
|
+
class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
|
|
32
32
|
"""
|
|
33
33
|
Grid Engine-specific AbstractGridEngineWorker methods
|
|
34
34
|
"""
|
toil/batchSystems/htcondor.py
CHANGED
|
@@ -48,7 +48,7 @@ schedd_lock = Lock()
|
|
|
48
48
|
class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
|
|
49
49
|
# When using HTCondor, the Schedd handles scheduling
|
|
50
50
|
|
|
51
|
-
class
|
|
51
|
+
class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
|
|
52
52
|
|
|
53
53
|
# Override the createJobs method so that we can use htcondor.Submit objects
|
|
54
54
|
# and so that we can get disk allocation requests and ceil the CPU request.
|
|
@@ -387,9 +387,9 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
387
387
|
return '"' + ' '.join(env_items) + '"'
|
|
388
388
|
|
|
389
389
|
# Override the issueBatchJob method so HTCondor can be given the disk request
|
|
390
|
-
def issueBatchJob(self, jobNode, job_environment: Optional[Dict[str, str]] = None):
|
|
390
|
+
def issueBatchJob(self, command: str, jobNode, job_environment: Optional[Dict[str, str]] = None):
|
|
391
391
|
# Avoid submitting internal jobs to the batch queue, handle locally
|
|
392
|
-
localID = self.handleLocalJob(jobNode)
|
|
392
|
+
localID = self.handleLocalJob(command, jobNode)
|
|
393
393
|
if localID is not None:
|
|
394
394
|
return localID
|
|
395
395
|
else:
|
|
@@ -398,7 +398,7 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
398
398
|
self.currentJobs.add(jobID)
|
|
399
399
|
|
|
400
400
|
# Construct our style of job tuple
|
|
401
|
-
self.newJobsQueue.put((jobID, jobNode.cores, jobNode.memory, jobNode.disk, jobNode.jobName,
|
|
401
|
+
self.newJobsQueue.put((jobID, jobNode.cores, jobNode.memory, jobNode.disk, jobNode.jobName, command,
|
|
402
402
|
job_environment or {}, jobNode.accelerators))
|
|
403
|
-
logger.debug("Issued the job command: %s with job id: %s ",
|
|
403
|
+
logger.debug("Issued the job command: %s with job id: %s ", command, str(jobID))
|
|
404
404
|
return jobID
|
toil/batchSystems/kubernetes.py
CHANGED
|
@@ -47,6 +47,8 @@ from typing import (Any,
|
|
|
47
47
|
cast,
|
|
48
48
|
overload)
|
|
49
49
|
|
|
50
|
+
from toil.lib.conversions import opt_strtobool
|
|
51
|
+
|
|
50
52
|
if sys.version_info < (3, 10):
|
|
51
53
|
from typing_extensions import ParamSpec
|
|
52
54
|
else:
|
|
@@ -83,7 +85,7 @@ from kubernetes.client import (BatchV1Api,
|
|
|
83
85
|
V1SecretVolumeSource,
|
|
84
86
|
V1Toleration,
|
|
85
87
|
V1Volume,
|
|
86
|
-
V1VolumeMount)
|
|
88
|
+
V1VolumeMount, V1SecurityContext)
|
|
87
89
|
from kubernetes.client.api_client import ApiClient
|
|
88
90
|
from kubernetes.client.exceptions import ApiException
|
|
89
91
|
from kubernetes.config.config_exception import ConfigException
|
|
@@ -758,6 +760,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
758
760
|
|
|
759
761
|
def _create_pod_spec(
|
|
760
762
|
self,
|
|
763
|
+
command: str,
|
|
761
764
|
job_desc: JobDescription,
|
|
762
765
|
job_environment: Optional[Dict[str, str]] = None
|
|
763
766
|
) -> V1PodSpec:
|
|
@@ -770,7 +773,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
770
773
|
environment.update(job_environment)
|
|
771
774
|
|
|
772
775
|
# Make a command to run it in the executor
|
|
773
|
-
command_list = pack_job(
|
|
776
|
+
command_list = pack_job(command, self.user_script, environment=environment)
|
|
774
777
|
|
|
775
778
|
# The Kubernetes API makes sense only in terms of the YAML format. Objects
|
|
776
779
|
# represent sections of the YAML files. Except from our point of view, all
|
|
@@ -877,14 +880,20 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
877
880
|
|
|
878
881
|
# Make a container definition
|
|
879
882
|
container = V1Container(command=command_list,
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
883
|
+
image=self.docker_image,
|
|
884
|
+
name="runner-container",
|
|
885
|
+
resources=resources,
|
|
886
|
+
volume_mounts=mounts)
|
|
887
|
+
|
|
888
|
+
# In case security context rules are not allowed to be set, we only apply
|
|
889
|
+
# a security context at all if we need to turn on privileged mode.
|
|
890
|
+
if self.config.kubernetes_privileged:
|
|
891
|
+
container.security_context = V1SecurityContext(privileged=self.config.kubernetes_privileged)
|
|
892
|
+
|
|
884
893
|
# Wrap the container in a spec
|
|
885
894
|
pod_spec = V1PodSpec(containers=[container],
|
|
886
|
-
|
|
887
|
-
|
|
895
|
+
volumes=volumes,
|
|
896
|
+
restart_policy="Never")
|
|
888
897
|
# Tell the spec where to land
|
|
889
898
|
placement.apply(pod_spec)
|
|
890
899
|
|
|
@@ -1005,9 +1014,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
1005
1014
|
self._release_acquired_resources(resources, notify=resource_notify)
|
|
1006
1015
|
del self._acquired_resources[job_name]
|
|
1007
1016
|
|
|
1008
|
-
def issueBatchJob(self, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
|
|
1017
|
+
def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
|
|
1009
1018
|
# Try the job as local
|
|
1010
|
-
localID = self.handleLocalJob(job_desc)
|
|
1019
|
+
localID = self.handleLocalJob(command, job_desc)
|
|
1011
1020
|
if localID is not None:
|
|
1012
1021
|
# It is a local job
|
|
1013
1022
|
return localID
|
|
@@ -1018,7 +1027,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
1018
1027
|
self.check_resource_request(job_desc)
|
|
1019
1028
|
|
|
1020
1029
|
# Make a pod that describes running the job
|
|
1021
|
-
pod_spec = self._create_pod_spec(job_desc, job_environment=job_environment)
|
|
1030
|
+
pod_spec = self._create_pod_spec(command, job_desc, job_environment=job_environment)
|
|
1022
1031
|
|
|
1023
1032
|
# Make a batch system scope job ID
|
|
1024
1033
|
job_id = self.getNextJobID()
|
|
@@ -1879,6 +1888,10 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
1879
1888
|
parser.add_argument("--kubernetesPodTimeout", dest="kubernetes_pod_timeout", default=120, env_var="TOIL_KUBERNETES_POD_TIMEOUT", type=float,
|
|
1880
1889
|
help="Seconds to wait for a scheduled Kubernetes pod to start running. "
|
|
1881
1890
|
"(default: %(default)s)")
|
|
1891
|
+
parser.add_argument("--kubernetesPrivileged", dest="kubernetes_privileged", default=False, env_var="TOIL_KUBERNETES_PRIVILEGED", type=opt_strtobool,
|
|
1892
|
+
help="Whether to ask worker pods to run in privileged mode. This should be used to access "
|
|
1893
|
+
"privileged operations, such as FUSE. On Toil-managed clusters with --enableFuse, "
|
|
1894
|
+
"this is set to True. (default: %(default)s)")
|
|
1882
1895
|
|
|
1883
1896
|
OptionType = TypeVar('OptionType')
|
|
1884
1897
|
@classmethod
|
|
@@ -1887,4 +1900,5 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
1887
1900
|
setOption("kubernetes_owner")
|
|
1888
1901
|
setOption("kubernetes_service_account",)
|
|
1889
1902
|
setOption("kubernetes_pod_timeout")
|
|
1903
|
+
setOption("kubernetes_privileged")
|
|
1890
1904
|
|
|
@@ -34,9 +34,9 @@ class BatchSystemLocalSupport(BatchSystemSupport):
|
|
|
34
34
|
config, maxCores, maxMemory, maxDisk, max_jobs=max_local_jobs
|
|
35
35
|
)
|
|
36
36
|
|
|
37
|
-
def handleLocalJob(self, jobDesc: JobDescription) -> Optional[int]:
|
|
37
|
+
def handleLocalJob(self, command: str, jobDesc: JobDescription) -> Optional[int]:
|
|
38
38
|
"""
|
|
39
|
-
To be called by
|
|
39
|
+
To be called by issueBatchJob.
|
|
40
40
|
|
|
41
41
|
Returns the jobID if the jobDesc has been submitted to the local queue,
|
|
42
42
|
otherwise returns None
|
|
@@ -50,7 +50,7 @@ class BatchSystemLocalSupport(BatchSystemSupport):
|
|
|
50
50
|
# somehow doesn't error whereas just returning the value complains
|
|
51
51
|
# we're returning an Any. TODO: When singleMachine.py typechecks,
|
|
52
52
|
# remove all these extra variables.
|
|
53
|
-
local_id: int = self.localBatch.issueBatchJob(jobDesc)
|
|
53
|
+
local_id: int = self.localBatch.issueBatchJob(command, jobDesc)
|
|
54
54
|
return local_id
|
|
55
55
|
else:
|
|
56
56
|
return None
|
toil/batchSystems/lsf.py
CHANGED
|
@@ -44,8 +44,8 @@ logger = logging.getLogger(__name__)
|
|
|
44
44
|
|
|
45
45
|
class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
46
46
|
|
|
47
|
-
class
|
|
48
|
-
"""LSF specific
|
|
47
|
+
class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
|
|
48
|
+
"""LSF specific GridEngineThread methods."""
|
|
49
49
|
|
|
50
50
|
def getRunningJobIDs(self):
|
|
51
51
|
times = {}
|
|
@@ -174,13 +174,13 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
174
174
|
def unignoreNode(self, nodeAddress):
|
|
175
175
|
self.ignoredNodes.remove(nodeAddress)
|
|
176
176
|
|
|
177
|
-
def issueBatchJob(self, jobNode: JobDescription, job_environment: Optional[Dict[str, str]] = None):
|
|
177
|
+
def issueBatchJob(self, command: str, jobNode: JobDescription, job_environment: Optional[Dict[str, str]] = None):
|
|
178
178
|
"""
|
|
179
179
|
Issues the following command returning a unique jobID. Command is the string to run, memory
|
|
180
180
|
is an int giving the number of bytes the job needs to run in and cores is the number of cpus
|
|
181
181
|
needed for the job and error-file is the path of the file to place any std-err/std-out in.
|
|
182
182
|
"""
|
|
183
|
-
localID = self.handleLocalJob(jobNode)
|
|
183
|
+
localID = self.handleLocalJob(command, jobNode)
|
|
184
184
|
if localID is not None:
|
|
185
185
|
return localID
|
|
186
186
|
|
|
@@ -200,12 +200,12 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
200
200
|
job = ToilJob(jobID=jobID,
|
|
201
201
|
name=str(jobNode),
|
|
202
202
|
resources=MesosShape(wallTime=0, **mesos_resources),
|
|
203
|
-
command=
|
|
203
|
+
command=command,
|
|
204
204
|
userScript=self.userScript,
|
|
205
205
|
environment=environment,
|
|
206
206
|
workerCleanupInfo=self.workerCleanupInfo)
|
|
207
207
|
jobType = job.resources
|
|
208
|
-
log.debug("Queueing the job
|
|
208
|
+
log.debug("Queueing the job %s with job id: %s ...", jobNode, str(jobID))
|
|
209
209
|
|
|
210
210
|
# TODO: round all elements of resources
|
|
211
211
|
|
|
@@ -196,12 +196,13 @@ class MesosExecutor(Executor):
|
|
|
196
196
|
"""
|
|
197
197
|
if job.userScript:
|
|
198
198
|
job.userScript.register()
|
|
199
|
-
|
|
199
|
+
command = job.command
|
|
200
|
+
log.debug("Invoking command: '%s'", command)
|
|
200
201
|
# Construct the job's environment
|
|
201
202
|
jobEnv = dict(os.environ, **job.environment)
|
|
202
203
|
log.debug('Using environment variables: %s', jobEnv.keys())
|
|
203
204
|
with self.popenLock:
|
|
204
|
-
return subprocess.Popen(
|
|
205
|
+
return subprocess.Popen(command,
|
|
205
206
|
preexec_fn=lambda: os.setpgrp(),
|
|
206
207
|
shell=True, env=jobEnv)
|
|
207
208
|
|
toil/batchSystems/options.py
CHANGED
|
@@ -76,6 +76,7 @@ def set_batchsystem_options(batch_system: Optional[str], set_option: OptionSette
|
|
|
76
76
|
set_option("manualMemArgs")
|
|
77
77
|
set_option("run_local_jobs_on_workers")
|
|
78
78
|
set_option("statePollingWait")
|
|
79
|
+
set_option("state_polling_timeout")
|
|
79
80
|
set_option("batch_logs_dir")
|
|
80
81
|
|
|
81
82
|
|
|
@@ -164,6 +165,14 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) -
|
|
|
164
165
|
"Return cached results if within the waiting period. Only works for grid "
|
|
165
166
|
"engine batch systems such as gridengine, htcondor, torque, slurm, and lsf."
|
|
166
167
|
)
|
|
168
|
+
parser.add_argument(
|
|
169
|
+
"--statePollingTimeout",
|
|
170
|
+
dest="state_polling_timeout",
|
|
171
|
+
type=int,
|
|
172
|
+
default=1200,
|
|
173
|
+
help="Time, in seconds, to retry against a broken scheduler. Only works for grid "
|
|
174
|
+
"engine batch systems such as gridengine, htcondor, torque, slurm, and lsf."
|
|
175
|
+
)
|
|
167
176
|
parser.add_argument(
|
|
168
177
|
"--batchLogsDir",
|
|
169
178
|
dest="batch_logs_dir",
|
|
@@ -475,17 +475,17 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
475
475
|
# We can actually run in this thread
|
|
476
476
|
jobName, jobStoreLocator, jobStoreID = jobCommand.split()[1:4] # Parse command
|
|
477
477
|
jobStore = Toil.resumeJobStore(jobStoreLocator)
|
|
478
|
-
toil_worker.workerScript(jobStore, jobStore.config, jobName, jobStoreID,
|
|
479
|
-
|
|
478
|
+
statusCode = toil_worker.workerScript(jobStore, jobStore.config, jobName, jobStoreID,
|
|
479
|
+
redirect_output_to_log_file=not self.debugWorker) # Call the worker
|
|
480
480
|
else:
|
|
481
481
|
# Run synchronously. If starting or running the command fails, let the exception stop us.
|
|
482
|
-
subprocess.check_call(jobCommand,
|
|
482
|
+
statusCode = subprocess.check_call(jobCommand,
|
|
483
483
|
shell=True,
|
|
484
484
|
env=dict(os.environ, **environment))
|
|
485
485
|
|
|
486
486
|
self.runningJobs.pop(jobID)
|
|
487
487
|
if not info.killIntended:
|
|
488
|
-
self.outputQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=
|
|
488
|
+
self.outputQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=statusCode, wallTime=time.time() - info.time, exitReason=None))
|
|
489
489
|
|
|
490
490
|
def getSchedulingStatusMessage(self):
|
|
491
491
|
# Implement the abstractBatchSystem's scheduling status message API
|
|
@@ -655,6 +655,7 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
655
655
|
# and all its children together. We assume that the
|
|
656
656
|
# process group ID will equal the PID of the process we
|
|
657
657
|
# are starting.
|
|
658
|
+
logger.debug("Attempting to run job command: %s", jobCommand)
|
|
658
659
|
popen = subprocess.Popen(jobCommand,
|
|
659
660
|
shell=True,
|
|
660
661
|
env=child_environment,
|
|
@@ -743,24 +744,24 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
743
744
|
|
|
744
745
|
logger.debug('Child %d for job %s succeeded', pid, jobID)
|
|
745
746
|
|
|
746
|
-
def issueBatchJob(self,
|
|
747
|
+
def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
|
|
747
748
|
"""Adds the command and resources to a queue to be run."""
|
|
748
749
|
|
|
749
750
|
self._checkOnDaddy()
|
|
750
751
|
|
|
751
752
|
# Apply scale in cores
|
|
752
|
-
scaled_desc =
|
|
753
|
+
scaled_desc = job_desc.scale('cores', self.scale)
|
|
753
754
|
# Round cores up to multiples of minCores
|
|
754
755
|
scaled_desc.cores = max(math.ceil(scaled_desc.cores / self.minCores) * self.minCores, self.minCores)
|
|
755
756
|
|
|
756
757
|
# Don't do our own assertions about job size vs. our configured size.
|
|
757
758
|
# The abstract batch system can handle it.
|
|
758
759
|
self.check_resource_request(scaled_desc)
|
|
759
|
-
logger.debug(f"Issuing the command: {
|
|
760
|
+
logger.debug(f"Issuing the command: {command} with {scaled_desc.requirements_string()}")
|
|
760
761
|
with self.jobIndexLock:
|
|
761
762
|
jobID = self.jobIndex
|
|
762
763
|
self.jobIndex += 1
|
|
763
|
-
self.jobs[jobID] =
|
|
764
|
+
self.jobs[jobID] = command
|
|
764
765
|
|
|
765
766
|
environment = self.environment.copy()
|
|
766
767
|
if job_environment:
|
|
@@ -769,10 +770,10 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
769
770
|
if self.debugWorker:
|
|
770
771
|
# Run immediately, blocking for return.
|
|
771
772
|
# Ignore resource requirements; we run one job at a time
|
|
772
|
-
self._runDebugJob(
|
|
773
|
+
self._runDebugJob(command, jobID, environment)
|
|
773
774
|
else:
|
|
774
775
|
# Queue the job for later
|
|
775
|
-
self.inputQueue.put((
|
|
776
|
+
self.inputQueue.put((command, jobID, scaled_desc.cores, scaled_desc.memory,
|
|
776
777
|
scaled_desc.disk, scaled_desc.accelerators, environment))
|
|
777
778
|
|
|
778
779
|
return jobID
|
toil/batchSystems/slurm.py
CHANGED
|
@@ -16,9 +16,9 @@ import math
|
|
|
16
16
|
import os
|
|
17
17
|
from argparse import ArgumentParser, _ArgumentGroup
|
|
18
18
|
from shlex import quote
|
|
19
|
-
from typing import Dict, List, Optional, Tuple, TypeVar, Union
|
|
19
|
+
from typing import Dict, List, Optional, Set, Tuple, TypeVar, Union
|
|
20
20
|
|
|
21
|
-
from toil.batchSystems.abstractBatchSystem import BatchJobExitReason, EXIT_STATUS_UNAVAILABLE_VALUE
|
|
21
|
+
from toil.batchSystems.abstractBatchSystem import BatchJobExitReason, EXIT_STATUS_UNAVAILABLE_VALUE, InsufficientSystemResources
|
|
22
22
|
from toil.batchSystems.abstractGridEngineBatchSystem import \
|
|
23
23
|
AbstractGridEngineBatchSystem
|
|
24
24
|
from toil.batchSystems.options import OptionSetter
|
|
@@ -27,10 +27,50 @@ from toil.lib.misc import CalledProcessErrorStderr, call_command
|
|
|
27
27
|
|
|
28
28
|
logger = logging.getLogger(__name__)
|
|
29
29
|
|
|
30
|
+
# We have a complete list of Slurm states. States not in one of these aren't
|
|
31
|
+
# allowed. See <https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES>
|
|
32
|
+
|
|
33
|
+
# If a job is in one of these states, Slurm can't run it anymore.
|
|
34
|
+
# We don't include states where the job is held or paused here;
|
|
35
|
+
# those mean it could run and needs to wait for someone to un-hold
|
|
36
|
+
# it, so Toil should wait for it.
|
|
37
|
+
#
|
|
38
|
+
# We map from each terminal state to the Toil-ontology exit reason.
|
|
39
|
+
TERMINAL_STATES: Dict[str, BatchJobExitReason] = {
|
|
40
|
+
"BOOT_FAIL": BatchJobExitReason.LOST,
|
|
41
|
+
"CANCELLED": BatchJobExitReason.KILLED,
|
|
42
|
+
"COMPLETED": BatchJobExitReason.FINISHED,
|
|
43
|
+
"DEADLINE": BatchJobExitReason.KILLED,
|
|
44
|
+
"FAILED": BatchJobExitReason.FAILED,
|
|
45
|
+
"NODE_FAIL": BatchJobExitReason.LOST,
|
|
46
|
+
"OUT_OF_MEMORY": BatchJobExitReason.MEMLIMIT,
|
|
47
|
+
"PREEMPTED": BatchJobExitReason.KILLED,
|
|
48
|
+
"REVOKED": BatchJobExitReason.KILLED,
|
|
49
|
+
"SPECIAL_EXIT": BatchJobExitReason.FAILED,
|
|
50
|
+
"TIMEOUT": BatchJobExitReason.KILLED
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
# If a job is in one of these states, it might eventually move to a different
|
|
54
|
+
# state.
|
|
55
|
+
NONTERMINAL_STATES: Set[str] = {
|
|
56
|
+
"CONFIGURING",
|
|
57
|
+
"COMPLETING",
|
|
58
|
+
"PENDING",
|
|
59
|
+
"RUNNING",
|
|
60
|
+
"RESV_DEL_HOLD",
|
|
61
|
+
"REQUEUE_FED",
|
|
62
|
+
"REQUEUE_HOLD",
|
|
63
|
+
"REQUEUED",
|
|
64
|
+
"RESIZING",
|
|
65
|
+
"SIGNALING",
|
|
66
|
+
"STAGE_OUT",
|
|
67
|
+
"STOPPED",
|
|
68
|
+
"SUSPENDED"
|
|
69
|
+
}
|
|
30
70
|
|
|
31
71
|
class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
32
72
|
|
|
33
|
-
class
|
|
73
|
+
class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
|
|
34
74
|
|
|
35
75
|
def getRunningJobIDs(self):
|
|
36
76
|
# Should return a dictionary of Job IDs and number of seconds
|
|
@@ -95,7 +135,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
95
135
|
logger.debug("sbatch submitted job %d", result)
|
|
96
136
|
return result
|
|
97
137
|
except OSError as e:
|
|
98
|
-
logger.error("sbatch command failed")
|
|
138
|
+
logger.error(f"sbatch command failed with error: {e}")
|
|
99
139
|
raise e
|
|
100
140
|
|
|
101
141
|
def coalesce_job_exit_codes(self, batch_job_id_list: list) -> List[Union[int, Tuple[int, Optional[BatchJobExitReason]], None]]:
|
|
@@ -165,24 +205,6 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
165
205
|
"""
|
|
166
206
|
state, rc = status
|
|
167
207
|
|
|
168
|
-
# If a job is in one of these states, Slurm can't run it anymore.
|
|
169
|
-
# We don't include states where the job is held or paused here;
|
|
170
|
-
# those mean it could run and needs to wait for someone to un-hold
|
|
171
|
-
# it, so Toil should wait for it.
|
|
172
|
-
#
|
|
173
|
-
# We map from each terminal state to the Toil-ontology exit reason.
|
|
174
|
-
TERMINAL_STATES: Dict[str, BatchJobExitReason] = {
|
|
175
|
-
"BOOT_FAIL": BatchJobExitReason.LOST,
|
|
176
|
-
"CANCELLED": BatchJobExitReason.KILLED,
|
|
177
|
-
"COMPLETED": BatchJobExitReason.FINISHED,
|
|
178
|
-
"DEADLINE": BatchJobExitReason.KILLED,
|
|
179
|
-
"FAILED": BatchJobExitReason.FAILED,
|
|
180
|
-
"NODE_FAIL": BatchJobExitReason.LOST,
|
|
181
|
-
"OUT_OF_MEMORY": BatchJobExitReason.MEMLIMIT,
|
|
182
|
-
"PREEMPTED": BatchJobExitReason.KILLED,
|
|
183
|
-
"TIMEOUT": BatchJobExitReason.KILLED
|
|
184
|
-
}
|
|
185
|
-
|
|
186
208
|
if state not in TERMINAL_STATES:
|
|
187
209
|
# Don't treat the job as exited yet
|
|
188
210
|
return None
|
|
@@ -204,6 +226,24 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
204
226
|
# If the code is nonzero, pass it along.
|
|
205
227
|
return (rc, exit_reason)
|
|
206
228
|
|
|
229
|
+
def _canonicalize_state(self, state: str) -> str:
|
|
230
|
+
"""
|
|
231
|
+
Turn a state string form SLURM into just the state token like "CANCELED".
|
|
232
|
+
"""
|
|
233
|
+
|
|
234
|
+
# Slurm will sometimes send something like "CANCELED by 30065" in
|
|
235
|
+
# the state column for some reason.
|
|
236
|
+
|
|
237
|
+
state_token = state
|
|
238
|
+
|
|
239
|
+
if " " in state_token:
|
|
240
|
+
state_token = state.split(" ", 1)[0]
|
|
241
|
+
|
|
242
|
+
if state_token not in TERMINAL_STATES and state_token not in NONTERMINAL_STATES:
|
|
243
|
+
raise RuntimeError("Toil job in unimplemented Slurm state " + state)
|
|
244
|
+
|
|
245
|
+
return state_token
|
|
246
|
+
|
|
207
247
|
def _getJobDetailsFromSacct(self, job_id_list: list) -> dict:
|
|
208
248
|
"""
|
|
209
249
|
Get SLURM job exit codes for the jobs in `job_id_list` by running `sacct`.
|
|
@@ -231,6 +271,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
231
271
|
if len(values) < 3:
|
|
232
272
|
continue
|
|
233
273
|
job_id_raw, state, exitcode = values
|
|
274
|
+
state = self._canonicalize_state(state)
|
|
234
275
|
logger.debug("%s state of job %s is %s", args[0], job_id_raw, state)
|
|
235
276
|
# JobIDRaw is in the form JobID[.JobStep]; we're not interested in job steps.
|
|
236
277
|
job_id_parts = job_id_raw.split(".")
|
|
@@ -305,6 +346,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
305
346
|
if job_id not in job_id_list:
|
|
306
347
|
continue
|
|
307
348
|
state = job['JobState']
|
|
349
|
+
state = self._canonicalize_state(state)
|
|
308
350
|
logger.debug("%s state of job %s is %s", args[0], job_id, state)
|
|
309
351
|
try:
|
|
310
352
|
exitcode = job['ExitCode']
|
toil/batchSystems/torque.py
CHANGED
|
@@ -31,7 +31,7 @@ logger = logging.getLogger(__name__)
|
|
|
31
31
|
class TorqueBatchSystem(AbstractGridEngineBatchSystem):
|
|
32
32
|
|
|
33
33
|
# class-specific Worker
|
|
34
|
-
class
|
|
34
|
+
class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
|
|
35
35
|
def __init__(
|
|
36
36
|
self, newJobsQueue, updatedJobsQueue, killQueue, killedJobsQueue, boss
|
|
37
37
|
):
|
toil/bus.py
CHANGED
|
@@ -20,7 +20,7 @@ functions to "handle" different things happening. Over time, it has become very
|
|
|
20
20
|
brittle: exactly the right handling functions need to be called in exactly the
|
|
21
21
|
right order, or it gets confused and does the wrong thing.
|
|
22
22
|
|
|
23
|
-
The MessageBus is meant to let the leader avoid this by more
|
|
23
|
+
The MessageBus is meant to let the leader avoid this by more loosely coupling
|
|
24
24
|
its components together, by having them communicate by sending messages instead
|
|
25
25
|
of by calling functions.
|
|
26
26
|
|
|
@@ -741,12 +741,16 @@ def replay_message_bus(path: str) -> Dict[str, JobStatus]:
|
|
|
741
741
|
|
|
742
742
|
return job_statuses
|
|
743
743
|
|
|
744
|
-
def gen_message_bus_path() -> str:
|
|
744
|
+
def gen_message_bus_path(tmpdir: Optional[str] = None) -> str:
|
|
745
745
|
"""
|
|
746
746
|
Return a file path in tmp to store the message bus at.
|
|
747
747
|
Calling function is responsible for cleaning the generated file.
|
|
748
|
+
|
|
749
|
+
The tmpdir argument will override the directory that the
|
|
750
|
+
message bus will be made in. If not provided, the standard tempfile
|
|
751
|
+
order will be used.
|
|
748
752
|
"""
|
|
749
|
-
fd, path = tempfile.mkstemp()
|
|
753
|
+
fd, path = tempfile.mkstemp(dir=tmpdir)
|
|
750
754
|
os.close(fd)
|
|
751
755
|
return path
|
|
752
756
|
#TODO Might want to clean up the tmpfile at some point after running the workflow
|