toil 6.1.0a1__py3-none-any.whl → 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +1 -232
- toil/batchSystems/abstractBatchSystem.py +41 -17
- toil/batchSystems/abstractGridEngineBatchSystem.py +79 -65
- toil/batchSystems/awsBatch.py +8 -8
- toil/batchSystems/cleanup_support.py +7 -3
- toil/batchSystems/contained_executor.py +4 -5
- toil/batchSystems/gridengine.py +1 -1
- toil/batchSystems/htcondor.py +5 -5
- toil/batchSystems/kubernetes.py +25 -11
- toil/batchSystems/local_support.py +3 -3
- toil/batchSystems/lsf.py +9 -9
- toil/batchSystems/mesos/batchSystem.py +4 -4
- toil/batchSystems/mesos/executor.py +3 -2
- toil/batchSystems/options.py +9 -0
- toil/batchSystems/singleMachine.py +11 -10
- toil/batchSystems/slurm.py +129 -16
- toil/batchSystems/torque.py +1 -1
- toil/bus.py +45 -3
- toil/common.py +56 -31
- toil/cwl/cwltoil.py +442 -371
- toil/deferred.py +1 -1
- toil/exceptions.py +1 -1
- toil/fileStores/abstractFileStore.py +69 -20
- toil/fileStores/cachingFileStore.py +6 -22
- toil/fileStores/nonCachingFileStore.py +6 -15
- toil/job.py +270 -86
- toil/jobStores/abstractJobStore.py +37 -31
- toil/jobStores/aws/jobStore.py +280 -218
- toil/jobStores/aws/utils.py +60 -31
- toil/jobStores/conftest.py +2 -2
- toil/jobStores/fileJobStore.py +3 -3
- toil/jobStores/googleJobStore.py +3 -4
- toil/leader.py +89 -38
- toil/lib/aws/__init__.py +26 -10
- toil/lib/aws/iam.py +2 -2
- toil/lib/aws/session.py +62 -22
- toil/lib/aws/utils.py +73 -37
- toil/lib/conversions.py +24 -1
- toil/lib/ec2.py +118 -69
- toil/lib/expando.py +1 -1
- toil/lib/generatedEC2Lists.py +8 -8
- toil/lib/io.py +42 -4
- toil/lib/misc.py +1 -3
- toil/lib/resources.py +57 -16
- toil/lib/retry.py +12 -5
- toil/lib/threading.py +29 -14
- toil/lib/throttle.py +1 -1
- toil/options/common.py +31 -30
- toil/options/wdl.py +5 -0
- toil/provisioners/__init__.py +9 -3
- toil/provisioners/abstractProvisioner.py +12 -2
- toil/provisioners/aws/__init__.py +20 -15
- toil/provisioners/aws/awsProvisioner.py +406 -329
- toil/provisioners/gceProvisioner.py +2 -2
- toil/provisioners/node.py +13 -5
- toil/server/app.py +1 -1
- toil/statsAndLogging.py +93 -23
- toil/test/__init__.py +27 -12
- toil/test/batchSystems/batchSystemTest.py +40 -33
- toil/test/batchSystems/batch_system_plugin_test.py +79 -0
- toil/test/batchSystems/test_slurm.py +22 -7
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +58 -0
- toil/test/cwl/cwlTest.py +245 -236
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +11 -14
- toil/test/jobStores/jobStoreTest.py +40 -54
- toil/test/lib/aws/test_iam.py +2 -2
- toil/test/lib/test_ec2.py +1 -1
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +37 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
- toil/test/provisioners/clusterTest.py +99 -16
- toil/test/server/serverTest.py +2 -2
- toil/test/src/autoDeploymentTest.py +1 -1
- toil/test/src/dockerCheckTest.py +2 -1
- toil/test/src/environmentTest.py +125 -0
- toil/test/src/fileStoreTest.py +1 -1
- toil/test/src/jobDescriptionTest.py +18 -8
- toil/test/src/jobTest.py +1 -1
- toil/test/src/realtimeLoggerTest.py +4 -0
- toil/test/src/workerTest.py +52 -19
- toil/test/utils/toilDebugTest.py +62 -4
- toil/test/utils/utilsTest.py +23 -21
- toil/test/wdl/wdltoil_test.py +49 -21
- toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
- toil/toilState.py +68 -9
- toil/utils/toilDebugFile.py +1 -1
- toil/utils/toilDebugJob.py +153 -26
- toil/utils/toilLaunchCluster.py +12 -2
- toil/utils/toilRsyncCluster.py +7 -2
- toil/utils/toilSshCluster.py +7 -3
- toil/utils/toilStats.py +310 -266
- toil/utils/toilStatus.py +98 -52
- toil/version.py +11 -11
- toil/wdl/wdltoil.py +644 -225
- toil/worker.py +125 -83
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
- toil-7.0.0.dist-info/METADATA +158 -0
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/RECORD +103 -96
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
|
@@ -174,13 +174,13 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
174
174
|
def unignoreNode(self, nodeAddress):
|
|
175
175
|
self.ignoredNodes.remove(nodeAddress)
|
|
176
176
|
|
|
177
|
-
def issueBatchJob(self, jobNode: JobDescription, job_environment: Optional[Dict[str, str]] = None):
|
|
177
|
+
def issueBatchJob(self, command: str, jobNode: JobDescription, job_environment: Optional[Dict[str, str]] = None):
|
|
178
178
|
"""
|
|
179
179
|
Issues the following command returning a unique jobID. Command is the string to run, memory
|
|
180
180
|
is an int giving the number of bytes the job needs to run in and cores is the number of cpus
|
|
181
181
|
needed for the job and error-file is the path of the file to place any std-err/std-out in.
|
|
182
182
|
"""
|
|
183
|
-
localID = self.handleLocalJob(jobNode)
|
|
183
|
+
localID = self.handleLocalJob(command, jobNode)
|
|
184
184
|
if localID is not None:
|
|
185
185
|
return localID
|
|
186
186
|
|
|
@@ -200,12 +200,12 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
200
200
|
job = ToilJob(jobID=jobID,
|
|
201
201
|
name=str(jobNode),
|
|
202
202
|
resources=MesosShape(wallTime=0, **mesos_resources),
|
|
203
|
-
command=
|
|
203
|
+
command=command,
|
|
204
204
|
userScript=self.userScript,
|
|
205
205
|
environment=environment,
|
|
206
206
|
workerCleanupInfo=self.workerCleanupInfo)
|
|
207
207
|
jobType = job.resources
|
|
208
|
-
log.debug("Queueing the job
|
|
208
|
+
log.debug("Queueing the job %s with job id: %s ...", jobNode, str(jobID))
|
|
209
209
|
|
|
210
210
|
# TODO: round all elements of resources
|
|
211
211
|
|
|
@@ -196,12 +196,13 @@ class MesosExecutor(Executor):
|
|
|
196
196
|
"""
|
|
197
197
|
if job.userScript:
|
|
198
198
|
job.userScript.register()
|
|
199
|
-
|
|
199
|
+
command = job.command
|
|
200
|
+
log.debug("Invoking command: '%s'", command)
|
|
200
201
|
# Construct the job's environment
|
|
201
202
|
jobEnv = dict(os.environ, **job.environment)
|
|
202
203
|
log.debug('Using environment variables: %s', jobEnv.keys())
|
|
203
204
|
with self.popenLock:
|
|
204
|
-
return subprocess.Popen(
|
|
205
|
+
return subprocess.Popen(command,
|
|
205
206
|
preexec_fn=lambda: os.setpgrp(),
|
|
206
207
|
shell=True, env=jobEnv)
|
|
207
208
|
|
toil/batchSystems/options.py
CHANGED
|
@@ -76,6 +76,7 @@ def set_batchsystem_options(batch_system: Optional[str], set_option: OptionSette
|
|
|
76
76
|
set_option("manualMemArgs")
|
|
77
77
|
set_option("run_local_jobs_on_workers")
|
|
78
78
|
set_option("statePollingWait")
|
|
79
|
+
set_option("state_polling_timeout")
|
|
79
80
|
set_option("batch_logs_dir")
|
|
80
81
|
|
|
81
82
|
|
|
@@ -164,6 +165,14 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) -
|
|
|
164
165
|
"Return cached results if within the waiting period. Only works for grid "
|
|
165
166
|
"engine batch systems such as gridengine, htcondor, torque, slurm, and lsf."
|
|
166
167
|
)
|
|
168
|
+
parser.add_argument(
|
|
169
|
+
"--statePollingTimeout",
|
|
170
|
+
dest="state_polling_timeout",
|
|
171
|
+
type=int,
|
|
172
|
+
default=1200,
|
|
173
|
+
help="Time, in seconds, to retry against a broken scheduler. Only works for grid "
|
|
174
|
+
"engine batch systems such as gridengine, htcondor, torque, slurm, and lsf."
|
|
175
|
+
)
|
|
167
176
|
parser.add_argument(
|
|
168
177
|
"--batchLogsDir",
|
|
169
178
|
dest="batch_logs_dir",
|
|
@@ -475,17 +475,17 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
475
475
|
# We can actually run in this thread
|
|
476
476
|
jobName, jobStoreLocator, jobStoreID = jobCommand.split()[1:4] # Parse command
|
|
477
477
|
jobStore = Toil.resumeJobStore(jobStoreLocator)
|
|
478
|
-
toil_worker.workerScript(jobStore, jobStore.config, jobName, jobStoreID,
|
|
479
|
-
|
|
478
|
+
statusCode = toil_worker.workerScript(jobStore, jobStore.config, jobName, jobStoreID,
|
|
479
|
+
redirect_output_to_log_file=not self.debugWorker) # Call the worker
|
|
480
480
|
else:
|
|
481
481
|
# Run synchronously. If starting or running the command fails, let the exception stop us.
|
|
482
|
-
subprocess.check_call(jobCommand,
|
|
482
|
+
statusCode = subprocess.check_call(jobCommand,
|
|
483
483
|
shell=True,
|
|
484
484
|
env=dict(os.environ, **environment))
|
|
485
485
|
|
|
486
486
|
self.runningJobs.pop(jobID)
|
|
487
487
|
if not info.killIntended:
|
|
488
|
-
self.outputQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=
|
|
488
|
+
self.outputQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=statusCode, wallTime=time.time() - info.time, exitReason=None))
|
|
489
489
|
|
|
490
490
|
def getSchedulingStatusMessage(self):
|
|
491
491
|
# Implement the abstractBatchSystem's scheduling status message API
|
|
@@ -655,6 +655,7 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
655
655
|
# and all its children together. We assume that the
|
|
656
656
|
# process group ID will equal the PID of the process we
|
|
657
657
|
# are starting.
|
|
658
|
+
logger.debug("Attempting to run job command: %s", jobCommand)
|
|
658
659
|
popen = subprocess.Popen(jobCommand,
|
|
659
660
|
shell=True,
|
|
660
661
|
env=child_environment,
|
|
@@ -743,24 +744,24 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
743
744
|
|
|
744
745
|
logger.debug('Child %d for job %s succeeded', pid, jobID)
|
|
745
746
|
|
|
746
|
-
def issueBatchJob(self,
|
|
747
|
+
def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
|
|
747
748
|
"""Adds the command and resources to a queue to be run."""
|
|
748
749
|
|
|
749
750
|
self._checkOnDaddy()
|
|
750
751
|
|
|
751
752
|
# Apply scale in cores
|
|
752
|
-
scaled_desc =
|
|
753
|
+
scaled_desc = job_desc.scale('cores', self.scale)
|
|
753
754
|
# Round cores up to multiples of minCores
|
|
754
755
|
scaled_desc.cores = max(math.ceil(scaled_desc.cores / self.minCores) * self.minCores, self.minCores)
|
|
755
756
|
|
|
756
757
|
# Don't do our own assertions about job size vs. our configured size.
|
|
757
758
|
# The abstract batch system can handle it.
|
|
758
759
|
self.check_resource_request(scaled_desc)
|
|
759
|
-
logger.debug(f"Issuing the command: {
|
|
760
|
+
logger.debug(f"Issuing the command: {command} with {scaled_desc.requirements_string()}")
|
|
760
761
|
with self.jobIndexLock:
|
|
761
762
|
jobID = self.jobIndex
|
|
762
763
|
self.jobIndex += 1
|
|
763
|
-
self.jobs[jobID] =
|
|
764
|
+
self.jobs[jobID] = command
|
|
764
765
|
|
|
765
766
|
environment = self.environment.copy()
|
|
766
767
|
if job_environment:
|
|
@@ -769,10 +770,10 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
769
770
|
if self.debugWorker:
|
|
770
771
|
# Run immediately, blocking for return.
|
|
771
772
|
# Ignore resource requirements; we run one job at a time
|
|
772
|
-
self._runDebugJob(
|
|
773
|
+
self._runDebugJob(command, jobID, environment)
|
|
773
774
|
else:
|
|
774
775
|
# Queue the job for later
|
|
775
|
-
self.inputQueue.put((
|
|
776
|
+
self.inputQueue.put((command, jobID, scaled_desc.cores, scaled_desc.memory,
|
|
776
777
|
scaled_desc.disk, scaled_desc.accelerators, environment))
|
|
777
778
|
|
|
778
779
|
return jobID
|
toil/batchSystems/slurm.py
CHANGED
|
@@ -16,8 +16,9 @@ import math
|
|
|
16
16
|
import os
|
|
17
17
|
from argparse import ArgumentParser, _ArgumentGroup
|
|
18
18
|
from shlex import quote
|
|
19
|
-
from typing import Dict, List, Optional, TypeVar, Union
|
|
19
|
+
from typing import Dict, List, Optional, Set, Tuple, TypeVar, Union
|
|
20
20
|
|
|
21
|
+
from toil.batchSystems.abstractBatchSystem import BatchJobExitReason, EXIT_STATUS_UNAVAILABLE_VALUE, InsufficientSystemResources
|
|
21
22
|
from toil.batchSystems.abstractGridEngineBatchSystem import \
|
|
22
23
|
AbstractGridEngineBatchSystem
|
|
23
24
|
from toil.batchSystems.options import OptionSetter
|
|
@@ -26,10 +27,50 @@ from toil.lib.misc import CalledProcessErrorStderr, call_command
|
|
|
26
27
|
|
|
27
28
|
logger = logging.getLogger(__name__)
|
|
28
29
|
|
|
30
|
+
# We have a complete list of Slurm states. States not in one of these aren't
|
|
31
|
+
# allowed. See <https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES>
|
|
32
|
+
|
|
33
|
+
# If a job is in one of these states, Slurm can't run it anymore.
|
|
34
|
+
# We don't include states where the job is held or paused here;
|
|
35
|
+
# those mean it could run and needs to wait for someone to un-hold
|
|
36
|
+
# it, so Toil should wait for it.
|
|
37
|
+
#
|
|
38
|
+
# We map from each terminal state to the Toil-ontology exit reason.
|
|
39
|
+
TERMINAL_STATES: Dict[str, BatchJobExitReason] = {
|
|
40
|
+
"BOOT_FAIL": BatchJobExitReason.LOST,
|
|
41
|
+
"CANCELLED": BatchJobExitReason.KILLED,
|
|
42
|
+
"COMPLETED": BatchJobExitReason.FINISHED,
|
|
43
|
+
"DEADLINE": BatchJobExitReason.KILLED,
|
|
44
|
+
"FAILED": BatchJobExitReason.FAILED,
|
|
45
|
+
"NODE_FAIL": BatchJobExitReason.LOST,
|
|
46
|
+
"OUT_OF_MEMORY": BatchJobExitReason.MEMLIMIT,
|
|
47
|
+
"PREEMPTED": BatchJobExitReason.KILLED,
|
|
48
|
+
"REVOKED": BatchJobExitReason.KILLED,
|
|
49
|
+
"SPECIAL_EXIT": BatchJobExitReason.FAILED,
|
|
50
|
+
"TIMEOUT": BatchJobExitReason.KILLED
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
# If a job is in one of these states, it might eventually move to a different
|
|
54
|
+
# state.
|
|
55
|
+
NONTERMINAL_STATES: Set[str] = {
|
|
56
|
+
"CONFIGURING",
|
|
57
|
+
"COMPLETING",
|
|
58
|
+
"PENDING",
|
|
59
|
+
"RUNNING",
|
|
60
|
+
"RESV_DEL_HOLD",
|
|
61
|
+
"REQUEUE_FED",
|
|
62
|
+
"REQUEUE_HOLD",
|
|
63
|
+
"REQUEUED",
|
|
64
|
+
"RESIZING",
|
|
65
|
+
"SIGNALING",
|
|
66
|
+
"STAGE_OUT",
|
|
67
|
+
"STOPPED",
|
|
68
|
+
"SUSPENDED"
|
|
69
|
+
}
|
|
29
70
|
|
|
30
71
|
class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
31
72
|
|
|
32
|
-
class
|
|
73
|
+
class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
|
|
33
74
|
|
|
34
75
|
def getRunningJobIDs(self):
|
|
35
76
|
# Should return a dictionary of Job IDs and number of seconds
|
|
@@ -64,7 +105,9 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
64
105
|
jobName: str,
|
|
65
106
|
job_environment: Optional[Dict[str, str]] = None,
|
|
66
107
|
gpus: Optional[int] = None) -> List[str]:
|
|
67
|
-
|
|
108
|
+
# Make sure to use exec so we can get Slurm's signals in the Toil
|
|
109
|
+
# worker instead of having an intervening Bash
|
|
110
|
+
return self.prepareSbatch(cpu, memory, jobID, jobName, job_environment, gpus) + [f'--wrap=exec {command}']
|
|
68
111
|
|
|
69
112
|
def submitJob(self, subLine):
|
|
70
113
|
try:
|
|
@@ -92,15 +135,15 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
92
135
|
logger.debug("sbatch submitted job %d", result)
|
|
93
136
|
return result
|
|
94
137
|
except OSError as e:
|
|
95
|
-
logger.error("sbatch command failed")
|
|
138
|
+
logger.error(f"sbatch command failed with error: {e}")
|
|
96
139
|
raise e
|
|
97
140
|
|
|
98
|
-
def coalesce_job_exit_codes(self, batch_job_id_list: list) ->
|
|
141
|
+
def coalesce_job_exit_codes(self, batch_job_id_list: list) -> List[Union[int, Tuple[int, Optional[BatchJobExitReason]], None]]:
|
|
99
142
|
"""
|
|
100
143
|
Collect all job exit codes in a single call.
|
|
101
144
|
:param batch_job_id_list: list of Job ID strings, where each string has the form
|
|
102
145
|
"<job>[.<task>]".
|
|
103
|
-
:return: list of job exit codes, associated with the list of job IDs.
|
|
146
|
+
:return: list of job exit codes or exit code, exit reason pairs associated with the list of job IDs.
|
|
104
147
|
"""
|
|
105
148
|
logger.debug("Getting exit codes for slurm jobs: %s", batch_job_id_list)
|
|
106
149
|
# Convert batch_job_id_list to list of integer job IDs.
|
|
@@ -111,7 +154,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
111
154
|
exit_codes.append(self._get_job_return_code(status))
|
|
112
155
|
return exit_codes
|
|
113
156
|
|
|
114
|
-
def getJobExitCode(self, batchJobID: str) -> int:
|
|
157
|
+
def getJobExitCode(self, batchJobID: str) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
|
|
115
158
|
"""
|
|
116
159
|
Get job exit code for given batch job ID.
|
|
117
160
|
:param batchJobID: string of the form "<job>[.<task>]".
|
|
@@ -138,18 +181,68 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
138
181
|
status_dict = self._getJobDetailsFromScontrol(job_id_list)
|
|
139
182
|
return status_dict
|
|
140
183
|
|
|
141
|
-
def _get_job_return_code(self, status: tuple) ->
|
|
184
|
+
def _get_job_return_code(self, status: tuple) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
|
|
142
185
|
"""
|
|
186
|
+
Given a Slurm return code, status pair, summarize them into a Toil return code, exit reason pair.
|
|
187
|
+
|
|
188
|
+
The return code may have already been OR'd with the 128-offset
|
|
189
|
+
Slurm-reported signal.
|
|
190
|
+
|
|
191
|
+
Slurm will report return codes of 0 even if jobs time out instead
|
|
192
|
+
of succeeding:
|
|
193
|
+
|
|
194
|
+
2093597|TIMEOUT|0:0
|
|
195
|
+
2093597.batch|CANCELLED|0:15
|
|
196
|
+
|
|
197
|
+
So we guarantee here that, if the Slurm status string is not a
|
|
198
|
+
successful one as defined in
|
|
199
|
+
<https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES>, we
|
|
200
|
+
will not return a successful return code.
|
|
201
|
+
|
|
143
202
|
Helper function for `getJobExitCode` and `coalesce_job_exit_codes`.
|
|
144
|
-
:param status: tuple containing the job's state and it's return code.
|
|
145
|
-
:return: the job's return code if it's completed, otherwise None.
|
|
203
|
+
:param status: tuple containing the job's state and it's return code from Slurm.
|
|
204
|
+
:return: the job's return code for Toil if it's completed, otherwise None.
|
|
146
205
|
"""
|
|
147
206
|
state, rc = status
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
207
|
+
|
|
208
|
+
if state not in TERMINAL_STATES:
|
|
209
|
+
# Don't treat the job as exited yet
|
|
210
|
+
return None
|
|
211
|
+
|
|
212
|
+
exit_reason = TERMINAL_STATES[state]
|
|
213
|
+
|
|
214
|
+
if exit_reason == BatchJobExitReason.FINISHED:
|
|
215
|
+
# The only state that should produce a 0 ever is COMPLETED. So
|
|
216
|
+
# if the job is COMPLETED and the exit reason is thus FINISHED,
|
|
217
|
+
# pass along the code it has.
|
|
218
|
+
return (rc, exit_reason)
|
|
219
|
+
|
|
220
|
+
if rc == 0:
|
|
221
|
+
# The job claims to be in a state other than COMPLETED, but
|
|
222
|
+
# also to have not encountered a problem. Say the exit status
|
|
223
|
+
# is unavailable.
|
|
224
|
+
return (EXIT_STATUS_UNAVAILABLE_VALUE, exit_reason)
|
|
225
|
+
|
|
226
|
+
# If the code is nonzero, pass it along.
|
|
227
|
+
return (rc, exit_reason)
|
|
228
|
+
|
|
229
|
+
def _canonicalize_state(self, state: str) -> str:
|
|
230
|
+
"""
|
|
231
|
+
Turn a state string form SLURM into just the state token like "CANCELED".
|
|
232
|
+
"""
|
|
233
|
+
|
|
234
|
+
# Slurm will sometimes send something like "CANCELED by 30065" in
|
|
235
|
+
# the state column for some reason.
|
|
236
|
+
|
|
237
|
+
state_token = state
|
|
238
|
+
|
|
239
|
+
if " " in state_token:
|
|
240
|
+
state_token = state.split(" ", 1)[0]
|
|
241
|
+
|
|
242
|
+
if state_token not in TERMINAL_STATES and state_token not in NONTERMINAL_STATES:
|
|
243
|
+
raise RuntimeError("Toil job in unimplemented Slurm state " + state)
|
|
244
|
+
|
|
245
|
+
return state_token
|
|
153
246
|
|
|
154
247
|
def _getJobDetailsFromSacct(self, job_id_list: list) -> dict:
|
|
155
248
|
"""
|
|
@@ -178,6 +271,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
178
271
|
if len(values) < 3:
|
|
179
272
|
continue
|
|
180
273
|
job_id_raw, state, exitcode = values
|
|
274
|
+
state = self._canonicalize_state(state)
|
|
181
275
|
logger.debug("%s state of job %s is %s", args[0], job_id_raw, state)
|
|
182
276
|
# JobIDRaw is in the form JobID[.JobStep]; we're not interested in job steps.
|
|
183
277
|
job_id_parts = job_id_raw.split(".")
|
|
@@ -252,6 +346,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
252
346
|
if job_id not in job_id_list:
|
|
253
347
|
continue
|
|
254
348
|
state = job['JobState']
|
|
349
|
+
state = self._canonicalize_state(state)
|
|
255
350
|
logger.debug("%s state of job %s is %s", args[0], job_id, state)
|
|
256
351
|
try:
|
|
257
352
|
exitcode = job['ExitCode']
|
|
@@ -283,8 +378,26 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
283
378
|
job_environment: Optional[Dict[str, str]],
|
|
284
379
|
gpus: Optional[int]) -> List[str]:
|
|
285
380
|
|
|
286
|
-
|
|
381
|
+
"""
|
|
382
|
+
Returns the sbatch command line to run to queue the job.
|
|
383
|
+
"""
|
|
384
|
+
|
|
385
|
+
# Start by naming the job
|
|
287
386
|
sbatch_line = ['sbatch', '-J', f'toil_job_{jobID}_{jobName}']
|
|
387
|
+
|
|
388
|
+
# Make sure the job gets a signal before it disappears so that e.g.
|
|
389
|
+
# container cleanup finally blocks can run. Ask for SIGINT so we
|
|
390
|
+
# can get the default Python KeyboardInterrupt which third-party
|
|
391
|
+
# code is likely to plan for. Make sure to send it to the batch
|
|
392
|
+
# shell process with "B:", not to all the srun steps it launches
|
|
393
|
+
# (because there shouldn't be any). We cunningly replaced the batch
|
|
394
|
+
# shell process with the Toil worker process, so Toil should be
|
|
395
|
+
# able to get the signal.
|
|
396
|
+
#
|
|
397
|
+
# TODO: Add a way to detect when the job failed because it
|
|
398
|
+
# responded to this signal and use the right exit reason for it.
|
|
399
|
+
sbatch_line.append("--signal=B:INT@30")
|
|
400
|
+
|
|
288
401
|
if gpus:
|
|
289
402
|
sbatch_line = sbatch_line[:1] + [f'--gres=gpu:{gpus}'] + sbatch_line[1:]
|
|
290
403
|
environment = {}
|
toil/batchSystems/torque.py
CHANGED
|
@@ -31,7 +31,7 @@ logger = logging.getLogger(__name__)
|
|
|
31
31
|
class TorqueBatchSystem(AbstractGridEngineBatchSystem):
|
|
32
32
|
|
|
33
33
|
# class-specific Worker
|
|
34
|
-
class
|
|
34
|
+
class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
|
|
35
35
|
def __init__(
|
|
36
36
|
self, newJobsQueue, updatedJobsQueue, killQueue, killedJobsQueue, boss
|
|
37
37
|
):
|
toil/bus.py
CHANGED
|
@@ -20,7 +20,7 @@ functions to "handle" different things happening. Over time, it has become very
|
|
|
20
20
|
brittle: exactly the right handling functions need to be called in exactly the
|
|
21
21
|
right order, or it gets confused and does the wrong thing.
|
|
22
22
|
|
|
23
|
-
The MessageBus is meant to let the leader avoid this by more
|
|
23
|
+
The MessageBus is meant to let the leader avoid this by more loosely coupling
|
|
24
24
|
its components together, by having them communicate by sending messages instead
|
|
25
25
|
of by calling functions.
|
|
26
26
|
|
|
@@ -87,6 +87,43 @@ from pubsub.core.topicutils import ALL_TOPICS
|
|
|
87
87
|
|
|
88
88
|
logger = logging.getLogger( __name__ )
|
|
89
89
|
|
|
90
|
+
# We define some ways to talk about jobs.
|
|
91
|
+
|
|
92
|
+
class Names(NamedTuple):
|
|
93
|
+
"""
|
|
94
|
+
Stores all the kinds of name a job can have.
|
|
95
|
+
"""
|
|
96
|
+
# Name of the kind of job this is
|
|
97
|
+
job_name: str
|
|
98
|
+
# Name of this particular work unit
|
|
99
|
+
unit_name: str
|
|
100
|
+
# Human-readable name for the job
|
|
101
|
+
display_name: str
|
|
102
|
+
# What the job prints as, used for stats-and-logging log management
|
|
103
|
+
stats_name: str
|
|
104
|
+
# Job store ID of the job for the work unit
|
|
105
|
+
job_store_id: str
|
|
106
|
+
|
|
107
|
+
def get_job_kind(names: Names) -> str:
|
|
108
|
+
"""
|
|
109
|
+
Return an identifying string for the job.
|
|
110
|
+
|
|
111
|
+
The result may contain spaces.
|
|
112
|
+
|
|
113
|
+
Returns: Either the unit name, job name, or display name, which identifies
|
|
114
|
+
the kind of job it is to toil.
|
|
115
|
+
Otherwise "Unknown Job" in case no identifier is available
|
|
116
|
+
"""
|
|
117
|
+
if names.unit_name:
|
|
118
|
+
return names.unit_name
|
|
119
|
+
elif names.job_name:
|
|
120
|
+
return names.job_name
|
|
121
|
+
elif names.display_name:
|
|
122
|
+
return names.display_name
|
|
123
|
+
else:
|
|
124
|
+
return "Unknown Job"
|
|
125
|
+
|
|
126
|
+
|
|
90
127
|
# We define a bunch of named tuple message types.
|
|
91
128
|
# These all need to be plain data: only hold ints, strings, etc.
|
|
92
129
|
|
|
@@ -648,6 +685,7 @@ class JobStatus:
|
|
|
648
685
|
|
|
649
686
|
def __repr__(self) -> str:
|
|
650
687
|
return json.dumps(self, default= lambda o: o.__dict__, indent=4)
|
|
688
|
+
|
|
651
689
|
def replay_message_bus(path: str) -> Dict[str, JobStatus]:
|
|
652
690
|
"""
|
|
653
691
|
Replay all the messages and work out what they mean for jobs.
|
|
@@ -703,12 +741,16 @@ def replay_message_bus(path: str) -> Dict[str, JobStatus]:
|
|
|
703
741
|
|
|
704
742
|
return job_statuses
|
|
705
743
|
|
|
706
|
-
def gen_message_bus_path() -> str:
|
|
744
|
+
def gen_message_bus_path(tmpdir: Optional[str] = None) -> str:
|
|
707
745
|
"""
|
|
708
746
|
Return a file path in tmp to store the message bus at.
|
|
709
747
|
Calling function is responsible for cleaning the generated file.
|
|
748
|
+
|
|
749
|
+
The tmpdir argument will override the directory that the
|
|
750
|
+
message bus will be made in. If not provided, the standard tempfile
|
|
751
|
+
order will be used.
|
|
710
752
|
"""
|
|
711
|
-
fd, path = tempfile.mkstemp()
|
|
753
|
+
fd, path = tempfile.mkstemp(dir=tmpdir)
|
|
712
754
|
os.close(fd)
|
|
713
755
|
return path
|
|
714
756
|
#TODO Might want to clean up the tmpfile at some point after running the workflow
|