toil 6.1.0__py3-none-any.whl → 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. toil/__init__.py +1 -232
  2. toil/batchSystems/abstractBatchSystem.py +22 -13
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +59 -45
  4. toil/batchSystems/awsBatch.py +8 -8
  5. toil/batchSystems/contained_executor.py +4 -5
  6. toil/batchSystems/gridengine.py +1 -1
  7. toil/batchSystems/htcondor.py +5 -5
  8. toil/batchSystems/kubernetes.py +25 -11
  9. toil/batchSystems/local_support.py +3 -3
  10. toil/batchSystems/lsf.py +2 -2
  11. toil/batchSystems/mesos/batchSystem.py +4 -4
  12. toil/batchSystems/mesos/executor.py +3 -2
  13. toil/batchSystems/options.py +9 -0
  14. toil/batchSystems/singleMachine.py +11 -10
  15. toil/batchSystems/slurm.py +64 -22
  16. toil/batchSystems/torque.py +1 -1
  17. toil/bus.py +7 -3
  18. toil/common.py +36 -13
  19. toil/cwl/cwltoil.py +365 -312
  20. toil/deferred.py +1 -1
  21. toil/fileStores/abstractFileStore.py +17 -17
  22. toil/fileStores/cachingFileStore.py +2 -2
  23. toil/fileStores/nonCachingFileStore.py +1 -1
  24. toil/job.py +228 -60
  25. toil/jobStores/abstractJobStore.py +18 -10
  26. toil/jobStores/aws/jobStore.py +280 -218
  27. toil/jobStores/aws/utils.py +57 -29
  28. toil/jobStores/conftest.py +2 -2
  29. toil/jobStores/fileJobStore.py +2 -2
  30. toil/jobStores/googleJobStore.py +3 -4
  31. toil/leader.py +72 -24
  32. toil/lib/aws/__init__.py +26 -10
  33. toil/lib/aws/iam.py +2 -2
  34. toil/lib/aws/session.py +62 -22
  35. toil/lib/aws/utils.py +73 -37
  36. toil/lib/conversions.py +5 -1
  37. toil/lib/ec2.py +118 -69
  38. toil/lib/expando.py +1 -1
  39. toil/lib/io.py +14 -2
  40. toil/lib/misc.py +1 -3
  41. toil/lib/resources.py +55 -21
  42. toil/lib/retry.py +12 -5
  43. toil/lib/threading.py +2 -2
  44. toil/lib/throttle.py +1 -1
  45. toil/options/common.py +27 -24
  46. toil/provisioners/__init__.py +9 -3
  47. toil/provisioners/abstractProvisioner.py +9 -7
  48. toil/provisioners/aws/__init__.py +20 -15
  49. toil/provisioners/aws/awsProvisioner.py +406 -329
  50. toil/provisioners/gceProvisioner.py +2 -2
  51. toil/provisioners/node.py +13 -5
  52. toil/server/app.py +1 -1
  53. toil/statsAndLogging.py +58 -16
  54. toil/test/__init__.py +27 -12
  55. toil/test/batchSystems/batchSystemTest.py +40 -33
  56. toil/test/batchSystems/batch_system_plugin_test.py +79 -0
  57. toil/test/batchSystems/test_slurm.py +1 -1
  58. toil/test/cwl/cwlTest.py +8 -91
  59. toil/test/cwl/seqtk_seq.cwl +1 -1
  60. toil/test/docs/scriptsTest.py +10 -13
  61. toil/test/jobStores/jobStoreTest.py +33 -49
  62. toil/test/lib/aws/test_iam.py +2 -2
  63. toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
  64. toil/test/provisioners/clusterTest.py +90 -8
  65. toil/test/server/serverTest.py +2 -2
  66. toil/test/src/autoDeploymentTest.py +1 -1
  67. toil/test/src/dockerCheckTest.py +2 -1
  68. toil/test/src/environmentTest.py +125 -0
  69. toil/test/src/fileStoreTest.py +1 -1
  70. toil/test/src/jobDescriptionTest.py +18 -8
  71. toil/test/src/jobTest.py +1 -1
  72. toil/test/src/realtimeLoggerTest.py +4 -0
  73. toil/test/src/workerTest.py +52 -19
  74. toil/test/utils/toilDebugTest.py +61 -3
  75. toil/test/utils/utilsTest.py +20 -18
  76. toil/test/wdl/wdltoil_test.py +24 -71
  77. toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
  78. toil/toilState.py +68 -9
  79. toil/utils/toilDebugJob.py +153 -26
  80. toil/utils/toilLaunchCluster.py +12 -2
  81. toil/utils/toilRsyncCluster.py +7 -2
  82. toil/utils/toilSshCluster.py +7 -3
  83. toil/utils/toilStats.py +2 -1
  84. toil/utils/toilStatus.py +97 -51
  85. toil/version.py +10 -10
  86. toil/wdl/wdltoil.py +318 -51
  87. toil/worker.py +96 -69
  88. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
  89. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/METADATA +55 -21
  90. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/RECORD +93 -90
  91. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
  92. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
  93. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
@@ -28,7 +28,7 @@ logger = logging.getLogger(__name__)
28
28
 
29
29
  class GridEngineBatchSystem(AbstractGridEngineBatchSystem):
30
30
 
31
- class Worker(AbstractGridEngineBatchSystem.Worker):
31
+ class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
32
32
  """
33
33
  Grid Engine-specific AbstractGridEngineWorker methods
34
34
  """
@@ -48,7 +48,7 @@ schedd_lock = Lock()
48
48
  class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
49
49
  # When using HTCondor, the Schedd handles scheduling
50
50
 
51
- class Worker(AbstractGridEngineBatchSystem.Worker):
51
+ class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
52
52
 
53
53
  # Override the createJobs method so that we can use htcondor.Submit objects
54
54
  # and so that we can get disk allocation requests and ceil the CPU request.
@@ -387,9 +387,9 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
387
387
  return '"' + ' '.join(env_items) + '"'
388
388
 
389
389
  # Override the issueBatchJob method so HTCondor can be given the disk request
390
- def issueBatchJob(self, jobNode, job_environment: Optional[Dict[str, str]] = None):
390
+ def issueBatchJob(self, command: str, jobNode, job_environment: Optional[Dict[str, str]] = None):
391
391
  # Avoid submitting internal jobs to the batch queue, handle locally
392
- localID = self.handleLocalJob(jobNode)
392
+ localID = self.handleLocalJob(command, jobNode)
393
393
  if localID is not None:
394
394
  return localID
395
395
  else:
@@ -398,7 +398,7 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
398
398
  self.currentJobs.add(jobID)
399
399
 
400
400
  # Construct our style of job tuple
401
- self.newJobsQueue.put((jobID, jobNode.cores, jobNode.memory, jobNode.disk, jobNode.jobName, jobNode.command,
401
+ self.newJobsQueue.put((jobID, jobNode.cores, jobNode.memory, jobNode.disk, jobNode.jobName, command,
402
402
  job_environment or {}, jobNode.accelerators))
403
- logger.debug("Issued the job command: %s with job id: %s ", jobNode.command, str(jobID))
403
+ logger.debug("Issued the job command: %s with job id: %s ", command, str(jobID))
404
404
  return jobID
@@ -47,6 +47,8 @@ from typing import (Any,
47
47
  cast,
48
48
  overload)
49
49
 
50
+ from toil.lib.conversions import opt_strtobool
51
+
50
52
  if sys.version_info < (3, 10):
51
53
  from typing_extensions import ParamSpec
52
54
  else:
@@ -83,7 +85,7 @@ from kubernetes.client import (BatchV1Api,
83
85
  V1SecretVolumeSource,
84
86
  V1Toleration,
85
87
  V1Volume,
86
- V1VolumeMount)
88
+ V1VolumeMount, V1SecurityContext)
87
89
  from kubernetes.client.api_client import ApiClient
88
90
  from kubernetes.client.exceptions import ApiException
89
91
  from kubernetes.config.config_exception import ConfigException
@@ -758,6 +760,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
758
760
 
759
761
  def _create_pod_spec(
760
762
  self,
763
+ command: str,
761
764
  job_desc: JobDescription,
762
765
  job_environment: Optional[Dict[str, str]] = None
763
766
  ) -> V1PodSpec:
@@ -770,7 +773,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
770
773
  environment.update(job_environment)
771
774
 
772
775
  # Make a command to run it in the executor
773
- command_list = pack_job(job_desc, self.user_script, environment=environment)
776
+ command_list = pack_job(command, self.user_script, environment=environment)
774
777
 
775
778
  # The Kubernetes API makes sense only in terms of the YAML format. Objects
776
779
  # represent sections of the YAML files. Except from our point of view, all
@@ -877,14 +880,20 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
877
880
 
878
881
  # Make a container definition
879
882
  container = V1Container(command=command_list,
880
- image=self.docker_image,
881
- name="runner-container",
882
- resources=resources,
883
- volume_mounts=mounts)
883
+ image=self.docker_image,
884
+ name="runner-container",
885
+ resources=resources,
886
+ volume_mounts=mounts)
887
+
888
+ # In case security context rules are not allowed to be set, we only apply
889
+ # a security context at all if we need to turn on privileged mode.
890
+ if self.config.kubernetes_privileged:
891
+ container.security_context = V1SecurityContext(privileged=self.config.kubernetes_privileged)
892
+
884
893
  # Wrap the container in a spec
885
894
  pod_spec = V1PodSpec(containers=[container],
886
- volumes=volumes,
887
- restart_policy="Never")
895
+ volumes=volumes,
896
+ restart_policy="Never")
888
897
  # Tell the spec where to land
889
898
  placement.apply(pod_spec)
890
899
 
@@ -1005,9 +1014,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1005
1014
  self._release_acquired_resources(resources, notify=resource_notify)
1006
1015
  del self._acquired_resources[job_name]
1007
1016
 
1008
- def issueBatchJob(self, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
1017
+ def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
1009
1018
  # Try the job as local
1010
- localID = self.handleLocalJob(job_desc)
1019
+ localID = self.handleLocalJob(command, job_desc)
1011
1020
  if localID is not None:
1012
1021
  # It is a local job
1013
1022
  return localID
@@ -1018,7 +1027,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1018
1027
  self.check_resource_request(job_desc)
1019
1028
 
1020
1029
  # Make a pod that describes running the job
1021
- pod_spec = self._create_pod_spec(job_desc, job_environment=job_environment)
1030
+ pod_spec = self._create_pod_spec(command, job_desc, job_environment=job_environment)
1022
1031
 
1023
1032
  # Make a batch system scope job ID
1024
1033
  job_id = self.getNextJobID()
@@ -1879,6 +1888,10 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1879
1888
  parser.add_argument("--kubernetesPodTimeout", dest="kubernetes_pod_timeout", default=120, env_var="TOIL_KUBERNETES_POD_TIMEOUT", type=float,
1880
1889
  help="Seconds to wait for a scheduled Kubernetes pod to start running. "
1881
1890
  "(default: %(default)s)")
1891
+ parser.add_argument("--kubernetesPrivileged", dest="kubernetes_privileged", default=False, env_var="TOIL_KUBERNETES_PRIVILEGED", type=opt_strtobool,
1892
+ help="Whether to ask worker pods to run in privileged mode. This should be used to access "
1893
+ "privileged operations, such as FUSE. On Toil-managed clusters with --enableFuse, "
1894
+ "this is set to True. (default: %(default)s)")
1882
1895
 
1883
1896
  OptionType = TypeVar('OptionType')
1884
1897
  @classmethod
@@ -1887,4 +1900,5 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1887
1900
  setOption("kubernetes_owner")
1888
1901
  setOption("kubernetes_service_account",)
1889
1902
  setOption("kubernetes_pod_timeout")
1903
+ setOption("kubernetes_privileged")
1890
1904
 
@@ -34,9 +34,9 @@ class BatchSystemLocalSupport(BatchSystemSupport):
34
34
  config, maxCores, maxMemory, maxDisk, max_jobs=max_local_jobs
35
35
  )
36
36
 
37
- def handleLocalJob(self, jobDesc: JobDescription) -> Optional[int]:
37
+ def handleLocalJob(self, command: str, jobDesc: JobDescription) -> Optional[int]:
38
38
  """
39
- To be called by issueBatchJobs.
39
+ To be called by issueBatchJob.
40
40
 
41
41
  Returns the jobID if the jobDesc has been submitted to the local queue,
42
42
  otherwise returns None
@@ -50,7 +50,7 @@ class BatchSystemLocalSupport(BatchSystemSupport):
50
50
  # somehow doesn't error whereas just returning the value complains
51
51
  # we're returning an Any. TODO: When singleMachine.py typechecks,
52
52
  # remove all these extra variables.
53
- local_id: int = self.localBatch.issueBatchJob(jobDesc)
53
+ local_id: int = self.localBatch.issueBatchJob(command, jobDesc)
54
54
  return local_id
55
55
  else:
56
56
  return None
toil/batchSystems/lsf.py CHANGED
@@ -44,8 +44,8 @@ logger = logging.getLogger(__name__)
44
44
 
45
45
  class LSFBatchSystem(AbstractGridEngineBatchSystem):
46
46
 
47
- class Worker(AbstractGridEngineBatchSystem.Worker):
48
- """LSF specific AbstractGridEngineWorker methods."""
47
+ class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
48
+ """LSF specific GridEngineThread methods."""
49
49
 
50
50
  def getRunningJobIDs(self):
51
51
  times = {}
@@ -174,13 +174,13 @@ class MesosBatchSystem(BatchSystemLocalSupport,
174
174
  def unignoreNode(self, nodeAddress):
175
175
  self.ignoredNodes.remove(nodeAddress)
176
176
 
177
- def issueBatchJob(self, jobNode: JobDescription, job_environment: Optional[Dict[str, str]] = None):
177
+ def issueBatchJob(self, command: str, jobNode: JobDescription, job_environment: Optional[Dict[str, str]] = None):
178
178
  """
179
179
  Issues the following command returning a unique jobID. Command is the string to run, memory
180
180
  is an int giving the number of bytes the job needs to run in and cores is the number of cpus
181
181
  needed for the job and error-file is the path of the file to place any std-err/std-out in.
182
182
  """
183
- localID = self.handleLocalJob(jobNode)
183
+ localID = self.handleLocalJob(command, jobNode)
184
184
  if localID is not None:
185
185
  return localID
186
186
 
@@ -200,12 +200,12 @@ class MesosBatchSystem(BatchSystemLocalSupport,
200
200
  job = ToilJob(jobID=jobID,
201
201
  name=str(jobNode),
202
202
  resources=MesosShape(wallTime=0, **mesos_resources),
203
- command=jobNode.command,
203
+ command=command,
204
204
  userScript=self.userScript,
205
205
  environment=environment,
206
206
  workerCleanupInfo=self.workerCleanupInfo)
207
207
  jobType = job.resources
208
- log.debug("Queueing the job command: %s with job id: %s ...", jobNode.command, str(jobID))
208
+ log.debug("Queueing the job %s with job id: %s ...", jobNode, str(jobID))
209
209
 
210
210
  # TODO: round all elements of resources
211
211
 
@@ -196,12 +196,13 @@ class MesosExecutor(Executor):
196
196
  """
197
197
  if job.userScript:
198
198
  job.userScript.register()
199
- log.debug("Invoking command: '%s'", job.command)
199
+ command = job.command
200
+ log.debug("Invoking command: '%s'", command)
200
201
  # Construct the job's environment
201
202
  jobEnv = dict(os.environ, **job.environment)
202
203
  log.debug('Using environment variables: %s', jobEnv.keys())
203
204
  with self.popenLock:
204
- return subprocess.Popen(job.command,
205
+ return subprocess.Popen(command,
205
206
  preexec_fn=lambda: os.setpgrp(),
206
207
  shell=True, env=jobEnv)
207
208
 
@@ -76,6 +76,7 @@ def set_batchsystem_options(batch_system: Optional[str], set_option: OptionSette
76
76
  set_option("manualMemArgs")
77
77
  set_option("run_local_jobs_on_workers")
78
78
  set_option("statePollingWait")
79
+ set_option("state_polling_timeout")
79
80
  set_option("batch_logs_dir")
80
81
 
81
82
 
@@ -164,6 +165,14 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) -
164
165
  "Return cached results if within the waiting period. Only works for grid "
165
166
  "engine batch systems such as gridengine, htcondor, torque, slurm, and lsf."
166
167
  )
168
+ parser.add_argument(
169
+ "--statePollingTimeout",
170
+ dest="state_polling_timeout",
171
+ type=int,
172
+ default=1200,
173
+ help="Time, in seconds, to retry against a broken scheduler. Only works for grid "
174
+ "engine batch systems such as gridengine, htcondor, torque, slurm, and lsf."
175
+ )
167
176
  parser.add_argument(
168
177
  "--batchLogsDir",
169
178
  dest="batch_logs_dir",
@@ -475,17 +475,17 @@ class SingleMachineBatchSystem(BatchSystemSupport):
475
475
  # We can actually run in this thread
476
476
  jobName, jobStoreLocator, jobStoreID = jobCommand.split()[1:4] # Parse command
477
477
  jobStore = Toil.resumeJobStore(jobStoreLocator)
478
- toil_worker.workerScript(jobStore, jobStore.config, jobName, jobStoreID,
479
- redirectOutputToLogFile=not self.debugWorker) # Call the worker
478
+ statusCode = toil_worker.workerScript(jobStore, jobStore.config, jobName, jobStoreID,
479
+ redirect_output_to_log_file=not self.debugWorker) # Call the worker
480
480
  else:
481
481
  # Run synchronously. If starting or running the command fails, let the exception stop us.
482
- subprocess.check_call(jobCommand,
482
+ statusCode = subprocess.check_call(jobCommand,
483
483
  shell=True,
484
484
  env=dict(os.environ, **environment))
485
485
 
486
486
  self.runningJobs.pop(jobID)
487
487
  if not info.killIntended:
488
- self.outputQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=0, wallTime=time.time() - info.time, exitReason=None))
488
+ self.outputQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=statusCode, wallTime=time.time() - info.time, exitReason=None))
489
489
 
490
490
  def getSchedulingStatusMessage(self):
491
491
  # Implement the abstractBatchSystem's scheduling status message API
@@ -655,6 +655,7 @@ class SingleMachineBatchSystem(BatchSystemSupport):
655
655
  # and all its children together. We assume that the
656
656
  # process group ID will equal the PID of the process we
657
657
  # are starting.
658
+ logger.debug("Attempting to run job command: %s", jobCommand)
658
659
  popen = subprocess.Popen(jobCommand,
659
660
  shell=True,
660
661
  env=child_environment,
@@ -743,24 +744,24 @@ class SingleMachineBatchSystem(BatchSystemSupport):
743
744
 
744
745
  logger.debug('Child %d for job %s succeeded', pid, jobID)
745
746
 
746
- def issueBatchJob(self, jobDesc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
747
+ def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
747
748
  """Adds the command and resources to a queue to be run."""
748
749
 
749
750
  self._checkOnDaddy()
750
751
 
751
752
  # Apply scale in cores
752
- scaled_desc = jobDesc.scale('cores', self.scale)
753
+ scaled_desc = job_desc.scale('cores', self.scale)
753
754
  # Round cores up to multiples of minCores
754
755
  scaled_desc.cores = max(math.ceil(scaled_desc.cores / self.minCores) * self.minCores, self.minCores)
755
756
 
756
757
  # Don't do our own assertions about job size vs. our configured size.
757
758
  # The abstract batch system can handle it.
758
759
  self.check_resource_request(scaled_desc)
759
- logger.debug(f"Issuing the command: {jobDesc.command} with {scaled_desc.requirements_string()}")
760
+ logger.debug(f"Issuing the command: {command} with {scaled_desc.requirements_string()}")
760
761
  with self.jobIndexLock:
761
762
  jobID = self.jobIndex
762
763
  self.jobIndex += 1
763
- self.jobs[jobID] = jobDesc.command
764
+ self.jobs[jobID] = command
764
765
 
765
766
  environment = self.environment.copy()
766
767
  if job_environment:
@@ -769,10 +770,10 @@ class SingleMachineBatchSystem(BatchSystemSupport):
769
770
  if self.debugWorker:
770
771
  # Run immediately, blocking for return.
771
772
  # Ignore resource requirements; we run one job at a time
772
- self._runDebugJob(jobDesc.command, jobID, environment)
773
+ self._runDebugJob(command, jobID, environment)
773
774
  else:
774
775
  # Queue the job for later
775
- self.inputQueue.put((jobDesc.command, jobID, scaled_desc.cores, scaled_desc.memory,
776
+ self.inputQueue.put((command, jobID, scaled_desc.cores, scaled_desc.memory,
776
777
  scaled_desc.disk, scaled_desc.accelerators, environment))
777
778
 
778
779
  return jobID
@@ -16,9 +16,9 @@ import math
16
16
  import os
17
17
  from argparse import ArgumentParser, _ArgumentGroup
18
18
  from shlex import quote
19
- from typing import Dict, List, Optional, Tuple, TypeVar, Union
19
+ from typing import Dict, List, Optional, Set, Tuple, TypeVar, Union
20
20
 
21
- from toil.batchSystems.abstractBatchSystem import BatchJobExitReason, EXIT_STATUS_UNAVAILABLE_VALUE
21
+ from toil.batchSystems.abstractBatchSystem import BatchJobExitReason, EXIT_STATUS_UNAVAILABLE_VALUE, InsufficientSystemResources
22
22
  from toil.batchSystems.abstractGridEngineBatchSystem import \
23
23
  AbstractGridEngineBatchSystem
24
24
  from toil.batchSystems.options import OptionSetter
@@ -27,10 +27,50 @@ from toil.lib.misc import CalledProcessErrorStderr, call_command
27
27
 
28
28
  logger = logging.getLogger(__name__)
29
29
 
30
+ # We have a complete list of Slurm states. States not in one of these aren't
31
+ # allowed. See <https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES>
32
+
33
+ # If a job is in one of these states, Slurm can't run it anymore.
34
+ # We don't include states where the job is held or paused here;
35
+ # those mean it could run and needs to wait for someone to un-hold
36
+ # it, so Toil should wait for it.
37
+ #
38
+ # We map from each terminal state to the Toil-ontology exit reason.
39
+ TERMINAL_STATES: Dict[str, BatchJobExitReason] = {
40
+ "BOOT_FAIL": BatchJobExitReason.LOST,
41
+ "CANCELLED": BatchJobExitReason.KILLED,
42
+ "COMPLETED": BatchJobExitReason.FINISHED,
43
+ "DEADLINE": BatchJobExitReason.KILLED,
44
+ "FAILED": BatchJobExitReason.FAILED,
45
+ "NODE_FAIL": BatchJobExitReason.LOST,
46
+ "OUT_OF_MEMORY": BatchJobExitReason.MEMLIMIT,
47
+ "PREEMPTED": BatchJobExitReason.KILLED,
48
+ "REVOKED": BatchJobExitReason.KILLED,
49
+ "SPECIAL_EXIT": BatchJobExitReason.FAILED,
50
+ "TIMEOUT": BatchJobExitReason.KILLED
51
+ }
52
+
53
+ # If a job is in one of these states, it might eventually move to a different
54
+ # state.
55
+ NONTERMINAL_STATES: Set[str] = {
56
+ "CONFIGURING",
57
+ "COMPLETING",
58
+ "PENDING",
59
+ "RUNNING",
60
+ "RESV_DEL_HOLD",
61
+ "REQUEUE_FED",
62
+ "REQUEUE_HOLD",
63
+ "REQUEUED",
64
+ "RESIZING",
65
+ "SIGNALING",
66
+ "STAGE_OUT",
67
+ "STOPPED",
68
+ "SUSPENDED"
69
+ }
30
70
 
31
71
  class SlurmBatchSystem(AbstractGridEngineBatchSystem):
32
72
 
33
- class Worker(AbstractGridEngineBatchSystem.Worker):
73
+ class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
34
74
 
35
75
  def getRunningJobIDs(self):
36
76
  # Should return a dictionary of Job IDs and number of seconds
@@ -95,7 +135,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
95
135
  logger.debug("sbatch submitted job %d", result)
96
136
  return result
97
137
  except OSError as e:
98
- logger.error("sbatch command failed")
138
+ logger.error(f"sbatch command failed with error: {e}")
99
139
  raise e
100
140
 
101
141
  def coalesce_job_exit_codes(self, batch_job_id_list: list) -> List[Union[int, Tuple[int, Optional[BatchJobExitReason]], None]]:
@@ -165,24 +205,6 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
165
205
  """
166
206
  state, rc = status
167
207
 
168
- # If a job is in one of these states, Slurm can't run it anymore.
169
- # We don't include states where the job is held or paused here;
170
- # those mean it could run and needs to wait for someone to un-hold
171
- # it, so Toil should wait for it.
172
- #
173
- # We map from each terminal state to the Toil-ontology exit reason.
174
- TERMINAL_STATES: Dict[str, BatchJobExitReason] = {
175
- "BOOT_FAIL": BatchJobExitReason.LOST,
176
- "CANCELLED": BatchJobExitReason.KILLED,
177
- "COMPLETED": BatchJobExitReason.FINISHED,
178
- "DEADLINE": BatchJobExitReason.KILLED,
179
- "FAILED": BatchJobExitReason.FAILED,
180
- "NODE_FAIL": BatchJobExitReason.LOST,
181
- "OUT_OF_MEMORY": BatchJobExitReason.MEMLIMIT,
182
- "PREEMPTED": BatchJobExitReason.KILLED,
183
- "TIMEOUT": BatchJobExitReason.KILLED
184
- }
185
-
186
208
  if state not in TERMINAL_STATES:
187
209
  # Don't treat the job as exited yet
188
210
  return None
@@ -204,6 +226,24 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
204
226
  # If the code is nonzero, pass it along.
205
227
  return (rc, exit_reason)
206
228
 
229
+ def _canonicalize_state(self, state: str) -> str:
230
+ """
231
+ Turn a state string form SLURM into just the state token like "CANCELED".
232
+ """
233
+
234
+ # Slurm will sometimes send something like "CANCELED by 30065" in
235
+ # the state column for some reason.
236
+
237
+ state_token = state
238
+
239
+ if " " in state_token:
240
+ state_token = state.split(" ", 1)[0]
241
+
242
+ if state_token not in TERMINAL_STATES and state_token not in NONTERMINAL_STATES:
243
+ raise RuntimeError("Toil job in unimplemented Slurm state " + state)
244
+
245
+ return state_token
246
+
207
247
  def _getJobDetailsFromSacct(self, job_id_list: list) -> dict:
208
248
  """
209
249
  Get SLURM job exit codes for the jobs in `job_id_list` by running `sacct`.
@@ -231,6 +271,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
231
271
  if len(values) < 3:
232
272
  continue
233
273
  job_id_raw, state, exitcode = values
274
+ state = self._canonicalize_state(state)
234
275
  logger.debug("%s state of job %s is %s", args[0], job_id_raw, state)
235
276
  # JobIDRaw is in the form JobID[.JobStep]; we're not interested in job steps.
236
277
  job_id_parts = job_id_raw.split(".")
@@ -305,6 +346,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
305
346
  if job_id not in job_id_list:
306
347
  continue
307
348
  state = job['JobState']
349
+ state = self._canonicalize_state(state)
308
350
  logger.debug("%s state of job %s is %s", args[0], job_id, state)
309
351
  try:
310
352
  exitcode = job['ExitCode']
@@ -31,7 +31,7 @@ logger = logging.getLogger(__name__)
31
31
  class TorqueBatchSystem(AbstractGridEngineBatchSystem):
32
32
 
33
33
  # class-specific Worker
34
- class Worker(AbstractGridEngineBatchSystem.Worker):
34
+ class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
35
35
  def __init__(
36
36
  self, newJobsQueue, updatedJobsQueue, killQueue, killedJobsQueue, boss
37
37
  ):
toil/bus.py CHANGED
@@ -20,7 +20,7 @@ functions to "handle" different things happening. Over time, it has become very
20
20
  brittle: exactly the right handling functions need to be called in exactly the
21
21
  right order, or it gets confused and does the wrong thing.
22
22
 
23
- The MessageBus is meant to let the leader avoid this by more losely coupling
23
+ The MessageBus is meant to let the leader avoid this by more loosely coupling
24
24
  its components together, by having them communicate by sending messages instead
25
25
  of by calling functions.
26
26
 
@@ -741,12 +741,16 @@ def replay_message_bus(path: str) -> Dict[str, JobStatus]:
741
741
 
742
742
  return job_statuses
743
743
 
744
- def gen_message_bus_path() -> str:
744
+ def gen_message_bus_path(tmpdir: Optional[str] = None) -> str:
745
745
  """
746
746
  Return a file path in tmp to store the message bus at.
747
747
  Calling function is responsible for cleaning the generated file.
748
+
749
+ The tmpdir argument will override the directory that the
750
+ message bus will be made in. If not provided, the standard tempfile
751
+ order will be used.
748
752
  """
749
- fd, path = tempfile.mkstemp()
753
+ fd, path = tempfile.mkstemp(dir=tmpdir)
750
754
  os.close(fd)
751
755
  return path
752
756
  #TODO Might want to clean up the tmpfile at some point after running the workflow