PyPI - toil - Versions diffs - 6.1.0__py3-none-any.whl → 7.0.0__py3-none-any.whl - Mend

toil 6.1.0py3-none-any.whl → 7.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

toil/__init__.py +1 -232
toil/batchSystems/abstractBatchSystem.py +22 -13
toil/batchSystems/abstractGridEngineBatchSystem.py +59 -45
toil/batchSystems/awsBatch.py +8 -8
toil/batchSystems/contained_executor.py +4 -5
toil/batchSystems/gridengine.py +1 -1
toil/batchSystems/htcondor.py +5 -5
toil/batchSystems/kubernetes.py +25 -11
toil/batchSystems/local_support.py +3 -3
toil/batchSystems/lsf.py +2 -2
toil/batchSystems/mesos/batchSystem.py +4 -4
toil/batchSystems/mesos/executor.py +3 -2
toil/batchSystems/options.py +9 -0
toil/batchSystems/singleMachine.py +11 -10
toil/batchSystems/slurm.py +64 -22
toil/batchSystems/torque.py +1 -1
toil/bus.py +7 -3
toil/common.py +36 -13
toil/cwl/cwltoil.py +365 -312
toil/deferred.py +1 -1
toil/fileStores/abstractFileStore.py +17 -17
toil/fileStores/cachingFileStore.py +2 -2
toil/fileStores/nonCachingFileStore.py +1 -1
toil/job.py +228 -60
toil/jobStores/abstractJobStore.py +18 -10
toil/jobStores/aws/jobStore.py +280 -218
toil/jobStores/aws/utils.py +57 -29
toil/jobStores/conftest.py +2 -2
toil/jobStores/fileJobStore.py +2 -2
toil/jobStores/googleJobStore.py +3 -4
toil/leader.py +72 -24
toil/lib/aws/__init__.py +26 -10
toil/lib/aws/iam.py +2 -2
toil/lib/aws/session.py +62 -22
toil/lib/aws/utils.py +73 -37
toil/lib/conversions.py +5 -1
toil/lib/ec2.py +118 -69
toil/lib/expando.py +1 -1
toil/lib/io.py +14 -2
toil/lib/misc.py +1 -3
toil/lib/resources.py +55 -21
toil/lib/retry.py +12 -5
toil/lib/threading.py +2 -2
toil/lib/throttle.py +1 -1
toil/options/common.py +27 -24
toil/provisioners/__init__.py +9 -3
toil/provisioners/abstractProvisioner.py +9 -7
toil/provisioners/aws/__init__.py +20 -15
toil/provisioners/aws/awsProvisioner.py +406 -329
toil/provisioners/gceProvisioner.py +2 -2
toil/provisioners/node.py +13 -5
toil/server/app.py +1 -1
toil/statsAndLogging.py +58 -16
toil/test/__init__.py +27 -12
toil/test/batchSystems/batchSystemTest.py +40 -33
toil/test/batchSystems/batch_system_plugin_test.py +79 -0
toil/test/batchSystems/test_slurm.py +1 -1
toil/test/cwl/cwlTest.py +8 -91
toil/test/cwl/seqtk_seq.cwl +1 -1
toil/test/docs/scriptsTest.py +10 -13
toil/test/jobStores/jobStoreTest.py +33 -49
toil/test/lib/aws/test_iam.py +2 -2
toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
toil/test/provisioners/clusterTest.py +90 -8
toil/test/server/serverTest.py +2 -2
toil/test/src/autoDeploymentTest.py +1 -1
toil/test/src/dockerCheckTest.py +2 -1
toil/test/src/environmentTest.py +125 -0
toil/test/src/fileStoreTest.py +1 -1
toil/test/src/jobDescriptionTest.py +18 -8
toil/test/src/jobTest.py +1 -1
toil/test/src/realtimeLoggerTest.py +4 -0
toil/test/src/workerTest.py +52 -19
toil/test/utils/toilDebugTest.py +61 -3
toil/test/utils/utilsTest.py +20 -18
toil/test/wdl/wdltoil_test.py +24 -71
toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
toil/toilState.py +68 -9
toil/utils/toilDebugJob.py +153 -26
toil/utils/toilLaunchCluster.py +12 -2
toil/utils/toilRsyncCluster.py +7 -2
toil/utils/toilSshCluster.py +7 -3
toil/utils/toilStats.py +2 -1
toil/utils/toilStatus.py +97 -51
toil/version.py +10 -10
toil/wdl/wdltoil.py +318 -51
toil/worker.py +96 -69
{toil-6.1.0.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
{toil-6.1.0.dist-info → toil-7.0.0.dist-info}/METADATA +55 -21
{toil-6.1.0.dist-info → toil-7.0.0.dist-info}/RECORD +93 -90
{toil-6.1.0.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
{toil-6.1.0.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
{toil-6.1.0.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0

toil/batchSystems/gridengine.py CHANGED Viewed

@@ -28,7 +28,7 @@ logger = logging.getLogger(__name__)
 class GridEngineBatchSystem(AbstractGridEngineBatchSystem):
-    class Worker(AbstractGridEngineBatchSystem.Worker):
+    class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
         """
         Grid Engine-specific AbstractGridEngineWorker methods
         """

toil/batchSystems/htcondor.py CHANGED Viewed

@@ -48,7 +48,7 @@ schedd_lock = Lock()
 class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
     # When using HTCondor, the Schedd handles scheduling
-    class Worker(AbstractGridEngineBatchSystem.Worker):
+    class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
         # Override the createJobs method so that we can use htcondor.Submit objects
         # and so that we can get disk allocation requests and ceil the CPU request.
@@ -387,9 +387,9 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
             return '"' + ' '.join(env_items) + '"'
     # Override the issueBatchJob method so HTCondor can be given the disk request
-    def issueBatchJob(self, jobNode, job_environment: Optional[Dict[str, str]] = None):
+    def issueBatchJob(self, command: str, jobNode, job_environment: Optional[Dict[str, str]] = None):
         # Avoid submitting internal jobs to the batch queue, handle locally
-        localID = self.handleLocalJob(jobNode)
+        localID = self.handleLocalJob(command, jobNode)
         if localID is not None:
             return localID
         else:
@@ -398,7 +398,7 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
             self.currentJobs.add(jobID)
             # Construct our style of job tuple
-            self.newJobsQueue.put((jobID, jobNode.cores, jobNode.memory, jobNode.disk, jobNode.jobName, jobNode.command,
+            self.newJobsQueue.put((jobID, jobNode.cores, jobNode.memory, jobNode.disk, jobNode.jobName, command,
                                    job_environment or {}, jobNode.accelerators))
-            logger.debug("Issued the job command: %s with job id: %s ", jobNode.command, str(jobID))
+            logger.debug("Issued the job command: %s with job id: %s ", command, str(jobID))
         return jobID

toil/batchSystems/kubernetes.py CHANGED Viewed

@@ -47,6 +47,8 @@ from typing import (Any,
                     cast,
                     overload)
+from toil.lib.conversions import opt_strtobool
 if sys.version_info < (3, 10):
     from typing_extensions import ParamSpec
 else:
@@ -83,7 +85,7 @@ from kubernetes.client import (BatchV1Api,
                                V1SecretVolumeSource,
                                V1Toleration,
                                V1Volume,
-                               V1VolumeMount)
+                               V1VolumeMount, V1SecurityContext)
 from kubernetes.client.api_client import ApiClient
 from kubernetes.client.exceptions import ApiException
 from kubernetes.config.config_exception import ConfigException
@@ -758,6 +760,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
     def _create_pod_spec(
             self,
+            command: str,
             job_desc: JobDescription,
             job_environment: Optional[Dict[str, str]] = None
     ) -> V1PodSpec:
@@ -770,7 +773,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
             environment.update(job_environment)
         # Make a command to run it in the executor
-        command_list = pack_job(job_desc, self.user_script, environment=environment)
+        command_list = pack_job(command, self.user_script, environment=environment)
         # The Kubernetes API makes sense only in terms of the YAML format. Objects
         # represent sections of the YAML files. Except from our point of view, all
@@ -877,14 +880,20 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
         # Make a container definition
         container = V1Container(command=command_list,
-                                                  image=self.docker_image,
-                                                  name="runner-container",
-                                                  resources=resources,
-                                                  volume_mounts=mounts)
+                                image=self.docker_image,
+                                name="runner-container",
+                                resources=resources,
+                                volume_mounts=mounts)
+        # In case security context rules are not allowed to be set, we only apply
+        # a security context at all if we need to turn on privileged mode.
+        if self.config.kubernetes_privileged:
+            container.security_context = V1SecurityContext(privileged=self.config.kubernetes_privileged)
         # Wrap the container in a spec
         pod_spec = V1PodSpec(containers=[container],
-                                               volumes=volumes,
-                                               restart_policy="Never")
+                             volumes=volumes,
+                             restart_policy="Never")
         # Tell the spec where to land
         placement.apply(pod_spec)
@@ -1005,9 +1014,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
             self._release_acquired_resources(resources, notify=resource_notify)
             del self._acquired_resources[job_name]
-    def issueBatchJob(self, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
+    def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
         # Try the job as local
-        localID = self.handleLocalJob(job_desc)
+        localID = self.handleLocalJob(command, job_desc)
         if localID is not None:
             # It is a local job
             return localID
@@ -1018,7 +1027,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
         self.check_resource_request(job_desc)
         # Make a pod that describes running the job
-        pod_spec = self._create_pod_spec(job_desc, job_environment=job_environment)
+        pod_spec = self._create_pod_spec(command, job_desc, job_environment=job_environment)
         # Make a batch system scope job ID
         job_id = self.getNextJobID()
@@ -1879,6 +1888,10 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
         parser.add_argument("--kubernetesPodTimeout", dest="kubernetes_pod_timeout", default=120, env_var="TOIL_KUBERNETES_POD_TIMEOUT", type=float,
                             help="Seconds to wait for a scheduled Kubernetes pod to start running.  "
                                  "(default: %(default)s)")
+        parser.add_argument("--kubernetesPrivileged", dest="kubernetes_privileged", default=False, env_var="TOIL_KUBERNETES_PRIVILEGED", type=opt_strtobool,
+                            help="Whether to ask worker pods to run in privileged mode. This should be used to access "
+                                 "privileged operations, such as FUSE. On Toil-managed clusters with --enableFuse, "
+                                 "this is set to True. (default: %(default)s)")
     OptionType = TypeVar('OptionType')
     @classmethod
@@ -1887,4 +1900,5 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
         setOption("kubernetes_owner")
         setOption("kubernetes_service_account",)
         setOption("kubernetes_pod_timeout")
+        setOption("kubernetes_privileged")

toil/batchSystems/local_support.py CHANGED Viewed

@@ -34,9 +34,9 @@ class BatchSystemLocalSupport(BatchSystemSupport):
             config, maxCores, maxMemory, maxDisk, max_jobs=max_local_jobs
         )
-    def handleLocalJob(self, jobDesc: JobDescription) -> Optional[int]:
+    def handleLocalJob(self, command: str, jobDesc: JobDescription) -> Optional[int]:
         """
-        To be called by issueBatchJobs.
+        To be called by issueBatchJob.
         Returns the jobID if the jobDesc has been submitted to the local queue,
         otherwise returns None
@@ -50,7 +50,7 @@ class BatchSystemLocalSupport(BatchSystemSupport):
             # somehow doesn't error whereas just returning the value complains
             # we're returning an Any. TODO: When singleMachine.py typechecks,
             # remove all these extra variables.
-            local_id: int = self.localBatch.issueBatchJob(jobDesc)
+            local_id: int = self.localBatch.issueBatchJob(command, jobDesc)
             return local_id
         else:
             return None

toil/batchSystems/lsf.py CHANGED Viewed

@@ -44,8 +44,8 @@ logger = logging.getLogger(__name__)
 class LSFBatchSystem(AbstractGridEngineBatchSystem):
-    class Worker(AbstractGridEngineBatchSystem.Worker):
-        """LSF specific AbstractGridEngineWorker methods."""
+    class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
+        """LSF specific GridEngineThread methods."""
         def getRunningJobIDs(self):
             times = {}

toil/batchSystems/mesos/batchSystem.py CHANGED Viewed

@@ -174,13 +174,13 @@ class MesosBatchSystem(BatchSystemLocalSupport,
     def unignoreNode(self, nodeAddress):
         self.ignoredNodes.remove(nodeAddress)
-    def issueBatchJob(self, jobNode: JobDescription, job_environment: Optional[Dict[str, str]] = None):
+    def issueBatchJob(self, command: str, jobNode: JobDescription, job_environment: Optional[Dict[str, str]] = None):
         """
         Issues the following command returning a unique jobID. Command is the string to run, memory
         is an int giving the number of bytes the job needs to run in and cores is the number of cpus
         needed for the job and error-file is the path of the file to place any std-err/std-out in.
         """
-        localID = self.handleLocalJob(jobNode)
+        localID = self.handleLocalJob(command, jobNode)
         if localID is not None:
             return localID
@@ -200,12 +200,12 @@ class MesosBatchSystem(BatchSystemLocalSupport,
         job = ToilJob(jobID=jobID,
                       name=str(jobNode),
                       resources=MesosShape(wallTime=0, **mesos_resources),
-                      command=jobNode.command,
+                      command=command,
                       userScript=self.userScript,
                       environment=environment,
                       workerCleanupInfo=self.workerCleanupInfo)
         jobType = job.resources
-        log.debug("Queueing the job command: %s with job id: %s ...", jobNode.command, str(jobID))
+        log.debug("Queueing the job %s with job id: %s ...", jobNode, str(jobID))
         # TODO: round all elements of resources

toil/batchSystems/mesos/executor.py CHANGED Viewed

@@ -196,12 +196,13 @@ class MesosExecutor(Executor):
             """
             if job.userScript:
                 job.userScript.register()
-            log.debug("Invoking command: '%s'", job.command)
+            command = job.command
+            log.debug("Invoking command: '%s'", command)
             # Construct the job's environment
             jobEnv = dict(os.environ, **job.environment)
             log.debug('Using environment variables: %s', jobEnv.keys())
             with self.popenLock:
-                return subprocess.Popen(job.command,
+                return subprocess.Popen(command,
                                         preexec_fn=lambda: os.setpgrp(),
                                         shell=True, env=jobEnv)

toil/batchSystems/options.py CHANGED Viewed

@@ -76,6 +76,7 @@ def set_batchsystem_options(batch_system: Optional[str], set_option: OptionSette
     set_option("manualMemArgs")
     set_option("run_local_jobs_on_workers")
     set_option("statePollingWait")
+    set_option("state_polling_timeout")
     set_option("batch_logs_dir")
@@ -164,6 +165,14 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) -
              "Return cached results if within the waiting period. Only works for grid "
              "engine batch systems such as gridengine, htcondor, torque, slurm, and lsf."
     )
+    parser.add_argument(
+        "--statePollingTimeout",
+        dest="state_polling_timeout",
+        type=int,
+        default=1200,
+        help="Time, in seconds, to retry against a broken scheduler. Only works for grid "
+             "engine batch systems such as gridengine, htcondor, torque, slurm, and lsf."
+    )
     parser.add_argument(
         "--batchLogsDir",
         dest="batch_logs_dir",

toil/batchSystems/singleMachine.py CHANGED Viewed

@@ -475,17 +475,17 @@ class SingleMachineBatchSystem(BatchSystemSupport):
             # We can actually run in this thread
             jobName, jobStoreLocator, jobStoreID = jobCommand.split()[1:4] # Parse command
             jobStore = Toil.resumeJobStore(jobStoreLocator)
-            toil_worker.workerScript(jobStore, jobStore.config, jobName, jobStoreID,
-                                     redirectOutputToLogFile=not self.debugWorker) # Call the worker
+            statusCode = toil_worker.workerScript(jobStore, jobStore.config, jobName, jobStoreID,
+                                     redirect_output_to_log_file=not self.debugWorker) # Call the worker
         else:
             # Run synchronously. If starting or running the command fails, let the exception stop us.
-            subprocess.check_call(jobCommand,
+            statusCode = subprocess.check_call(jobCommand,
                                   shell=True,
                                   env=dict(os.environ, **environment))
         self.runningJobs.pop(jobID)
         if not info.killIntended:
-            self.outputQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=0, wallTime=time.time() - info.time, exitReason=None))
+            self.outputQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=statusCode, wallTime=time.time() - info.time, exitReason=None))
     def getSchedulingStatusMessage(self):
         # Implement the abstractBatchSystem's scheduling status message API
@@ -655,6 +655,7 @@ class SingleMachineBatchSystem(BatchSystemSupport):
             # and all its children together. We assume that the
             # process group ID will equal the PID of the process we
             # are starting.
+            logger.debug("Attempting to run job command: %s", jobCommand)
             popen = subprocess.Popen(jobCommand,
                                      shell=True,
                                      env=child_environment,
@@ -743,24 +744,24 @@ class SingleMachineBatchSystem(BatchSystemSupport):
         logger.debug('Child %d for job %s succeeded', pid, jobID)
-    def issueBatchJob(self, jobDesc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
+    def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
         """Adds the command and resources to a queue to be run."""
         self._checkOnDaddy()
         # Apply scale in cores
-        scaled_desc = jobDesc.scale('cores', self.scale)
+        scaled_desc = job_desc.scale('cores', self.scale)
         # Round cores up to multiples of minCores
         scaled_desc.cores = max(math.ceil(scaled_desc.cores / self.minCores) * self.minCores, self.minCores)
         # Don't do our own assertions about job size vs. our configured size.
         # The abstract batch system can handle it.
         self.check_resource_request(scaled_desc)
-        logger.debug(f"Issuing the command: {jobDesc.command} with {scaled_desc.requirements_string()}")
+        logger.debug(f"Issuing the command: {command} with {scaled_desc.requirements_string()}")
         with self.jobIndexLock:
             jobID = self.jobIndex
             self.jobIndex += 1
-        self.jobs[jobID] = jobDesc.command
+        self.jobs[jobID] = command
         environment = self.environment.copy()
         if job_environment:
@@ -769,10 +770,10 @@ class SingleMachineBatchSystem(BatchSystemSupport):
         if self.debugWorker:
             # Run immediately, blocking for return.
             # Ignore resource requirements; we run one job at a time
-            self._runDebugJob(jobDesc.command, jobID, environment)
+            self._runDebugJob(command, jobID, environment)
         else:
             # Queue the job for later
-            self.inputQueue.put((jobDesc.command, jobID, scaled_desc.cores, scaled_desc.memory,
+            self.inputQueue.put((command, jobID, scaled_desc.cores, scaled_desc.memory,
                                 scaled_desc.disk, scaled_desc.accelerators, environment))
         return jobID

toil/batchSystems/slurm.py CHANGED Viewed

@@ -16,9 +16,9 @@ import math
 import os
 from argparse import ArgumentParser, _ArgumentGroup
 from shlex import quote
-from typing import Dict, List, Optional, Tuple, TypeVar, Union
+from typing import Dict, List, Optional, Set, Tuple, TypeVar, Union
-from toil.batchSystems.abstractBatchSystem import BatchJobExitReason, EXIT_STATUS_UNAVAILABLE_VALUE
+from toil.batchSystems.abstractBatchSystem import BatchJobExitReason, EXIT_STATUS_UNAVAILABLE_VALUE, InsufficientSystemResources
 from toil.batchSystems.abstractGridEngineBatchSystem import \
     AbstractGridEngineBatchSystem
 from toil.batchSystems.options import OptionSetter
@@ -27,10 +27,50 @@ from toil.lib.misc import CalledProcessErrorStderr, call_command
 logger = logging.getLogger(__name__)
+# We have a complete list of Slurm states. States not in one of these aren't
+# allowed. See <https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES>
+# If a job is in one of these states, Slurm can't run it anymore.
+# We don't include states where the job is held or paused here;
+# those mean it could run and needs to wait for someone to un-hold
+# it, so Toil should wait for it.
+#
+# We map from each terminal state to the Toil-ontology exit reason.
+TERMINAL_STATES: Dict[str, BatchJobExitReason] = {
+    "BOOT_FAIL": BatchJobExitReason.LOST,
+    "CANCELLED": BatchJobExitReason.KILLED,
+    "COMPLETED": BatchJobExitReason.FINISHED,
+    "DEADLINE": BatchJobExitReason.KILLED,
+    "FAILED": BatchJobExitReason.FAILED,
+    "NODE_FAIL": BatchJobExitReason.LOST,
+    "OUT_OF_MEMORY": BatchJobExitReason.MEMLIMIT,
+    "PREEMPTED": BatchJobExitReason.KILLED,
+    "REVOKED": BatchJobExitReason.KILLED,
+    "SPECIAL_EXIT": BatchJobExitReason.FAILED,
+    "TIMEOUT": BatchJobExitReason.KILLED
+}
+# If a job is in one of these states, it might eventually move to a different
+# state.
+NONTERMINAL_STATES: Set[str] = {
+    "CONFIGURING",
+    "COMPLETING",
+    "PENDING",
+    "RUNNING",
+    "RESV_DEL_HOLD",
+    "REQUEUE_FED",
+    "REQUEUE_HOLD",
+    "REQUEUED",
+    "RESIZING",
+    "SIGNALING",
+    "STAGE_OUT",
+    "STOPPED",
+    "SUSPENDED"
+}
 class SlurmBatchSystem(AbstractGridEngineBatchSystem):
-    class Worker(AbstractGridEngineBatchSystem.Worker):
+    class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
         def getRunningJobIDs(self):
             # Should return a dictionary of Job IDs and number of seconds
@@ -95,7 +135,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
                 logger.debug("sbatch submitted job %d", result)
                 return result
             except OSError as e:
-                logger.error("sbatch command failed")
+                logger.error(f"sbatch command failed with error: {e}")
                 raise e
         def coalesce_job_exit_codes(self, batch_job_id_list: list) -> List[Union[int, Tuple[int, Optional[BatchJobExitReason]], None]]:
@@ -165,24 +205,6 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
             """
             state, rc = status
-            # If a job is in one of these states, Slurm can't run it anymore.
-            # We don't include states where the job is held or paused here;
-            # those mean it could run and needs to wait for someone to un-hold
-            # it, so Toil should wait for it.
-            #
-            # We map from each terminal state to the Toil-ontology exit reason.
-            TERMINAL_STATES: Dict[str, BatchJobExitReason] = {
-                "BOOT_FAIL": BatchJobExitReason.LOST,
-                "CANCELLED": BatchJobExitReason.KILLED,
-                "COMPLETED": BatchJobExitReason.FINISHED,
-                "DEADLINE": BatchJobExitReason.KILLED,
-                "FAILED": BatchJobExitReason.FAILED,
-                "NODE_FAIL": BatchJobExitReason.LOST,
-                "OUT_OF_MEMORY": BatchJobExitReason.MEMLIMIT,
-                "PREEMPTED": BatchJobExitReason.KILLED,
-                "TIMEOUT": BatchJobExitReason.KILLED
-            }
             if state not in TERMINAL_STATES:
                 # Don't treat the job as exited yet
                 return None
@@ -204,6 +226,24 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
             # If the code is nonzero, pass it along.
             return (rc, exit_reason)
+        def _canonicalize_state(self, state: str) -> str:
+            """
+            Turn a state string form SLURM into just the state token like "CANCELED".
+            """
+            # Slurm will sometimes send something like "CANCELED by 30065" in
+            # the state column for some reason.
+            state_token = state
+            if " " in state_token:
+                state_token = state.split(" ", 1)[0]
+            if state_token not in TERMINAL_STATES and state_token not in NONTERMINAL_STATES:
+                raise RuntimeError("Toil job in unimplemented Slurm state " + state)
+            return state_token
         def _getJobDetailsFromSacct(self, job_id_list: list) -> dict:
             """
             Get SLURM job exit codes for the jobs in `job_id_list` by running `sacct`.
@@ -231,6 +271,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
                 if len(values) < 3:
                     continue
                 job_id_raw, state, exitcode = values
+                state = self._canonicalize_state(state)
                 logger.debug("%s state of job %s is %s", args[0], job_id_raw, state)
                 # JobIDRaw is in the form JobID[.JobStep]; we're not interested in job steps.
                 job_id_parts = job_id_raw.split(".")
@@ -305,6 +346,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
                 if job_id not in job_id_list:
                     continue
                 state = job['JobState']
+                state = self._canonicalize_state(state)
                 logger.debug("%s state of job %s is %s", args[0], job_id, state)
                 try:
                     exitcode = job['ExitCode']

toil/batchSystems/torque.py CHANGED Viewed

@@ -31,7 +31,7 @@ logger = logging.getLogger(__name__)
 class TorqueBatchSystem(AbstractGridEngineBatchSystem):
     # class-specific Worker
-    class Worker(AbstractGridEngineBatchSystem.Worker):
+    class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
         def __init__(
             self, newJobsQueue, updatedJobsQueue, killQueue, killedJobsQueue, boss
         ):

toil/bus.py CHANGED Viewed

@@ -20,7 +20,7 @@ functions to "handle" different things happening. Over time, it has become very
 brittle: exactly the right handling functions need to be called in exactly the
 right order, or it gets confused and does the wrong thing.
-The MessageBus is meant to let the leader avoid this by more losely coupling
+The MessageBus is meant to let the leader avoid this by more loosely coupling
 its components together, by having them communicate by sending messages instead
 of by calling functions.
@@ -741,12 +741,16 @@ def replay_message_bus(path: str) -> Dict[str, JobStatus]:
     return job_statuses
-def gen_message_bus_path() -> str:
+def gen_message_bus_path(tmpdir: Optional[str] = None) -> str:
     """
     Return a file path in tmp to store the message bus at.
     Calling function is responsible for cleaning the generated file.
+    The tmpdir argument will override the directory that the
+    message bus will be made in. If not provided, the standard tempfile
+    order will be used.
     """
-    fd, path = tempfile.mkstemp()
+    fd, path = tempfile.mkstemp(dir=tmpdir)
     os.close(fd)
     return path
     #TODO Might want to clean up the tmpfile at some point after running the workflow

toil 6.1.0__py3-none-any.whl → 7.0.0__py3-none-any.whl

toil 6.1.0py3-none-any.whl → 7.0.0py3-none-any.whl