PyPI - toil - Versions diffs - 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl - Mend

toil 6.1.0a1py3-none-any.whl → 8.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (193) hide show

toil/__init__.py +122 -315
toil/batchSystems/__init__.py +1 -0
toil/batchSystems/abstractBatchSystem.py +173 -89
toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
toil/batchSystems/awsBatch.py +244 -135
toil/batchSystems/cleanup_support.py +26 -16
toil/batchSystems/contained_executor.py +31 -28
toil/batchSystems/gridengine.py +86 -50
toil/batchSystems/htcondor.py +166 -89
toil/batchSystems/kubernetes.py +632 -382
toil/batchSystems/local_support.py +20 -15
toil/batchSystems/lsf.py +134 -81
toil/batchSystems/lsfHelper.py +13 -11
toil/batchSystems/mesos/__init__.py +41 -29
toil/batchSystems/mesos/batchSystem.py +290 -151
toil/batchSystems/mesos/executor.py +79 -50
toil/batchSystems/mesos/test/__init__.py +31 -23
toil/batchSystems/options.py +46 -28
toil/batchSystems/registry.py +53 -19
toil/batchSystems/singleMachine.py +296 -125
toil/batchSystems/slurm.py +603 -138
toil/batchSystems/torque.py +47 -33
toil/bus.py +186 -76
toil/common.py +664 -368
toil/cwl/__init__.py +1 -1
toil/cwl/cwltoil.py +1136 -483
toil/cwl/utils.py +17 -22
toil/deferred.py +63 -42
toil/exceptions.py +5 -3
toil/fileStores/__init__.py +5 -5
toil/fileStores/abstractFileStore.py +140 -60
toil/fileStores/cachingFileStore.py +717 -269
toil/fileStores/nonCachingFileStore.py +116 -87
toil/job.py +1225 -368
toil/jobStores/abstractJobStore.py +416 -266
toil/jobStores/aws/jobStore.py +863 -477
toil/jobStores/aws/utils.py +201 -120
toil/jobStores/conftest.py +3 -2
toil/jobStores/fileJobStore.py +292 -154
toil/jobStores/googleJobStore.py +140 -74
toil/jobStores/utils.py +36 -15
toil/leader.py +668 -272
toil/lib/accelerators.py +115 -18
toil/lib/aws/__init__.py +74 -31
toil/lib/aws/ami.py +122 -87
toil/lib/aws/iam.py +284 -108
toil/lib/aws/s3.py +31 -0
toil/lib/aws/session.py +214 -39
toil/lib/aws/utils.py +287 -231
toil/lib/bioio.py +13 -5
toil/lib/compatibility.py +11 -6
toil/lib/conversions.py +104 -47
toil/lib/docker.py +131 -103
toil/lib/ec2.py +361 -199
toil/lib/ec2nodes.py +174 -106
toil/lib/encryption/_dummy.py +5 -3
toil/lib/encryption/_nacl.py +10 -6
toil/lib/encryption/conftest.py +1 -0
toil/lib/exceptions.py +26 -7
toil/lib/expando.py +5 -3
toil/lib/ftp_utils.py +217 -0
toil/lib/generatedEC2Lists.py +127 -19
toil/lib/humanize.py +6 -2
toil/lib/integration.py +341 -0
toil/lib/io.py +141 -15
toil/lib/iterables.py +4 -2
toil/lib/memoize.py +12 -8
toil/lib/misc.py +66 -21
toil/lib/objects.py +2 -2
toil/lib/resources.py +68 -15
toil/lib/retry.py +126 -81
toil/lib/threading.py +299 -82
toil/lib/throttle.py +16 -15
toil/options/common.py +843 -409
toil/options/cwl.py +175 -90
toil/options/runner.py +50 -0
toil/options/wdl.py +73 -17
toil/provisioners/__init__.py +117 -46
toil/provisioners/abstractProvisioner.py +332 -157
toil/provisioners/aws/__init__.py +70 -33
toil/provisioners/aws/awsProvisioner.py +1145 -715
toil/provisioners/clusterScaler.py +541 -279
toil/provisioners/gceProvisioner.py +282 -179
toil/provisioners/node.py +155 -79
toil/realtimeLogger.py +34 -22
toil/resource.py +137 -75
toil/server/app.py +128 -62
toil/server/celery_app.py +3 -1
toil/server/cli/wes_cwl_runner.py +82 -53
toil/server/utils.py +54 -28
toil/server/wes/abstract_backend.py +64 -26
toil/server/wes/amazon_wes_utils.py +21 -15
toil/server/wes/tasks.py +121 -63
toil/server/wes/toil_backend.py +142 -107
toil/server/wsgi_app.py +4 -3
toil/serviceManager.py +58 -22
toil/statsAndLogging.py +224 -70
toil/test/__init__.py +282 -183
toil/test/batchSystems/batchSystemTest.py +460 -210
toil/test/batchSystems/batch_system_plugin_test.py +90 -0
toil/test/batchSystems/test_gridengine.py +173 -0
toil/test/batchSystems/test_lsf_helper.py +67 -58
toil/test/batchSystems/test_slurm.py +110 -49
toil/test/cactus/__init__.py +0 -0
toil/test/cactus/test_cactus_integration.py +56 -0
toil/test/cwl/cwlTest.py +496 -287
toil/test/cwl/measure_default_memory.cwl +12 -0
toil/test/cwl/not_run_required_input.cwl +29 -0
toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
toil/test/cwl/seqtk_seq.cwl +1 -1
toil/test/docs/scriptsTest.py +69 -46
toil/test/jobStores/jobStoreTest.py +427 -264
toil/test/lib/aws/test_iam.py +118 -50
toil/test/lib/aws/test_s3.py +16 -9
toil/test/lib/aws/test_utils.py +5 -6
toil/test/lib/dockerTest.py +118 -141
toil/test/lib/test_conversions.py +113 -115
toil/test/lib/test_ec2.py +58 -50
toil/test/lib/test_integration.py +104 -0
toil/test/lib/test_misc.py +12 -5
toil/test/mesos/MesosDataStructuresTest.py +23 -10
toil/test/mesos/helloWorld.py +7 -6
toil/test/mesos/stress.py +25 -20
toil/test/options/__init__.py +13 -0
toil/test/options/options.py +42 -0
toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
toil/test/provisioners/clusterScalerTest.py +440 -250
toil/test/provisioners/clusterTest.py +166 -44
toil/test/provisioners/gceProvisionerTest.py +174 -100
toil/test/provisioners/provisionerTest.py +25 -13
toil/test/provisioners/restartScript.py +5 -4
toil/test/server/serverTest.py +188 -141
toil/test/sort/restart_sort.py +137 -68
toil/test/sort/sort.py +134 -66
toil/test/sort/sortTest.py +91 -49
toil/test/src/autoDeploymentTest.py +141 -101
toil/test/src/busTest.py +20 -18
toil/test/src/checkpointTest.py +8 -2
toil/test/src/deferredFunctionTest.py +49 -35
toil/test/src/dockerCheckTest.py +32 -24
toil/test/src/environmentTest.py +135 -0
toil/test/src/fileStoreTest.py +539 -272
toil/test/src/helloWorldTest.py +7 -4
toil/test/src/importExportFileTest.py +61 -31
toil/test/src/jobDescriptionTest.py +46 -21
toil/test/src/jobEncapsulationTest.py +2 -0
toil/test/src/jobFileStoreTest.py +74 -50
toil/test/src/jobServiceTest.py +187 -73
toil/test/src/jobTest.py +121 -71
toil/test/src/miscTests.py +19 -18
toil/test/src/promisedRequirementTest.py +82 -36
toil/test/src/promisesTest.py +7 -6
toil/test/src/realtimeLoggerTest.py +10 -6
toil/test/src/regularLogTest.py +71 -37
toil/test/src/resourceTest.py +80 -49
toil/test/src/restartDAGTest.py +36 -22
toil/test/src/resumabilityTest.py +9 -2
toil/test/src/retainTempDirTest.py +45 -14
toil/test/src/systemTest.py +12 -8
toil/test/src/threadingTest.py +44 -25
toil/test/src/toilContextManagerTest.py +10 -7
toil/test/src/userDefinedJobArgTypeTest.py +8 -5
toil/test/src/workerTest.py +73 -23
toil/test/utils/toilDebugTest.py +103 -33
toil/test/utils/toilKillTest.py +4 -5
toil/test/utils/utilsTest.py +245 -106
toil/test/wdl/wdltoil_test.py +818 -149
toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
toil/toilState.py +120 -35
toil/utils/toilConfig.py +13 -4
toil/utils/toilDebugFile.py +44 -27
toil/utils/toilDebugJob.py +214 -27
toil/utils/toilDestroyCluster.py +11 -6
toil/utils/toilKill.py +8 -3
toil/utils/toilLaunchCluster.py +256 -140
toil/utils/toilMain.py +37 -16
toil/utils/toilRsyncCluster.py +32 -14
toil/utils/toilSshCluster.py +49 -22
toil/utils/toilStats.py +356 -273
toil/utils/toilStatus.py +292 -139
toil/utils/toilUpdateEC2Instances.py +3 -1
toil/version.py +12 -12
toil/wdl/utils.py +5 -5
toil/wdl/wdltoil.py +3913 -1033
toil/worker.py +367 -184
{toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
toil-8.0.0.dist-info/METADATA +173 -0
toil-8.0.0.dist-info/RECORD +253 -0
{toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
toil-6.1.0a1.dist-info/METADATA +0 -125
toil-6.1.0a1.dist-info/RECORD +0 -237
{toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
{toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0

toil/batchSystems/slurm.py CHANGED Viewed

@@ -11,62 +11,260 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 import logging
 import math
 import os
-from argparse import ArgumentParser, _ArgumentGroup
+import sys
+from argparse import SUPPRESS, ArgumentParser, _ArgumentGroup
 from shlex import quote
-from typing import Dict, List, Optional, TypeVar, Union
-from toil.batchSystems.abstractGridEngineBatchSystem import \
-    AbstractGridEngineBatchSystem
+from typing import NamedTuple, TypeVar
+from toil.batchSystems.abstractBatchSystem import (
+    EXIT_STATUS_UNAVAILABLE_VALUE,
+    BatchJobExitReason,
+    InsufficientSystemResources,
+)
+from toil.batchSystems.abstractGridEngineBatchSystem import (
+    AbstractGridEngineBatchSystem,
+)
 from toil.batchSystems.options import OptionSetter
-from toil.job import Requirer
+from toil.bus import get_job_kind
+from toil.common import Config
+from toil.job import JobDescription, Requirer
+from toil.lib.conversions import strtobool
 from toil.lib.misc import CalledProcessErrorStderr, call_command
+from toil.statsAndLogging import TRACE
 logger = logging.getLogger(__name__)
+# We have a complete list of Slurm states. States not in one of these aren't
+# allowed. See <https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES>
-class SlurmBatchSystem(AbstractGridEngineBatchSystem):
+# If a job is in one of these states, Slurm can't run it anymore.
+# We don't include states where the job is held or paused here;
+# those mean it could run and needs to wait for someone to un-hold
+# it, so Toil should wait for it.
+#
+# We map from each terminal state to the Toil-ontology exit reason.
+TERMINAL_STATES: dict[str, BatchJobExitReason] = {
+    "BOOT_FAIL": BatchJobExitReason.LOST,
+    "CANCELLED": BatchJobExitReason.KILLED,
+    "COMPLETED": BatchJobExitReason.FINISHED,
+    "DEADLINE": BatchJobExitReason.KILLED,
+    "FAILED": BatchJobExitReason.FAILED,
+    "NODE_FAIL": BatchJobExitReason.LOST,
+    "OUT_OF_MEMORY": BatchJobExitReason.MEMLIMIT,
+    "PREEMPTED": BatchJobExitReason.KILLED,
+    "REVOKED": BatchJobExitReason.KILLED,
+    "SPECIAL_EXIT": BatchJobExitReason.FAILED,
+    "TIMEOUT": BatchJobExitReason.KILLED,
+}
+# If a job is in one of these states, it might eventually move to a different
+# state.
+NONTERMINAL_STATES: set[str] = {
+    "CONFIGURING",
+    "COMPLETING",
+    "PENDING",
+    "RUNNING",
+    "RESV_DEL_HOLD",
+    "REQUEUE_FED",
+    "REQUEUE_HOLD",
+    "REQUEUED",
+    "RESIZING",
+    "SIGNALING",
+    "STAGE_OUT",
+    "STOPPED",
+    "SUSPENDED",
+}
+def parse_slurm_time(slurm_time: str) -> int:
+    """
+    Parse a Slurm-style time duration like 7-00:00:00 to a number of seconds.
+    Raises ValueError if not parseable.
+    """
+    # slurm returns time in days-hours:minutes:seconds format
+    # Sometimes it will only return minutes:seconds, so days may be omitted
+    # For ease of calculating, we'll make sure all the delimeters are ':'
+    # Then reverse the list so that we're always counting up from seconds -> minutes -> hours -> days
+    total_seconds = 0
+    elapsed_split: list[str] = slurm_time.replace("-", ":").split(":")
+    elapsed_split.reverse()
+    seconds_per_unit = [1, 60, 3600, 86400]
+    for index, multiplier in enumerate(seconds_per_unit):
+        if index < len(elapsed_split):
+            total_seconds += multiplier * int(elapsed_split[index])
+    return total_seconds
-    class Worker(AbstractGridEngineBatchSystem.Worker):
-        def getRunningJobIDs(self):
+class SlurmBatchSystem(AbstractGridEngineBatchSystem):
+    class PartitionInfo(NamedTuple):
+        partition_name: str
+        gres: bool
+        time_limit: float
+        priority: int
+        cpus: str
+        memory: str
+    class PartitionSet:
+        """
+        Set of available partitions detected on the slurm batch system
+        """
+        default_gpu_partition: SlurmBatchSystem.PartitionInfo | None
+        all_partitions: list[SlurmBatchSystem.PartitionInfo]
+        gpu_partitions: set[str]
+        def __init__(self) -> None:
+            self._get_partition_info()
+            self._get_gpu_partitions()
+        def _get_gpu_partitions(self) -> None:
+            """
+            Get all available GPU partitions. Also get the default GPU partition.
+            :return: None
+            """
+            gpu_partitions = [
+                partition for partition in self.all_partitions if partition.gres
+            ]
+            self.gpu_partitions = {p.partition_name for p in gpu_partitions}
+            # Grab the lowest priority GPU partition
+            # If no GPU partitions are available, then set the default to None
+            self.default_gpu_partition = None
+            if len(gpu_partitions) > 0:
+                self.default_gpu_partition = sorted(
+                    gpu_partitions, key=lambda x: x.priority
+                )[0]
+        def _get_partition_info(self) -> None:
+            """
+            Call the Slurm batch system with sinfo to grab all available partitions.
+            Then parse the output and store all available Slurm partitions
+            :return: None
+            """
+            sinfo_command = ["sinfo", "-a", "-o", "%P %G %l %p %c %m"]
+            sinfo = call_command(sinfo_command)
+            parsed_partitions = []
+            for line in sinfo.split("\n")[1:]:
+                if line.strip():
+                    partition_name, gres, time, priority, cpus, memory = line.split(" ")
+                    try:
+                        # Parse time to a number so we can compute on it
+                        partition_time: float = parse_slurm_time(time)
+                    except ValueError:
+                        # Maybe time is unlimited?
+                        partition_time = float("inf")
+                    try:
+                        # Parse priority to an int so we can sort on it
+                        partition_priority = int(priority)
+                    except ValueError:
+                        logger.warning(
+                            "Could not parse priority %s for partition %s, assuming high priority",
+                            partition_name,
+                            priority,
+                        )
+                        partition_priority = sys.maxsize
+                    parsed_partitions.append(
+                        SlurmBatchSystem.PartitionInfo(
+                            partition_name.rstrip("*"),
+                            gres != "(null)",
+                            partition_time,
+                            partition_priority,
+                            cpus,
+                            memory,
+                        )
+                    )
+            self.all_partitions = parsed_partitions
+        def get_partition(self, time_limit: float | None) -> str | None:
+            """
+            Get the partition name to use for a job with the given time limit.
+            """
+            if time_limit is None:
+                # Just use Slurm's default
+                return None
+            winning_partition = None
+            for partition in self.all_partitions:
+                if partition.time_limit >= time_limit and (
+                    winning_partition is None
+                    or partition.time_limit < winning_partition.time_limit
+                ):
+                    # If this partition can fit the job and is faster than the current winner, take it
+                    winning_partition = partition
+            # TODO: Store partitions in a better indexed way
+            if winning_partition is None and len(self.all_partitions) > 0:
+                # We have partitions and none of them can fit this
+                raise RuntimeError(
+                    "Could not find a Slurm partition that can fit a job that runs for {time_limit} seconds"
+                )
+            if winning_partition is None:
+                return None
+            else:
+                return winning_partition.partition_name
+    class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
+        # Our boss is always the enclosing class
+        boss: SlurmBatchSystem
+        def getRunningJobIDs(self) -> dict[int, int]:
             # Should return a dictionary of Job IDs and number of seconds
             times = {}
             with self.runningJobsLock:
-                currentjobs = {str(self.batchJobIDs[x][0]): x for x in self.runningJobs}
+                currentjobs: dict[str, int] = {
+                    str(self.batchJobIDs[x][0]): x for x in self.runningJobs
+                }
             # currentjobs is a dictionary that maps a slurm job id (string) to our own internal job id
             # squeue arguments:
             # -h for no header
             # --format to get jobid i, state %t and time days-hours:minutes:seconds
-            lines = call_command(['squeue', '-h', '--format', '%i %t %M'], quiet=True).split('\n')
+            lines = call_command(
+                ["squeue", "-h", "--format", "%i %t %M"], quiet=True
+            ).split("\n")
             for line in lines:
                 values = line.split()
                 if len(values) < 3:
                     continue
                 slurm_jobid, state, elapsed_time = values
-                if slurm_jobid in currentjobs and state == 'R':
-                    seconds_running = self.parse_elapsed(elapsed_time)
+                if slurm_jobid in currentjobs and state == "R":
+                    try:
+                        seconds_running = parse_slurm_time(elapsed_time)
+                    except ValueError:
+                        # slurm may return INVALID instead of a time
+                        seconds_running = 0
                     times[currentjobs[slurm_jobid]] = seconds_running
             return times
-        def killJob(self, jobID):
-            call_command(['scancel', self.getBatchSystemID(jobID)])
-        def prepareSubmission(self,
-                              cpu: int,
-                              memory: int,
-                              jobID: int,
-                              command: str,
-                              jobName: str,
-                              job_environment: Optional[Dict[str, str]] = None,
-                              gpus: Optional[int] = None) -> List[str]:
-            return self.prepareSbatch(cpu, memory, jobID, jobName, job_environment, gpus) + [f'--wrap={command}']
-        def submitJob(self, subLine):
+        def killJob(self, jobID: int) -> None:
+            call_command(["scancel", self.getBatchSystemID(jobID)])
+        def prepareSubmission(
+            self,
+            cpu: int,
+            memory: int,
+            jobID: int,
+            command: str,
+            jobName: str,
+            job_environment: dict[str, str] | None = None,
+            gpus: int | None = None,
+        ) -> list[str]:
+            # Make sure to use exec so we can get Slurm's signals in the Toil
+            # worker instead of having an intervening Bash
+            return self.prepareSbatch(
+                cpu, memory, jobID, jobName, job_environment, gpus
+            ) + [f"--wrap=exec {command}"]
+        def submitJob(self, subLine: list[str]) -> int:
             try:
                 # Slurm is not quite clever enough to follow the XDG spec on
                 # its own. If the submission command sees e.g. XDG_RUNTIME_DIR
@@ -82,7 +280,11 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
                 # This doesn't get us a trustworthy XDG session in Slurm, but
                 # it does let us see the one Slurm tries to give us.
                 no_session_environment = os.environ.copy()
-                session_names = [n for n in no_session_environment.keys() if n.startswith('XDG_') or n.startswith('DBUS_')]
+                session_names = [
+                    n
+                    for n in no_session_environment.keys()
+                    if n.startswith("XDG_") or n.startswith("DBUS_")
+                ]
                 for name in session_names:
                     del no_session_environment[name]
@@ -92,39 +294,47 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
                 logger.debug("sbatch submitted job %d", result)
                 return result
             except OSError as e:
-                logger.error("sbatch command failed")
+                logger.error(f"sbatch command failed with error: {e}")
                 raise e
-        def coalesce_job_exit_codes(self, batch_job_id_list: list) -> list:
+        def coalesce_job_exit_codes(
+            self, batch_job_id_list: list[str]
+        ) -> list[int | tuple[int, BatchJobExitReason | None] | None]:
             """
             Collect all job exit codes in a single call.
             :param batch_job_id_list: list of Job ID strings, where each string has the form
             "<job>[.<task>]".
-            :return: list of job exit codes, associated with the list of job IDs.
+            :return: list of job exit codes or exit code, exit reason pairs associated with the list of job IDs.
             """
-            logger.debug("Getting exit codes for slurm jobs: %s", batch_job_id_list)
+            logger.log(
+                TRACE, "Getting exit codes for slurm jobs: %s", batch_job_id_list
+            )
             # Convert batch_job_id_list to list of integer job IDs.
-            job_id_list = [int(id.split('.')[0]) for id in batch_job_id_list]
+            job_id_list = [int(id.split(".")[0]) for id in batch_job_id_list]
             status_dict = self._get_job_details(job_id_list)
-            exit_codes = []
+            exit_codes: list[int | tuple[int, BatchJobExitReason | None] | None] = []
             for _, status in status_dict.items():
                 exit_codes.append(self._get_job_return_code(status))
             return exit_codes
-        def getJobExitCode(self, batchJobID: str) -> int:
+        def getJobExitCode(
+            self, batchJobID: str
+        ) -> int | tuple[int, BatchJobExitReason | None] | None:
             """
             Get job exit code for given batch job ID.
             :param batchJobID: string of the form "<job>[.<task>]".
             :return: integer job exit code.
             """
-            logger.debug("Getting exit code for slurm job: %s", batchJobID)
+            logger.log(TRACE, "Getting exit code for slurm job: %s", batchJobID)
             # Convert batchJobID to an integer job ID.
-            job_id = int(batchJobID.split('.')[0])
+            job_id = int(batchJobID.split(".")[0])
             status_dict = self._get_job_details([job_id])
             status = status_dict[job_id]
             return self._get_job_return_code(status)
-        def _get_job_details(self, job_id_list: list) -> dict:
+        def _get_job_details(
+            self, job_id_list: list[int]
+        ) -> dict[int, tuple[str | None, int | None]]:
             """
             Helper function for `getJobExitCode` and `coalesce_job_exit_codes`.
             Fetch job details from Slurm's accounting system or job control system.
@@ -138,20 +348,77 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
                 status_dict = self._getJobDetailsFromScontrol(job_id_list)
             return status_dict
-        def _get_job_return_code(self, status: tuple) -> list:
+        def _get_job_return_code(
+            self, status: tuple[str | None, int | None]
+        ) -> int | tuple[int, BatchJobExitReason | None] | None:
             """
+            Given a Slurm return code, status pair, summarize them into a Toil return code, exit reason pair.
+            The return code may have already been OR'd with the 128-offset
+            Slurm-reported signal.
+            Slurm will report return codes of 0 even if jobs time out instead
+            of succeeding:
+                2093597|TIMEOUT|0:0
+                2093597.batch|CANCELLED|0:15
+            So we guarantee here that, if the Slurm status string is not a
+            successful one as defined in
+            <https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES>, we
+            will not return a successful return code.
             Helper function for `getJobExitCode` and `coalesce_job_exit_codes`.
-            :param status: tuple containing the job's state and it's return code.
-            :return: the job's return code if it's completed, otherwise None.
+            :param status: tuple containing the job's state and it's return code from Slurm.
+            :return: the job's return code for Toil if it's completed, otherwise None.
             """
             state, rc = status
-            # If job is in a running state, set return code to None to indicate we don't have
-            # an update.
-            if state in ('PENDING', 'RUNNING', 'CONFIGURING', 'COMPLETING', 'RESIZING', 'SUSPENDED'):
-                rc = None
-            return rc
-        def _getJobDetailsFromSacct(self, job_id_list: list) -> dict:
+            if state not in TERMINAL_STATES:
+                # Don't treat the job as exited yet
+                return None
+            exit_reason = TERMINAL_STATES[state]
+            if exit_reason == BatchJobExitReason.FINISHED:
+                # The only state that should produce a 0 ever is COMPLETED. So
+                # if the job is COMPLETED and the exit reason is thus FINISHED,
+                # pass along the code it has.
+                return (rc, exit_reason)  # type: ignore[return-value] # mypy doesn't understand enums well
+            if rc == 0:
+                # The job claims to be in a state other than COMPLETED, but
+                # also to have not encountered a problem. Say the exit status
+                # is unavailable.
+                return (EXIT_STATUS_UNAVAILABLE_VALUE, exit_reason)
+            # If the code is nonzero, pass it along.
+            return (rc, exit_reason)  # type: ignore[return-value] # mypy doesn't understand enums well
+        def _canonicalize_state(self, state: str) -> str:
+            """
+            Turn a state string form SLURM into just the state token like "CANCELED".
+            """
+            # Slurm will sometimes send something like "CANCELED by 30065" in
+            # the state column for some reason.
+            state_token = state
+            if " " in state_token:
+                state_token = state.split(" ", 1)[0]
+            if (
+                state_token not in TERMINAL_STATES
+                and state_token not in NONTERMINAL_STATES
+            ):
+                raise RuntimeError("Toil job in unimplemented Slurm state " + state)
+            return state_token
+        def _getJobDetailsFromSacct(
+            self, job_id_list: list[int]
+        ) -> dict[int, tuple[str | None, int | None]]:
             """
             Get SLURM job exit codes for the jobs in `job_id_list` by running `sacct`.
             :param job_id_list: list of integer batch job IDs.
@@ -159,51 +426,68 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
             containing the job's state and exit code.
             """
             job_ids = ",".join(str(id) for id in job_id_list)
-            args = ['sacct',
-                    '-n',  # no header
-                    '-j', job_ids,  # job
-                    '--format', 'JobIDRaw,State,ExitCode',  # specify output columns
-                    '-P',  # separate columns with pipes
-                    '-S', '1970-01-01']  # override start time limit
+            args = [
+                "sacct",
+                "-n",  # no header
+                "-j",
+                job_ids,  # job
+                "--format",
+                "JobIDRaw,State,ExitCode",  # specify output columns
+                "-P",  # separate columns with pipes
+                "-S",
+                "1970-01-01",
+            ]  # override start time limit
             stdout = call_command(args, quiet=True)
             # Collect the job statuses in a dict; key is the job-id, value is a tuple containing
             # job state and exit status. Initialize dict before processing output of `sacct`.
-            job_statuses = {}
+            job_statuses: dict[int, tuple[str | None, int | None]] = {}
             for job_id in job_id_list:
                 job_statuses[job_id] = (None, None)
             for line in stdout.splitlines():
-                values = line.strip().split('|')
+                values = line.strip().split("|")
                 if len(values) < 3:
                     continue
+                state: str
                 job_id_raw, state, exitcode = values
-                logger.debug("%s state of job %s is %s", args[0], job_id_raw, state)
+                state = self._canonicalize_state(state)
+                logger.log(
+                    TRACE, "%s state of job %s is %s", args[0], job_id_raw, state
+                )
                 # JobIDRaw is in the form JobID[.JobStep]; we're not interested in job steps.
                 job_id_parts = job_id_raw.split(".")
                 if len(job_id_parts) > 1:
                     continue
                 job_id = int(job_id_parts[0])
-                status, signal = (int(n) for n in exitcode.split(':'))
+                status: int
+                signal: int
+                status, signal = (int(n) for n in exitcode.split(":"))
                 if signal > 0:
                     # A non-zero signal may indicate e.g. an out-of-memory killed job
                     status = 128 + signal
-                logger.debug("%s exit code of job %d is %s, return status %d",
-                             args[0], job_id, exitcode, status)
+                logger.log(
+                    TRACE,
+                    "%s exit code of job %d is %s, return status %d",
+                    args[0],
+                    job_id,
+                    exitcode,
+                    status,
+                )
                 job_statuses[job_id] = state, status
-            logger.debug("%s returning job statuses: %s", args[0], job_statuses)
+            logger.log(TRACE, "%s returning job statuses: %s", args[0], job_statuses)
             return job_statuses
-        def _getJobDetailsFromScontrol(self, job_id_list: list) -> dict:
+        def _getJobDetailsFromScontrol(
+            self, job_id_list: list[int]
+        ) -> dict[int, tuple[str | None, int | None]]:
             """
             Get SLURM job exit codes for the jobs in `job_id_list` by running `scontrol`.
             :param job_id_list: list of integer batch job IDs.
             :return: dict of job statuses, where key is the job-id, and value is a tuple
             containing the job's state and exit code.
             """
-            args = ['scontrol',
-                    'show',
-                    'job']
+            args = ["scontrol", "show", "job"]
             # `scontrol` can only return information about a single job,
             # or all the jobs it knows about.
             if len(job_id_list) == 1:
@@ -212,14 +496,16 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
             stdout = call_command(args, quiet=True)
             # Job records are separated by a blank line.
+            job_records = None
             if isinstance(stdout, str):
-                job_records = stdout.strip().split('\n\n')
+                job_records = stdout.strip().split("\n\n")
             elif isinstance(stdout, bytes):
-                job_records = stdout.decode('utf-8').strip().split('\n\n')
+                job_records = stdout.decode("utf-8").strip().split("\n\n")
             # Collect the job statuses in a dict; key is the job-id, value is a tuple containing
             # job state and exit status. Initialize dict before processing output of `scontrol`.
-            job_statuses = {}
+            job_statuses: dict[int, tuple[str | None, int | None]] = {}
+            job_id: int | None
             for job_id in job_id_list:
                 job_statuses[job_id] = (None, None)
@@ -229,7 +515,8 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
                 return job_statuses
             for record in job_records:
-                job = {}
+                job: dict[str, str] = {}
+                job_id = None
                 for line in record.splitlines():
                     for item in line.split():
                         # Output is in the form of many key=value pairs, multiple pairs on each line
@@ -237,73 +524,104 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
                         # added to a dictionary.
                         # Note: In some cases, the value itself may contain white-space. So, if we find
                         # a key without a value, we consider that key part of the previous value.
-                        bits = item.split('=', 1)
+                        bits = item.split("=", 1)
                         if len(bits) == 1:
-                            job[key] += ' ' + bits[0]
+                            job[key] += " " + bits[0]  # type: ignore[has-type]  # we depend on the previous iteration to populate key
                         else:
                             key = bits[0]
                             job[key] = bits[1]
                     # The first line of the record contains the JobId. Stop processing the remainder
                     # of this record, if we're not interested in this job.
-                    job_id = int(job['JobId'])
+                    job_id = int(job["JobId"])
                     if job_id not in job_id_list:
-                        logger.debug("%s job %d is not in the list", args[0], job_id)
+                        logger.log(
+                            TRACE, "%s job %d is not in the list", args[0], job_id
+                        )
                         break
-                if job_id not in job_id_list:
+                if job_id is None or job_id not in job_id_list:
                     continue
-                state = job['JobState']
-                logger.debug("%s state of job %s is %s", args[0], job_id, state)
+                state = job["JobState"]
+                state = self._canonicalize_state(state)
+                logger.log(TRACE, "%s state of job %s is %s", args[0], job_id, state)
                 try:
-                    exitcode = job['ExitCode']
+                    exitcode = job["ExitCode"]
                     if exitcode is not None:
-                        status, signal = (int(n) for n in exitcode.split(':'))
+                        status, signal = (int(n) for n in exitcode.split(":"))
                         if signal > 0:
                             # A non-zero signal may indicate e.g. an out-of-memory killed job
                             status = 128 + signal
-                        logger.debug("%s exit code of job %d is %s, return status %d",
-                                     args[0], job_id, exitcode, status)
+                        logger.log(
+                            TRACE,
+                            "%s exit code of job %d is %s, return status %d",
+                            args[0],
+                            job_id,
+                            exitcode,
+                            status,
+                        )
                         rc = status
                     else:
                         rc = None
                 except KeyError:
                     rc = None
                 job_statuses[job_id] = (state, rc)
-            logger.debug("%s returning job statuses: %s", args[0], job_statuses)
+            logger.log(TRACE, "%s returning job statuses: %s", args[0], job_statuses)
             return job_statuses
         ###
         ### Implementation-specific helper methods
         ###
-        def prepareSbatch(self,
-                          cpu: int,
-                          mem: int,
-                          jobID: int,
-                          jobName: str,
-                          job_environment: Optional[Dict[str, str]],
-                          gpus: Optional[int]) -> List[str]:
+        def prepareSbatch(
+            self,
+            cpu: int,
+            mem: int,
+            jobID: int,
+            jobName: str,
+            job_environment: dict[str, str] | None,
+            gpus: int | None,
+        ) -> list[str]:
+            """
+            Returns the sbatch command line to run to queue the job.
+            """
+            # Start by naming the job
+            sbatch_line = ["sbatch", "-J", f"toil_job_{jobID}_{jobName}"]
+            # Make sure the job gets a signal before it disappears so that e.g.
+            # container cleanup finally blocks can run. Ask for SIGINT so we
+            # can get the default Python KeyboardInterrupt which third-party
+            # code is likely to plan for. Make sure to send it to the batch
+            # shell process with "B:", not to all the srun steps it launches
+            # (because there shouldn't be any). We cunningly replaced the batch
+            # shell process with the Toil worker process, so Toil should be
+            # able to get the signal.
+            #
+            # TODO: Add a way to detect when the job failed because it
+            # responded to this signal and use the right exit reason for it.
+            sbatch_line.append("--signal=B:INT@30")
-            #  Returns the sbatch command line before the script to run
-            sbatch_line = ['sbatch', '-J', f'toil_job_{jobID}_{jobName}']
-            if gpus:
-                sbatch_line = sbatch_line[:1] + [f'--gres=gpu:{gpus}'] + sbatch_line[1:]
             environment = {}
             environment.update(self.boss.environment)
             if job_environment:
                 environment.update(job_environment)
             # "Native extensions" for SLURM (see DRMAA or SAGA)
-            nativeConfig = os.getenv('TOIL_SLURM_ARGS')
+            # Also any extra arguments from --slurmArgs or TOIL_SLURM_ARGS
+            nativeConfig: str = self.boss.config.slurm_args  # type: ignore[attr-defined]
             # --export=[ALL,]<environment_toil_variables>
             set_exports = "--export=ALL"
             if nativeConfig is not None:
-                logger.debug("Native SLURM options appended to sbatch from TOIL_SLURM_ARGS env. variable: %s", nativeConfig)
+                logger.debug(
+                    "Native SLURM options appended to sbatch: %s", nativeConfig
+                )
                 for arg in nativeConfig.split():
                     if arg.startswith("--mem") or arg.startswith("--cpus-per-task"):
-                        raise ValueError(f"Some resource arguments are incompatible: {nativeConfig}")
+                        raise ValueError(
+                            f"Some resource arguments are incompatible: {nativeConfig}"
+                        )
                     # repleace default behaviour by the one stated at TOIL_SLURM_ARGS
                     if arg.startswith("--export"):
                         set_exports = arg
@@ -314,54 +632,149 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
                 for k, v in environment.items():
                     quoted_value = quote(os.environ[k] if v is None else v)
-                    argList.append(f'{k}={quoted_value}')
+                    argList.append(f"{k}={quoted_value}")
-                set_exports += ',' + ','.join(argList)
+                set_exports += "," + ",".join(argList)
             # add --export to the sbatch
             sbatch_line.append(set_exports)
-            parallel_env = os.getenv('TOIL_SLURM_PE')
+            parallel_env: str = self.boss.config.slurm_pe  # type: ignore[attr-defined]
             if cpu and cpu > 1 and parallel_env:
-                sbatch_line.append(f'--partition={parallel_env}')
+                sbatch_line.append(f"--partition={parallel_env}")
-            if mem is not None and self.boss.config.allocate_mem:
+            if mem is not None and self.boss.config.slurm_allocate_mem:  # type: ignore[attr-defined]
                 # memory passed in is in bytes, but slurm expects megabytes
-                sbatch_line.append(f'--mem={math.ceil(mem / 2 ** 20)}')
+                sbatch_line.append(f"--mem={math.ceil(mem / 2 ** 20)}")
             if cpu is not None:
-                sbatch_line.append(f'--cpus-per-task={math.ceil(cpu)}')
+                sbatch_line.append(f"--cpus-per-task={math.ceil(cpu)}")
-            stdoutfile: str = self.boss.format_std_out_err_path(jobID, '%j', 'out')
-            stderrfile: str = self.boss.format_std_out_err_path(jobID, '%j', 'err')
-            sbatch_line.extend(['-o', stdoutfile, '-e', stderrfile])
+            time_limit: int = self.boss.config.slurm_time  # type: ignore[attr-defined]
+            if time_limit is not None:
+                # Put all the seconds in the seconds slot
+                sbatch_line.append(f"--time=0:{time_limit}")
+            if gpus:
+                # This block will add a gpu supported partition only if no partition is supplied by the user
+                sbatch_line = sbatch_line[:1] + [f"--gres=gpu:{gpus}"] + sbatch_line[1:]
+                if not any(option.startswith("--partition") for option in sbatch_line):
+                    # no partition specified, so specify one
+                    # try to get the name of the lowest priority gpu supported partition
+                    lowest_gpu_partition = self.boss.partitions.default_gpu_partition
+                    if lowest_gpu_partition is None:
+                        # no gpu partitions are available, raise an error
+                        raise RuntimeError(
+                            f"The job {jobName} is requesting GPUs, but the Slurm cluster does not appear to have an accessible partition with GPUs"
+                        )
+                    if (
+                        time_limit is not None
+                        and lowest_gpu_partition.time_limit < time_limit
+                    ):
+                        # TODO: find the lowest-priority GPU partition that has at least each job's time limit!
+                        logger.warning(
+                            "Trying to submit a job that needs %s seconds to partition %s that has a limit of %s seconds",
+                            time_limit,
+                            lowest_gpu_partition.partition_name,
+                            lowest_gpu_partition.time_limit,
+                        )
+                    sbatch_line.append(
+                        f"--partition={lowest_gpu_partition.partition_name}"
+                    )
+                else:
+                    # there is a partition specified already, check if the partition has GPUs
+                    for i, option in enumerate(sbatch_line):
+                        if option.startswith("--partition"):
+                            # grab the partition name depending on if it's specified via an "=" or a space
+                            if "=" in option:
+                                partition_name = option[len("--partition=") :]
+                            else:
+                                partition_name = option[i + 1]
+                            available_gpu_partitions = (
+                                self.boss.partitions.gpu_partitions
+                            )
+                            if partition_name not in available_gpu_partitions:
+                                # the specified partition is not compatible, so warn the user that the job may not work
+                                logger.warning(
+                                    f"Job {jobName} needs {gpus} GPUs, but specified partition {partition_name} is incompatible. This job may not work."
+                                    f"Try specifying one of these partitions instead: {', '.join(available_gpu_partitions)}."
+                                )
+                            break
+            if not any(option.startswith("--partition") for option in sbatch_line):
+                # Pick a partition ourselves
+                chosen_partition = self.boss.partitions.get_partition(time_limit)
+                if chosen_partition is not None:
+                    # Route to that partition
+                    sbatch_line.append(f"--partition={chosen_partition}")
+            stdoutfile: str = self.boss.format_std_out_err_path(jobID, "%j", "out")
+            stderrfile: str = self.boss.format_std_out_err_path(jobID, "%j", "err")
+            sbatch_line.extend(["-o", stdoutfile, "-e", stderrfile])
             return sbatch_line
-        def parse_elapsed(self, elapsed):
-            # slurm returns elapsed time in days-hours:minutes:seconds format
-            # Sometimes it will only return minutes:seconds, so days may be omitted
-            # For ease of calculating, we'll make sure all the delimeters are ':'
-            # Then reverse the list so that we're always counting up from seconds -> minutes -> hours -> days
-            total_seconds = 0
-            try:
-                elapsed = elapsed.replace('-', ':').split(':')
-                elapsed.reverse()
-                seconds_per_unit = [1, 60, 3600, 86400]
-                for index, multiplier in enumerate(seconds_per_unit):
-                    if index < len(elapsed):
-                        total_seconds += multiplier * int(elapsed[index])
-            except ValueError:
-                pass  # slurm may return INVALID instead of a time
-            return total_seconds
+    def __init__(
+        self, config: Config, maxCores: float, maxMemory: int, maxDisk: int
+    ) -> None:
+        super().__init__(config, maxCores, maxMemory, maxDisk)
+        self.partitions = SlurmBatchSystem.PartitionSet()
+    # Override issuing jobs so we can check if we need to use Slurm's magic
+    # whole-node-memory feature.
+    def issueBatchJob(
+        self,
+        command: str,
+        job_desc: JobDescription,
+        job_environment: dict[str, str] | None = None,
+    ) -> int:
+        # Avoid submitting internal jobs to the batch queue, handle locally
+        local_id = self.handleLocalJob(command, job_desc)
+        if local_id is not None:
+            return local_id
+        else:
+            self.check_resource_request(job_desc)
+            gpus = self.count_needed_gpus(job_desc)
+            job_id = self.getNextJobID()
+            self.currentJobs.add(job_id)
+            if "memory" not in job_desc.requirements and self.config.slurm_default_all_mem:  # type: ignore[attr-defined]
+                # The job doesn't have its own memory requirement, and we are
+                # defaulting to whole node memory. Use Slurm's 0-memory sentinel.
+                memory = 0
+            else:
+                # Use the memory actually on the job, or the Toil default memory
+                memory = job_desc.memory
+            self.newJobsQueue.put(
+                (
+                    job_id,
+                    job_desc.cores,
+                    memory,
+                    command,
+                    get_job_kind(job_desc.get_names()),
+                    job_environment,
+                    gpus,
+                )
+            )
+            logger.debug(
+                "Issued the job command: %s with job id: %s and job name %s",
+                command,
+                str(job_id),
+                get_job_kind(job_desc.get_names()),
+            )
+        return job_id
     def _check_accelerator_request(self, requirer: Requirer) -> None:
         for accelerator in requirer.accelerators:
-            if accelerator['kind'] != 'gpu':
-                raise InsufficientSystemResources(requirer, 'accelerators', details=
-                                                  [
-                                                      f'The accelerator {accelerator} could not be provided'
-                                                      'The Toil Slurm batch system only supports gpu accelerators at the moment.'
-                                                  ])
+            if accelerator["kind"] != "gpu":
+                raise InsufficientSystemResources(
+                    requirer,
+                    "accelerators",
+                    details=[
+                        f"The accelerator {accelerator} could not be provided"
+                        "The Toil Slurm batch system only supports gpu accelerators at the moment."
+                    ],
+                )
     ###
     ### The interface for SLURM
@@ -375,17 +788,69 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
     # implement getWaitDuration().
     @classmethod
-    def add_options(cls, parser: Union[ArgumentParser, _ArgumentGroup]):
-        allocate_mem = parser.add_mutually_exclusive_group()
-        allocate_mem_help = ("A flag that can block allocating memory with '--mem' for job submissions "
-                             "on SLURM since some system servers may reject any job request that "
-                             "explicitly specifies the memory allocation.  The default is to always allocate memory.")
-        allocate_mem.add_argument("--dont_allocate_mem", action='store_false', dest="allocate_mem", help=allocate_mem_help)
-        allocate_mem.add_argument("--allocate_mem", action='store_true', dest="allocate_mem", help=allocate_mem_help)
-        allocate_mem.set_defaults(allocate_mem=True)
-    OptionType = TypeVar('OptionType')
+    def add_options(cls, parser: ArgumentParser | _ArgumentGroup) -> None:
+        parser.add_argument(
+            "--slurmAllocateMem",
+            dest="slurm_allocate_mem",
+            type=strtobool,
+            default=True,
+            env_var="TOIL_SLURM_ALLOCATE_MEM",
+            help="If False, do not use --mem. Used as a workaround for Slurm clusters that reject jobs "
+            "with memory allocations.",
+        )
+        # Keep these deprcated options for backward compatibility
+        parser.add_argument(
+            "--dont_allocate_mem",
+            action="store_false",
+            dest="slurm_allocate_mem",
+            help=SUPPRESS,
+        )
+        parser.add_argument(
+            "--allocate_mem",
+            action="store_true",
+            dest="slurm_allocate_mem",
+            help=SUPPRESS,
+        )
+        parser.add_argument(
+            "--slurmDefaultAllMem",
+            dest="slurm_default_all_mem",
+            type=strtobool,
+            default=False,
+            env_var="TOIL_SLURM_DEFAULT_ALL_MEM",
+            help="If True, assign Toil jobs without their own memory requirements all available "
+            "memory on a Slurm node (via Slurm --mem=0).",
+        )
+        parser.add_argument(
+            "--slurmTime",
+            dest="slurm_time",
+            type=parse_slurm_time,
+            default=None,
+            env_var="TOIL_SLURM_TIME",
+            help="Slurm job time limit, in [DD-]HH:MM:SS format.",
+        )
+        parser.add_argument(
+            "--slurmPE",
+            dest="slurm_pe",
+            default=None,
+            env_var="TOIL_SLURM_PE",
+            help="Special partition to send Slurm jobs to if they ask for more than 1 CPU.",
+        )
+        parser.add_argument(
+            "--slurmArgs",
+            dest="slurm_args",
+            default="",
+            env_var="TOIL_SLURM_ARGS",
+            help="Extra arguments to pass to Slurm.",
+        )
+    OptionType = TypeVar("OptionType")
     @classmethod
     def setOptions(cls, setOption: OptionSetter) -> None:
-        setOption("allocate_mem")
+        setOption("slurm_allocate_mem")
+        setOption("slurm_default_all_mem")
+        setOption("slurm_time")
+        setOption("slurm_pe")
+        setOption("slurm_args")

toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl

toil 6.1.0a1py3-none-any.whl → 8.0.0py3-none-any.whl