PyPI - toil - Versions diffs - 6.1.0a1__py3-none-any.whl → 7.0.0__py3-none-any.whl - Mend

toil 6.1.0a1py3-none-any.whl → 7.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (104) hide show

toil/__init__.py +1 -232
toil/batchSystems/abstractBatchSystem.py +41 -17
toil/batchSystems/abstractGridEngineBatchSystem.py +79 -65
toil/batchSystems/awsBatch.py +8 -8
toil/batchSystems/cleanup_support.py +7 -3
toil/batchSystems/contained_executor.py +4 -5
toil/batchSystems/gridengine.py +1 -1
toil/batchSystems/htcondor.py +5 -5
toil/batchSystems/kubernetes.py +25 -11
toil/batchSystems/local_support.py +3 -3
toil/batchSystems/lsf.py +9 -9
toil/batchSystems/mesos/batchSystem.py +4 -4
toil/batchSystems/mesos/executor.py +3 -2
toil/batchSystems/options.py +9 -0
toil/batchSystems/singleMachine.py +11 -10
toil/batchSystems/slurm.py +129 -16
toil/batchSystems/torque.py +1 -1
toil/bus.py +45 -3
toil/common.py +56 -31
toil/cwl/cwltoil.py +442 -371
toil/deferred.py +1 -1
toil/exceptions.py +1 -1
toil/fileStores/abstractFileStore.py +69 -20
toil/fileStores/cachingFileStore.py +6 -22
toil/fileStores/nonCachingFileStore.py +6 -15
toil/job.py +270 -86
toil/jobStores/abstractJobStore.py +37 -31
toil/jobStores/aws/jobStore.py +280 -218
toil/jobStores/aws/utils.py +60 -31
toil/jobStores/conftest.py +2 -2
toil/jobStores/fileJobStore.py +3 -3
toil/jobStores/googleJobStore.py +3 -4
toil/leader.py +89 -38
toil/lib/aws/__init__.py +26 -10
toil/lib/aws/iam.py +2 -2
toil/lib/aws/session.py +62 -22
toil/lib/aws/utils.py +73 -37
toil/lib/conversions.py +24 -1
toil/lib/ec2.py +118 -69
toil/lib/expando.py +1 -1
toil/lib/generatedEC2Lists.py +8 -8
toil/lib/io.py +42 -4
toil/lib/misc.py +1 -3
toil/lib/resources.py +57 -16
toil/lib/retry.py +12 -5
toil/lib/threading.py +29 -14
toil/lib/throttle.py +1 -1
toil/options/common.py +31 -30
toil/options/wdl.py +5 -0
toil/provisioners/__init__.py +9 -3
toil/provisioners/abstractProvisioner.py +12 -2
toil/provisioners/aws/__init__.py +20 -15
toil/provisioners/aws/awsProvisioner.py +406 -329
toil/provisioners/gceProvisioner.py +2 -2
toil/provisioners/node.py +13 -5
toil/server/app.py +1 -1
toil/statsAndLogging.py +93 -23
toil/test/__init__.py +27 -12
toil/test/batchSystems/batchSystemTest.py +40 -33
toil/test/batchSystems/batch_system_plugin_test.py +79 -0
toil/test/batchSystems/test_slurm.py +22 -7
toil/test/cactus/__init__.py +0 -0
toil/test/cactus/test_cactus_integration.py +58 -0
toil/test/cwl/cwlTest.py +245 -236
toil/test/cwl/seqtk_seq.cwl +1 -1
toil/test/docs/scriptsTest.py +11 -14
toil/test/jobStores/jobStoreTest.py +40 -54
toil/test/lib/aws/test_iam.py +2 -2
toil/test/lib/test_ec2.py +1 -1
toil/test/options/__init__.py +13 -0
toil/test/options/options.py +37 -0
toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
toil/test/provisioners/clusterTest.py +99 -16
toil/test/server/serverTest.py +2 -2
toil/test/src/autoDeploymentTest.py +1 -1
toil/test/src/dockerCheckTest.py +2 -1
toil/test/src/environmentTest.py +125 -0
toil/test/src/fileStoreTest.py +1 -1
toil/test/src/jobDescriptionTest.py +18 -8
toil/test/src/jobTest.py +1 -1
toil/test/src/realtimeLoggerTest.py +4 -0
toil/test/src/workerTest.py +52 -19
toil/test/utils/toilDebugTest.py +62 -4
toil/test/utils/utilsTest.py +23 -21
toil/test/wdl/wdltoil_test.py +49 -21
toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
toil/toilState.py +68 -9
toil/utils/toilDebugFile.py +1 -1
toil/utils/toilDebugJob.py +153 -26
toil/utils/toilLaunchCluster.py +12 -2
toil/utils/toilRsyncCluster.py +7 -2
toil/utils/toilSshCluster.py +7 -3
toil/utils/toilStats.py +310 -266
toil/utils/toilStatus.py +98 -52
toil/version.py +11 -11
toil/wdl/wdltoil.py +644 -225
toil/worker.py +125 -83
{toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
toil-7.0.0.dist-info/METADATA +158 -0
{toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/RECORD +103 -96
{toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
toil-6.1.0a1.dist-info/METADATA +0 -125
{toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
{toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0

toil/batchSystems/abstractGridEngineBatchSystem.py CHANGED Viewed

@@ -22,9 +22,10 @@ from typing import Dict, List, Optional, Tuple, Union
 from toil.batchSystems.abstractBatchSystem import (BatchJobExitReason,
                                                    UpdatedBatchJobInfo)
 from toil.batchSystems.cleanup_support import BatchSystemCleanupSupport
-from toil.bus import ExternalBatchIdMessage
+from toil.bus import ExternalBatchIdMessage, get_job_kind
 from toil.job import AcceleratorRequirement
 from toil.lib.misc import CalledProcessErrorStderr
+from toil.lib.retry import old_retry, DEFAULT_DELAYS
 logger = logging.getLogger(__name__)
@@ -44,26 +45,29 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
     A partial implementation of BatchSystemSupport for batch systems run on a
     standard HPC cluster. By default auto-deployment is not implemented.
     """
+    class GridEngineThreadException(Exception):
+        pass
-    class Worker(Thread, metaclass=ABCMeta):
+    class GridEngineThread(Thread, metaclass=ABCMeta):
         def __init__(self, newJobsQueue: Queue, updatedJobsQueue: Queue, killQueue: Queue, killedJobsQueue: Queue, boss: 'AbstractGridEngineBatchSystem') -> None:
             """
-            Abstract worker interface class. All instances are created with five
+            Abstract thread interface class. All instances are created with five
             initial arguments (below). Note the Queue instances passed are empty.
             :param newJobsQueue: a Queue of new (unsubmitted) jobs
             :param updatedJobsQueue: a Queue of jobs that have been updated
             :param killQueue: a Queue of active jobs that need to be killed
-            :param killedJobsQueue: Queue of killed jobs for this worker
+            :param killedJobsQueue: Queue of killed jobs for this thread
             :param boss: the AbstractGridEngineBatchSystem instance that
-                         controls this AbstractGridEngineWorker
+                         controls this GridEngineThread
             """
             Thread.__init__(self)
             self.boss = boss
             self.boss.config.statePollingWait = \
                 self.boss.config.statePollingWait or self.boss.getWaitDuration()
+            self.boss.config.state_polling_timeout = \
+                self.boss.config.state_polling_timeout or self.boss.config.statePollingWait * 10
             self.newJobsQueue = newJobsQueue
             self.updatedJobsQueue = updatedJobsQueue
             self.killQueue = killQueue
@@ -74,6 +78,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
             self.batchJobIDs: Dict[int, str] = dict()
             self._checkOnJobsCache = None
             self._checkOnJobsTimestamp = None
+            self.exception = None
         def getBatchSystemID(self, jobID: int) -> str:
             """
@@ -107,7 +112,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
             """
             Create a new job with the given attributes.
-            Implementation-specific; called by AbstractGridEngineWorker.run()
+            Implementation-specific; called by GridEngineThread.run()
             """
             activity = False
             # Load new job id if present:
@@ -143,7 +148,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
         def killJobs(self):
             """
-            Kill any running jobs within worker
+            Kill any running jobs within thread
             """
             killList = list()
             while True:
@@ -175,7 +180,8 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
             while killList:
                 for jobID in list(killList):
                     batchJobID = self.getBatchSystemID(jobID)
-                    if self.boss.with_retries(self.getJobExitCode, batchJobID) is not None:
+                    exit_code = self.boss.with_retries(self.getJobExitCode, batchJobID)
+                    if exit_code is not None:
                         logger.debug('Adding jobID %s to killedJobsQueue', jobID)
                         self.killedJobsQueue.put(jobID)
                         killList.remove(jobID)
@@ -225,23 +231,20 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
             return activity
         def _handle_job_status(
-            self, job_id: int, status: Union[int, None], activity: bool
+            self, job_id: int, status: Union[int, Tuple[int, Optional[BatchJobExitReason]], None], activity: bool
         ) -> bool:
             """
             Helper method for checkOnJobs to handle job statuses
             """
             if status is not None:
+                if isinstance(status, int):
+                    code = status
+                    reason = None
+                else:
+                    code, reason = status
                 self.updatedJobsQueue.put(
                     UpdatedBatchJobInfo(
-                        jobID=job_id, exitStatus=status, exitReason=None, wallTime=None
-                    )
-                )
-                self.forgetJob(job_id)
-                return True
-            if status is not None and isinstance(status, BatchJobExitReason):
-                self.updatedJobsQueue.put(
-                    UpdatedBatchJobInfo(
-                        jobID=job_id, exitStatus=1, exitReason=status, wallTime=None
+                        jobID=job_id, exitStatus=code, exitReason=reason, wallTime=None
                     )
                 )
                 self.forgetJob(job_id)
@@ -276,14 +279,17 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
                 while self._runStep():
                     pass
             except Exception as ex:
-                logger.error("GridEngine like batch system failure", exc_info=ex)
-                raise
+                self.exception = ex
+                logger.error("GridEngine like batch system failure: %s", ex)
+                # don't raise exception as is_alive will still be set to false,
+                # signalling exception in the thread as we expect the thread to
+                # always be running for the duration of the workflow
-        def coalesce_job_exit_codes(self, batch_job_id_list: list) -> list:
+        def coalesce_job_exit_codes(self, batch_job_id_list: list) -> List[Union[int, Tuple[int, Optional[BatchJobExitReason]], None]]:
             """
-            Returns exit codes for a list of jobs.
+            Returns exit codes and possibly exit reasons for a list of jobs, or None if they are running.
-            Called by AbstractGridEngineWorker.checkOnJobs().
+            Called by GridEngineThread.checkOnJobs().
             This is an optional part of the interface. It should raise
             NotImplementedError if not actually implemented for a particular
@@ -344,23 +350,26 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
         def killJob(self, jobID):
             """
             Kill specific job with the Toil job ID. Implementation-specific; called
-            by AbstractGridEngineWorker.killJobs()
+            by GridEngineThread.killJobs()
             :param string jobID: Toil job ID
             """
             raise NotImplementedError()
         @abstractmethod
-        def getJobExitCode(self, batchJobID):
+        def getJobExitCode(self, batchJobID) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
             """
-            Returns job exit code or an instance of abstractBatchSystem.BatchJobExitReason.
-            if something else happened other than the job exiting.
-            Implementation-specific; called by AbstractGridEngineWorker.checkOnJobs()
+            Returns job exit code and possibly an instance of abstractBatchSystem.BatchJobExitReason.
-            :param string batchjobID: batch system job ID
+            Returns None if the job is still running.
-            :rtype: int|toil.batchSystems.abstractBatchSystem.BatchJobExitReason: exit code int
-                    or BatchJobExitReason if something else happened other than job exiting.
+            If the job is not running but the exit code is not available, it
+            will be EXIT_STATUS_UNAVAILABLE_VALUE. Implementation-specific;
+            called by GridEngineThread.checkOnJobs().
+            The exit code will only be 0 if the job affirmatively succeeded.
+            :param string batchjobID: batch system job ID
             """
             raise NotImplementedError()
@@ -375,24 +384,20 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
         self.updatedJobsQueue = Queue()
         self.killQueue = Queue()
         self.killedJobsQueue = Queue()
-        # get the associated worker class here
-        self.worker = self.Worker(self.newJobsQueue, self.updatedJobsQueue,
-                                  self.killQueue, self.killedJobsQueue, self)
-        self.worker.start()
+        # get the associated thread class here
+        self.background_thread = self.GridEngineThread(self.newJobsQueue, self.updatedJobsQueue,
+                                                       self.killQueue, self.killedJobsQueue, self)
+        self.background_thread.start()
         self._getRunningBatchJobIDsTimestamp = None
         self._getRunningBatchJobIDsCache = {}
-    @classmethod
-    def supportsWorkerCleanup(cls):
-        return False
     @classmethod
     def supportsAutoDeployment(cls):
         return False
-    def issueBatchJob(self, jobDesc, job_environment: Optional[Dict[str, str]] = None):
+    def issueBatchJob(self, command: str, jobDesc, job_environment: Optional[Dict[str, str]] = None):
         # Avoid submitting internal jobs to the batch queue, handle locally
-        localID = self.handleLocalJob(jobDesc)
+        localID = self.handleLocalJob(command, jobDesc)
         if localID is not None:
             return localID
         else:
@@ -406,11 +411,11 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
                         gpus = accelerator['count']
             else:
                 gpus = jobDesc.accelerators
-            self.newJobsQueue.put((jobID, jobDesc.cores, jobDesc.memory, jobDesc.command, jobDesc.get_job_kind(),
+            self.newJobsQueue.put((jobID, jobDesc.cores, jobDesc.memory, command, get_job_kind(jobDesc.get_names()),
                                    job_environment, gpus))
-            logger.debug("Issued the job command: %s with job id: %s and job name %s", jobDesc.command, str(jobID),
-                         jobDesc.get_job_kind())
+            logger.debug("Issued the job command: %s with job id: %s and job name %s", command, str(jobID),
+                         get_job_kind(jobDesc.get_names()))
         return jobID
     def killBatchJobs(self, jobIDs):
@@ -424,7 +429,12 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
         for jobID in jobIDs:
             self.killQueue.put(jobID)
         while jobIDs:
-            killedJobId = self.killedJobsQueue.get()
+            try:
+                killedJobId = self.killedJobsQueue.get(timeout=10)
+            except Empty:
+                if not self.background_thread.is_alive():
+                    raise self.GridEngineThreadException("Grid engine thread failed unexpectedly") from self.background_thread.exception
+                continue
             if killedJobId is None:
                 break
             jobIDs.remove(killedJobId)
@@ -456,7 +466,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
                 self.config.statePollingWait):
             batchIds = self._getRunningBatchJobIDsCache
         else:
-            batchIds = self.with_retries(self.worker.getRunningJobIDs)
+            batchIds = self.with_retries(self.background_thread.getRunningJobIDs)
             self._getRunningBatchJobIDsCache = batchIds
             self._getRunningBatchJobIDsTimestamp = datetime.now()
         batchIds.update(self.getRunningLocalJobIDs())
@@ -464,6 +474,11 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
     def getUpdatedBatchJob(self, maxWait):
         local_tuple = self.getUpdatedLocalJob(0)
+        if not self.background_thread.is_alive():
+            # kill remaining jobs on the thread
+            self.background_thread.killJobs()
+            raise self.GridEngineThreadException("Unexpected GridEngineThread failure") from self.background_thread.exception
         if local_tuple:
             return local_tuple
         else:
@@ -477,14 +492,14 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
     def shutdown(self) -> None:
         """
-        Signals worker to shutdown (via sentinel) then cleanly joins the thread
+        Signals thread to shutdown (via sentinel) then cleanly joins the thread
         """
         self.shutdownLocal()
         newJobsQueue = self.newJobsQueue
         self.newJobsQueue = None
         newJobsQueue.put(None)
-        self.worker.join()
+        self.background_thread.join()
     def setEnv(self, name, value=None):
         if value and ',' in value:
@@ -503,21 +518,20 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
     def with_retries(self, operation, *args, **kwargs):
         """
-        Call operation with args and kwargs. If one of the calls to an SGE
-        command fails, sleep and try again for a set number of times.
+        Call operation with args and kwargs. If one of the calls to a
+        command fails, sleep and try again.
         """
-        maxTries = 3
-        tries = 0
-        while True:
-            tries += 1
-            try:
-                return operation(*args, **kwargs)
-            except CalledProcessErrorStderr as err:
-                if tries < maxTries:
-                    logger.error("Will retry errored operation %s, code %d: %s",
-                                 operation.__name__, err.returncode, err.stderr)
-                    time.sleep(self.config.statePollingWait)
-                else:
-                    logger.error("Failed operation %s, code %d: %s",
+        for attempt in old_retry(
+            # Don't retry more often than the state polling wait.
+            delays=[max(delay, self.config.statePollingWait) for delay in DEFAULT_DELAYS],
+            timeout=self.config.state_polling_timeout,
+            predicate=lambda e: isinstance(e, CalledProcessErrorStderr)
+        ):
+            with attempt:
+                try:
+                    return operation(*args, **kwargs)
+                except CalledProcessErrorStderr as err:
+                    logger.error("Errored operation %s, code %d: %s",
                                  operation.__name__, err.returncode, err.stderr)
+                    # Raise up to the retry logic, which will retry until timeout
                     raise err

toil/batchSystems/awsBatch.py CHANGED Viewed

@@ -36,7 +36,7 @@ import uuid
 from argparse import ArgumentParser, _ArgumentGroup
 from typing import Any, Dict, Iterator, List, Optional, Set, Union
-from boto.exception import BotoServerError
+from botocore.exceptions import ClientError
 from toil import applianceSelf
 from toil.batchSystems.abstractBatchSystem import (EXIT_STATUS_UNAVAILABLE_VALUE,
@@ -156,9 +156,9 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
                     'AWS Batch can only provide nvidia gpu accelerators.'
                 ])
-    def issueBatchJob(self, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
+    def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
         # Try the job as local
-        local_id = self.handleLocalJob(job_desc)
+        local_id = self.handleLocalJob(command, job_desc)
         if local_id is not None:
             # It is a local job
             return local_id
@@ -184,7 +184,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
                 environment.update(job_environment)
             # Make a command to run it in the executor
-            command_list = pack_job(job_desc, self.user_script)
+            command_list = pack_job(command, self.user_script)
             # Compose a job spec to submit
             job_spec = {
@@ -376,7 +376,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
         # Get rid of the job definition we are using if we can.
         self._destroy_job_definition()
-    @retry(errors=[BotoServerError])
+    @retry(errors=[ClientError])
     def _try_terminate(self, aws_id: str) -> None:
         """
         Internal function. Should not be called outside this class.
@@ -392,7 +392,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
         # Kill the AWS Batch job
         self.client.terminate_job(jobId=aws_id, reason='Killed by Toil')
-    @retry(errors=[BotoServerError])
+    @retry(errors=[ClientError])
     def _wait_until_stopped(self, aws_id: str) -> None:
         """
         Internal function. Should not be called outside this class.
@@ -418,7 +418,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
             logger.info('Waiting for killed job %s to stop', self.aws_id_to_bs_id.get(aws_id, aws_id))
             time.sleep(2)
-    @retry(errors=[BotoServerError])
+    @retry(errors=[ClientError])
     def _get_or_create_job_definition(self) -> str:
         """
         Internal function. Should not be called outside this class.
@@ -482,7 +482,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
         return self.job_definition
-    @retry(errors=[BotoServerError])
+    @retry(errors=[ClientError])
     def _destroy_job_definition(self) -> None:
         """
         Internal function. Should not be called outside this class.

toil/batchSystems/cleanup_support.py CHANGED Viewed

@@ -69,8 +69,13 @@ class WorkerCleanupContext:
     def __enter__(self) -> None:
         # Set up an arena so we know who is the last worker to leave
-        self.arena = LastProcessStandingArena(Toil.get_toil_coordination_dir(self.workerCleanupInfo.work_dir, self.workerCleanupInfo.coordination_dir),
-                                              self.workerCleanupInfo.workflow_id + '-cleanup')
+        self.arena = LastProcessStandingArena(
+            Toil.get_toil_coordination_dir(
+                self.workerCleanupInfo.work_dir,
+                self.workerCleanupInfo.coordination_dir
+            ),
+            Toil.get_workflow_path_component(self.workerCleanupInfo.workflow_id) + "-cleanup"
+        )
         logger.debug('Entering cleanup arena')
         self.arena.enter()
         logger.debug('Cleanup arena entered')
@@ -90,4 +95,3 @@ class WorkerCleanupContext:
             # Now the coordination_dir is allowed to no longer exist on the node.
         logger.debug('Cleanup arena left')

toil/batchSystems/contained_executor.py CHANGED Viewed

@@ -25,18 +25,17 @@ import sys
 from typing import Any, Dict, List, Optional
 from toil.batchSystems.abstractBatchSystem import EXIT_STATUS_UNAVAILABLE_VALUE
-from toil.job import JobDescription
 from toil.resource import Resource
 from toil.statsAndLogging import configure_root_logger, set_log_level
 logger = logging.getLogger(__name__)
-def pack_job(job_desc: JobDescription, user_script: Optional[Resource] = None, environment: Optional[Dict[str, str]] = None) -> List[str]:
+def pack_job(command: str, user_script: Optional[Resource] = None, environment: Optional[Dict[str, str]] = None) -> List[str]:
     """
-    Create a command that, when run, will execute the given job.
+    Create a command that runs the given command in an environment.
-    :param job_desc: Job description for the job to run.
+    :param command: Worker command to run to run the job.
     :param user_script: User script that will be loaded before the job is run.
     :param environment: Environment variable dict that will be applied before
         the job is run.
@@ -46,7 +45,7 @@ def pack_job(job_desc: JobDescription, user_script: Optional[Resource] = None, e
     """
     # Make a job dict to send to the executor.
     # TODO: Factor out executor setup from here and Kubernetes and TES
-    job: Dict[str, Any] = {"command": job_desc.command}
+    job: Dict[str, Any] = {"command": command}
     if user_script is not None:
         # If there's a user script resource be sure to send it along
         job['userScript'] = user_script

toil/batchSystems/gridengine.py CHANGED Viewed

@@ -28,7 +28,7 @@ logger = logging.getLogger(__name__)
 class GridEngineBatchSystem(AbstractGridEngineBatchSystem):
-    class Worker(AbstractGridEngineBatchSystem.Worker):
+    class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
         """
         Grid Engine-specific AbstractGridEngineWorker methods
         """

toil/batchSystems/htcondor.py CHANGED Viewed

@@ -48,7 +48,7 @@ schedd_lock = Lock()
 class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
     # When using HTCondor, the Schedd handles scheduling
-    class Worker(AbstractGridEngineBatchSystem.Worker):
+    class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
         # Override the createJobs method so that we can use htcondor.Submit objects
         # and so that we can get disk allocation requests and ceil the CPU request.
@@ -387,9 +387,9 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
             return '"' + ' '.join(env_items) + '"'
     # Override the issueBatchJob method so HTCondor can be given the disk request
-    def issueBatchJob(self, jobNode, job_environment: Optional[Dict[str, str]] = None):
+    def issueBatchJob(self, command: str, jobNode, job_environment: Optional[Dict[str, str]] = None):
         # Avoid submitting internal jobs to the batch queue, handle locally
-        localID = self.handleLocalJob(jobNode)
+        localID = self.handleLocalJob(command, jobNode)
         if localID is not None:
             return localID
         else:
@@ -398,7 +398,7 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
             self.currentJobs.add(jobID)
             # Construct our style of job tuple
-            self.newJobsQueue.put((jobID, jobNode.cores, jobNode.memory, jobNode.disk, jobNode.jobName, jobNode.command,
+            self.newJobsQueue.put((jobID, jobNode.cores, jobNode.memory, jobNode.disk, jobNode.jobName, command,
                                    job_environment or {}, jobNode.accelerators))
-            logger.debug("Issued the job command: %s with job id: %s ", jobNode.command, str(jobID))
+            logger.debug("Issued the job command: %s with job id: %s ", command, str(jobID))
         return jobID

toil/batchSystems/kubernetes.py CHANGED Viewed

@@ -47,6 +47,8 @@ from typing import (Any,
                     cast,
                     overload)
+from toil.lib.conversions import opt_strtobool
 if sys.version_info < (3, 10):
     from typing_extensions import ParamSpec
 else:
@@ -83,7 +85,7 @@ from kubernetes.client import (BatchV1Api,
                                V1SecretVolumeSource,
                                V1Toleration,
                                V1Volume,
-                               V1VolumeMount)
+                               V1VolumeMount, V1SecurityContext)
 from kubernetes.client.api_client import ApiClient
 from kubernetes.client.exceptions import ApiException
 from kubernetes.config.config_exception import ConfigException
@@ -758,6 +760,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
     def _create_pod_spec(
             self,
+            command: str,
             job_desc: JobDescription,
             job_environment: Optional[Dict[str, str]] = None
     ) -> V1PodSpec:
@@ -770,7 +773,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
             environment.update(job_environment)
         # Make a command to run it in the executor
-        command_list = pack_job(job_desc, self.user_script, environment=environment)
+        command_list = pack_job(command, self.user_script, environment=environment)
         # The Kubernetes API makes sense only in terms of the YAML format. Objects
         # represent sections of the YAML files. Except from our point of view, all
@@ -877,14 +880,20 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
         # Make a container definition
         container = V1Container(command=command_list,
-                                                  image=self.docker_image,
-                                                  name="runner-container",
-                                                  resources=resources,
-                                                  volume_mounts=mounts)
+                                image=self.docker_image,
+                                name="runner-container",
+                                resources=resources,
+                                volume_mounts=mounts)
+        # In case security context rules are not allowed to be set, we only apply
+        # a security context at all if we need to turn on privileged mode.
+        if self.config.kubernetes_privileged:
+            container.security_context = V1SecurityContext(privileged=self.config.kubernetes_privileged)
         # Wrap the container in a spec
         pod_spec = V1PodSpec(containers=[container],
-                                               volumes=volumes,
-                                               restart_policy="Never")
+                             volumes=volumes,
+                             restart_policy="Never")
         # Tell the spec where to land
         placement.apply(pod_spec)
@@ -1005,9 +1014,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
             self._release_acquired_resources(resources, notify=resource_notify)
             del self._acquired_resources[job_name]
-    def issueBatchJob(self, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
+    def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
         # Try the job as local
-        localID = self.handleLocalJob(job_desc)
+        localID = self.handleLocalJob(command, job_desc)
         if localID is not None:
             # It is a local job
             return localID
@@ -1018,7 +1027,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
         self.check_resource_request(job_desc)
         # Make a pod that describes running the job
-        pod_spec = self._create_pod_spec(job_desc, job_environment=job_environment)
+        pod_spec = self._create_pod_spec(command, job_desc, job_environment=job_environment)
         # Make a batch system scope job ID
         job_id = self.getNextJobID()
@@ -1879,6 +1888,10 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
         parser.add_argument("--kubernetesPodTimeout", dest="kubernetes_pod_timeout", default=120, env_var="TOIL_KUBERNETES_POD_TIMEOUT", type=float,
                             help="Seconds to wait for a scheduled Kubernetes pod to start running.  "
                                  "(default: %(default)s)")
+        parser.add_argument("--kubernetesPrivileged", dest="kubernetes_privileged", default=False, env_var="TOIL_KUBERNETES_PRIVILEGED", type=opt_strtobool,
+                            help="Whether to ask worker pods to run in privileged mode. This should be used to access "
+                                 "privileged operations, such as FUSE. On Toil-managed clusters with --enableFuse, "
+                                 "this is set to True. (default: %(default)s)")
     OptionType = TypeVar('OptionType')
     @classmethod
@@ -1887,4 +1900,5 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
         setOption("kubernetes_owner")
         setOption("kubernetes_service_account",)
         setOption("kubernetes_pod_timeout")
+        setOption("kubernetes_privileged")

toil/batchSystems/local_support.py CHANGED Viewed

@@ -34,9 +34,9 @@ class BatchSystemLocalSupport(BatchSystemSupport):
             config, maxCores, maxMemory, maxDisk, max_jobs=max_local_jobs
         )
-    def handleLocalJob(self, jobDesc: JobDescription) -> Optional[int]:
+    def handleLocalJob(self, command: str, jobDesc: JobDescription) -> Optional[int]:
         """
-        To be called by issueBatchJobs.
+        To be called by issueBatchJob.
         Returns the jobID if the jobDesc has been submitted to the local queue,
         otherwise returns None
@@ -50,7 +50,7 @@ class BatchSystemLocalSupport(BatchSystemSupport):
             # somehow doesn't error whereas just returning the value complains
             # we're returning an Any. TODO: When singleMachine.py typechecks,
             # remove all these extra variables.
-            local_id: int = self.localBatch.issueBatchJob(jobDesc)
+            local_id: int = self.localBatch.issueBatchJob(command, jobDesc)
             return local_id
         else:
             return None

toil/batchSystems/lsf.py CHANGED Viewed

@@ -25,12 +25,12 @@ import re
 import subprocess
 from datetime import datetime
 from random import randint
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Tuple, Union
 from dateutil.parser import parse
 from dateutil.tz import tzlocal
-from toil.batchSystems.abstractBatchSystem import BatchJobExitReason
+from toil.batchSystems.abstractBatchSystem import BatchJobExitReason, EXIT_STATUS_UNAVAILABLE_VALUE
 from toil.batchSystems.abstractGridEngineBatchSystem import \
     AbstractGridEngineBatchSystem
 from toil.batchSystems.lsfHelper import (check_lsf_json_output_supported,
@@ -44,8 +44,8 @@ logger = logging.getLogger(__name__)
 class LSFBatchSystem(AbstractGridEngineBatchSystem):
-    class Worker(AbstractGridEngineBatchSystem.Worker):
-        """LSF specific AbstractGridEngineWorker methods."""
+    class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
+        """LSF specific GridEngineThread methods."""
         def getRunningJobIDs(self):
             times = {}
@@ -161,7 +161,7 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
                         status_resonse.append(None)
             return status_resonse
-        def getJobExitCode(self, lsfJobID):
+        def getJobExitCode(self, lsfJobID) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
             # the task is set as part of the job ID if using getBatchSystemID()
             if "NOT_SUBMITTED" in lsfJobID:
                 logger.error("bjobs detected job failed to submit")
@@ -186,7 +186,7 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
             return self.fallbackGetJobExitCode(job)
-        def parse_bjobs_record(self, bjobs_record: dict, job: int) -> Union[int, None]:
+        def parse_bjobs_record(self, bjobs_record: dict, job: int) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
             """
             Helper functions for getJobExitCode and  to parse the bjobs status record
             """
@@ -224,7 +224,7 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
                         exit_info, job
                     )
                     if "TERM_MEMLIMIT" in exit_reason:
-                        return BatchJobExitReason.MEMLIMIT
+                        return (exit_code if exit_code != 0 else EXIT_STATUS_UNAVAILABLE_VALUE, BatchJobExitReason.MEMLIMIT)
                     return exit_code
                 if process_status == "RUN":
                     logger.debug(
@@ -237,7 +237,7 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
                 return self.getJobExitCodeBACCT(job)
-        def getJobExitCodeBACCT(self,job):
+        def getJobExitCodeBACCT(self,job) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
             # if not found in bjobs, then try bacct (slower than bjobs)
             logger.debug("bjobs failed to detect job - trying bacct: "
                          "{}".format(job))
@@ -258,7 +258,7 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
                          "running: {}".format(job))
             return None
-        def fallbackGetJobExitCode(self, job):
+        def fallbackGetJobExitCode(self, job) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
             args = ["bjobs", "-l", str(job)]
             logger.debug(f"Checking job exit code for job via bjobs (fallback): {job}")
             stdout = call_command(args)

toil 6.1.0a1__py3-none-any.whl → 7.0.0__py3-none-any.whl

toil 6.1.0a1py3-none-any.whl → 7.0.0py3-none-any.whl