PyPI - toil - Versions diffs - 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl - Mend

toil 6.1.0a1py3-none-any.whl → 8.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (193) hide show

toil/__init__.py +122 -315
toil/batchSystems/__init__.py +1 -0
toil/batchSystems/abstractBatchSystem.py +173 -89
toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
toil/batchSystems/awsBatch.py +244 -135
toil/batchSystems/cleanup_support.py +26 -16
toil/batchSystems/contained_executor.py +31 -28
toil/batchSystems/gridengine.py +86 -50
toil/batchSystems/htcondor.py +166 -89
toil/batchSystems/kubernetes.py +632 -382
toil/batchSystems/local_support.py +20 -15
toil/batchSystems/lsf.py +134 -81
toil/batchSystems/lsfHelper.py +13 -11
toil/batchSystems/mesos/__init__.py +41 -29
toil/batchSystems/mesos/batchSystem.py +290 -151
toil/batchSystems/mesos/executor.py +79 -50
toil/batchSystems/mesos/test/__init__.py +31 -23
toil/batchSystems/options.py +46 -28
toil/batchSystems/registry.py +53 -19
toil/batchSystems/singleMachine.py +296 -125
toil/batchSystems/slurm.py +603 -138
toil/batchSystems/torque.py +47 -33
toil/bus.py +186 -76
toil/common.py +664 -368
toil/cwl/__init__.py +1 -1
toil/cwl/cwltoil.py +1136 -483
toil/cwl/utils.py +17 -22
toil/deferred.py +63 -42
toil/exceptions.py +5 -3
toil/fileStores/__init__.py +5 -5
toil/fileStores/abstractFileStore.py +140 -60
toil/fileStores/cachingFileStore.py +717 -269
toil/fileStores/nonCachingFileStore.py +116 -87
toil/job.py +1225 -368
toil/jobStores/abstractJobStore.py +416 -266
toil/jobStores/aws/jobStore.py +863 -477
toil/jobStores/aws/utils.py +201 -120
toil/jobStores/conftest.py +3 -2
toil/jobStores/fileJobStore.py +292 -154
toil/jobStores/googleJobStore.py +140 -74
toil/jobStores/utils.py +36 -15
toil/leader.py +668 -272
toil/lib/accelerators.py +115 -18
toil/lib/aws/__init__.py +74 -31
toil/lib/aws/ami.py +122 -87
toil/lib/aws/iam.py +284 -108
toil/lib/aws/s3.py +31 -0
toil/lib/aws/session.py +214 -39
toil/lib/aws/utils.py +287 -231
toil/lib/bioio.py +13 -5
toil/lib/compatibility.py +11 -6
toil/lib/conversions.py +104 -47
toil/lib/docker.py +131 -103
toil/lib/ec2.py +361 -199
toil/lib/ec2nodes.py +174 -106
toil/lib/encryption/_dummy.py +5 -3
toil/lib/encryption/_nacl.py +10 -6
toil/lib/encryption/conftest.py +1 -0
toil/lib/exceptions.py +26 -7
toil/lib/expando.py +5 -3
toil/lib/ftp_utils.py +217 -0
toil/lib/generatedEC2Lists.py +127 -19
toil/lib/humanize.py +6 -2
toil/lib/integration.py +341 -0
toil/lib/io.py +141 -15
toil/lib/iterables.py +4 -2
toil/lib/memoize.py +12 -8
toil/lib/misc.py +66 -21
toil/lib/objects.py +2 -2
toil/lib/resources.py +68 -15
toil/lib/retry.py +126 -81
toil/lib/threading.py +299 -82
toil/lib/throttle.py +16 -15
toil/options/common.py +843 -409
toil/options/cwl.py +175 -90
toil/options/runner.py +50 -0
toil/options/wdl.py +73 -17
toil/provisioners/__init__.py +117 -46
toil/provisioners/abstractProvisioner.py +332 -157
toil/provisioners/aws/__init__.py +70 -33
toil/provisioners/aws/awsProvisioner.py +1145 -715
toil/provisioners/clusterScaler.py +541 -279
toil/provisioners/gceProvisioner.py +282 -179
toil/provisioners/node.py +155 -79
toil/realtimeLogger.py +34 -22
toil/resource.py +137 -75
toil/server/app.py +128 -62
toil/server/celery_app.py +3 -1
toil/server/cli/wes_cwl_runner.py +82 -53
toil/server/utils.py +54 -28
toil/server/wes/abstract_backend.py +64 -26
toil/server/wes/amazon_wes_utils.py +21 -15
toil/server/wes/tasks.py +121 -63
toil/server/wes/toil_backend.py +142 -107
toil/server/wsgi_app.py +4 -3
toil/serviceManager.py +58 -22
toil/statsAndLogging.py +224 -70
toil/test/__init__.py +282 -183
toil/test/batchSystems/batchSystemTest.py +460 -210
toil/test/batchSystems/batch_system_plugin_test.py +90 -0
toil/test/batchSystems/test_gridengine.py +173 -0
toil/test/batchSystems/test_lsf_helper.py +67 -58
toil/test/batchSystems/test_slurm.py +110 -49
toil/test/cactus/__init__.py +0 -0
toil/test/cactus/test_cactus_integration.py +56 -0
toil/test/cwl/cwlTest.py +496 -287
toil/test/cwl/measure_default_memory.cwl +12 -0
toil/test/cwl/not_run_required_input.cwl +29 -0
toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
toil/test/cwl/seqtk_seq.cwl +1 -1
toil/test/docs/scriptsTest.py +69 -46
toil/test/jobStores/jobStoreTest.py +427 -264
toil/test/lib/aws/test_iam.py +118 -50
toil/test/lib/aws/test_s3.py +16 -9
toil/test/lib/aws/test_utils.py +5 -6
toil/test/lib/dockerTest.py +118 -141
toil/test/lib/test_conversions.py +113 -115
toil/test/lib/test_ec2.py +58 -50
toil/test/lib/test_integration.py +104 -0
toil/test/lib/test_misc.py +12 -5
toil/test/mesos/MesosDataStructuresTest.py +23 -10
toil/test/mesos/helloWorld.py +7 -6
toil/test/mesos/stress.py +25 -20
toil/test/options/__init__.py +13 -0
toil/test/options/options.py +42 -0
toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
toil/test/provisioners/clusterScalerTest.py +440 -250
toil/test/provisioners/clusterTest.py +166 -44
toil/test/provisioners/gceProvisionerTest.py +174 -100
toil/test/provisioners/provisionerTest.py +25 -13
toil/test/provisioners/restartScript.py +5 -4
toil/test/server/serverTest.py +188 -141
toil/test/sort/restart_sort.py +137 -68
toil/test/sort/sort.py +134 -66
toil/test/sort/sortTest.py +91 -49
toil/test/src/autoDeploymentTest.py +141 -101
toil/test/src/busTest.py +20 -18
toil/test/src/checkpointTest.py +8 -2
toil/test/src/deferredFunctionTest.py +49 -35
toil/test/src/dockerCheckTest.py +32 -24
toil/test/src/environmentTest.py +135 -0
toil/test/src/fileStoreTest.py +539 -272
toil/test/src/helloWorldTest.py +7 -4
toil/test/src/importExportFileTest.py +61 -31
toil/test/src/jobDescriptionTest.py +46 -21
toil/test/src/jobEncapsulationTest.py +2 -0
toil/test/src/jobFileStoreTest.py +74 -50
toil/test/src/jobServiceTest.py +187 -73
toil/test/src/jobTest.py +121 -71
toil/test/src/miscTests.py +19 -18
toil/test/src/promisedRequirementTest.py +82 -36
toil/test/src/promisesTest.py +7 -6
toil/test/src/realtimeLoggerTest.py +10 -6
toil/test/src/regularLogTest.py +71 -37
toil/test/src/resourceTest.py +80 -49
toil/test/src/restartDAGTest.py +36 -22
toil/test/src/resumabilityTest.py +9 -2
toil/test/src/retainTempDirTest.py +45 -14
toil/test/src/systemTest.py +12 -8
toil/test/src/threadingTest.py +44 -25
toil/test/src/toilContextManagerTest.py +10 -7
toil/test/src/userDefinedJobArgTypeTest.py +8 -5
toil/test/src/workerTest.py +73 -23
toil/test/utils/toilDebugTest.py +103 -33
toil/test/utils/toilKillTest.py +4 -5
toil/test/utils/utilsTest.py +245 -106
toil/test/wdl/wdltoil_test.py +818 -149
toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
toil/toilState.py +120 -35
toil/utils/toilConfig.py +13 -4
toil/utils/toilDebugFile.py +44 -27
toil/utils/toilDebugJob.py +214 -27
toil/utils/toilDestroyCluster.py +11 -6
toil/utils/toilKill.py +8 -3
toil/utils/toilLaunchCluster.py +256 -140
toil/utils/toilMain.py +37 -16
toil/utils/toilRsyncCluster.py +32 -14
toil/utils/toilSshCluster.py +49 -22
toil/utils/toilStats.py +356 -273
toil/utils/toilStatus.py +292 -139
toil/utils/toilUpdateEC2Instances.py +3 -1
toil/version.py +12 -12
toil/wdl/utils.py +5 -5
toil/wdl/wdltoil.py +3913 -1033
toil/worker.py +367 -184
{toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
toil-8.0.0.dist-info/METADATA +173 -0
toil-8.0.0.dist-info/RECORD +253 -0
{toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
toil-6.1.0a1.dist-info/METADATA +0 -125
toil-6.1.0a1.dist-info/RECORD +0 -237
{toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
{toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0

toil/worker.py CHANGED Viewed

@@ -25,24 +25,31 @@ import stat
 import sys
 import time
 import traceback
+from collections.abc import Iterator
 from contextlib import contextmanager
-from typing import Any, Callable, Iterator, List, Optional
+from typing import Any, Callable, Optional
 from configargparse import ArgParser
 from toil import logProcessContext
 from toil.common import Config, Toil, safeUnpickleFromStream
-from toil.cwl.utils import (CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION,
-                            CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE)
+from toil.cwl.utils import (
+    CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION,
+    CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE,
+)
 from toil.deferred import DeferredFunctionManager
 from toil.fileStores.abstractFileStore import AbstractFileStore
-from toil.job import CheckpointJobDescription, Job, JobDescription
+from toil.job import (
+    CheckpointJobDescription,
+    DebugStoppingPointReached,
+    Job,
+    JobDescription,
+)
 from toil.jobStores.abstractJobStore import AbstractJobStore
 from toil.lib.expando import MagicExpando
 from toil.lib.io import make_public_dir
-from toil.lib.resources import (get_total_cpu_time,
-                                get_total_cpu_time_and_memory_usage)
-from toil.statsAndLogging import configure_root_logger, set_log_level
+from toil.lib.resources import ResourceMonitor
+from toil.statsAndLogging import configure_root_logger, install_log_color, set_log_level
 logger = logging.getLogger(__name__)
@@ -50,36 +57,55 @@ logger = logging.getLogger(__name__)
 class StatsDict(MagicExpando):
     """Subclass of MagicExpando for type-checking purposes."""
-    jobs: List[str]
+    jobs: list[MagicExpando]
-def nextChainable(predecessor: JobDescription, jobStore: AbstractJobStore, config: Config) -> Optional[JobDescription]:
+def nextChainable(
+    predecessor: JobDescription, job_store: AbstractJobStore, config: Config
+) -> Optional[JobDescription]:
     """
     Returns the next chainable job's JobDescription after the given predecessor
     JobDescription, if one exists, or None if the chain must terminate.
     :param predecessor: The job to chain from
-    :param jobStore: The JobStore to fetch JobDescriptions from.
+    :param job_store: The JobStore to fetch JobDescriptions from.
     :param config: The configuration for the current run.
     """
-    #If no more jobs to run or services not finished, quit
-    if predecessor.nextSuccessors() is None or len(predecessor.services) > 0 or (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint != None):
-        logger.debug("Stopping running chain of jobs: no successors: %s, services: %s, checkpoint: %s",
-                     predecessor.nextSuccessors() is None, len(predecessor.services), (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint != None))
+    # If no more jobs to run or services not finished, quit
+    if (
+        predecessor.nextSuccessors() is None
+        or len(predecessor.services) > 0
+        or (
+            isinstance(predecessor, CheckpointJobDescription)
+            and predecessor.checkpoint is not None
+        )
+    ):
+        logger.debug(
+            "Stopping running chain of jobs: no successors: %s, services: %s, checkpoint: %s",
+            predecessor.nextSuccessors() is None,
+            len(predecessor.services),
+            (
+                isinstance(predecessor, CheckpointJobDescription)
+                and predecessor.checkpoint is not None
+            ),
+        )
         return None
-    #Get the next set of jobs to run
-    jobs = list(predecessor.nextSuccessors())
+    # Get the next set of jobs to run
+    jobs = list(predecessor.nextSuccessors() or set())
     if len(jobs) == 0:
         # If there are no jobs, we might just not have any children.
-        logger.debug("Stopping running chain of jobs because job has no ready children or follow-ons")
+        logger.debug(
+            "Stopping running chain of jobs because job has no ready children or follow-ons"
+        )
         return None
-    #If there are 2 or more jobs to run in parallel we quit
+    # If there are 2 or more jobs to run in parallel we quit
     if len(jobs) >= 2:
-        logger.debug("No more jobs can run in series by this worker,"
-                    " it's got %i successors", len(jobs))
+        logger.debug(
+            "No more jobs can run in series by this worker," " it's got %i successors",
+            len(jobs),
+        )
         logger.debug("Two distinct successors are %s and %s", jobs[0], jobs[1])
         return None
@@ -89,10 +115,10 @@ def nextChainable(predecessor: JobDescription, jobStore: AbstractJobStore, confi
     logger.debug("%s would chain to ID %s", predecessor, successorID)
     # Load the successor JobDescription
-    successor = jobStore.load_job(successorID)
+    successor = job_store.load_job(successorID)
-    #We check the requirements of the successor to see if we can run it
-    #within the current worker
+    # We check the requirements of the successor to see if we can run it
+    # within the current worker
     if successor.memory > predecessor.memory:
         logger.debug("We need more memory for the next job, so finishing")
         return None
@@ -103,14 +129,20 @@ def nextChainable(predecessor: JobDescription, jobStore: AbstractJobStore, confi
         logger.debug("We need more disk for the next job, so finishing")
         return None
     if successor.preemptible != predecessor.preemptible:
-        logger.debug("Preemptibility is different for the next job, returning to the leader")
+        logger.debug(
+            "Preemptibility is different for the next job, returning to the leader"
+        )
         return None
     if successor.predecessorNumber > 1:
-        logger.debug("The next job has multiple predecessors; we must return to the leader.")
+        logger.debug(
+            "The next job has multiple predecessors; we must return to the leader."
+        )
         return None
     if len(successor.services) > 0:
-        logger.debug("The next job requires services that will not yet be started; we must return to the leader.")
+        logger.debug(
+            "The next job requires services that will not yet be started; we must return to the leader."
+        )
         return None
     if isinstance(successor, CheckpointJobDescription):
@@ -118,17 +150,43 @@ def nextChainable(predecessor: JobDescription, jobStore: AbstractJobStore, confi
         logger.debug("Next job is checkpoint, so finishing")
         return None
+    if (
+        not config.run_local_jobs_on_workers
+        and predecessor.local
+        and not successor.local
+    ):
+        # This job might be running on the leader, but the next job may not.
+        #
+        # TODO: Optimize by detecting whether we actually are on the leader,
+        # somehow.
+        logger.debug("Next job is not allowed to run on the leader, so finishing")
+        return None
     # Made it through! This job is chainable.
     return successor
-def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobStoreID: str, redirectOutputToLogFile: bool = True) -> int:
+def workerScript(
+    job_store: AbstractJobStore,
+    config: Config,
+    job_name: str,
+    job_store_id: str,
+    redirect_output_to_log_file: bool = True,
+    local_worker_temp_dir: Optional[str] = None,
+    debug_flags: Optional[set[str]] = None,
+) -> int:
     """
     Worker process script, runs a job.
-    :param jobStore: The JobStore to fetch JobDescriptions from.
+    :param job_store: The JobStore to fetch JobDescriptions from.
     :param config: The configuration for the current run.
-    :param jobName: The "job name" (a user friendly name) of the job to be run
-    :param jobStoreID: The job store ID of the job to be run
+    :param job_name: The "job name" (a user friendly name) of the job to be run
+    :param job_store_id: The job store ID of the job to be run
+    :param redirect_output_to_log_file: If False, log directly to the console
+        instead of capturing job output.
+    :param local_worker_temp_dir: The directory for the worker to work in. May
+        be recursively removed after the job runs.
+    :param debug_flags: Flags to set on each job before running it.
     :return int: 1 if a job failed, or 0 if all jobs succeeded
     """
@@ -136,8 +194,13 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
     configure_root_logger()
     set_log_level(config.logLevel)
+    if config.colored_logs:
+        install_log_color()
+    logger.debug("Worker started for job %s...", job_name)
     ##########################################
-    #Create the worker killer, if requested
+    # Create the worker killer, if requested
     ##########################################
     logFileByteReportLimit = config.maxLogFileSize
@@ -178,11 +241,11 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
         # before it does. Either way, init will have to clean it up for us.
     ##########################################
-    #Load the environment for the job
+    # Load the environment for the job
     ##########################################
-    #First load the environment for the job.
-    with jobStore.read_shared_file_stream("environment.pickle") as fileHandle:
+    # First load the environment for the job.
+    with job_store.read_shared_file_stream("environment.pickle") as fileHandle:
         environment = safeUnpickleFromStream(fileHandle)
     env_reject = {
         "TMPDIR",
@@ -199,15 +262,15 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
         "XDG_SESSION_ID",
         "XDG_RUNTIME_DIR",
         "XDG_DATA_DIRS",
-        "DBUS_SESSION_BUS_ADDRESS"
+        "DBUS_SESSION_BUS_ADDRESS",
     }
     for i in environment:
         if i == "PATH":
             # Handle path specially. Sometimes e.g. leader may not include
             # /bin, but the Toil appliance needs it.
-            if i in os.environ and os.environ[i] != '':
+            if i in os.environ and os.environ[i] != "":
                 # Use the provided PATH and then the local system's PATH
-                os.environ[i] = environment[i] + ':' + os.environ[i]
+                os.environ[i] = environment[i] + ":" + os.environ[i]
             else:
                 # Use the provided PATH only
                 os.environ[i] = environment[i]
@@ -215,42 +278,48 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
             os.environ[i] = environment[i]
     # sys.path is used by __import__ to find modules
     if "PYTHONPATH" in environment:
-        for e in environment["PYTHONPATH"].split(':'):
-            if e != '':
+        for e in environment["PYTHONPATH"].split(":"):
+            if e != "":
                 sys.path.append(e)
     ##########################################
-    #Setup the temporary directories.
+    # Setup the temporary directories.
     ##########################################
     # Dir to put all this worker's temp files in.
     if config.workflowID is None:
         raise RuntimeError("The worker workflow ID was never set.")
     toilWorkflowDir = Toil.getLocalWorkflowDir(config.workflowID, config.workDir)
     # Dir to put lock files in, ideally not on NFS.
-    toil_coordination_dir = Toil.get_local_workflow_coordination_dir(config.workflowID, config.workDir, config.coordination_dir)
-    localWorkerTempDir = make_public_dir(in_directory=toilWorkflowDir)
-    os.chmod(localWorkerTempDir, 0o755)
+    toil_coordination_dir = Toil.get_local_workflow_coordination_dir(
+        config.workflowID, config.workDir, config.coordination_dir
+    )
+    if local_worker_temp_dir is None:
+        # Invent a temp directory to work in
+        local_worker_temp_dir = make_public_dir(toilWorkflowDir)
+    os.chmod(local_worker_temp_dir, 0o755)
     ##########################################
-    #Setup the logging
+    # Setup the logging
     ##########################################
-    #This is mildly tricky because we don't just want to
-    #redirect stdout and stderr for this Python process; we want to redirect it
-    #for this process and all children. Consequently, we can't just replace
-    #sys.stdout and sys.stderr; we need to mess with the underlying OS-level
-    #file descriptors. See <http://stackoverflow.com/a/11632982/402891>
+    # This is mildly tricky because we don't just want to
+    # redirect stdout and stderr for this Python process; we want to redirect it
+    # for this process and all children. Consequently, we can't just replace
+    # sys.stdout and sys.stderr; we need to mess with the underlying OS-level
+    # file descriptors. See <http://stackoverflow.com/a/11632982/402891>
-    #When we start, standard input is file descriptor 0, standard output is
-    #file descriptor 1, and standard error is file descriptor 2.
+    # When we start, standard input is file descriptor 0, standard output is
+    # file descriptor 1, and standard error is file descriptor 2.
     # Do we even want to redirect output? Let the config make us not do it.
-    redirectOutputToLogFile = redirectOutputToLogFile and not config.disableWorkerOutputCapture
+    redirect_output_to_log_file = (
+        redirect_output_to_log_file and not config.disableWorkerOutputCapture
+    )
-    #What file do we want to point FDs 1 and 2 to?
-    tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt")
+    # What file do we want to point FDs 1 and 2 to?
+    tempWorkerLogPath = os.path.join(local_worker_temp_dir, "worker_log.txt")
-    if redirectOutputToLogFile:
+    if redirect_output_to_log_file:
         # Announce that we are redirecting logging, and where it will now go.
         # This is only important if we are trying to manually trace a faulty worker invocation.
         logger.debug("Redirecting logging to %s", tempWorkerLogPath)
@@ -287,13 +356,15 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
     jobAttemptFailed = False
     failure_exit_code = 1
+    first_job_cores = None
     statsDict = StatsDict()  # type: ignore[no-untyped-call]
     statsDict.jobs = []
-    statsDict.workers.logsToMaster = []
+    statsDict.workers.logs_to_leader = []
+    statsDict.workers.logging_user_streams = []
     def blockFn() -> bool:
         return True
-    listOfJobs = [jobName]
     job = None
     try:
@@ -312,18 +383,17 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
         # Load the JobDescription
         ##########################################
-        jobDesc = jobStore.load_job(jobStoreID)
-        listOfJobs[0] = str(jobDesc)
+        jobDesc = job_store.load_job(job_store_id)
         logger.debug("Parsed job description")
         ##########################################
         # Cleanup from any earlier invocation of the job
         ##########################################
-        if jobDesc.command is None:
+        if not jobDesc.has_body():
             logger.debug("Job description has no body to run.")
             # Cleanup jobs already finished
-            jobDesc.clear_nonexistent_dependents(jobStore)
+            jobDesc.clear_nonexistent_dependents(job_store)
             logger.debug("Cleaned up any references to completed successor jobs")
         # This cleans the old log file which may
@@ -331,14 +401,17 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
         oldLogFile = jobDesc.logJobStoreFileID
         if oldLogFile is not None:
             jobDesc.logJobStoreFileID = None
-            jobStore.update_job(jobDesc)  # Update first, before deleting any files
-            jobStore.delete_file(oldLogFile)
+            job_store.update_job(jobDesc)  # Update first, before deleting any files
+            job_store.delete_file(oldLogFile)
         ##########################################
         # If a checkpoint exists, restart from the checkpoint
         ##########################################
-        if isinstance(jobDesc, CheckpointJobDescription) and jobDesc.checkpoint is not None:
+        if (
+            isinstance(jobDesc, CheckpointJobDescription)
+            and jobDesc.checkpoint is not None
+        ):
             # The job is a checkpoint, and is being restarted after previously completing
             logger.debug("Job is a checkpoint")
             # If the checkpoint still has extant successors or services, its
@@ -350,75 +423,106 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
                 if jobDesc.remainingTryCount < 0:
                     raise RuntimeError("The try count of the job cannot be negative.")
                 jobDesc.remainingTryCount = max(0, jobDesc.remainingTryCount - 1)
-                jobDesc.restartCheckpoint(jobStore)
+                jobDesc.restartCheckpoint(job_store)
             # Otherwise, the job and successors are done, and we can cleanup stuff we couldn't clean
             # because of the job being a checkpoint
             else:
-                logger.debug("The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete.")
-                #Delete any remnant files
-                list(map(jobStore.delete_file, list(filter(jobStore.file_exists, jobDesc.checkpointFilesToDelete))))
+                logger.debug(
+                    "The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete."
+                )
+                # Delete any remnant files
+                list(
+                    map(
+                        job_store.delete_file,
+                        list(
+                            filter(
+                                job_store.file_exists, jobDesc.checkpointFilesToDelete
+                            )
+                        ),
+                    )
+                )
         ##########################################
-        #Setup the stats, if requested
+        # Setup the stats, if requested
         ##########################################
         if config.stats:
-            startClock = get_total_cpu_time()
+            # Remember the cores from the first job, which is how many we have reserved for us.
+            statsDict.workers.requested_cores = jobDesc.cores
+            startClock = ResourceMonitor.get_total_cpu_time()
         startTime = time.time()
         while True:
             ##########################################
-            #Run the job body, if there is one
+            # Run the job body, if there is one
             ##########################################
             logger.info("Working on job %s", jobDesc)
-            if jobDesc.command is not None:
-                if not jobDesc.command.startswith("_toil "):
-                    raise RuntimeError("Job command must start with '_toil' before being converted to an executable command.")
-                logger.debug("Got a command to run: %s" % jobDesc.command)
+            if jobDesc.has_body():
                 # Load the job. It will use the same JobDescription we have been using.
-                job = Job.loadJob(jobStore, jobDesc)
+                job = Job.loadJob(job_store, jobDesc)
                 if isinstance(jobDesc, CheckpointJobDescription):
-                    # If it is a checkpoint job, save the command
-                    jobDesc.checkpoint = jobDesc.command
+                    # If it is a checkpoint job, set the checkpoint
+                    jobDesc.set_checkpoint()
                 logger.info("Loaded body %s from description %s", job, jobDesc)
+                if debug_flags:
+                    for flag in debug_flags:
+                        logger.debug("Turning on debug flag %s on job", flag)
+                        job.set_debug_flag(flag)
                 # Create a fileStore object for the job
-                fileStore = AbstractFileStore.createFileStore(jobStore, jobDesc, localWorkerTempDir, blockFn,
-                                                              caching=config.caching)
-                with job._executor(stats=statsDict if config.stats else None,
-                                   fileStore=fileStore):
-                    with deferredFunctionManager.open() as defer:
-                        with fileStore.open(job):
-                            # Get the next block function to wait on committing this job
-                            blockFn = fileStore.waitForCommit
-                            # Run the job, save new successors, and set up
-                            # locally (but don't commit) successor
-                            # relationships and job completion.
-                            # Pass everything as name=value because Cactus
-                            # likes to override _runner when it shouldn't and
-                            # it needs some hope of finding the arguments it
-                            # wants across multiple Toil versions. We also
-                            # still pass a jobGraph argument to placate old
-                            # versions of Cactus.
-                            job._runner(jobGraph=None, jobStore=jobStore, fileStore=fileStore, defer=defer)
-                            # When the executor for the job finishes it will
-                            # kick off a commit with the command link to the
-                            # job body cut.
-                # Accumulate messages from this job & any subsequent chained jobs
-                statsDict.workers.logsToMaster += fileStore.loggingMessages
+                fileStore = AbstractFileStore.createFileStore(
+                    job_store,
+                    jobDesc,
+                    local_worker_temp_dir,
+                    blockFn,
+                    caching=config.caching,
+                )
+                try:
+                    with job._executor(
+                        stats=statsDict if config.stats else None, fileStore=fileStore
+                    ):
+                        with deferredFunctionManager.open() as defer:
+                            with fileStore.open(job):
+                                # Get the next block function to wait on committing this job
+                                blockFn = fileStore.waitForCommit
+                                # Run the job, save new successors, and set up
+                                # locally (but don't commit) successor
+                                # relationships and job completion.
+                                # Pass everything as name=value because Cactus
+                                # likes to override _runner when it shouldn't and
+                                # it needs some hope of finding the arguments it
+                                # wants across multiple Toil versions. We also
+                                # still pass a jobGraph argument to placate old
+                                # versions of Cactus.
+                                job._runner(
+                                    jobGraph=None,
+                                    jobStore=job_store,
+                                    fileStore=fileStore,
+                                    defer=defer,
+                                )
+                                # When the executor for the job finishes it will
+                                # kick off a commit with the link to the job body
+                                # cut.
+                finally:
+                    # Accumulate messages from this job & any subsequent chained jobs.
+                    # Keep the messages even if the job fails.
+                    statsDict.workers.logs_to_leader += fileStore.logging_messages
+                    statsDict.workers.logging_user_streams += (
+                        fileStore.logging_user_streams
+                    )
                 logger.info("Completed body for %s", jobDesc)
             else:
-                #The command may be none, in which case
-                #the JobDescription is either a shell ready to be deleted or has
-                #been scheduled after a failure to cleanup
+                # The body may not be attached, in which case the
+                # JobDescription is either a shell ready to be deleted or has
+                # been scheduled after a failure to cleanup
                 logger.debug("No user job to run, so finishing")
                 break
@@ -426,9 +530,9 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
                 raise RuntimeError("The termination flag is set")
             ##########################################
-            #Establish if we can run another job within the worker
+            # Establish if we can run another job within the worker
             ##########################################
-            successor = nextChainable(jobDesc, jobStore, config)
+            successor = nextChainable(jobDesc, job_store, config)
             if successor is None or config.disableChaining:
                 # Can't chain any more jobs. We are going to stop.
@@ -449,17 +553,18 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
             # Make sure nothing has gone wrong and we can really chain
             if jobDesc.memory < successor.memory:
-                raise RuntimeError("Cannot chain jobs. A job's memory cannot be less than it's successor.")
+                raise RuntimeError(
+                    "Cannot chain jobs. A job's memory cannot be less than it's successor."
+                )
             if jobDesc.cores < successor.cores:
-                raise RuntimeError("Cannot chain jobs. A job's cores cannot be less than it's successor.")
+                raise RuntimeError(
+                    "Cannot chain jobs. A job's cores cannot be less than it's successor."
+                )
             # Save the successor's original ID, so we can clean it (and its
             # body) up after we finish executing it.
             successorID = successor.jobStoreID
-            # add the successor to the list of jobs run
-            listOfJobs.append(str(successor))
             # Now we need to become that successor, under the original ID.
             successor.replace(jobDesc)
             jobDesc = successor
@@ -470,8 +575,13 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
             # Build a fileStore to update the job and commit the replacement.
             # TODO: can we have a commit operation without an entire FileStore???
-            fileStore = AbstractFileStore.createFileStore(jobStore, jobDesc, localWorkerTempDir, blockFn,
-                                                          caching=config.caching)
+            fileStore = AbstractFileStore.createFileStore(
+                job_store,
+                jobDesc,
+                local_worker_temp_dir,
+                blockFn,
+                caching=config.caching,
+            )
             # Update blockFn to wait for that commit operation.
             blockFn = fileStore.waitForCommit
@@ -482,30 +592,70 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
             logger.debug("Starting the next job")
         ##########################################
-        #Finish up the stats
+        # Finish up the stats
         ##########################################
         if config.stats:
-            totalCPUTime, totalMemoryUsage = get_total_cpu_time_and_memory_usage()
+            totalCPUTime, totalMemoryUsage = (
+                ResourceMonitor.get_total_cpu_time_and_memory_usage()
+            )
             statsDict.workers.time = str(time.time() - startTime)
             statsDict.workers.clock = str(totalCPUTime - startClock)
             statsDict.workers.memory = str(totalMemoryUsage)
+            # Say the worker used the max disk we saw from any job
+            max_bytes = 0
+            for job_stats in statsDict.jobs:
+                if "disk" in job_stats:
+                    max_bytes = max(max_bytes, int(job_stats.disk))
+            statsDict.workers.disk = str(max_bytes)
+            # Count the jobs executed.
+            # TODO: toil stats could compute this but its parser is too general to hook into simply.
+            statsDict.workers.jobs_run = len(statsDict.jobs)
         # log the worker log path here so that if the file is truncated the path can still be found
-        if redirectOutputToLogFile:
-            logger.info("Worker log can be found at %s. Set --cleanWorkDir to retain this log", localWorkerTempDir)
-        logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds", time.time() - startTime)
+        if redirect_output_to_log_file:
+            logger.info(
+                "Worker log can be found at %s. Set --cleanWorkDir to retain this log",
+                local_worker_temp_dir,
+            )
+        logger.info(
+            "Finished running the chain of jobs on this node, we ran for a total of %f seconds",
+            time.time() - startTime,
+        )
     ##########################################
-    #Trapping where worker goes wrong
+    # Trapping where worker goes wrong
     ##########################################
-    except Exception as e: #Case that something goes wrong in worker
-        traceback.print_exc()
-        logger.error("Exiting the worker because of a failed job on host %s", socket.gethostname())
+    except DebugStoppingPointReached:
+        # Job wants the worker to stop for debugging
+        raise
+    except (
+        BaseException
+    ) as e:  # Case that something goes wrong in worker, or we are asked to stop
+        if not isinstance(e, SystemExit):
+            logger.critical(
+                "Worker crashed with traceback:\n%s", traceback.format_exc()
+            )
+        logger.error(
+            "Exiting the worker because of a failed job on host %s",
+            socket.gethostname(),
+        )
         if isinstance(e, CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION):
             # We need to inform the leader that this is a CWL workflow problem
             # and it needs to inform its caller.
             failure_exit_code = CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
+        elif isinstance(e, SystemExit) and isinstance(e.code, int) and e.code != 0:
+            # We're meant to be exiting with a particular code.
+            failure_exit_code = e.code
+        else:
+            try:
+                from WDL.runtime.error import CommandFailed
+                if isinstance(e, CommandFailed):
+                    failure_exit_code = e.exit_status
+            except ImportError:
+                # WDL dependency not available
+                pass
         AbstractFileStore._terminateEvent.set()
     finally:
         # Get rid of our deferred function manager now so we can't mistake it
@@ -521,16 +671,15 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
             logger.debug("cwltool.main._terminate_processess exception: %s", (e))
             raise e
     ##########################################
-    #Wait for the asynchronous chain of writes/updates to finish
+    # Wait for the asynchronous chain of writes/updates to finish
     ##########################################
     blockFn()
     ##########################################
-    #All the asynchronous worker/update threads must be finished now,
-    #so safe to test if they completed okay
+    # All the asynchronous worker/update threads must be finished now,
+    # so safe to test if they completed okay
     ##########################################
     if AbstractFileStore._terminateEvent.is_set():
@@ -538,19 +687,19 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
         # Clobber any garbage state we have for this job from failing with
         # whatever good state is still stored in the JobStore
-        jobDesc = jobStore.load_job(jobStoreID)
+        jobDesc = job_store.load_job(job_store_id)
         # Remember that we failed
         jobAttemptFailed = True
     ##########################################
-    #Cleanup
+    # Cleanup
     ##########################################
     # Close the worker logging
     # Flush at the Python level
     sys.stdout.flush()
     sys.stderr.flush()
-    if redirectOutputToLogFile:
+    if redirect_output_to_log_file:
         # Flush at the OS level
         os.fsync(1)
         os.fsync(2)
@@ -577,43 +726,66 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
     # relative to the end (since Python won't decode Unicode backward, or even
     # interpret seek offsets in characters for us). TODO: We may get invalid or
     # just different Unicode by breaking up a character at the boundary!
-    if jobAttemptFailed and redirectOutputToLogFile:
-        jobDesc.logJobStoreFileID = logJobStoreFileID = jobStore.getEmptyFileStoreID(
+    if jobAttemptFailed and redirect_output_to_log_file:
+        jobDesc.logJobStoreFileID = logJobStoreFileID = job_store.getEmptyFileStoreID(
             jobDesc.jobStoreID, cleanup=True
         )
-        jobDesc.chainedJobs = listOfJobs
-        with jobStore.update_file_stream(logJobStoreFileID) as w:
-            with open(tempWorkerLogPath, 'rb') as f:
-                if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit !=0:
+        with job_store.update_file_stream(logJobStoreFileID) as w:
+            with open(tempWorkerLogPath, "rb") as f:
+                if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit != 0:
                     if logFileByteReportLimit > 0:
-                        f.seek(-logFileByteReportLimit, 2)  # seek to last tooBig bytes of file
+                        f.seek(
+                            -logFileByteReportLimit, 2
+                        )  # seek to last tooBig bytes of file
                     elif logFileByteReportLimit < 0:
-                        f.seek(logFileByteReportLimit, 0)  # seek to first tooBig bytes of file
+                        f.seek(
+                            logFileByteReportLimit, 0
+                        )  # seek to first tooBig bytes of file
                 # Dump the possibly-invalid-Unicode bytes into the log file
-                w.write(f.read()) # TODO load file using a buffer
+                w.write(f.read())  # TODO load file using a buffer
         # Commit log file reference back to JobStore
-        jobStore.update_job(jobDesc)
+        job_store.update_job(jobDesc)
-    elif ((debugging or (config.writeLogsFromAllJobs and not jobDesc.local))
-          and redirectOutputToLogFile):  # write log messages
-        with open(tempWorkerLogPath, 'rb') as logFile:
+    elif (
+        debugging or (config.writeLogsFromAllJobs and not jobDesc.local)
+    ) and redirect_output_to_log_file:  # write log messages
+        with open(tempWorkerLogPath, "rb") as logFile:
             if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit != 0:
                 if logFileByteReportLimit > 0:
-                    logFile.seek(-logFileByteReportLimit, 2)  # seek to last tooBig bytes of file
+                    logFile.seek(
+                        -logFileByteReportLimit, 2
+                    )  # seek to last tooBig bytes of file
                 elif logFileByteReportLimit < 0:
-                    logFile.seek(logFileByteReportLimit, 0)  # seek to first tooBig bytes of file
+                    logFile.seek(
+                        logFileByteReportLimit, 0
+                    )  # seek to first tooBig bytes of file
             # Make sure lines are Unicode so they can be JSON serialized as part of the dict.
             # We may have damaged the Unicode text by cutting it at an arbitrary byte so we drop bad characters.
-            logMessages = [line.decode('utf-8', 'skip') for line in logFile.read().splitlines()]
-        statsDict.logs.names = listOfJobs
+            logMessages = [
+                line.decode("utf-8", "skip") for line in logFile.read().splitlines()
+            ]
+        statsDict.logs.names = [names.stats_name for names in jobDesc.get_chain()]
         statsDict.logs.messages = logMessages
-    if (debugging or config.stats or statsDict.workers.logsToMaster) and not jobAttemptFailed:  # We have stats/logging to report back
-        jobStore.write_logs(json.dumps(statsDict, ensure_ascii=True))
+    if (
+        debugging
+        or config.stats
+        or statsDict.workers.logs_to_leader
+        or statsDict.workers.logging_user_streams
+    ):
+        # We have stats/logging to report back.
+        # We report even if the job attempt failed.
+        # TODO: Will that upset analysis of the stats?
+        job_store.write_logs(json.dumps(statsDict, ensure_ascii=True))
     # Remove the temp dir
     cleanUp = config.cleanWorkDir
-    if cleanUp == 'always' or (cleanUp == 'onSuccess' and not jobAttemptFailed) or (cleanUp == 'onError' and jobAttemptFailed):
+    if (
+        cleanUp == "always"
+        or (cleanUp == "onSuccess" and not jobAttemptFailed)
+        or (cleanUp == "onError" and jobAttemptFailed)
+    ):
         def make_parent_writable(func: Callable[[str], Any], path: str, _: Any) -> None:
             """
             When encountering an error removing a file or directory, make sure
@@ -624,24 +796,32 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
             """
             # Just chmod it for rwx for user. This can't work anyway if it isn't ours.
             try:
-                os.chmod(os.path.dirname(path),  stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
+                os.chmod(
+                    os.path.dirname(path), stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR
+                )
             except PermissionError as e:
-                logger.error('Could not set permissions on %s to allow cleanup of %s: %s', os.path.dirname(path), path, e)
-        shutil.rmtree(localWorkerTempDir, onerror=make_parent_writable)
+                logger.error(
+                    "Could not set permissions on %s to allow cleanup of %s: %s",
+                    os.path.dirname(path),
+                    path,
+                    e,
+                )
+        shutil.rmtree(local_worker_temp_dir, onerror=make_parent_writable)
     # This must happen after the log file is done with, else there is no place to put the log
     if (not jobAttemptFailed) and jobDesc.is_subtree_done():
-        # We can now safely get rid of the JobDescription, and all jobs it chained up
-        for otherID in jobDesc.merged_jobs:
-            jobStore.delete_job(otherID)
-        jobStore.delete_job(str(jobDesc.jobStoreID))
+        for merged_in in jobDesc.get_chain():
+            # We can now safely get rid of the JobDescription, and all jobs it chained up
+            job_store.delete_job(merged_in.job_store_id)
     if jobAttemptFailed:
         return failure_exit_code
     else:
         return 0
-def parse_args(args: List[str]) -> Any:
+def parse_args(args: list[str]) -> Any:
     """
     Parse command-line arguments to the worker.
     """
@@ -655,26 +835,33 @@ def parse_args(args: List[str]) -> Any:
     # Now add all the options to it
     # Base required job information
-    parser.add_argument("jobName", type=str,
-        help="Text name of the job being run")
-    parser.add_argument("jobStoreLocator", type=str,
-        help="Information required to connect to the job store")
-    parser.add_argument("jobStoreID", type=str,
-        help="ID of the job within the job store")
+    parser.add_argument("jobName", type=str, help="Text name of the job being run")
+    parser.add_argument(
+        "jobStoreLocator",
+        type=str,
+        help="Information required to connect to the job store",
+    )
+    parser.add_argument(
+        "jobStoreID", type=str, help="ID of the job within the job store"
+    )
     # Additional worker abilities
-    parser.add_argument("--context", default=[], action="append",
+    parser.add_argument(
+        "--context",
+        default=[],
+        action="append",
         help="""Pickled, base64-encoded context manager(s) to run job inside of.
                 Allows the Toil leader to pass setup and cleanup work provided by the
                 batch system, in the form of pickled Python context manager objects,
                 that the worker can then run before/after the job on the batch
-                system's behalf.""")
+                system's behalf.""",
+    )
     return parser.parse_args(args)
 @contextmanager
-def in_contexts(contexts: List[str]) -> Iterator[None]:
+def in_contexts(contexts: list[str]) -> Iterator[None]:
     """
     Unpickle and enter all the pickled, base64-encoded context managers in the
     given list. Then do the body, then leave them all.
@@ -688,10 +875,12 @@ def in_contexts(contexts: List[str]) -> Iterator[None]:
         rest = contexts[1:]
         try:
-            manager = pickle.loads(base64.b64decode(first.encode('utf-8')))
+            manager = pickle.loads(base64.b64decode(first.encode("utf-8")))
         except:
             exc_info = sys.exc_info()
-            logger.error('Exception while unpickling context manager: ', exc_info=exc_info)
+            logger.error(
+                "Exception while unpickling context manager: ", exc_info=exc_info
+            )
             raise
         with manager:
@@ -701,28 +890,22 @@ def in_contexts(contexts: List[str]) -> Iterator[None]:
                 yield
-def main(argv: Optional[List[str]] = None) -> None:
+def main(argv: Optional[list[str]] = None) -> None:
     if argv is None:
         argv = sys.argv
     # Parse our command line
     options = parse_args(argv)
-    # Parse input args
-    jobName = argv[1]
-    jobStoreLocator = argv[2]
-    jobStoreID = argv[3]
     ##########################################
-    #Load the jobStore/config file
+    # Load the jobStore/config file
     ##########################################
-    jobStore = Toil.resumeJobStore(options.jobStoreLocator)
-    config = jobStore.config
+    job_store = Toil.resumeJobStore(options.jobStoreLocator)
+    config = job_store.config
     with in_contexts(options.context):
         # Call the worker
-        exit_code = workerScript(jobStore, config, options.jobName, options.jobStoreID)
+        exit_code = workerScript(job_store, config, options.jobName, options.jobStoreID)
     # Exit with its return value
     sys.exit(exit_code)

toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl

toil 6.1.0a1py3-none-any.whl → 8.0.0py3-none-any.whl