toil 7.0.0__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +121 -83
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +137 -77
- toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
- toil/batchSystems/awsBatch.py +237 -128
- toil/batchSystems/cleanup_support.py +22 -16
- toil/batchSystems/contained_executor.py +30 -26
- toil/batchSystems/gridengine.py +85 -49
- toil/batchSystems/htcondor.py +164 -87
- toil/batchSystems/kubernetes.py +622 -386
- toil/batchSystems/local_support.py +17 -12
- toil/batchSystems/lsf.py +132 -79
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +288 -149
- toil/batchSystems/mesos/executor.py +77 -49
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +38 -29
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +293 -123
- toil/batchSystems/slurm.py +489 -137
- toil/batchSystems/torque.py +46 -32
- toil/bus.py +141 -73
- toil/common.py +630 -359
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1114 -532
- toil/cwl/utils.py +17 -22
- toil/deferred.py +62 -41
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +88 -57
- toil/fileStores/cachingFileStore.py +711 -247
- toil/fileStores/nonCachingFileStore.py +113 -75
- toil/job.py +988 -315
- toil/jobStores/abstractJobStore.py +387 -243
- toil/jobStores/aws/jobStore.py +727 -403
- toil/jobStores/aws/utils.py +161 -109
- toil/jobStores/conftest.py +1 -0
- toil/jobStores/fileJobStore.py +289 -151
- toil/jobStores/googleJobStore.py +137 -70
- toil/jobStores/utils.py +36 -15
- toil/leader.py +614 -269
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +55 -28
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +193 -58
- toil/lib/aws/utils.py +238 -218
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +83 -49
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +322 -209
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +4 -2
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +99 -11
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +65 -18
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +19 -7
- toil/lib/retry.py +115 -77
- toil/lib/threading.py +282 -80
- toil/lib/throttle.py +15 -14
- toil/options/common.py +834 -401
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +70 -19
- toil/provisioners/__init__.py +111 -46
- toil/provisioners/abstractProvisioner.py +322 -157
- toil/provisioners/aws/__init__.py +62 -30
- toil/provisioners/aws/awsProvisioner.py +980 -627
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +147 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +127 -61
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +148 -64
- toil/test/__init__.py +263 -179
- toil/test/batchSystems/batchSystemTest.py +438 -195
- toil/test/batchSystems/batch_system_plugin_test.py +18 -7
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +93 -47
- toil/test/cactus/test_cactus_integration.py +20 -22
- toil/test/cwl/cwlTest.py +271 -71
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/docs/scriptsTest.py +60 -34
- toil/test/jobStores/jobStoreTest.py +412 -235
- toil/test/lib/aws/test_iam.py +116 -48
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +57 -49
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/options.py +7 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +81 -42
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +140 -100
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +33 -26
- toil/test/src/environmentTest.py +20 -10
- toil/test/src/fileStoreTest.py +538 -271
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +32 -17
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +120 -70
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +6 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +33 -16
- toil/test/utils/toilDebugTest.py +70 -58
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +239 -102
- toil/test/wdl/wdltoil_test.py +789 -148
- toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
- toil/toilState.py +52 -26
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +85 -25
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +251 -145
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +27 -14
- toil/utils/toilSshCluster.py +45 -22
- toil/utils/toilStats.py +75 -36
- toil/utils/toilStatus.py +226 -119
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +11 -11
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3513 -1052
- toil/worker.py +269 -128
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-7.0.0.dist-info/METADATA +0 -158
- toil-7.0.0.dist-info/RECORD +0 -244
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/LICENSE +0 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/worker.py
CHANGED
|
@@ -25,23 +25,31 @@ import stat
|
|
|
25
25
|
import sys
|
|
26
26
|
import time
|
|
27
27
|
import traceback
|
|
28
|
+
from collections.abc import Iterator
|
|
28
29
|
from contextlib import contextmanager
|
|
29
|
-
from typing import Any, Callable,
|
|
30
|
+
from typing import Any, Callable, Optional
|
|
30
31
|
|
|
31
32
|
from configargparse import ArgParser
|
|
32
33
|
|
|
33
34
|
from toil import logProcessContext
|
|
34
35
|
from toil.common import Config, Toil, safeUnpickleFromStream
|
|
35
|
-
from toil.cwl.utils import (
|
|
36
|
-
|
|
36
|
+
from toil.cwl.utils import (
|
|
37
|
+
CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION,
|
|
38
|
+
CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE,
|
|
39
|
+
)
|
|
37
40
|
from toil.deferred import DeferredFunctionManager
|
|
38
41
|
from toil.fileStores.abstractFileStore import AbstractFileStore
|
|
39
|
-
from toil.job import
|
|
42
|
+
from toil.job import (
|
|
43
|
+
CheckpointJobDescription,
|
|
44
|
+
DebugStoppingPointReached,
|
|
45
|
+
Job,
|
|
46
|
+
JobDescription,
|
|
47
|
+
)
|
|
40
48
|
from toil.jobStores.abstractJobStore import AbstractJobStore
|
|
41
49
|
from toil.lib.expando import MagicExpando
|
|
42
50
|
from toil.lib.io import make_public_dir
|
|
43
51
|
from toil.lib.resources import ResourceMonitor
|
|
44
|
-
from toil.statsAndLogging import configure_root_logger,
|
|
52
|
+
from toil.statsAndLogging import configure_root_logger, install_log_color, set_log_level
|
|
45
53
|
|
|
46
54
|
logger = logging.getLogger(__name__)
|
|
47
55
|
|
|
@@ -49,10 +57,12 @@ logger = logging.getLogger(__name__)
|
|
|
49
57
|
class StatsDict(MagicExpando):
|
|
50
58
|
"""Subclass of MagicExpando for type-checking purposes."""
|
|
51
59
|
|
|
52
|
-
jobs:
|
|
60
|
+
jobs: list[MagicExpando]
|
|
53
61
|
|
|
54
62
|
|
|
55
|
-
def nextChainable(
|
|
63
|
+
def nextChainable(
|
|
64
|
+
predecessor: JobDescription, job_store: AbstractJobStore, config: Config
|
|
65
|
+
) -> Optional[JobDescription]:
|
|
56
66
|
"""
|
|
57
67
|
Returns the next chainable job's JobDescription after the given predecessor
|
|
58
68
|
JobDescription, if one exists, or None if the chain must terminate.
|
|
@@ -61,24 +71,41 @@ def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, conf
|
|
|
61
71
|
:param job_store: The JobStore to fetch JobDescriptions from.
|
|
62
72
|
:param config: The configuration for the current run.
|
|
63
73
|
"""
|
|
64
|
-
#If no more jobs to run or services not finished, quit
|
|
65
|
-
if
|
|
66
|
-
|
|
67
|
-
|
|
74
|
+
# If no more jobs to run or services not finished, quit
|
|
75
|
+
if (
|
|
76
|
+
predecessor.nextSuccessors() is None
|
|
77
|
+
or len(predecessor.services) > 0
|
|
78
|
+
or (
|
|
79
|
+
isinstance(predecessor, CheckpointJobDescription)
|
|
80
|
+
and predecessor.checkpoint is not None
|
|
81
|
+
)
|
|
82
|
+
):
|
|
83
|
+
logger.debug(
|
|
84
|
+
"Stopping running chain of jobs: no successors: %s, services: %s, checkpoint: %s",
|
|
85
|
+
predecessor.nextSuccessors() is None,
|
|
86
|
+
len(predecessor.services),
|
|
87
|
+
(
|
|
88
|
+
isinstance(predecessor, CheckpointJobDescription)
|
|
89
|
+
and predecessor.checkpoint is not None
|
|
90
|
+
),
|
|
91
|
+
)
|
|
68
92
|
return None
|
|
69
93
|
|
|
70
|
-
|
|
71
|
-
#Get the next set of jobs to run
|
|
94
|
+
# Get the next set of jobs to run
|
|
72
95
|
jobs = list(predecessor.nextSuccessors() or set())
|
|
73
96
|
if len(jobs) == 0:
|
|
74
97
|
# If there are no jobs, we might just not have any children.
|
|
75
|
-
logger.debug(
|
|
98
|
+
logger.debug(
|
|
99
|
+
"Stopping running chain of jobs because job has no ready children or follow-ons"
|
|
100
|
+
)
|
|
76
101
|
return None
|
|
77
102
|
|
|
78
|
-
#If there are 2 or more jobs to run in parallel we quit
|
|
103
|
+
# If there are 2 or more jobs to run in parallel we quit
|
|
79
104
|
if len(jobs) >= 2:
|
|
80
|
-
logger.debug(
|
|
81
|
-
|
|
105
|
+
logger.debug(
|
|
106
|
+
"No more jobs can run in series by this worker," " it's got %i successors",
|
|
107
|
+
len(jobs),
|
|
108
|
+
)
|
|
82
109
|
logger.debug("Two distinct successors are %s and %s", jobs[0], jobs[1])
|
|
83
110
|
return None
|
|
84
111
|
|
|
@@ -90,8 +117,8 @@ def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, conf
|
|
|
90
117
|
# Load the successor JobDescription
|
|
91
118
|
successor = job_store.load_job(successorID)
|
|
92
119
|
|
|
93
|
-
#We check the requirements of the successor to see if we can run it
|
|
94
|
-
#within the current worker
|
|
120
|
+
# We check the requirements of the successor to see if we can run it
|
|
121
|
+
# within the current worker
|
|
95
122
|
if successor.memory > predecessor.memory:
|
|
96
123
|
logger.debug("We need more memory for the next job, so finishing")
|
|
97
124
|
return None
|
|
@@ -102,14 +129,20 @@ def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, conf
|
|
|
102
129
|
logger.debug("We need more disk for the next job, so finishing")
|
|
103
130
|
return None
|
|
104
131
|
if successor.preemptible != predecessor.preemptible:
|
|
105
|
-
logger.debug(
|
|
132
|
+
logger.debug(
|
|
133
|
+
"Preemptibility is different for the next job, returning to the leader"
|
|
134
|
+
)
|
|
106
135
|
return None
|
|
107
136
|
if successor.predecessorNumber > 1:
|
|
108
|
-
logger.debug(
|
|
137
|
+
logger.debug(
|
|
138
|
+
"The next job has multiple predecessors; we must return to the leader."
|
|
139
|
+
)
|
|
109
140
|
return None
|
|
110
141
|
|
|
111
142
|
if len(successor.services) > 0:
|
|
112
|
-
logger.debug(
|
|
143
|
+
logger.debug(
|
|
144
|
+
"The next job requires services that will not yet be started; we must return to the leader."
|
|
145
|
+
)
|
|
113
146
|
return None
|
|
114
147
|
|
|
115
148
|
if isinstance(successor, CheckpointJobDescription):
|
|
@@ -117,7 +150,11 @@ def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, conf
|
|
|
117
150
|
logger.debug("Next job is checkpoint, so finishing")
|
|
118
151
|
return None
|
|
119
152
|
|
|
120
|
-
if
|
|
153
|
+
if (
|
|
154
|
+
not config.run_local_jobs_on_workers
|
|
155
|
+
and predecessor.local
|
|
156
|
+
and not successor.local
|
|
157
|
+
):
|
|
121
158
|
# This job might be running on the leader, but the next job may not.
|
|
122
159
|
#
|
|
123
160
|
# TODO: Optimize by detecting whether we actually are on the leader,
|
|
@@ -128,6 +165,7 @@ def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, conf
|
|
|
128
165
|
# Made it through! This job is chainable.
|
|
129
166
|
return successor
|
|
130
167
|
|
|
168
|
+
|
|
131
169
|
def workerScript(
|
|
132
170
|
job_store: AbstractJobStore,
|
|
133
171
|
config: Config,
|
|
@@ -135,7 +173,7 @@ def workerScript(
|
|
|
135
173
|
job_store_id: str,
|
|
136
174
|
redirect_output_to_log_file: bool = True,
|
|
137
175
|
local_worker_temp_dir: Optional[str] = None,
|
|
138
|
-
debug_flags: Optional[
|
|
176
|
+
debug_flags: Optional[set[str]] = None,
|
|
139
177
|
) -> int:
|
|
140
178
|
"""
|
|
141
179
|
Worker process script, runs a job.
|
|
@@ -162,7 +200,7 @@ def workerScript(
|
|
|
162
200
|
logger.debug("Worker started for job %s...", job_name)
|
|
163
201
|
|
|
164
202
|
##########################################
|
|
165
|
-
#Create the worker killer, if requested
|
|
203
|
+
# Create the worker killer, if requested
|
|
166
204
|
##########################################
|
|
167
205
|
|
|
168
206
|
logFileByteReportLimit = config.maxLogFileSize
|
|
@@ -203,10 +241,10 @@ def workerScript(
|
|
|
203
241
|
# before it does. Either way, init will have to clean it up for us.
|
|
204
242
|
|
|
205
243
|
##########################################
|
|
206
|
-
#Load the environment for the job
|
|
244
|
+
# Load the environment for the job
|
|
207
245
|
##########################################
|
|
208
246
|
|
|
209
|
-
#First load the environment for the job.
|
|
247
|
+
# First load the environment for the job.
|
|
210
248
|
with job_store.read_shared_file_stream("environment.pickle") as fileHandle:
|
|
211
249
|
environment = safeUnpickleFromStream(fileHandle)
|
|
212
250
|
env_reject = {
|
|
@@ -224,15 +262,15 @@ def workerScript(
|
|
|
224
262
|
"XDG_SESSION_ID",
|
|
225
263
|
"XDG_RUNTIME_DIR",
|
|
226
264
|
"XDG_DATA_DIRS",
|
|
227
|
-
"DBUS_SESSION_BUS_ADDRESS"
|
|
265
|
+
"DBUS_SESSION_BUS_ADDRESS",
|
|
228
266
|
}
|
|
229
267
|
for i in environment:
|
|
230
268
|
if i == "PATH":
|
|
231
269
|
# Handle path specially. Sometimes e.g. leader may not include
|
|
232
270
|
# /bin, but the Toil appliance needs it.
|
|
233
|
-
if i in os.environ and os.environ[i] !=
|
|
271
|
+
if i in os.environ and os.environ[i] != "":
|
|
234
272
|
# Use the provided PATH and then the local system's PATH
|
|
235
|
-
os.environ[i] = environment[i] +
|
|
273
|
+
os.environ[i] = environment[i] + ":" + os.environ[i]
|
|
236
274
|
else:
|
|
237
275
|
# Use the provided PATH only
|
|
238
276
|
os.environ[i] = environment[i]
|
|
@@ -240,41 +278,45 @@ def workerScript(
|
|
|
240
278
|
os.environ[i] = environment[i]
|
|
241
279
|
# sys.path is used by __import__ to find modules
|
|
242
280
|
if "PYTHONPATH" in environment:
|
|
243
|
-
for e in environment["PYTHONPATH"].split(
|
|
244
|
-
if e !=
|
|
281
|
+
for e in environment["PYTHONPATH"].split(":"):
|
|
282
|
+
if e != "":
|
|
245
283
|
sys.path.append(e)
|
|
246
284
|
|
|
247
285
|
##########################################
|
|
248
|
-
#Setup the temporary directories.
|
|
286
|
+
# Setup the temporary directories.
|
|
249
287
|
##########################################
|
|
250
288
|
# Dir to put all this worker's temp files in.
|
|
251
289
|
if config.workflowID is None:
|
|
252
290
|
raise RuntimeError("The worker workflow ID was never set.")
|
|
253
291
|
toilWorkflowDir = Toil.getLocalWorkflowDir(config.workflowID, config.workDir)
|
|
254
292
|
# Dir to put lock files in, ideally not on NFS.
|
|
255
|
-
toil_coordination_dir = Toil.get_local_workflow_coordination_dir(
|
|
293
|
+
toil_coordination_dir = Toil.get_local_workflow_coordination_dir(
|
|
294
|
+
config.workflowID, config.workDir, config.coordination_dir
|
|
295
|
+
)
|
|
256
296
|
if local_worker_temp_dir is None:
|
|
257
297
|
# Invent a temp directory to work in
|
|
258
298
|
local_worker_temp_dir = make_public_dir(toilWorkflowDir)
|
|
259
299
|
os.chmod(local_worker_temp_dir, 0o755)
|
|
260
300
|
|
|
261
301
|
##########################################
|
|
262
|
-
#Setup the logging
|
|
302
|
+
# Setup the logging
|
|
263
303
|
##########################################
|
|
264
304
|
|
|
265
|
-
#This is mildly tricky because we don't just want to
|
|
266
|
-
#redirect stdout and stderr for this Python process; we want to redirect it
|
|
267
|
-
#for this process and all children. Consequently, we can't just replace
|
|
268
|
-
#sys.stdout and sys.stderr; we need to mess with the underlying OS-level
|
|
269
|
-
#file descriptors. See <http://stackoverflow.com/a/11632982/402891>
|
|
305
|
+
# This is mildly tricky because we don't just want to
|
|
306
|
+
# redirect stdout and stderr for this Python process; we want to redirect it
|
|
307
|
+
# for this process and all children. Consequently, we can't just replace
|
|
308
|
+
# sys.stdout and sys.stderr; we need to mess with the underlying OS-level
|
|
309
|
+
# file descriptors. See <http://stackoverflow.com/a/11632982/402891>
|
|
270
310
|
|
|
271
|
-
#When we start, standard input is file descriptor 0, standard output is
|
|
272
|
-
#file descriptor 1, and standard error is file descriptor 2.
|
|
311
|
+
# When we start, standard input is file descriptor 0, standard output is
|
|
312
|
+
# file descriptor 1, and standard error is file descriptor 2.
|
|
273
313
|
|
|
274
314
|
# Do we even want to redirect output? Let the config make us not do it.
|
|
275
|
-
redirect_output_to_log_file =
|
|
315
|
+
redirect_output_to_log_file = (
|
|
316
|
+
redirect_output_to_log_file and not config.disableWorkerOutputCapture
|
|
317
|
+
)
|
|
276
318
|
|
|
277
|
-
#What file do we want to point FDs 1 and 2 to?
|
|
319
|
+
# What file do we want to point FDs 1 and 2 to?
|
|
278
320
|
tempWorkerLogPath = os.path.join(local_worker_temp_dir, "worker_log.txt")
|
|
279
321
|
|
|
280
322
|
if redirect_output_to_log_file:
|
|
@@ -322,6 +364,7 @@ def workerScript(
|
|
|
322
364
|
|
|
323
365
|
def blockFn() -> bool:
|
|
324
366
|
return True
|
|
367
|
+
|
|
325
368
|
job = None
|
|
326
369
|
try:
|
|
327
370
|
|
|
@@ -365,7 +408,10 @@ def workerScript(
|
|
|
365
408
|
# If a checkpoint exists, restart from the checkpoint
|
|
366
409
|
##########################################
|
|
367
410
|
|
|
368
|
-
if
|
|
411
|
+
if (
|
|
412
|
+
isinstance(jobDesc, CheckpointJobDescription)
|
|
413
|
+
and jobDesc.checkpoint is not None
|
|
414
|
+
):
|
|
369
415
|
# The job is a checkpoint, and is being restarted after previously completing
|
|
370
416
|
logger.debug("Job is a checkpoint")
|
|
371
417
|
# If the checkpoint still has extant successors or services, its
|
|
@@ -381,12 +427,23 @@ def workerScript(
|
|
|
381
427
|
# Otherwise, the job and successors are done, and we can cleanup stuff we couldn't clean
|
|
382
428
|
# because of the job being a checkpoint
|
|
383
429
|
else:
|
|
384
|
-
logger.debug(
|
|
385
|
-
|
|
386
|
-
|
|
430
|
+
logger.debug(
|
|
431
|
+
"The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete."
|
|
432
|
+
)
|
|
433
|
+
# Delete any remnant files
|
|
434
|
+
list(
|
|
435
|
+
map(
|
|
436
|
+
job_store.delete_file,
|
|
437
|
+
list(
|
|
438
|
+
filter(
|
|
439
|
+
job_store.file_exists, jobDesc.checkpointFilesToDelete
|
|
440
|
+
)
|
|
441
|
+
),
|
|
442
|
+
)
|
|
443
|
+
)
|
|
387
444
|
|
|
388
445
|
##########################################
|
|
389
|
-
#Setup the stats, if requested
|
|
446
|
+
# Setup the stats, if requested
|
|
390
447
|
##########################################
|
|
391
448
|
|
|
392
449
|
if config.stats:
|
|
@@ -397,7 +454,7 @@ def workerScript(
|
|
|
397
454
|
startTime = time.time()
|
|
398
455
|
while True:
|
|
399
456
|
##########################################
|
|
400
|
-
#Run the job body, if there is one
|
|
457
|
+
# Run the job body, if there is one
|
|
401
458
|
##########################################
|
|
402
459
|
|
|
403
460
|
logger.info("Working on job %s", jobDesc)
|
|
@@ -417,33 +474,48 @@ def workerScript(
|
|
|
417
474
|
job.set_debug_flag(flag)
|
|
418
475
|
|
|
419
476
|
# Create a fileStore object for the job
|
|
420
|
-
fileStore = AbstractFileStore.createFileStore(
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
477
|
+
fileStore = AbstractFileStore.createFileStore(
|
|
478
|
+
job_store,
|
|
479
|
+
jobDesc,
|
|
480
|
+
local_worker_temp_dir,
|
|
481
|
+
blockFn,
|
|
482
|
+
caching=config.caching,
|
|
483
|
+
)
|
|
484
|
+
try:
|
|
485
|
+
with job._executor(
|
|
486
|
+
stats=statsDict if config.stats else None, fileStore=fileStore
|
|
487
|
+
):
|
|
488
|
+
with deferredFunctionManager.open() as defer:
|
|
489
|
+
with fileStore.open(job):
|
|
490
|
+
# Get the next block function to wait on committing this job
|
|
491
|
+
blockFn = fileStore.waitForCommit
|
|
492
|
+
|
|
493
|
+
# Run the job, save new successors, and set up
|
|
494
|
+
# locally (but don't commit) successor
|
|
495
|
+
# relationships and job completion.
|
|
496
|
+
# Pass everything as name=value because Cactus
|
|
497
|
+
# likes to override _runner when it shouldn't and
|
|
498
|
+
# it needs some hope of finding the arguments it
|
|
499
|
+
# wants across multiple Toil versions. We also
|
|
500
|
+
# still pass a jobGraph argument to placate old
|
|
501
|
+
# versions of Cactus.
|
|
502
|
+
job._runner(
|
|
503
|
+
jobGraph=None,
|
|
504
|
+
jobStore=job_store,
|
|
505
|
+
fileStore=fileStore,
|
|
506
|
+
defer=defer,
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
# When the executor for the job finishes it will
|
|
510
|
+
# kick off a commit with the link to the job body
|
|
511
|
+
# cut.
|
|
512
|
+
finally:
|
|
513
|
+
# Accumulate messages from this job & any subsequent chained jobs.
|
|
514
|
+
# Keep the messages even if the job fails.
|
|
515
|
+
statsDict.workers.logs_to_leader += fileStore.logging_messages
|
|
516
|
+
statsDict.workers.logging_user_streams += (
|
|
517
|
+
fileStore.logging_user_streams
|
|
518
|
+
)
|
|
447
519
|
|
|
448
520
|
logger.info("Completed body for %s", jobDesc)
|
|
449
521
|
|
|
@@ -458,7 +530,7 @@ def workerScript(
|
|
|
458
530
|
raise RuntimeError("The termination flag is set")
|
|
459
531
|
|
|
460
532
|
##########################################
|
|
461
|
-
#Establish if we can run another job within the worker
|
|
533
|
+
# Establish if we can run another job within the worker
|
|
462
534
|
##########################################
|
|
463
535
|
successor = nextChainable(jobDesc, job_store, config)
|
|
464
536
|
if successor is None or config.disableChaining:
|
|
@@ -481,9 +553,13 @@ def workerScript(
|
|
|
481
553
|
|
|
482
554
|
# Make sure nothing has gone wrong and we can really chain
|
|
483
555
|
if jobDesc.memory < successor.memory:
|
|
484
|
-
raise RuntimeError(
|
|
556
|
+
raise RuntimeError(
|
|
557
|
+
"Cannot chain jobs. A job's memory cannot be less than it's successor."
|
|
558
|
+
)
|
|
485
559
|
if jobDesc.cores < successor.cores:
|
|
486
|
-
raise RuntimeError(
|
|
560
|
+
raise RuntimeError(
|
|
561
|
+
"Cannot chain jobs. A job's cores cannot be less than it's successor."
|
|
562
|
+
)
|
|
487
563
|
|
|
488
564
|
# Save the successor's original ID, so we can clean it (and its
|
|
489
565
|
# body) up after we finish executing it.
|
|
@@ -499,8 +575,13 @@ def workerScript(
|
|
|
499
575
|
|
|
500
576
|
# Build a fileStore to update the job and commit the replacement.
|
|
501
577
|
# TODO: can we have a commit operation without an entire FileStore???
|
|
502
|
-
fileStore = AbstractFileStore.createFileStore(
|
|
503
|
-
|
|
578
|
+
fileStore = AbstractFileStore.createFileStore(
|
|
579
|
+
job_store,
|
|
580
|
+
jobDesc,
|
|
581
|
+
local_worker_temp_dir,
|
|
582
|
+
blockFn,
|
|
583
|
+
caching=config.caching,
|
|
584
|
+
)
|
|
504
585
|
|
|
505
586
|
# Update blockFn to wait for that commit operation.
|
|
506
587
|
blockFn = fileStore.waitForCommit
|
|
@@ -511,10 +592,12 @@ def workerScript(
|
|
|
511
592
|
logger.debug("Starting the next job")
|
|
512
593
|
|
|
513
594
|
##########################################
|
|
514
|
-
#Finish up the stats
|
|
595
|
+
# Finish up the stats
|
|
515
596
|
##########################################
|
|
516
597
|
if config.stats:
|
|
517
|
-
totalCPUTime, totalMemoryUsage =
|
|
598
|
+
totalCPUTime, totalMemoryUsage = (
|
|
599
|
+
ResourceMonitor.get_total_cpu_time_and_memory_usage()
|
|
600
|
+
)
|
|
518
601
|
statsDict.workers.time = str(time.time() - startTime)
|
|
519
602
|
statsDict.workers.clock = str(totalCPUTime - startClock)
|
|
520
603
|
statsDict.workers.memory = str(totalMemoryUsage)
|
|
@@ -526,25 +609,37 @@ def workerScript(
|
|
|
526
609
|
statsDict.workers.disk = str(max_bytes)
|
|
527
610
|
# Count the jobs executed.
|
|
528
611
|
# TODO: toil stats could compute this but its parser is too general to hook into simply.
|
|
529
|
-
statsDict.workers.jobs_run
|
|
530
|
-
|
|
612
|
+
statsDict.workers.jobs_run = len(statsDict.jobs)
|
|
531
613
|
|
|
532
614
|
# log the worker log path here so that if the file is truncated the path can still be found
|
|
533
615
|
if redirect_output_to_log_file:
|
|
534
|
-
logger.info(
|
|
535
|
-
|
|
536
|
-
|
|
616
|
+
logger.info(
|
|
617
|
+
"Worker log can be found at %s. Set --cleanWorkDir to retain this log",
|
|
618
|
+
local_worker_temp_dir,
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
logger.info(
|
|
622
|
+
"Finished running the chain of jobs on this node, we ran for a total of %f seconds",
|
|
623
|
+
time.time() - startTime,
|
|
624
|
+
)
|
|
537
625
|
|
|
538
626
|
##########################################
|
|
539
|
-
#Trapping where worker goes wrong
|
|
627
|
+
# Trapping where worker goes wrong
|
|
540
628
|
##########################################
|
|
541
629
|
except DebugStoppingPointReached:
|
|
542
630
|
# Job wants the worker to stop for debugging
|
|
543
631
|
raise
|
|
544
|
-
except
|
|
632
|
+
except (
|
|
633
|
+
BaseException
|
|
634
|
+
) as e: # Case that something goes wrong in worker, or we are asked to stop
|
|
545
635
|
if not isinstance(e, SystemExit):
|
|
546
|
-
logger.critical(
|
|
547
|
-
|
|
636
|
+
logger.critical(
|
|
637
|
+
"Worker crashed with traceback:\n%s", traceback.format_exc()
|
|
638
|
+
)
|
|
639
|
+
logger.error(
|
|
640
|
+
"Exiting the worker because of a failed job on host %s",
|
|
641
|
+
socket.gethostname(),
|
|
642
|
+
)
|
|
548
643
|
if isinstance(e, CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION):
|
|
549
644
|
# We need to inform the leader that this is a CWL workflow problem
|
|
550
645
|
# and it needs to inform its caller.
|
|
@@ -552,6 +647,15 @@ def workerScript(
|
|
|
552
647
|
elif isinstance(e, SystemExit) and isinstance(e.code, int) and e.code != 0:
|
|
553
648
|
# We're meant to be exiting with a particular code.
|
|
554
649
|
failure_exit_code = e.code
|
|
650
|
+
else:
|
|
651
|
+
try:
|
|
652
|
+
from WDL.runtime.error import CommandFailed
|
|
653
|
+
|
|
654
|
+
if isinstance(e, CommandFailed):
|
|
655
|
+
failure_exit_code = e.exit_status
|
|
656
|
+
except ImportError:
|
|
657
|
+
# WDL dependency not available
|
|
658
|
+
pass
|
|
555
659
|
AbstractFileStore._terminateEvent.set()
|
|
556
660
|
finally:
|
|
557
661
|
# Get rid of our deferred function manager now so we can't mistake it
|
|
@@ -567,16 +671,15 @@ def workerScript(
|
|
|
567
671
|
logger.debug("cwltool.main._terminate_processess exception: %s", (e))
|
|
568
672
|
raise e
|
|
569
673
|
|
|
570
|
-
|
|
571
674
|
##########################################
|
|
572
|
-
#Wait for the asynchronous chain of writes/updates to finish
|
|
675
|
+
# Wait for the asynchronous chain of writes/updates to finish
|
|
573
676
|
##########################################
|
|
574
677
|
|
|
575
678
|
blockFn()
|
|
576
679
|
|
|
577
680
|
##########################################
|
|
578
|
-
#All the asynchronous worker/update threads must be finished now,
|
|
579
|
-
#so safe to test if they completed okay
|
|
681
|
+
# All the asynchronous worker/update threads must be finished now,
|
|
682
|
+
# so safe to test if they completed okay
|
|
580
683
|
##########################################
|
|
581
684
|
|
|
582
685
|
if AbstractFileStore._terminateEvent.is_set():
|
|
@@ -589,7 +692,7 @@ def workerScript(
|
|
|
589
692
|
jobAttemptFailed = True
|
|
590
693
|
|
|
591
694
|
##########################################
|
|
592
|
-
#Cleanup
|
|
695
|
+
# Cleanup
|
|
593
696
|
##########################################
|
|
594
697
|
|
|
595
698
|
# Close the worker logging
|
|
@@ -628,32 +731,48 @@ def workerScript(
|
|
|
628
731
|
jobDesc.jobStoreID, cleanup=True
|
|
629
732
|
)
|
|
630
733
|
with job_store.update_file_stream(logJobStoreFileID) as w:
|
|
631
|
-
with open(tempWorkerLogPath,
|
|
632
|
-
if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit !=0:
|
|
734
|
+
with open(tempWorkerLogPath, "rb") as f:
|
|
735
|
+
if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit != 0:
|
|
633
736
|
if logFileByteReportLimit > 0:
|
|
634
|
-
f.seek(
|
|
737
|
+
f.seek(
|
|
738
|
+
-logFileByteReportLimit, 2
|
|
739
|
+
) # seek to last tooBig bytes of file
|
|
635
740
|
elif logFileByteReportLimit < 0:
|
|
636
|
-
f.seek(
|
|
741
|
+
f.seek(
|
|
742
|
+
logFileByteReportLimit, 0
|
|
743
|
+
) # seek to first tooBig bytes of file
|
|
637
744
|
# Dump the possibly-invalid-Unicode bytes into the log file
|
|
638
|
-
w.write(f.read())
|
|
745
|
+
w.write(f.read()) # TODO load file using a buffer
|
|
639
746
|
# Commit log file reference back to JobStore
|
|
640
747
|
job_store.update_job(jobDesc)
|
|
641
748
|
|
|
642
|
-
elif (
|
|
643
|
-
|
|
644
|
-
|
|
749
|
+
elif (
|
|
750
|
+
debugging or (config.writeLogsFromAllJobs and not jobDesc.local)
|
|
751
|
+
) and redirect_output_to_log_file: # write log messages
|
|
752
|
+
with open(tempWorkerLogPath, "rb") as logFile:
|
|
645
753
|
if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit != 0:
|
|
646
754
|
if logFileByteReportLimit > 0:
|
|
647
|
-
logFile.seek(
|
|
755
|
+
logFile.seek(
|
|
756
|
+
-logFileByteReportLimit, 2
|
|
757
|
+
) # seek to last tooBig bytes of file
|
|
648
758
|
elif logFileByteReportLimit < 0:
|
|
649
|
-
logFile.seek(
|
|
759
|
+
logFile.seek(
|
|
760
|
+
logFileByteReportLimit, 0
|
|
761
|
+
) # seek to first tooBig bytes of file
|
|
650
762
|
# Make sure lines are Unicode so they can be JSON serialized as part of the dict.
|
|
651
763
|
# We may have damaged the Unicode text by cutting it at an arbitrary byte so we drop bad characters.
|
|
652
|
-
logMessages = [
|
|
764
|
+
logMessages = [
|
|
765
|
+
line.decode("utf-8", "skip") for line in logFile.read().splitlines()
|
|
766
|
+
]
|
|
653
767
|
statsDict.logs.names = [names.stats_name for names in jobDesc.get_chain()]
|
|
654
768
|
statsDict.logs.messages = logMessages
|
|
655
769
|
|
|
656
|
-
if
|
|
770
|
+
if (
|
|
771
|
+
debugging
|
|
772
|
+
or config.stats
|
|
773
|
+
or statsDict.workers.logs_to_leader
|
|
774
|
+
or statsDict.workers.logging_user_streams
|
|
775
|
+
):
|
|
657
776
|
# We have stats/logging to report back.
|
|
658
777
|
# We report even if the job attempt failed.
|
|
659
778
|
# TODO: Will that upset analysis of the stats?
|
|
@@ -661,7 +780,12 @@ def workerScript(
|
|
|
661
780
|
|
|
662
781
|
# Remove the temp dir
|
|
663
782
|
cleanUp = config.cleanWorkDir
|
|
664
|
-
if
|
|
783
|
+
if (
|
|
784
|
+
cleanUp == "always"
|
|
785
|
+
or (cleanUp == "onSuccess" and not jobAttemptFailed)
|
|
786
|
+
or (cleanUp == "onError" and jobAttemptFailed)
|
|
787
|
+
):
|
|
788
|
+
|
|
665
789
|
def make_parent_writable(func: Callable[[str], Any], path: str, _: Any) -> None:
|
|
666
790
|
"""
|
|
667
791
|
When encountering an error removing a file or directory, make sure
|
|
@@ -672,9 +796,17 @@ def workerScript(
|
|
|
672
796
|
"""
|
|
673
797
|
# Just chmod it for rwx for user. This can't work anyway if it isn't ours.
|
|
674
798
|
try:
|
|
675
|
-
os.chmod(
|
|
799
|
+
os.chmod(
|
|
800
|
+
os.path.dirname(path), stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR
|
|
801
|
+
)
|
|
676
802
|
except PermissionError as e:
|
|
677
|
-
logger.error(
|
|
803
|
+
logger.error(
|
|
804
|
+
"Could not set permissions on %s to allow cleanup of %s: %s",
|
|
805
|
+
os.path.dirname(path),
|
|
806
|
+
path,
|
|
807
|
+
e,
|
|
808
|
+
)
|
|
809
|
+
|
|
678
810
|
shutil.rmtree(local_worker_temp_dir, onerror=make_parent_writable)
|
|
679
811
|
|
|
680
812
|
# This must happen after the log file is done with, else there is no place to put the log
|
|
@@ -683,13 +815,13 @@ def workerScript(
|
|
|
683
815
|
# We can now safely get rid of the JobDescription, and all jobs it chained up
|
|
684
816
|
job_store.delete_job(merged_in.job_store_id)
|
|
685
817
|
|
|
686
|
-
|
|
687
818
|
if jobAttemptFailed:
|
|
688
819
|
return failure_exit_code
|
|
689
820
|
else:
|
|
690
821
|
return 0
|
|
691
822
|
|
|
692
|
-
|
|
823
|
+
|
|
824
|
+
def parse_args(args: list[str]) -> Any:
|
|
693
825
|
"""
|
|
694
826
|
Parse command-line arguments to the worker.
|
|
695
827
|
"""
|
|
@@ -703,26 +835,33 @@ def parse_args(args: List[str]) -> Any:
|
|
|
703
835
|
# Now add all the options to it
|
|
704
836
|
|
|
705
837
|
# Base required job information
|
|
706
|
-
parser.add_argument("jobName", type=str,
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
838
|
+
parser.add_argument("jobName", type=str, help="Text name of the job being run")
|
|
839
|
+
parser.add_argument(
|
|
840
|
+
"jobStoreLocator",
|
|
841
|
+
type=str,
|
|
842
|
+
help="Information required to connect to the job store",
|
|
843
|
+
)
|
|
844
|
+
parser.add_argument(
|
|
845
|
+
"jobStoreID", type=str, help="ID of the job within the job store"
|
|
846
|
+
)
|
|
712
847
|
|
|
713
848
|
# Additional worker abilities
|
|
714
|
-
parser.add_argument(
|
|
849
|
+
parser.add_argument(
|
|
850
|
+
"--context",
|
|
851
|
+
default=[],
|
|
852
|
+
action="append",
|
|
715
853
|
help="""Pickled, base64-encoded context manager(s) to run job inside of.
|
|
716
854
|
Allows the Toil leader to pass setup and cleanup work provided by the
|
|
717
855
|
batch system, in the form of pickled Python context manager objects,
|
|
718
856
|
that the worker can then run before/after the job on the batch
|
|
719
|
-
system's behalf."""
|
|
857
|
+
system's behalf.""",
|
|
858
|
+
)
|
|
720
859
|
|
|
721
860
|
return parser.parse_args(args)
|
|
722
861
|
|
|
723
862
|
|
|
724
863
|
@contextmanager
|
|
725
|
-
def in_contexts(contexts:
|
|
864
|
+
def in_contexts(contexts: list[str]) -> Iterator[None]:
|
|
726
865
|
"""
|
|
727
866
|
Unpickle and enter all the pickled, base64-encoded context managers in the
|
|
728
867
|
given list. Then do the body, then leave them all.
|
|
@@ -736,10 +875,12 @@ def in_contexts(contexts: List[str]) -> Iterator[None]:
|
|
|
736
875
|
rest = contexts[1:]
|
|
737
876
|
|
|
738
877
|
try:
|
|
739
|
-
manager = pickle.loads(base64.b64decode(first.encode(
|
|
878
|
+
manager = pickle.loads(base64.b64decode(first.encode("utf-8")))
|
|
740
879
|
except:
|
|
741
880
|
exc_info = sys.exc_info()
|
|
742
|
-
logger.error(
|
|
881
|
+
logger.error(
|
|
882
|
+
"Exception while unpickling context manager: ", exc_info=exc_info
|
|
883
|
+
)
|
|
743
884
|
raise
|
|
744
885
|
|
|
745
886
|
with manager:
|
|
@@ -749,14 +890,14 @@ def in_contexts(contexts: List[str]) -> Iterator[None]:
|
|
|
749
890
|
yield
|
|
750
891
|
|
|
751
892
|
|
|
752
|
-
def main(argv: Optional[
|
|
893
|
+
def main(argv: Optional[list[str]] = None) -> None:
|
|
753
894
|
if argv is None:
|
|
754
895
|
argv = sys.argv
|
|
755
896
|
# Parse our command line
|
|
756
897
|
options = parse_args(argv)
|
|
757
898
|
|
|
758
899
|
##########################################
|
|
759
|
-
#Load the jobStore/config file
|
|
900
|
+
# Load the jobStore/config file
|
|
760
901
|
##########################################
|
|
761
902
|
|
|
762
903
|
job_store = Toil.resumeJobStore(options.jobStoreLocator)
|