toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +122 -315
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +173 -89
- toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
- toil/batchSystems/awsBatch.py +244 -135
- toil/batchSystems/cleanup_support.py +26 -16
- toil/batchSystems/contained_executor.py +31 -28
- toil/batchSystems/gridengine.py +86 -50
- toil/batchSystems/htcondor.py +166 -89
- toil/batchSystems/kubernetes.py +632 -382
- toil/batchSystems/local_support.py +20 -15
- toil/batchSystems/lsf.py +134 -81
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +290 -151
- toil/batchSystems/mesos/executor.py +79 -50
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +46 -28
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +296 -125
- toil/batchSystems/slurm.py +603 -138
- toil/batchSystems/torque.py +47 -33
- toil/bus.py +186 -76
- toil/common.py +664 -368
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1136 -483
- toil/cwl/utils.py +17 -22
- toil/deferred.py +63 -42
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +140 -60
- toil/fileStores/cachingFileStore.py +717 -269
- toil/fileStores/nonCachingFileStore.py +116 -87
- toil/job.py +1225 -368
- toil/jobStores/abstractJobStore.py +416 -266
- toil/jobStores/aws/jobStore.py +863 -477
- toil/jobStores/aws/utils.py +201 -120
- toil/jobStores/conftest.py +3 -2
- toil/jobStores/fileJobStore.py +292 -154
- toil/jobStores/googleJobStore.py +140 -74
- toil/jobStores/utils.py +36 -15
- toil/leader.py +668 -272
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +74 -31
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +214 -39
- toil/lib/aws/utils.py +287 -231
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +104 -47
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +361 -199
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +5 -3
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +141 -15
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +66 -21
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +68 -15
- toil/lib/retry.py +126 -81
- toil/lib/threading.py +299 -82
- toil/lib/throttle.py +16 -15
- toil/options/common.py +843 -409
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +73 -17
- toil/provisioners/__init__.py +117 -46
- toil/provisioners/abstractProvisioner.py +332 -157
- toil/provisioners/aws/__init__.py +70 -33
- toil/provisioners/aws/awsProvisioner.py +1145 -715
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +155 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +128 -62
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +224 -70
- toil/test/__init__.py +282 -183
- toil/test/batchSystems/batchSystemTest.py +460 -210
- toil/test/batchSystems/batch_system_plugin_test.py +90 -0
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +110 -49
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +56 -0
- toil/test/cwl/cwlTest.py +496 -287
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +69 -46
- toil/test/jobStores/jobStoreTest.py +427 -264
- toil/test/lib/aws/test_iam.py +118 -50
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +58 -50
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +42 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +166 -44
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +141 -101
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +32 -24
- toil/test/src/environmentTest.py +135 -0
- toil/test/src/fileStoreTest.py +539 -272
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +46 -21
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +121 -71
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +10 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +73 -23
- toil/test/utils/toilDebugTest.py +103 -33
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +245 -106
- toil/test/wdl/wdltoil_test.py +818 -149
- toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
- toil/toilState.py +120 -35
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +214 -27
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +256 -140
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +32 -14
- toil/utils/toilSshCluster.py +49 -22
- toil/utils/toilStats.py +356 -273
- toil/utils/toilStatus.py +292 -139
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +12 -12
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3913 -1033
- toil/worker.py +367 -184
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- toil-6.1.0a1.dist-info/RECORD +0 -237
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/worker.py
CHANGED
|
@@ -25,24 +25,31 @@ import stat
|
|
|
25
25
|
import sys
|
|
26
26
|
import time
|
|
27
27
|
import traceback
|
|
28
|
+
from collections.abc import Iterator
|
|
28
29
|
from contextlib import contextmanager
|
|
29
|
-
from typing import Any, Callable,
|
|
30
|
+
from typing import Any, Callable, Optional
|
|
30
31
|
|
|
31
32
|
from configargparse import ArgParser
|
|
32
33
|
|
|
33
34
|
from toil import logProcessContext
|
|
34
35
|
from toil.common import Config, Toil, safeUnpickleFromStream
|
|
35
|
-
from toil.cwl.utils import (
|
|
36
|
-
|
|
36
|
+
from toil.cwl.utils import (
|
|
37
|
+
CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION,
|
|
38
|
+
CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE,
|
|
39
|
+
)
|
|
37
40
|
from toil.deferred import DeferredFunctionManager
|
|
38
41
|
from toil.fileStores.abstractFileStore import AbstractFileStore
|
|
39
|
-
from toil.job import
|
|
42
|
+
from toil.job import (
|
|
43
|
+
CheckpointJobDescription,
|
|
44
|
+
DebugStoppingPointReached,
|
|
45
|
+
Job,
|
|
46
|
+
JobDescription,
|
|
47
|
+
)
|
|
40
48
|
from toil.jobStores.abstractJobStore import AbstractJobStore
|
|
41
49
|
from toil.lib.expando import MagicExpando
|
|
42
50
|
from toil.lib.io import make_public_dir
|
|
43
|
-
from toil.lib.resources import
|
|
44
|
-
|
|
45
|
-
from toil.statsAndLogging import configure_root_logger, set_log_level
|
|
51
|
+
from toil.lib.resources import ResourceMonitor
|
|
52
|
+
from toil.statsAndLogging import configure_root_logger, install_log_color, set_log_level
|
|
46
53
|
|
|
47
54
|
logger = logging.getLogger(__name__)
|
|
48
55
|
|
|
@@ -50,36 +57,55 @@ logger = logging.getLogger(__name__)
|
|
|
50
57
|
class StatsDict(MagicExpando):
|
|
51
58
|
"""Subclass of MagicExpando for type-checking purposes."""
|
|
52
59
|
|
|
53
|
-
jobs:
|
|
60
|
+
jobs: list[MagicExpando]
|
|
54
61
|
|
|
55
62
|
|
|
56
|
-
def nextChainable(
|
|
63
|
+
def nextChainable(
|
|
64
|
+
predecessor: JobDescription, job_store: AbstractJobStore, config: Config
|
|
65
|
+
) -> Optional[JobDescription]:
|
|
57
66
|
"""
|
|
58
67
|
Returns the next chainable job's JobDescription after the given predecessor
|
|
59
68
|
JobDescription, if one exists, or None if the chain must terminate.
|
|
60
69
|
|
|
61
70
|
:param predecessor: The job to chain from
|
|
62
|
-
:param
|
|
71
|
+
:param job_store: The JobStore to fetch JobDescriptions from.
|
|
63
72
|
:param config: The configuration for the current run.
|
|
64
73
|
"""
|
|
65
|
-
#If no more jobs to run or services not finished, quit
|
|
66
|
-
if
|
|
67
|
-
|
|
68
|
-
|
|
74
|
+
# If no more jobs to run or services not finished, quit
|
|
75
|
+
if (
|
|
76
|
+
predecessor.nextSuccessors() is None
|
|
77
|
+
or len(predecessor.services) > 0
|
|
78
|
+
or (
|
|
79
|
+
isinstance(predecessor, CheckpointJobDescription)
|
|
80
|
+
and predecessor.checkpoint is not None
|
|
81
|
+
)
|
|
82
|
+
):
|
|
83
|
+
logger.debug(
|
|
84
|
+
"Stopping running chain of jobs: no successors: %s, services: %s, checkpoint: %s",
|
|
85
|
+
predecessor.nextSuccessors() is None,
|
|
86
|
+
len(predecessor.services),
|
|
87
|
+
(
|
|
88
|
+
isinstance(predecessor, CheckpointJobDescription)
|
|
89
|
+
and predecessor.checkpoint is not None
|
|
90
|
+
),
|
|
91
|
+
)
|
|
69
92
|
return None
|
|
70
93
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
jobs = list(predecessor.nextSuccessors())
|
|
94
|
+
# Get the next set of jobs to run
|
|
95
|
+
jobs = list(predecessor.nextSuccessors() or set())
|
|
74
96
|
if len(jobs) == 0:
|
|
75
97
|
# If there are no jobs, we might just not have any children.
|
|
76
|
-
logger.debug(
|
|
98
|
+
logger.debug(
|
|
99
|
+
"Stopping running chain of jobs because job has no ready children or follow-ons"
|
|
100
|
+
)
|
|
77
101
|
return None
|
|
78
102
|
|
|
79
|
-
#If there are 2 or more jobs to run in parallel we quit
|
|
103
|
+
# If there are 2 or more jobs to run in parallel we quit
|
|
80
104
|
if len(jobs) >= 2:
|
|
81
|
-
logger.debug(
|
|
82
|
-
|
|
105
|
+
logger.debug(
|
|
106
|
+
"No more jobs can run in series by this worker," " it's got %i successors",
|
|
107
|
+
len(jobs),
|
|
108
|
+
)
|
|
83
109
|
logger.debug("Two distinct successors are %s and %s", jobs[0], jobs[1])
|
|
84
110
|
return None
|
|
85
111
|
|
|
@@ -89,10 +115,10 @@ def nextChainable(predecessor: JobDescription, jobStore: AbstractJobStore, confi
|
|
|
89
115
|
logger.debug("%s would chain to ID %s", predecessor, successorID)
|
|
90
116
|
|
|
91
117
|
# Load the successor JobDescription
|
|
92
|
-
successor =
|
|
118
|
+
successor = job_store.load_job(successorID)
|
|
93
119
|
|
|
94
|
-
#We check the requirements of the successor to see if we can run it
|
|
95
|
-
#within the current worker
|
|
120
|
+
# We check the requirements of the successor to see if we can run it
|
|
121
|
+
# within the current worker
|
|
96
122
|
if successor.memory > predecessor.memory:
|
|
97
123
|
logger.debug("We need more memory for the next job, so finishing")
|
|
98
124
|
return None
|
|
@@ -103,14 +129,20 @@ def nextChainable(predecessor: JobDescription, jobStore: AbstractJobStore, confi
|
|
|
103
129
|
logger.debug("We need more disk for the next job, so finishing")
|
|
104
130
|
return None
|
|
105
131
|
if successor.preemptible != predecessor.preemptible:
|
|
106
|
-
logger.debug(
|
|
132
|
+
logger.debug(
|
|
133
|
+
"Preemptibility is different for the next job, returning to the leader"
|
|
134
|
+
)
|
|
107
135
|
return None
|
|
108
136
|
if successor.predecessorNumber > 1:
|
|
109
|
-
logger.debug(
|
|
137
|
+
logger.debug(
|
|
138
|
+
"The next job has multiple predecessors; we must return to the leader."
|
|
139
|
+
)
|
|
110
140
|
return None
|
|
111
141
|
|
|
112
142
|
if len(successor.services) > 0:
|
|
113
|
-
logger.debug(
|
|
143
|
+
logger.debug(
|
|
144
|
+
"The next job requires services that will not yet be started; we must return to the leader."
|
|
145
|
+
)
|
|
114
146
|
return None
|
|
115
147
|
|
|
116
148
|
if isinstance(successor, CheckpointJobDescription):
|
|
@@ -118,17 +150,43 @@ def nextChainable(predecessor: JobDescription, jobStore: AbstractJobStore, confi
|
|
|
118
150
|
logger.debug("Next job is checkpoint, so finishing")
|
|
119
151
|
return None
|
|
120
152
|
|
|
153
|
+
if (
|
|
154
|
+
not config.run_local_jobs_on_workers
|
|
155
|
+
and predecessor.local
|
|
156
|
+
and not successor.local
|
|
157
|
+
):
|
|
158
|
+
# This job might be running on the leader, but the next job may not.
|
|
159
|
+
#
|
|
160
|
+
# TODO: Optimize by detecting whether we actually are on the leader,
|
|
161
|
+
# somehow.
|
|
162
|
+
logger.debug("Next job is not allowed to run on the leader, so finishing")
|
|
163
|
+
return None
|
|
164
|
+
|
|
121
165
|
# Made it through! This job is chainable.
|
|
122
166
|
return successor
|
|
123
167
|
|
|
124
|
-
|
|
168
|
+
|
|
169
|
+
def workerScript(
|
|
170
|
+
job_store: AbstractJobStore,
|
|
171
|
+
config: Config,
|
|
172
|
+
job_name: str,
|
|
173
|
+
job_store_id: str,
|
|
174
|
+
redirect_output_to_log_file: bool = True,
|
|
175
|
+
local_worker_temp_dir: Optional[str] = None,
|
|
176
|
+
debug_flags: Optional[set[str]] = None,
|
|
177
|
+
) -> int:
|
|
125
178
|
"""
|
|
126
179
|
Worker process script, runs a job.
|
|
127
180
|
|
|
128
|
-
:param
|
|
181
|
+
:param job_store: The JobStore to fetch JobDescriptions from.
|
|
129
182
|
:param config: The configuration for the current run.
|
|
130
|
-
:param
|
|
131
|
-
:param
|
|
183
|
+
:param job_name: The "job name" (a user friendly name) of the job to be run
|
|
184
|
+
:param job_store_id: The job store ID of the job to be run
|
|
185
|
+
:param redirect_output_to_log_file: If False, log directly to the console
|
|
186
|
+
instead of capturing job output.
|
|
187
|
+
:param local_worker_temp_dir: The directory for the worker to work in. May
|
|
188
|
+
be recursively removed after the job runs.
|
|
189
|
+
:param debug_flags: Flags to set on each job before running it.
|
|
132
190
|
|
|
133
191
|
:return int: 1 if a job failed, or 0 if all jobs succeeded
|
|
134
192
|
"""
|
|
@@ -136,8 +194,13 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
136
194
|
configure_root_logger()
|
|
137
195
|
set_log_level(config.logLevel)
|
|
138
196
|
|
|
197
|
+
if config.colored_logs:
|
|
198
|
+
install_log_color()
|
|
199
|
+
|
|
200
|
+
logger.debug("Worker started for job %s...", job_name)
|
|
201
|
+
|
|
139
202
|
##########################################
|
|
140
|
-
#Create the worker killer, if requested
|
|
203
|
+
# Create the worker killer, if requested
|
|
141
204
|
##########################################
|
|
142
205
|
|
|
143
206
|
logFileByteReportLimit = config.maxLogFileSize
|
|
@@ -178,11 +241,11 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
178
241
|
# before it does. Either way, init will have to clean it up for us.
|
|
179
242
|
|
|
180
243
|
##########################################
|
|
181
|
-
#Load the environment for the job
|
|
244
|
+
# Load the environment for the job
|
|
182
245
|
##########################################
|
|
183
246
|
|
|
184
|
-
#First load the environment for the job.
|
|
185
|
-
with
|
|
247
|
+
# First load the environment for the job.
|
|
248
|
+
with job_store.read_shared_file_stream("environment.pickle") as fileHandle:
|
|
186
249
|
environment = safeUnpickleFromStream(fileHandle)
|
|
187
250
|
env_reject = {
|
|
188
251
|
"TMPDIR",
|
|
@@ -199,15 +262,15 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
199
262
|
"XDG_SESSION_ID",
|
|
200
263
|
"XDG_RUNTIME_DIR",
|
|
201
264
|
"XDG_DATA_DIRS",
|
|
202
|
-
"DBUS_SESSION_BUS_ADDRESS"
|
|
265
|
+
"DBUS_SESSION_BUS_ADDRESS",
|
|
203
266
|
}
|
|
204
267
|
for i in environment:
|
|
205
268
|
if i == "PATH":
|
|
206
269
|
# Handle path specially. Sometimes e.g. leader may not include
|
|
207
270
|
# /bin, but the Toil appliance needs it.
|
|
208
|
-
if i in os.environ and os.environ[i] !=
|
|
271
|
+
if i in os.environ and os.environ[i] != "":
|
|
209
272
|
# Use the provided PATH and then the local system's PATH
|
|
210
|
-
os.environ[i] = environment[i] +
|
|
273
|
+
os.environ[i] = environment[i] + ":" + os.environ[i]
|
|
211
274
|
else:
|
|
212
275
|
# Use the provided PATH only
|
|
213
276
|
os.environ[i] = environment[i]
|
|
@@ -215,42 +278,48 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
215
278
|
os.environ[i] = environment[i]
|
|
216
279
|
# sys.path is used by __import__ to find modules
|
|
217
280
|
if "PYTHONPATH" in environment:
|
|
218
|
-
for e in environment["PYTHONPATH"].split(
|
|
219
|
-
if e !=
|
|
281
|
+
for e in environment["PYTHONPATH"].split(":"):
|
|
282
|
+
if e != "":
|
|
220
283
|
sys.path.append(e)
|
|
221
284
|
|
|
222
285
|
##########################################
|
|
223
|
-
#Setup the temporary directories.
|
|
286
|
+
# Setup the temporary directories.
|
|
224
287
|
##########################################
|
|
225
288
|
# Dir to put all this worker's temp files in.
|
|
226
289
|
if config.workflowID is None:
|
|
227
290
|
raise RuntimeError("The worker workflow ID was never set.")
|
|
228
291
|
toilWorkflowDir = Toil.getLocalWorkflowDir(config.workflowID, config.workDir)
|
|
229
292
|
# Dir to put lock files in, ideally not on NFS.
|
|
230
|
-
toil_coordination_dir = Toil.get_local_workflow_coordination_dir(
|
|
231
|
-
|
|
232
|
-
|
|
293
|
+
toil_coordination_dir = Toil.get_local_workflow_coordination_dir(
|
|
294
|
+
config.workflowID, config.workDir, config.coordination_dir
|
|
295
|
+
)
|
|
296
|
+
if local_worker_temp_dir is None:
|
|
297
|
+
# Invent a temp directory to work in
|
|
298
|
+
local_worker_temp_dir = make_public_dir(toilWorkflowDir)
|
|
299
|
+
os.chmod(local_worker_temp_dir, 0o755)
|
|
233
300
|
|
|
234
301
|
##########################################
|
|
235
|
-
#Setup the logging
|
|
302
|
+
# Setup the logging
|
|
236
303
|
##########################################
|
|
237
304
|
|
|
238
|
-
#This is mildly tricky because we don't just want to
|
|
239
|
-
#redirect stdout and stderr for this Python process; we want to redirect it
|
|
240
|
-
#for this process and all children. Consequently, we can't just replace
|
|
241
|
-
#sys.stdout and sys.stderr; we need to mess with the underlying OS-level
|
|
242
|
-
#file descriptors. See <http://stackoverflow.com/a/11632982/402891>
|
|
305
|
+
# This is mildly tricky because we don't just want to
|
|
306
|
+
# redirect stdout and stderr for this Python process; we want to redirect it
|
|
307
|
+
# for this process and all children. Consequently, we can't just replace
|
|
308
|
+
# sys.stdout and sys.stderr; we need to mess with the underlying OS-level
|
|
309
|
+
# file descriptors. See <http://stackoverflow.com/a/11632982/402891>
|
|
243
310
|
|
|
244
|
-
#When we start, standard input is file descriptor 0, standard output is
|
|
245
|
-
#file descriptor 1, and standard error is file descriptor 2.
|
|
311
|
+
# When we start, standard input is file descriptor 0, standard output is
|
|
312
|
+
# file descriptor 1, and standard error is file descriptor 2.
|
|
246
313
|
|
|
247
314
|
# Do we even want to redirect output? Let the config make us not do it.
|
|
248
|
-
|
|
315
|
+
redirect_output_to_log_file = (
|
|
316
|
+
redirect_output_to_log_file and not config.disableWorkerOutputCapture
|
|
317
|
+
)
|
|
249
318
|
|
|
250
|
-
#What file do we want to point FDs 1 and 2 to?
|
|
251
|
-
tempWorkerLogPath = os.path.join(
|
|
319
|
+
# What file do we want to point FDs 1 and 2 to?
|
|
320
|
+
tempWorkerLogPath = os.path.join(local_worker_temp_dir, "worker_log.txt")
|
|
252
321
|
|
|
253
|
-
if
|
|
322
|
+
if redirect_output_to_log_file:
|
|
254
323
|
# Announce that we are redirecting logging, and where it will now go.
|
|
255
324
|
# This is only important if we are trying to manually trace a faulty worker invocation.
|
|
256
325
|
logger.debug("Redirecting logging to %s", tempWorkerLogPath)
|
|
@@ -287,13 +356,15 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
287
356
|
|
|
288
357
|
jobAttemptFailed = False
|
|
289
358
|
failure_exit_code = 1
|
|
359
|
+
first_job_cores = None
|
|
290
360
|
statsDict = StatsDict() # type: ignore[no-untyped-call]
|
|
291
361
|
statsDict.jobs = []
|
|
292
|
-
statsDict.workers.
|
|
362
|
+
statsDict.workers.logs_to_leader = []
|
|
363
|
+
statsDict.workers.logging_user_streams = []
|
|
293
364
|
|
|
294
365
|
def blockFn() -> bool:
|
|
295
366
|
return True
|
|
296
|
-
|
|
367
|
+
|
|
297
368
|
job = None
|
|
298
369
|
try:
|
|
299
370
|
|
|
@@ -312,18 +383,17 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
312
383
|
# Load the JobDescription
|
|
313
384
|
##########################################
|
|
314
385
|
|
|
315
|
-
jobDesc =
|
|
316
|
-
listOfJobs[0] = str(jobDesc)
|
|
386
|
+
jobDesc = job_store.load_job(job_store_id)
|
|
317
387
|
logger.debug("Parsed job description")
|
|
318
388
|
|
|
319
389
|
##########################################
|
|
320
390
|
# Cleanup from any earlier invocation of the job
|
|
321
391
|
##########################################
|
|
322
392
|
|
|
323
|
-
if jobDesc.
|
|
393
|
+
if not jobDesc.has_body():
|
|
324
394
|
logger.debug("Job description has no body to run.")
|
|
325
395
|
# Cleanup jobs already finished
|
|
326
|
-
jobDesc.clear_nonexistent_dependents(
|
|
396
|
+
jobDesc.clear_nonexistent_dependents(job_store)
|
|
327
397
|
logger.debug("Cleaned up any references to completed successor jobs")
|
|
328
398
|
|
|
329
399
|
# This cleans the old log file which may
|
|
@@ -331,14 +401,17 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
331
401
|
oldLogFile = jobDesc.logJobStoreFileID
|
|
332
402
|
if oldLogFile is not None:
|
|
333
403
|
jobDesc.logJobStoreFileID = None
|
|
334
|
-
|
|
335
|
-
|
|
404
|
+
job_store.update_job(jobDesc) # Update first, before deleting any files
|
|
405
|
+
job_store.delete_file(oldLogFile)
|
|
336
406
|
|
|
337
407
|
##########################################
|
|
338
408
|
# If a checkpoint exists, restart from the checkpoint
|
|
339
409
|
##########################################
|
|
340
410
|
|
|
341
|
-
if
|
|
411
|
+
if (
|
|
412
|
+
isinstance(jobDesc, CheckpointJobDescription)
|
|
413
|
+
and jobDesc.checkpoint is not None
|
|
414
|
+
):
|
|
342
415
|
# The job is a checkpoint, and is being restarted after previously completing
|
|
343
416
|
logger.debug("Job is a checkpoint")
|
|
344
417
|
# If the checkpoint still has extant successors or services, its
|
|
@@ -350,75 +423,106 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
350
423
|
if jobDesc.remainingTryCount < 0:
|
|
351
424
|
raise RuntimeError("The try count of the job cannot be negative.")
|
|
352
425
|
jobDesc.remainingTryCount = max(0, jobDesc.remainingTryCount - 1)
|
|
353
|
-
jobDesc.restartCheckpoint(
|
|
426
|
+
jobDesc.restartCheckpoint(job_store)
|
|
354
427
|
# Otherwise, the job and successors are done, and we can cleanup stuff we couldn't clean
|
|
355
428
|
# because of the job being a checkpoint
|
|
356
429
|
else:
|
|
357
|
-
logger.debug(
|
|
358
|
-
|
|
359
|
-
|
|
430
|
+
logger.debug(
|
|
431
|
+
"The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete."
|
|
432
|
+
)
|
|
433
|
+
# Delete any remnant files
|
|
434
|
+
list(
|
|
435
|
+
map(
|
|
436
|
+
job_store.delete_file,
|
|
437
|
+
list(
|
|
438
|
+
filter(
|
|
439
|
+
job_store.file_exists, jobDesc.checkpointFilesToDelete
|
|
440
|
+
)
|
|
441
|
+
),
|
|
442
|
+
)
|
|
443
|
+
)
|
|
360
444
|
|
|
361
445
|
##########################################
|
|
362
|
-
#Setup the stats, if requested
|
|
446
|
+
# Setup the stats, if requested
|
|
363
447
|
##########################################
|
|
364
448
|
|
|
365
449
|
if config.stats:
|
|
366
|
-
|
|
450
|
+
# Remember the cores from the first job, which is how many we have reserved for us.
|
|
451
|
+
statsDict.workers.requested_cores = jobDesc.cores
|
|
452
|
+
startClock = ResourceMonitor.get_total_cpu_time()
|
|
367
453
|
|
|
368
454
|
startTime = time.time()
|
|
369
455
|
while True:
|
|
370
456
|
##########################################
|
|
371
|
-
#Run the job body, if there is one
|
|
457
|
+
# Run the job body, if there is one
|
|
372
458
|
##########################################
|
|
373
459
|
|
|
374
460
|
logger.info("Working on job %s", jobDesc)
|
|
375
461
|
|
|
376
|
-
if jobDesc.
|
|
377
|
-
if not jobDesc.command.startswith("_toil "):
|
|
378
|
-
raise RuntimeError("Job command must start with '_toil' before being converted to an executable command.")
|
|
379
|
-
logger.debug("Got a command to run: %s" % jobDesc.command)
|
|
462
|
+
if jobDesc.has_body():
|
|
380
463
|
# Load the job. It will use the same JobDescription we have been using.
|
|
381
|
-
job = Job.loadJob(
|
|
464
|
+
job = Job.loadJob(job_store, jobDesc)
|
|
382
465
|
if isinstance(jobDesc, CheckpointJobDescription):
|
|
383
|
-
# If it is a checkpoint job,
|
|
384
|
-
jobDesc.
|
|
466
|
+
# If it is a checkpoint job, set the checkpoint
|
|
467
|
+
jobDesc.set_checkpoint()
|
|
385
468
|
|
|
386
469
|
logger.info("Loaded body %s from description %s", job, jobDesc)
|
|
387
470
|
|
|
471
|
+
if debug_flags:
|
|
472
|
+
for flag in debug_flags:
|
|
473
|
+
logger.debug("Turning on debug flag %s on job", flag)
|
|
474
|
+
job.set_debug_flag(flag)
|
|
475
|
+
|
|
388
476
|
# Create a fileStore object for the job
|
|
389
|
-
fileStore = AbstractFileStore.createFileStore(
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
477
|
+
fileStore = AbstractFileStore.createFileStore(
|
|
478
|
+
job_store,
|
|
479
|
+
jobDesc,
|
|
480
|
+
local_worker_temp_dir,
|
|
481
|
+
blockFn,
|
|
482
|
+
caching=config.caching,
|
|
483
|
+
)
|
|
484
|
+
try:
|
|
485
|
+
with job._executor(
|
|
486
|
+
stats=statsDict if config.stats else None, fileStore=fileStore
|
|
487
|
+
):
|
|
488
|
+
with deferredFunctionManager.open() as defer:
|
|
489
|
+
with fileStore.open(job):
|
|
490
|
+
# Get the next block function to wait on committing this job
|
|
491
|
+
blockFn = fileStore.waitForCommit
|
|
492
|
+
|
|
493
|
+
# Run the job, save new successors, and set up
|
|
494
|
+
# locally (but don't commit) successor
|
|
495
|
+
# relationships and job completion.
|
|
496
|
+
# Pass everything as name=value because Cactus
|
|
497
|
+
# likes to override _runner when it shouldn't and
|
|
498
|
+
# it needs some hope of finding the arguments it
|
|
499
|
+
# wants across multiple Toil versions. We also
|
|
500
|
+
# still pass a jobGraph argument to placate old
|
|
501
|
+
# versions of Cactus.
|
|
502
|
+
job._runner(
|
|
503
|
+
jobGraph=None,
|
|
504
|
+
jobStore=job_store,
|
|
505
|
+
fileStore=fileStore,
|
|
506
|
+
defer=defer,
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
# When the executor for the job finishes it will
|
|
510
|
+
# kick off a commit with the link to the job body
|
|
511
|
+
# cut.
|
|
512
|
+
finally:
|
|
513
|
+
# Accumulate messages from this job & any subsequent chained jobs.
|
|
514
|
+
# Keep the messages even if the job fails.
|
|
515
|
+
statsDict.workers.logs_to_leader += fileStore.logging_messages
|
|
516
|
+
statsDict.workers.logging_user_streams += (
|
|
517
|
+
fileStore.logging_user_streams
|
|
518
|
+
)
|
|
415
519
|
|
|
416
520
|
logger.info("Completed body for %s", jobDesc)
|
|
417
521
|
|
|
418
522
|
else:
|
|
419
|
-
#The
|
|
420
|
-
#
|
|
421
|
-
#been scheduled after a failure to cleanup
|
|
523
|
+
# The body may not be attached, in which case the
|
|
524
|
+
# JobDescription is either a shell ready to be deleted or has
|
|
525
|
+
# been scheduled after a failure to cleanup
|
|
422
526
|
logger.debug("No user job to run, so finishing")
|
|
423
527
|
break
|
|
424
528
|
|
|
@@ -426,9 +530,9 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
426
530
|
raise RuntimeError("The termination flag is set")
|
|
427
531
|
|
|
428
532
|
##########################################
|
|
429
|
-
#Establish if we can run another job within the worker
|
|
533
|
+
# Establish if we can run another job within the worker
|
|
430
534
|
##########################################
|
|
431
|
-
successor = nextChainable(jobDesc,
|
|
535
|
+
successor = nextChainable(jobDesc, job_store, config)
|
|
432
536
|
if successor is None or config.disableChaining:
|
|
433
537
|
# Can't chain any more jobs. We are going to stop.
|
|
434
538
|
|
|
@@ -449,17 +553,18 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
449
553
|
|
|
450
554
|
# Make sure nothing has gone wrong and we can really chain
|
|
451
555
|
if jobDesc.memory < successor.memory:
|
|
452
|
-
raise RuntimeError(
|
|
556
|
+
raise RuntimeError(
|
|
557
|
+
"Cannot chain jobs. A job's memory cannot be less than it's successor."
|
|
558
|
+
)
|
|
453
559
|
if jobDesc.cores < successor.cores:
|
|
454
|
-
raise RuntimeError(
|
|
560
|
+
raise RuntimeError(
|
|
561
|
+
"Cannot chain jobs. A job's cores cannot be less than it's successor."
|
|
562
|
+
)
|
|
455
563
|
|
|
456
564
|
# Save the successor's original ID, so we can clean it (and its
|
|
457
565
|
# body) up after we finish executing it.
|
|
458
566
|
successorID = successor.jobStoreID
|
|
459
567
|
|
|
460
|
-
# add the successor to the list of jobs run
|
|
461
|
-
listOfJobs.append(str(successor))
|
|
462
|
-
|
|
463
568
|
# Now we need to become that successor, under the original ID.
|
|
464
569
|
successor.replace(jobDesc)
|
|
465
570
|
jobDesc = successor
|
|
@@ -470,8 +575,13 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
470
575
|
|
|
471
576
|
# Build a fileStore to update the job and commit the replacement.
|
|
472
577
|
# TODO: can we have a commit operation without an entire FileStore???
|
|
473
|
-
fileStore = AbstractFileStore.createFileStore(
|
|
474
|
-
|
|
578
|
+
fileStore = AbstractFileStore.createFileStore(
|
|
579
|
+
job_store,
|
|
580
|
+
jobDesc,
|
|
581
|
+
local_worker_temp_dir,
|
|
582
|
+
blockFn,
|
|
583
|
+
caching=config.caching,
|
|
584
|
+
)
|
|
475
585
|
|
|
476
586
|
# Update blockFn to wait for that commit operation.
|
|
477
587
|
blockFn = fileStore.waitForCommit
|
|
@@ -482,30 +592,70 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
482
592
|
logger.debug("Starting the next job")
|
|
483
593
|
|
|
484
594
|
##########################################
|
|
485
|
-
#Finish up the stats
|
|
595
|
+
# Finish up the stats
|
|
486
596
|
##########################################
|
|
487
597
|
if config.stats:
|
|
488
|
-
totalCPUTime, totalMemoryUsage =
|
|
598
|
+
totalCPUTime, totalMemoryUsage = (
|
|
599
|
+
ResourceMonitor.get_total_cpu_time_and_memory_usage()
|
|
600
|
+
)
|
|
489
601
|
statsDict.workers.time = str(time.time() - startTime)
|
|
490
602
|
statsDict.workers.clock = str(totalCPUTime - startClock)
|
|
491
603
|
statsDict.workers.memory = str(totalMemoryUsage)
|
|
604
|
+
# Say the worker used the max disk we saw from any job
|
|
605
|
+
max_bytes = 0
|
|
606
|
+
for job_stats in statsDict.jobs:
|
|
607
|
+
if "disk" in job_stats:
|
|
608
|
+
max_bytes = max(max_bytes, int(job_stats.disk))
|
|
609
|
+
statsDict.workers.disk = str(max_bytes)
|
|
610
|
+
# Count the jobs executed.
|
|
611
|
+
# TODO: toil stats could compute this but its parser is too general to hook into simply.
|
|
612
|
+
statsDict.workers.jobs_run = len(statsDict.jobs)
|
|
492
613
|
|
|
493
614
|
# log the worker log path here so that if the file is truncated the path can still be found
|
|
494
|
-
if
|
|
495
|
-
logger.info(
|
|
496
|
-
|
|
497
|
-
|
|
615
|
+
if redirect_output_to_log_file:
|
|
616
|
+
logger.info(
|
|
617
|
+
"Worker log can be found at %s. Set --cleanWorkDir to retain this log",
|
|
618
|
+
local_worker_temp_dir,
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
logger.info(
|
|
622
|
+
"Finished running the chain of jobs on this node, we ran for a total of %f seconds",
|
|
623
|
+
time.time() - startTime,
|
|
624
|
+
)
|
|
498
625
|
|
|
499
626
|
##########################################
|
|
500
|
-
#Trapping where worker goes wrong
|
|
627
|
+
# Trapping where worker goes wrong
|
|
501
628
|
##########################################
|
|
502
|
-
except
|
|
503
|
-
|
|
504
|
-
|
|
629
|
+
except DebugStoppingPointReached:
|
|
630
|
+
# Job wants the worker to stop for debugging
|
|
631
|
+
raise
|
|
632
|
+
except (
|
|
633
|
+
BaseException
|
|
634
|
+
) as e: # Case that something goes wrong in worker, or we are asked to stop
|
|
635
|
+
if not isinstance(e, SystemExit):
|
|
636
|
+
logger.critical(
|
|
637
|
+
"Worker crashed with traceback:\n%s", traceback.format_exc()
|
|
638
|
+
)
|
|
639
|
+
logger.error(
|
|
640
|
+
"Exiting the worker because of a failed job on host %s",
|
|
641
|
+
socket.gethostname(),
|
|
642
|
+
)
|
|
505
643
|
if isinstance(e, CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION):
|
|
506
644
|
# We need to inform the leader that this is a CWL workflow problem
|
|
507
645
|
# and it needs to inform its caller.
|
|
508
646
|
failure_exit_code = CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
|
|
647
|
+
elif isinstance(e, SystemExit) and isinstance(e.code, int) and e.code != 0:
|
|
648
|
+
# We're meant to be exiting with a particular code.
|
|
649
|
+
failure_exit_code = e.code
|
|
650
|
+
else:
|
|
651
|
+
try:
|
|
652
|
+
from WDL.runtime.error import CommandFailed
|
|
653
|
+
|
|
654
|
+
if isinstance(e, CommandFailed):
|
|
655
|
+
failure_exit_code = e.exit_status
|
|
656
|
+
except ImportError:
|
|
657
|
+
# WDL dependency not available
|
|
658
|
+
pass
|
|
509
659
|
AbstractFileStore._terminateEvent.set()
|
|
510
660
|
finally:
|
|
511
661
|
# Get rid of our deferred function manager now so we can't mistake it
|
|
@@ -521,16 +671,15 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
521
671
|
logger.debug("cwltool.main._terminate_processess exception: %s", (e))
|
|
522
672
|
raise e
|
|
523
673
|
|
|
524
|
-
|
|
525
674
|
##########################################
|
|
526
|
-
#Wait for the asynchronous chain of writes/updates to finish
|
|
675
|
+
# Wait for the asynchronous chain of writes/updates to finish
|
|
527
676
|
##########################################
|
|
528
677
|
|
|
529
678
|
blockFn()
|
|
530
679
|
|
|
531
680
|
##########################################
|
|
532
|
-
#All the asynchronous worker/update threads must be finished now,
|
|
533
|
-
#so safe to test if they completed okay
|
|
681
|
+
# All the asynchronous worker/update threads must be finished now,
|
|
682
|
+
# so safe to test if they completed okay
|
|
534
683
|
##########################################
|
|
535
684
|
|
|
536
685
|
if AbstractFileStore._terminateEvent.is_set():
|
|
@@ -538,19 +687,19 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
538
687
|
|
|
539
688
|
# Clobber any garbage state we have for this job from failing with
|
|
540
689
|
# whatever good state is still stored in the JobStore
|
|
541
|
-
jobDesc =
|
|
690
|
+
jobDesc = job_store.load_job(job_store_id)
|
|
542
691
|
# Remember that we failed
|
|
543
692
|
jobAttemptFailed = True
|
|
544
693
|
|
|
545
694
|
##########################################
|
|
546
|
-
#Cleanup
|
|
695
|
+
# Cleanup
|
|
547
696
|
##########################################
|
|
548
697
|
|
|
549
698
|
# Close the worker logging
|
|
550
699
|
# Flush at the Python level
|
|
551
700
|
sys.stdout.flush()
|
|
552
701
|
sys.stderr.flush()
|
|
553
|
-
if
|
|
702
|
+
if redirect_output_to_log_file:
|
|
554
703
|
# Flush at the OS level
|
|
555
704
|
os.fsync(1)
|
|
556
705
|
os.fsync(2)
|
|
@@ -577,43 +726,66 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
577
726
|
# relative to the end (since Python won't decode Unicode backward, or even
|
|
578
727
|
# interpret seek offsets in characters for us). TODO: We may get invalid or
|
|
579
728
|
# just different Unicode by breaking up a character at the boundary!
|
|
580
|
-
if jobAttemptFailed and
|
|
581
|
-
jobDesc.logJobStoreFileID = logJobStoreFileID =
|
|
729
|
+
if jobAttemptFailed and redirect_output_to_log_file:
|
|
730
|
+
jobDesc.logJobStoreFileID = logJobStoreFileID = job_store.getEmptyFileStoreID(
|
|
582
731
|
jobDesc.jobStoreID, cleanup=True
|
|
583
732
|
)
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit !=0:
|
|
733
|
+
with job_store.update_file_stream(logJobStoreFileID) as w:
|
|
734
|
+
with open(tempWorkerLogPath, "rb") as f:
|
|
735
|
+
if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit != 0:
|
|
588
736
|
if logFileByteReportLimit > 0:
|
|
589
|
-
f.seek(
|
|
737
|
+
f.seek(
|
|
738
|
+
-logFileByteReportLimit, 2
|
|
739
|
+
) # seek to last tooBig bytes of file
|
|
590
740
|
elif logFileByteReportLimit < 0:
|
|
591
|
-
f.seek(
|
|
741
|
+
f.seek(
|
|
742
|
+
logFileByteReportLimit, 0
|
|
743
|
+
) # seek to first tooBig bytes of file
|
|
592
744
|
# Dump the possibly-invalid-Unicode bytes into the log file
|
|
593
|
-
w.write(f.read())
|
|
745
|
+
w.write(f.read()) # TODO load file using a buffer
|
|
594
746
|
# Commit log file reference back to JobStore
|
|
595
|
-
|
|
747
|
+
job_store.update_job(jobDesc)
|
|
596
748
|
|
|
597
|
-
elif (
|
|
598
|
-
|
|
599
|
-
|
|
749
|
+
elif (
|
|
750
|
+
debugging or (config.writeLogsFromAllJobs and not jobDesc.local)
|
|
751
|
+
) and redirect_output_to_log_file: # write log messages
|
|
752
|
+
with open(tempWorkerLogPath, "rb") as logFile:
|
|
600
753
|
if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit != 0:
|
|
601
754
|
if logFileByteReportLimit > 0:
|
|
602
|
-
logFile.seek(
|
|
755
|
+
logFile.seek(
|
|
756
|
+
-logFileByteReportLimit, 2
|
|
757
|
+
) # seek to last tooBig bytes of file
|
|
603
758
|
elif logFileByteReportLimit < 0:
|
|
604
|
-
logFile.seek(
|
|
759
|
+
logFile.seek(
|
|
760
|
+
logFileByteReportLimit, 0
|
|
761
|
+
) # seek to first tooBig bytes of file
|
|
605
762
|
# Make sure lines are Unicode so they can be JSON serialized as part of the dict.
|
|
606
763
|
# We may have damaged the Unicode text by cutting it at an arbitrary byte so we drop bad characters.
|
|
607
|
-
logMessages = [
|
|
608
|
-
|
|
764
|
+
logMessages = [
|
|
765
|
+
line.decode("utf-8", "skip") for line in logFile.read().splitlines()
|
|
766
|
+
]
|
|
767
|
+
statsDict.logs.names = [names.stats_name for names in jobDesc.get_chain()]
|
|
609
768
|
statsDict.logs.messages = logMessages
|
|
610
769
|
|
|
611
|
-
if (
|
|
612
|
-
|
|
770
|
+
if (
|
|
771
|
+
debugging
|
|
772
|
+
or config.stats
|
|
773
|
+
or statsDict.workers.logs_to_leader
|
|
774
|
+
or statsDict.workers.logging_user_streams
|
|
775
|
+
):
|
|
776
|
+
# We have stats/logging to report back.
|
|
777
|
+
# We report even if the job attempt failed.
|
|
778
|
+
# TODO: Will that upset analysis of the stats?
|
|
779
|
+
job_store.write_logs(json.dumps(statsDict, ensure_ascii=True))
|
|
613
780
|
|
|
614
781
|
# Remove the temp dir
|
|
615
782
|
cleanUp = config.cleanWorkDir
|
|
616
|
-
if
|
|
783
|
+
if (
|
|
784
|
+
cleanUp == "always"
|
|
785
|
+
or (cleanUp == "onSuccess" and not jobAttemptFailed)
|
|
786
|
+
or (cleanUp == "onError" and jobAttemptFailed)
|
|
787
|
+
):
|
|
788
|
+
|
|
617
789
|
def make_parent_writable(func: Callable[[str], Any], path: str, _: Any) -> None:
|
|
618
790
|
"""
|
|
619
791
|
When encountering an error removing a file or directory, make sure
|
|
@@ -624,24 +796,32 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
624
796
|
"""
|
|
625
797
|
# Just chmod it for rwx for user. This can't work anyway if it isn't ours.
|
|
626
798
|
try:
|
|
627
|
-
os.chmod(
|
|
799
|
+
os.chmod(
|
|
800
|
+
os.path.dirname(path), stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR
|
|
801
|
+
)
|
|
628
802
|
except PermissionError as e:
|
|
629
|
-
logger.error(
|
|
630
|
-
|
|
803
|
+
logger.error(
|
|
804
|
+
"Could not set permissions on %s to allow cleanup of %s: %s",
|
|
805
|
+
os.path.dirname(path),
|
|
806
|
+
path,
|
|
807
|
+
e,
|
|
808
|
+
)
|
|
809
|
+
|
|
810
|
+
shutil.rmtree(local_worker_temp_dir, onerror=make_parent_writable)
|
|
631
811
|
|
|
632
812
|
# This must happen after the log file is done with, else there is no place to put the log
|
|
633
813
|
if (not jobAttemptFailed) and jobDesc.is_subtree_done():
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
jobStore.delete_job(str(jobDesc.jobStoreID))
|
|
814
|
+
for merged_in in jobDesc.get_chain():
|
|
815
|
+
# We can now safely get rid of the JobDescription, and all jobs it chained up
|
|
816
|
+
job_store.delete_job(merged_in.job_store_id)
|
|
638
817
|
|
|
639
818
|
if jobAttemptFailed:
|
|
640
819
|
return failure_exit_code
|
|
641
820
|
else:
|
|
642
821
|
return 0
|
|
643
822
|
|
|
644
|
-
|
|
823
|
+
|
|
824
|
+
def parse_args(args: list[str]) -> Any:
|
|
645
825
|
"""
|
|
646
826
|
Parse command-line arguments to the worker.
|
|
647
827
|
"""
|
|
@@ -655,26 +835,33 @@ def parse_args(args: List[str]) -> Any:
|
|
|
655
835
|
# Now add all the options to it
|
|
656
836
|
|
|
657
837
|
# Base required job information
|
|
658
|
-
parser.add_argument("jobName", type=str,
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
838
|
+
parser.add_argument("jobName", type=str, help="Text name of the job being run")
|
|
839
|
+
parser.add_argument(
|
|
840
|
+
"jobStoreLocator",
|
|
841
|
+
type=str,
|
|
842
|
+
help="Information required to connect to the job store",
|
|
843
|
+
)
|
|
844
|
+
parser.add_argument(
|
|
845
|
+
"jobStoreID", type=str, help="ID of the job within the job store"
|
|
846
|
+
)
|
|
664
847
|
|
|
665
848
|
# Additional worker abilities
|
|
666
|
-
parser.add_argument(
|
|
849
|
+
parser.add_argument(
|
|
850
|
+
"--context",
|
|
851
|
+
default=[],
|
|
852
|
+
action="append",
|
|
667
853
|
help="""Pickled, base64-encoded context manager(s) to run job inside of.
|
|
668
854
|
Allows the Toil leader to pass setup and cleanup work provided by the
|
|
669
855
|
batch system, in the form of pickled Python context manager objects,
|
|
670
856
|
that the worker can then run before/after the job on the batch
|
|
671
|
-
system's behalf."""
|
|
857
|
+
system's behalf.""",
|
|
858
|
+
)
|
|
672
859
|
|
|
673
860
|
return parser.parse_args(args)
|
|
674
861
|
|
|
675
862
|
|
|
676
863
|
@contextmanager
|
|
677
|
-
def in_contexts(contexts:
|
|
864
|
+
def in_contexts(contexts: list[str]) -> Iterator[None]:
|
|
678
865
|
"""
|
|
679
866
|
Unpickle and enter all the pickled, base64-encoded context managers in the
|
|
680
867
|
given list. Then do the body, then leave them all.
|
|
@@ -688,10 +875,12 @@ def in_contexts(contexts: List[str]) -> Iterator[None]:
|
|
|
688
875
|
rest = contexts[1:]
|
|
689
876
|
|
|
690
877
|
try:
|
|
691
|
-
manager = pickle.loads(base64.b64decode(first.encode(
|
|
878
|
+
manager = pickle.loads(base64.b64decode(first.encode("utf-8")))
|
|
692
879
|
except:
|
|
693
880
|
exc_info = sys.exc_info()
|
|
694
|
-
logger.error(
|
|
881
|
+
logger.error(
|
|
882
|
+
"Exception while unpickling context manager: ", exc_info=exc_info
|
|
883
|
+
)
|
|
695
884
|
raise
|
|
696
885
|
|
|
697
886
|
with manager:
|
|
@@ -701,28 +890,22 @@ def in_contexts(contexts: List[str]) -> Iterator[None]:
|
|
|
701
890
|
yield
|
|
702
891
|
|
|
703
892
|
|
|
704
|
-
def main(argv: Optional[
|
|
893
|
+
def main(argv: Optional[list[str]] = None) -> None:
|
|
705
894
|
if argv is None:
|
|
706
895
|
argv = sys.argv
|
|
707
|
-
|
|
708
896
|
# Parse our command line
|
|
709
897
|
options = parse_args(argv)
|
|
710
898
|
|
|
711
|
-
# Parse input args
|
|
712
|
-
jobName = argv[1]
|
|
713
|
-
jobStoreLocator = argv[2]
|
|
714
|
-
jobStoreID = argv[3]
|
|
715
|
-
|
|
716
899
|
##########################################
|
|
717
|
-
#Load the jobStore/config file
|
|
900
|
+
# Load the jobStore/config file
|
|
718
901
|
##########################################
|
|
719
902
|
|
|
720
|
-
|
|
721
|
-
config =
|
|
903
|
+
job_store = Toil.resumeJobStore(options.jobStoreLocator)
|
|
904
|
+
config = job_store.config
|
|
722
905
|
|
|
723
906
|
with in_contexts(options.context):
|
|
724
907
|
# Call the worker
|
|
725
|
-
exit_code = workerScript(
|
|
908
|
+
exit_code = workerScript(job_store, config, options.jobName, options.jobStoreID)
|
|
726
909
|
|
|
727
910
|
# Exit with its return value
|
|
728
911
|
sys.exit(exit_code)
|