toil 7.0.0__py3-none-any.whl → 8.1.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +124 -86
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +137 -77
- toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
- toil/batchSystems/awsBatch.py +237 -128
- toil/batchSystems/cleanup_support.py +22 -16
- toil/batchSystems/contained_executor.py +30 -26
- toil/batchSystems/gridengine.py +85 -49
- toil/batchSystems/htcondor.py +164 -87
- toil/batchSystems/kubernetes.py +622 -386
- toil/batchSystems/local_support.py +17 -12
- toil/batchSystems/lsf.py +132 -79
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +288 -149
- toil/batchSystems/mesos/executor.py +77 -49
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +39 -29
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +293 -123
- toil/batchSystems/slurm.py +651 -155
- toil/batchSystems/torque.py +46 -32
- toil/bus.py +141 -73
- toil/common.py +784 -397
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1137 -534
- toil/cwl/utils.py +17 -22
- toil/deferred.py +62 -41
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +88 -57
- toil/fileStores/cachingFileStore.py +711 -247
- toil/fileStores/nonCachingFileStore.py +113 -75
- toil/job.py +1031 -349
- toil/jobStores/abstractJobStore.py +387 -243
- toil/jobStores/aws/jobStore.py +772 -412
- toil/jobStores/aws/utils.py +161 -109
- toil/jobStores/conftest.py +1 -0
- toil/jobStores/fileJobStore.py +289 -151
- toil/jobStores/googleJobStore.py +137 -70
- toil/jobStores/utils.py +36 -15
- toil/leader.py +614 -269
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +55 -28
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +204 -58
- toil/lib/aws/utils.py +290 -213
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +83 -49
- toil/lib/docker.py +131 -103
- toil/lib/dockstore.py +379 -0
- toil/lib/ec2.py +322 -209
- toil/lib/ec2nodes.py +174 -105
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +4 -2
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/history.py +1271 -0
- toil/lib/history_submission.py +681 -0
- toil/lib/humanize.py +6 -2
- toil/lib/io.py +121 -12
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +83 -18
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +19 -7
- toil/lib/retry.py +125 -87
- toil/lib/threading.py +282 -80
- toil/lib/throttle.py +15 -14
- toil/lib/trs.py +390 -0
- toil/lib/web.py +38 -0
- toil/options/common.py +850 -402
- toil/options/cwl.py +185 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +70 -19
- toil/provisioners/__init__.py +111 -46
- toil/provisioners/abstractProvisioner.py +322 -157
- toil/provisioners/aws/__init__.py +62 -30
- toil/provisioners/aws/awsProvisioner.py +980 -627
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +283 -180
- toil/provisioners/node.py +147 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +127 -61
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +84 -55
- toil/server/utils.py +56 -31
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +183 -65
- toil/test/__init__.py +263 -179
- toil/test/batchSystems/batchSystemTest.py +438 -195
- toil/test/batchSystems/batch_system_plugin_test.py +18 -7
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +265 -49
- toil/test/cactus/test_cactus_integration.py +20 -22
- toil/test/cwl/conftest.py +39 -0
- toil/test/cwl/cwlTest.py +375 -72
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/optional-file.cwl +18 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/docs/scriptsTest.py +60 -34
- toil/test/jobStores/jobStoreTest.py +412 -235
- toil/test/lib/aws/test_iam.py +116 -48
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +57 -49
- toil/test/lib/test_history.py +212 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/lib/test_trs.py +161 -0
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/options.py +7 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +81 -42
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +140 -100
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +33 -26
- toil/test/src/environmentTest.py +20 -10
- toil/test/src/fileStoreTest.py +538 -271
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +32 -17
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +120 -70
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +6 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +33 -16
- toil/test/utils/toilDebugTest.py +70 -58
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +239 -102
- toil/test/wdl/wdltoil_test.py +789 -148
- toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
- toil/toilState.py +52 -26
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +85 -25
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +251 -145
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +27 -14
- toil/utils/toilSshCluster.py +45 -22
- toil/utils/toilStats.py +75 -36
- toil/utils/toilStatus.py +226 -119
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +6 -6
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3528 -1053
- toil/worker.py +370 -149
- toil-8.1.0b1.dist-info/METADATA +178 -0
- toil-8.1.0b1.dist-info/RECORD +259 -0
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/WHEEL +1 -1
- toil-7.0.0.dist-info/METADATA +0 -158
- toil-7.0.0.dist-info/RECORD +0 -244
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/LICENSE +0 -0
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/entry_points.txt +0 -0
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/top_level.txt +0 -0
toil/worker.py
CHANGED
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
import atexit
|
|
14
15
|
import base64
|
|
15
16
|
import copy
|
|
16
17
|
import json
|
|
@@ -22,26 +23,36 @@ import shutil
|
|
|
22
23
|
import signal
|
|
23
24
|
import socket
|
|
24
25
|
import stat
|
|
26
|
+
import subprocess
|
|
25
27
|
import sys
|
|
28
|
+
import threading
|
|
26
29
|
import time
|
|
27
30
|
import traceback
|
|
31
|
+
from collections.abc import Iterator
|
|
28
32
|
from contextlib import contextmanager
|
|
29
|
-
from typing import Any, Callable,
|
|
33
|
+
from typing import Any, Callable, Optional
|
|
30
34
|
|
|
31
35
|
from configargparse import ArgParser
|
|
32
36
|
|
|
33
37
|
from toil import logProcessContext
|
|
34
38
|
from toil.common import Config, Toil, safeUnpickleFromStream
|
|
35
|
-
from toil.cwl.utils import (
|
|
36
|
-
|
|
39
|
+
from toil.cwl.utils import (
|
|
40
|
+
CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION,
|
|
41
|
+
CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE,
|
|
42
|
+
)
|
|
37
43
|
from toil.deferred import DeferredFunctionManager
|
|
38
44
|
from toil.fileStores.abstractFileStore import AbstractFileStore
|
|
39
|
-
from toil.job import
|
|
45
|
+
from toil.job import (
|
|
46
|
+
CheckpointJobDescription,
|
|
47
|
+
DebugStoppingPointReached,
|
|
48
|
+
Job,
|
|
49
|
+
JobDescription,
|
|
50
|
+
)
|
|
40
51
|
from toil.jobStores.abstractJobStore import AbstractJobStore
|
|
41
52
|
from toil.lib.expando import MagicExpando
|
|
42
53
|
from toil.lib.io import make_public_dir
|
|
43
54
|
from toil.lib.resources import ResourceMonitor
|
|
44
|
-
from toil.statsAndLogging import configure_root_logger,
|
|
55
|
+
from toil.statsAndLogging import configure_root_logger, install_log_color, set_log_level
|
|
45
56
|
|
|
46
57
|
logger = logging.getLogger(__name__)
|
|
47
58
|
|
|
@@ -49,10 +60,11 @@ logger = logging.getLogger(__name__)
|
|
|
49
60
|
class StatsDict(MagicExpando):
|
|
50
61
|
"""Subclass of MagicExpando for type-checking purposes."""
|
|
51
62
|
|
|
52
|
-
jobs:
|
|
63
|
+
jobs: list[MagicExpando]
|
|
53
64
|
|
|
54
|
-
|
|
55
|
-
|
|
65
|
+
def nextChainable(
|
|
66
|
+
predecessor: JobDescription, job_store: AbstractJobStore, config: Config
|
|
67
|
+
) -> Optional[JobDescription]:
|
|
56
68
|
"""
|
|
57
69
|
Returns the next chainable job's JobDescription after the given predecessor
|
|
58
70
|
JobDescription, if one exists, or None if the chain must terminate.
|
|
@@ -61,24 +73,41 @@ def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, conf
|
|
|
61
73
|
:param job_store: The JobStore to fetch JobDescriptions from.
|
|
62
74
|
:param config: The configuration for the current run.
|
|
63
75
|
"""
|
|
64
|
-
#If no more jobs to run or services not finished, quit
|
|
65
|
-
if
|
|
66
|
-
|
|
67
|
-
|
|
76
|
+
# If no more jobs to run or services not finished, quit
|
|
77
|
+
if (
|
|
78
|
+
predecessor.nextSuccessors() is None
|
|
79
|
+
or len(predecessor.services) > 0
|
|
80
|
+
or (
|
|
81
|
+
isinstance(predecessor, CheckpointJobDescription)
|
|
82
|
+
and predecessor.checkpoint is not None
|
|
83
|
+
)
|
|
84
|
+
):
|
|
85
|
+
logger.debug(
|
|
86
|
+
"Stopping running chain of jobs: no successors: %s, services: %s, checkpoint: %s",
|
|
87
|
+
predecessor.nextSuccessors() is None,
|
|
88
|
+
len(predecessor.services),
|
|
89
|
+
(
|
|
90
|
+
isinstance(predecessor, CheckpointJobDescription)
|
|
91
|
+
and predecessor.checkpoint is not None
|
|
92
|
+
),
|
|
93
|
+
)
|
|
68
94
|
return None
|
|
69
95
|
|
|
70
|
-
|
|
71
|
-
#Get the next set of jobs to run
|
|
96
|
+
# Get the next set of jobs to run
|
|
72
97
|
jobs = list(predecessor.nextSuccessors() or set())
|
|
73
98
|
if len(jobs) == 0:
|
|
74
99
|
# If there are no jobs, we might just not have any children.
|
|
75
|
-
logger.debug(
|
|
100
|
+
logger.debug(
|
|
101
|
+
"Stopping running chain of jobs because job has no ready children or follow-ons"
|
|
102
|
+
)
|
|
76
103
|
return None
|
|
77
104
|
|
|
78
|
-
#If there are 2 or more jobs to run in parallel we quit
|
|
105
|
+
# If there are 2 or more jobs to run in parallel we quit
|
|
79
106
|
if len(jobs) >= 2:
|
|
80
|
-
logger.debug(
|
|
81
|
-
|
|
107
|
+
logger.debug(
|
|
108
|
+
"No more jobs can run in series by this worker," " it's got %i successors",
|
|
109
|
+
len(jobs),
|
|
110
|
+
)
|
|
82
111
|
logger.debug("Two distinct successors are %s and %s", jobs[0], jobs[1])
|
|
83
112
|
return None
|
|
84
113
|
|
|
@@ -90,8 +119,8 @@ def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, conf
|
|
|
90
119
|
# Load the successor JobDescription
|
|
91
120
|
successor = job_store.load_job(successorID)
|
|
92
121
|
|
|
93
|
-
#We check the requirements of the successor to see if we can run it
|
|
94
|
-
#within the current worker
|
|
122
|
+
# We check the requirements of the successor to see if we can run it
|
|
123
|
+
# within the current worker
|
|
95
124
|
if successor.memory > predecessor.memory:
|
|
96
125
|
logger.debug("We need more memory for the next job, so finishing")
|
|
97
126
|
return None
|
|
@@ -102,14 +131,20 @@ def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, conf
|
|
|
102
131
|
logger.debug("We need more disk for the next job, so finishing")
|
|
103
132
|
return None
|
|
104
133
|
if successor.preemptible != predecessor.preemptible:
|
|
105
|
-
logger.debug(
|
|
134
|
+
logger.debug(
|
|
135
|
+
"Preemptibility is different for the next job, returning to the leader"
|
|
136
|
+
)
|
|
106
137
|
return None
|
|
107
138
|
if successor.predecessorNumber > 1:
|
|
108
|
-
logger.debug(
|
|
139
|
+
logger.debug(
|
|
140
|
+
"The next job has multiple predecessors; we must return to the leader."
|
|
141
|
+
)
|
|
109
142
|
return None
|
|
110
143
|
|
|
111
144
|
if len(successor.services) > 0:
|
|
112
|
-
logger.debug(
|
|
145
|
+
logger.debug(
|
|
146
|
+
"The next job requires services that will not yet be started; we must return to the leader."
|
|
147
|
+
)
|
|
113
148
|
return None
|
|
114
149
|
|
|
115
150
|
if isinstance(successor, CheckpointJobDescription):
|
|
@@ -117,7 +152,11 @@ def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, conf
|
|
|
117
152
|
logger.debug("Next job is checkpoint, so finishing")
|
|
118
153
|
return None
|
|
119
154
|
|
|
120
|
-
if
|
|
155
|
+
if (
|
|
156
|
+
not config.run_local_jobs_on_workers
|
|
157
|
+
and predecessor.local
|
|
158
|
+
and not successor.local
|
|
159
|
+
):
|
|
121
160
|
# This job might be running on the leader, but the next job may not.
|
|
122
161
|
#
|
|
123
162
|
# TODO: Optimize by detecting whether we actually are on the leader,
|
|
@@ -128,6 +167,86 @@ def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, conf
|
|
|
128
167
|
# Made it through! This job is chainable.
|
|
129
168
|
return successor
|
|
130
169
|
|
|
170
|
+
def unstick_worker(interval: float = 120, timeout: float = 120) -> None:
|
|
171
|
+
"""
|
|
172
|
+
Thread function that tries to prevent the process from getting stuck.
|
|
173
|
+
|
|
174
|
+
Meant to be used as a daemon thread: does not have a shutdown signal but
|
|
175
|
+
cleans up on exit.
|
|
176
|
+
|
|
177
|
+
:param interval: Try to unstick the process at intervals of this many
|
|
178
|
+
seconds.
|
|
179
|
+
:param timeout: Stop child processes that take longer than this many
|
|
180
|
+
seconds to finish.
|
|
181
|
+
"""
|
|
182
|
+
|
|
183
|
+
# We've observed Toil getting stuck reading the job from the job store,
|
|
184
|
+
# either due to a problem with the FileJobStore or with local temp storage,
|
|
185
|
+
# but then get unstuck as soon as someone logged in and ran lsof on the
|
|
186
|
+
# Toil process. So we make sure to do that to ourselves every once in a
|
|
187
|
+
# while as long as the worker is running.
|
|
188
|
+
|
|
189
|
+
# Figure out our process ID
|
|
190
|
+
pid = os.getpid()
|
|
191
|
+
|
|
192
|
+
child: Optional[subprocess.Popen[bytes]] = None
|
|
193
|
+
|
|
194
|
+
def clean_up_child() -> None:
|
|
195
|
+
"""
|
|
196
|
+
Cleanup function to run at daemon thread shutdown when the main thread
|
|
197
|
+
terminates without shutting us down.
|
|
198
|
+
|
|
199
|
+
Also used to kill the child process if it takes too long.
|
|
200
|
+
"""
|
|
201
|
+
if child is not None:
|
|
202
|
+
# Kill the child immediately if it is running
|
|
203
|
+
child.kill()
|
|
204
|
+
try:
|
|
205
|
+
# Wait one last time to try and reap the child process
|
|
206
|
+
child.wait(timeout=5)
|
|
207
|
+
except subprocess.TimeoutExpired:
|
|
208
|
+
pass
|
|
209
|
+
|
|
210
|
+
atexit.register(clean_up_child)
|
|
211
|
+
|
|
212
|
+
# TODO: If we handle daemon thread shutdown just fine, why do we bother
|
|
213
|
+
# with all the event stuff? Why not cut it?
|
|
214
|
+
|
|
215
|
+
# Wait the interval before trying the first unstick
|
|
216
|
+
time.sleep(interval)
|
|
217
|
+
|
|
218
|
+
while True:
|
|
219
|
+
# Run an lsof on our PID, which has been observed to unstick reads.
|
|
220
|
+
#
|
|
221
|
+
# We rely on the thread being able to go away and atexit() hooks
|
|
222
|
+
# happening in the middle of a wait with a timeout.
|
|
223
|
+
#
|
|
224
|
+
# We also want to handle the case where the child process gets so
|
|
225
|
+
# gummed up that it can't exit when killed.
|
|
226
|
+
|
|
227
|
+
# Preserve errors form child process but not output
|
|
228
|
+
child = subprocess.Popen(
|
|
229
|
+
["lsof", "-p", str(pid)],
|
|
230
|
+
stdin=subprocess.DEVNULL,
|
|
231
|
+
stdout=subprocess.DEVNULL,
|
|
232
|
+
)
|
|
233
|
+
try:
|
|
234
|
+
child.wait(timeout=timeout)
|
|
235
|
+
except subprocess.TimeoutExpired:
|
|
236
|
+
logger.warning("Running lsof took too long!")
|
|
237
|
+
clean_up_child()
|
|
238
|
+
if child.returncode is None:
|
|
239
|
+
# Kill didn't take
|
|
240
|
+
logger.warning("Could not promptly kill child process: %s", child.pid)
|
|
241
|
+
|
|
242
|
+
if child.returncode != 0:
|
|
243
|
+
# Something went wrong, which is suspicious. Either it failed or it
|
|
244
|
+
# timed out and could not be killed promptly.
|
|
245
|
+
logger.warning("Could not list open files on ourselves. Return code: %s", child.returncode)
|
|
246
|
+
|
|
247
|
+
# Wait the interval.
|
|
248
|
+
time.sleep(interval)
|
|
249
|
+
|
|
131
250
|
def workerScript(
|
|
132
251
|
job_store: AbstractJobStore,
|
|
133
252
|
config: Config,
|
|
@@ -135,7 +254,7 @@ def workerScript(
|
|
|
135
254
|
job_store_id: str,
|
|
136
255
|
redirect_output_to_log_file: bool = True,
|
|
137
256
|
local_worker_temp_dir: Optional[str] = None,
|
|
138
|
-
debug_flags: Optional[
|
|
257
|
+
debug_flags: Optional[set[str]] = None,
|
|
139
258
|
) -> int:
|
|
140
259
|
"""
|
|
141
260
|
Worker process script, runs a job.
|
|
@@ -162,7 +281,7 @@ def workerScript(
|
|
|
162
281
|
logger.debug("Worker started for job %s...", job_name)
|
|
163
282
|
|
|
164
283
|
##########################################
|
|
165
|
-
#Create the worker killer, if requested
|
|
284
|
+
# Create the worker killer, if requested
|
|
166
285
|
##########################################
|
|
167
286
|
|
|
168
287
|
logFileByteReportLimit = config.maxLogFileSize
|
|
@@ -203,10 +322,17 @@ def workerScript(
|
|
|
203
322
|
# before it does. Either way, init will have to clean it up for us.
|
|
204
323
|
|
|
205
324
|
##########################################
|
|
206
|
-
#
|
|
325
|
+
# Create the worker unsticker
|
|
207
326
|
##########################################
|
|
327
|
+
unstick_thread = threading.Thread(target=unstick_worker, args=())
|
|
328
|
+
unstick_thread.daemon = True
|
|
329
|
+
unstick_thread.start()
|
|
208
330
|
|
|
209
|
-
|
|
331
|
+
##########################################
|
|
332
|
+
# Load the environment for the job
|
|
333
|
+
##########################################
|
|
334
|
+
|
|
335
|
+
# First load the environment for the job.
|
|
210
336
|
with job_store.read_shared_file_stream("environment.pickle") as fileHandle:
|
|
211
337
|
environment = safeUnpickleFromStream(fileHandle)
|
|
212
338
|
env_reject = {
|
|
@@ -224,15 +350,15 @@ def workerScript(
|
|
|
224
350
|
"XDG_SESSION_ID",
|
|
225
351
|
"XDG_RUNTIME_DIR",
|
|
226
352
|
"XDG_DATA_DIRS",
|
|
227
|
-
"DBUS_SESSION_BUS_ADDRESS"
|
|
353
|
+
"DBUS_SESSION_BUS_ADDRESS",
|
|
228
354
|
}
|
|
229
355
|
for i in environment:
|
|
230
356
|
if i == "PATH":
|
|
231
357
|
# Handle path specially. Sometimes e.g. leader may not include
|
|
232
358
|
# /bin, but the Toil appliance needs it.
|
|
233
|
-
if i in os.environ and os.environ[i] !=
|
|
359
|
+
if i in os.environ and os.environ[i] != "":
|
|
234
360
|
# Use the provided PATH and then the local system's PATH
|
|
235
|
-
os.environ[i] = environment[i] +
|
|
361
|
+
os.environ[i] = environment[i] + ":" + os.environ[i]
|
|
236
362
|
else:
|
|
237
363
|
# Use the provided PATH only
|
|
238
364
|
os.environ[i] = environment[i]
|
|
@@ -240,41 +366,45 @@ def workerScript(
|
|
|
240
366
|
os.environ[i] = environment[i]
|
|
241
367
|
# sys.path is used by __import__ to find modules
|
|
242
368
|
if "PYTHONPATH" in environment:
|
|
243
|
-
for e in environment["PYTHONPATH"].split(
|
|
244
|
-
if e !=
|
|
369
|
+
for e in environment["PYTHONPATH"].split(":"):
|
|
370
|
+
if e != "":
|
|
245
371
|
sys.path.append(e)
|
|
246
372
|
|
|
247
373
|
##########################################
|
|
248
|
-
#Setup the temporary directories.
|
|
374
|
+
# Setup the temporary directories.
|
|
249
375
|
##########################################
|
|
250
376
|
# Dir to put all this worker's temp files in.
|
|
251
377
|
if config.workflowID is None:
|
|
252
378
|
raise RuntimeError("The worker workflow ID was never set.")
|
|
253
379
|
toilWorkflowDir = Toil.getLocalWorkflowDir(config.workflowID, config.workDir)
|
|
254
380
|
# Dir to put lock files in, ideally not on NFS.
|
|
255
|
-
toil_coordination_dir = Toil.get_local_workflow_coordination_dir(
|
|
381
|
+
toil_coordination_dir = Toil.get_local_workflow_coordination_dir(
|
|
382
|
+
config.workflowID, config.workDir, config.coordination_dir
|
|
383
|
+
)
|
|
256
384
|
if local_worker_temp_dir is None:
|
|
257
385
|
# Invent a temp directory to work in
|
|
258
386
|
local_worker_temp_dir = make_public_dir(toilWorkflowDir)
|
|
259
387
|
os.chmod(local_worker_temp_dir, 0o755)
|
|
260
388
|
|
|
261
389
|
##########################################
|
|
262
|
-
#Setup the logging
|
|
390
|
+
# Setup the logging
|
|
263
391
|
##########################################
|
|
264
392
|
|
|
265
|
-
#This is mildly tricky because we don't just want to
|
|
266
|
-
#redirect stdout and stderr for this Python process; we want to redirect it
|
|
267
|
-
#for this process and all children. Consequently, we can't just replace
|
|
268
|
-
#sys.stdout and sys.stderr; we need to mess with the underlying OS-level
|
|
269
|
-
#file descriptors. See <http://stackoverflow.com/a/11632982/402891>
|
|
393
|
+
# This is mildly tricky because we don't just want to
|
|
394
|
+
# redirect stdout and stderr for this Python process; we want to redirect it
|
|
395
|
+
# for this process and all children. Consequently, we can't just replace
|
|
396
|
+
# sys.stdout and sys.stderr; we need to mess with the underlying OS-level
|
|
397
|
+
# file descriptors. See <http://stackoverflow.com/a/11632982/402891>
|
|
270
398
|
|
|
271
|
-
#When we start, standard input is file descriptor 0, standard output is
|
|
272
|
-
#file descriptor 1, and standard error is file descriptor 2.
|
|
399
|
+
# When we start, standard input is file descriptor 0, standard output is
|
|
400
|
+
# file descriptor 1, and standard error is file descriptor 2.
|
|
273
401
|
|
|
274
402
|
# Do we even want to redirect output? Let the config make us not do it.
|
|
275
|
-
redirect_output_to_log_file =
|
|
403
|
+
redirect_output_to_log_file = (
|
|
404
|
+
redirect_output_to_log_file and not config.disableWorkerOutputCapture
|
|
405
|
+
)
|
|
276
406
|
|
|
277
|
-
#What file do we want to point FDs 1 and 2 to?
|
|
407
|
+
# What file do we want to point FDs 1 and 2 to?
|
|
278
408
|
tempWorkerLogPath = os.path.join(local_worker_temp_dir, "worker_log.txt")
|
|
279
409
|
|
|
280
410
|
if redirect_output_to_log_file:
|
|
@@ -322,6 +452,7 @@ def workerScript(
|
|
|
322
452
|
|
|
323
453
|
def blockFn() -> bool:
|
|
324
454
|
return True
|
|
455
|
+
|
|
325
456
|
job = None
|
|
326
457
|
try:
|
|
327
458
|
|
|
@@ -365,7 +496,10 @@ def workerScript(
|
|
|
365
496
|
# If a checkpoint exists, restart from the checkpoint
|
|
366
497
|
##########################################
|
|
367
498
|
|
|
368
|
-
if
|
|
499
|
+
if (
|
|
500
|
+
isinstance(jobDesc, CheckpointJobDescription)
|
|
501
|
+
and jobDesc.checkpoint is not None
|
|
502
|
+
):
|
|
369
503
|
# The job is a checkpoint, and is being restarted after previously completing
|
|
370
504
|
logger.debug("Job is a checkpoint")
|
|
371
505
|
# If the checkpoint still has extant successors or services, its
|
|
@@ -381,23 +515,33 @@ def workerScript(
|
|
|
381
515
|
# Otherwise, the job and successors are done, and we can cleanup stuff we couldn't clean
|
|
382
516
|
# because of the job being a checkpoint
|
|
383
517
|
else:
|
|
384
|
-
logger.debug(
|
|
385
|
-
|
|
386
|
-
|
|
518
|
+
logger.debug(
|
|
519
|
+
"The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete."
|
|
520
|
+
)
|
|
521
|
+
# Delete any remnant files
|
|
522
|
+
list(
|
|
523
|
+
map(
|
|
524
|
+
job_store.delete_file,
|
|
525
|
+
list(
|
|
526
|
+
filter(
|
|
527
|
+
job_store.file_exists, jobDesc.checkpointFilesToDelete
|
|
528
|
+
)
|
|
529
|
+
),
|
|
530
|
+
)
|
|
531
|
+
)
|
|
387
532
|
|
|
388
533
|
##########################################
|
|
389
|
-
#Setup the stats
|
|
534
|
+
# Setup the stats
|
|
390
535
|
##########################################
|
|
391
536
|
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
startClock = ResourceMonitor.get_total_cpu_time()
|
|
537
|
+
# Remember the cores from the first job, which is how many we have reserved for us.
|
|
538
|
+
statsDict.workers.requested_cores = jobDesc.cores
|
|
539
|
+
startClock = ResourceMonitor.get_total_cpu_time()
|
|
396
540
|
|
|
397
541
|
startTime = time.time()
|
|
398
542
|
while True:
|
|
399
543
|
##########################################
|
|
400
|
-
#Run the job body, if there is one
|
|
544
|
+
# Run the job body, if there is one
|
|
401
545
|
##########################################
|
|
402
546
|
|
|
403
547
|
logger.info("Working on job %s", jobDesc)
|
|
@@ -417,33 +561,48 @@ def workerScript(
|
|
|
417
561
|
job.set_debug_flag(flag)
|
|
418
562
|
|
|
419
563
|
# Create a fileStore object for the job
|
|
420
|
-
fileStore = AbstractFileStore.createFileStore(
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
564
|
+
fileStore = AbstractFileStore.createFileStore(
|
|
565
|
+
job_store,
|
|
566
|
+
jobDesc,
|
|
567
|
+
local_worker_temp_dir,
|
|
568
|
+
blockFn,
|
|
569
|
+
caching=config.caching,
|
|
570
|
+
)
|
|
571
|
+
try:
|
|
572
|
+
with job._executor(
|
|
573
|
+
stats=statsDict, fileStore=fileStore
|
|
574
|
+
):
|
|
575
|
+
with deferredFunctionManager.open() as defer:
|
|
576
|
+
with fileStore.open(job):
|
|
577
|
+
# Get the next block function to wait on committing this job
|
|
578
|
+
blockFn = fileStore.waitForCommit
|
|
579
|
+
|
|
580
|
+
# Run the job, save new successors, and set up
|
|
581
|
+
# locally (but don't commit) successor
|
|
582
|
+
# relationships and job completion.
|
|
583
|
+
# Pass everything as name=value because Cactus
|
|
584
|
+
# likes to override _runner when it shouldn't and
|
|
585
|
+
# it needs some hope of finding the arguments it
|
|
586
|
+
# wants across multiple Toil versions. We also
|
|
587
|
+
# still pass a jobGraph argument to placate old
|
|
588
|
+
# versions of Cactus.
|
|
589
|
+
job._runner(
|
|
590
|
+
jobGraph=None,
|
|
591
|
+
jobStore=job_store,
|
|
592
|
+
fileStore=fileStore,
|
|
593
|
+
defer=defer,
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
# When the executor for the job finishes it will
|
|
597
|
+
# kick off a commit with the link to the job body
|
|
598
|
+
# cut.
|
|
599
|
+
finally:
|
|
600
|
+
# Accumulate messages from this job & any subsequent chained jobs.
|
|
601
|
+
# Keep the messages even if the job fails.
|
|
602
|
+
statsDict.workers.logs_to_leader += fileStore.logging_messages
|
|
603
|
+
statsDict.workers.logging_user_streams += (
|
|
604
|
+
fileStore.logging_user_streams
|
|
605
|
+
)
|
|
447
606
|
|
|
448
607
|
logger.info("Completed body for %s", jobDesc)
|
|
449
608
|
|
|
@@ -458,7 +617,7 @@ def workerScript(
|
|
|
458
617
|
raise RuntimeError("The termination flag is set")
|
|
459
618
|
|
|
460
619
|
##########################################
|
|
461
|
-
#Establish if we can run another job within the worker
|
|
620
|
+
# Establish if we can run another job within the worker
|
|
462
621
|
##########################################
|
|
463
622
|
successor = nextChainable(jobDesc, job_store, config)
|
|
464
623
|
if successor is None or config.disableChaining:
|
|
@@ -481,9 +640,13 @@ def workerScript(
|
|
|
481
640
|
|
|
482
641
|
# Make sure nothing has gone wrong and we can really chain
|
|
483
642
|
if jobDesc.memory < successor.memory:
|
|
484
|
-
raise RuntimeError(
|
|
643
|
+
raise RuntimeError(
|
|
644
|
+
"Cannot chain jobs. A job's memory cannot be less than it's successor."
|
|
645
|
+
)
|
|
485
646
|
if jobDesc.cores < successor.cores:
|
|
486
|
-
raise RuntimeError(
|
|
647
|
+
raise RuntimeError(
|
|
648
|
+
"Cannot chain jobs. A job's cores cannot be less than it's successor."
|
|
649
|
+
)
|
|
487
650
|
|
|
488
651
|
# Save the successor's original ID, so we can clean it (and its
|
|
489
652
|
# body) up after we finish executing it.
|
|
@@ -499,8 +662,13 @@ def workerScript(
|
|
|
499
662
|
|
|
500
663
|
# Build a fileStore to update the job and commit the replacement.
|
|
501
664
|
# TODO: can we have a commit operation without an entire FileStore???
|
|
502
|
-
fileStore = AbstractFileStore.createFileStore(
|
|
503
|
-
|
|
665
|
+
fileStore = AbstractFileStore.createFileStore(
|
|
666
|
+
job_store,
|
|
667
|
+
jobDesc,
|
|
668
|
+
local_worker_temp_dir,
|
|
669
|
+
blockFn,
|
|
670
|
+
caching=config.caching,
|
|
671
|
+
)
|
|
504
672
|
|
|
505
673
|
# Update blockFn to wait for that commit operation.
|
|
506
674
|
blockFn = fileStore.waitForCommit
|
|
@@ -511,40 +679,53 @@ def workerScript(
|
|
|
511
679
|
logger.debug("Starting the next job")
|
|
512
680
|
|
|
513
681
|
##########################################
|
|
514
|
-
#Finish up the stats
|
|
682
|
+
# Finish up the stats
|
|
515
683
|
##########################################
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
684
|
+
totalCPUTime, totalMemoryUsage = (
|
|
685
|
+
ResourceMonitor.get_total_cpu_time_and_memory_usage()
|
|
686
|
+
)
|
|
687
|
+
statsDict.workers.time = str(time.time() - startTime)
|
|
688
|
+
statsDict.workers.clock = str(totalCPUTime - startClock)
|
|
689
|
+
statsDict.workers.memory = str(totalMemoryUsage)
|
|
690
|
+
# Say the worker used the max disk we saw from any job
|
|
691
|
+
max_bytes = 0
|
|
692
|
+
for job_stats in statsDict.jobs:
|
|
693
|
+
if "disk" in job_stats:
|
|
694
|
+
max_bytes = max(max_bytes, int(job_stats.disk))
|
|
695
|
+
statsDict.workers.disk = str(max_bytes)
|
|
696
|
+
# Count the jobs executed.
|
|
697
|
+
# TODO: toil stats could compute this but its parser is too general to hook into simply.
|
|
698
|
+
statsDict.workers.jobs_run = len(statsDict.jobs)
|
|
531
699
|
|
|
532
700
|
# log the worker log path here so that if the file is truncated the path can still be found
|
|
533
701
|
if redirect_output_to_log_file:
|
|
534
|
-
logger.info(
|
|
535
|
-
|
|
536
|
-
|
|
702
|
+
logger.info(
|
|
703
|
+
"Worker log can be found at %s. Set --cleanWorkDir to retain this log",
|
|
704
|
+
local_worker_temp_dir,
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
logger.info(
|
|
708
|
+
"Finished running the chain of jobs on this node, we ran for a total of %f seconds",
|
|
709
|
+
time.time() - startTime,
|
|
710
|
+
)
|
|
537
711
|
|
|
538
712
|
##########################################
|
|
539
|
-
#Trapping where worker goes wrong
|
|
713
|
+
# Trapping where worker goes wrong
|
|
540
714
|
##########################################
|
|
541
715
|
except DebugStoppingPointReached:
|
|
542
716
|
# Job wants the worker to stop for debugging
|
|
543
717
|
raise
|
|
544
|
-
except
|
|
718
|
+
except (
|
|
719
|
+
BaseException
|
|
720
|
+
) as e: # Case that something goes wrong in worker, or we are asked to stop
|
|
545
721
|
if not isinstance(e, SystemExit):
|
|
546
|
-
logger.critical(
|
|
547
|
-
|
|
722
|
+
logger.critical(
|
|
723
|
+
"Worker crashed with traceback:\n%s", traceback.format_exc()
|
|
724
|
+
)
|
|
725
|
+
logger.error(
|
|
726
|
+
"Exiting the worker because of a failed job on host %s",
|
|
727
|
+
socket.gethostname(),
|
|
728
|
+
)
|
|
548
729
|
if isinstance(e, CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION):
|
|
549
730
|
# We need to inform the leader that this is a CWL workflow problem
|
|
550
731
|
# and it needs to inform its caller.
|
|
@@ -552,6 +733,15 @@ def workerScript(
|
|
|
552
733
|
elif isinstance(e, SystemExit) and isinstance(e.code, int) and e.code != 0:
|
|
553
734
|
# We're meant to be exiting with a particular code.
|
|
554
735
|
failure_exit_code = e.code
|
|
736
|
+
else:
|
|
737
|
+
try:
|
|
738
|
+
from WDL.runtime.error import CommandFailed
|
|
739
|
+
|
|
740
|
+
if isinstance(e, CommandFailed):
|
|
741
|
+
failure_exit_code = e.exit_status
|
|
742
|
+
except ImportError:
|
|
743
|
+
# WDL dependency not available
|
|
744
|
+
pass
|
|
555
745
|
AbstractFileStore._terminateEvent.set()
|
|
556
746
|
finally:
|
|
557
747
|
# Get rid of our deferred function manager now so we can't mistake it
|
|
@@ -567,16 +757,15 @@ def workerScript(
|
|
|
567
757
|
logger.debug("cwltool.main._terminate_processess exception: %s", (e))
|
|
568
758
|
raise e
|
|
569
759
|
|
|
570
|
-
|
|
571
760
|
##########################################
|
|
572
|
-
#Wait for the asynchronous chain of writes/updates to finish
|
|
761
|
+
# Wait for the asynchronous chain of writes/updates to finish
|
|
573
762
|
##########################################
|
|
574
763
|
|
|
575
764
|
blockFn()
|
|
576
765
|
|
|
577
766
|
##########################################
|
|
578
|
-
#All the asynchronous worker/update threads must be finished now,
|
|
579
|
-
#so safe to test if they completed okay
|
|
767
|
+
# All the asynchronous worker/update threads must be finished now,
|
|
768
|
+
# so safe to test if they completed okay
|
|
580
769
|
##########################################
|
|
581
770
|
|
|
582
771
|
if AbstractFileStore._terminateEvent.is_set():
|
|
@@ -589,7 +778,7 @@ def workerScript(
|
|
|
589
778
|
jobAttemptFailed = True
|
|
590
779
|
|
|
591
780
|
##########################################
|
|
592
|
-
#Cleanup
|
|
781
|
+
# Cleanup
|
|
593
782
|
##########################################
|
|
594
783
|
|
|
595
784
|
# Close the worker logging
|
|
@@ -628,40 +817,55 @@ def workerScript(
|
|
|
628
817
|
jobDesc.jobStoreID, cleanup=True
|
|
629
818
|
)
|
|
630
819
|
with job_store.update_file_stream(logJobStoreFileID) as w:
|
|
631
|
-
with open(tempWorkerLogPath,
|
|
632
|
-
if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit !=0:
|
|
820
|
+
with open(tempWorkerLogPath, "rb") as f:
|
|
821
|
+
if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit != 0:
|
|
633
822
|
if logFileByteReportLimit > 0:
|
|
634
|
-
f.seek(
|
|
823
|
+
f.seek(
|
|
824
|
+
-logFileByteReportLimit, 2
|
|
825
|
+
) # seek to last tooBig bytes of file
|
|
635
826
|
elif logFileByteReportLimit < 0:
|
|
636
|
-
f.seek(
|
|
827
|
+
f.seek(
|
|
828
|
+
logFileByteReportLimit, 0
|
|
829
|
+
) # seek to first tooBig bytes of file
|
|
637
830
|
# Dump the possibly-invalid-Unicode bytes into the log file
|
|
638
|
-
w.write(f.read())
|
|
831
|
+
w.write(f.read()) # TODO load file using a buffer
|
|
639
832
|
# Commit log file reference back to JobStore
|
|
640
833
|
job_store.update_job(jobDesc)
|
|
641
834
|
|
|
642
|
-
elif (
|
|
643
|
-
|
|
644
|
-
|
|
835
|
+
elif (
|
|
836
|
+
debugging or (config.writeLogsFromAllJobs and not jobDesc.local)
|
|
837
|
+
) and redirect_output_to_log_file: # write log messages
|
|
838
|
+
with open(tempWorkerLogPath, "rb") as logFile:
|
|
645
839
|
if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit != 0:
|
|
646
840
|
if logFileByteReportLimit > 0:
|
|
647
|
-
logFile.seek(
|
|
841
|
+
logFile.seek(
|
|
842
|
+
-logFileByteReportLimit, 2
|
|
843
|
+
) # seek to last tooBig bytes of file
|
|
648
844
|
elif logFileByteReportLimit < 0:
|
|
649
|
-
logFile.seek(
|
|
845
|
+
logFile.seek(
|
|
846
|
+
logFileByteReportLimit, 0
|
|
847
|
+
) # seek to first tooBig bytes of file
|
|
650
848
|
# Make sure lines are Unicode so they can be JSON serialized as part of the dict.
|
|
651
849
|
# We may have damaged the Unicode text by cutting it at an arbitrary byte so we drop bad characters.
|
|
652
|
-
logMessages = [
|
|
850
|
+
logMessages = [
|
|
851
|
+
line.decode("utf-8", "skip") for line in logFile.read().splitlines()
|
|
852
|
+
]
|
|
653
853
|
statsDict.logs.names = [names.stats_name for names in jobDesc.get_chain()]
|
|
654
854
|
statsDict.logs.messages = logMessages
|
|
655
855
|
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
job_store.write_logs(json.dumps(statsDict, ensure_ascii=True))
|
|
856
|
+
# We have stats/logging to report back.
|
|
857
|
+
# We report even if the job attempt failed.
|
|
858
|
+
# TODO: Will that upset analysis of the stats?
|
|
859
|
+
job_store.write_logs(json.dumps(statsDict, ensure_ascii=True))
|
|
661
860
|
|
|
662
861
|
# Remove the temp dir
|
|
663
862
|
cleanUp = config.cleanWorkDir
|
|
664
|
-
if
|
|
863
|
+
if (
|
|
864
|
+
cleanUp == "always"
|
|
865
|
+
or (cleanUp == "onSuccess" and not jobAttemptFailed)
|
|
866
|
+
or (cleanUp == "onError" and jobAttemptFailed)
|
|
867
|
+
):
|
|
868
|
+
|
|
665
869
|
def make_parent_writable(func: Callable[[str], Any], path: str, _: Any) -> None:
|
|
666
870
|
"""
|
|
667
871
|
When encountering an error removing a file or directory, make sure
|
|
@@ -672,9 +876,17 @@ def workerScript(
|
|
|
672
876
|
"""
|
|
673
877
|
# Just chmod it for rwx for user. This can't work anyway if it isn't ours.
|
|
674
878
|
try:
|
|
675
|
-
os.chmod(
|
|
879
|
+
os.chmod(
|
|
880
|
+
os.path.dirname(path), stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR
|
|
881
|
+
)
|
|
676
882
|
except PermissionError as e:
|
|
677
|
-
logger.error(
|
|
883
|
+
logger.error(
|
|
884
|
+
"Could not set permissions on %s to allow cleanup of %s: %s",
|
|
885
|
+
os.path.dirname(path),
|
|
886
|
+
path,
|
|
887
|
+
e,
|
|
888
|
+
)
|
|
889
|
+
|
|
678
890
|
shutil.rmtree(local_worker_temp_dir, onerror=make_parent_writable)
|
|
679
891
|
|
|
680
892
|
# This must happen after the log file is done with, else there is no place to put the log
|
|
@@ -683,13 +895,13 @@ def workerScript(
|
|
|
683
895
|
# We can now safely get rid of the JobDescription, and all jobs it chained up
|
|
684
896
|
job_store.delete_job(merged_in.job_store_id)
|
|
685
897
|
|
|
686
|
-
|
|
687
898
|
if jobAttemptFailed:
|
|
688
899
|
return failure_exit_code
|
|
689
900
|
else:
|
|
690
901
|
return 0
|
|
691
902
|
|
|
692
|
-
|
|
903
|
+
|
|
904
|
+
def parse_args(args: list[str]) -> Any:
|
|
693
905
|
"""
|
|
694
906
|
Parse command-line arguments to the worker.
|
|
695
907
|
"""
|
|
@@ -703,26 +915,33 @@ def parse_args(args: List[str]) -> Any:
|
|
|
703
915
|
# Now add all the options to it
|
|
704
916
|
|
|
705
917
|
# Base required job information
|
|
706
|
-
parser.add_argument("jobName", type=str,
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
918
|
+
parser.add_argument("jobName", type=str, help="Text name of the job being run")
|
|
919
|
+
parser.add_argument(
|
|
920
|
+
"jobStoreLocator",
|
|
921
|
+
type=str,
|
|
922
|
+
help="Information required to connect to the job store",
|
|
923
|
+
)
|
|
924
|
+
parser.add_argument(
|
|
925
|
+
"jobStoreID", type=str, help="ID of the job within the job store"
|
|
926
|
+
)
|
|
712
927
|
|
|
713
928
|
# Additional worker abilities
|
|
714
|
-
parser.add_argument(
|
|
929
|
+
parser.add_argument(
|
|
930
|
+
"--context",
|
|
931
|
+
default=[],
|
|
932
|
+
action="append",
|
|
715
933
|
help="""Pickled, base64-encoded context manager(s) to run job inside of.
|
|
716
934
|
Allows the Toil leader to pass setup and cleanup work provided by the
|
|
717
935
|
batch system, in the form of pickled Python context manager objects,
|
|
718
936
|
that the worker can then run before/after the job on the batch
|
|
719
|
-
system's behalf."""
|
|
937
|
+
system's behalf.""",
|
|
938
|
+
)
|
|
720
939
|
|
|
721
940
|
return parser.parse_args(args)
|
|
722
941
|
|
|
723
942
|
|
|
724
943
|
@contextmanager
|
|
725
|
-
def in_contexts(contexts:
|
|
944
|
+
def in_contexts(contexts: list[str]) -> Iterator[None]:
|
|
726
945
|
"""
|
|
727
946
|
Unpickle and enter all the pickled, base64-encoded context managers in the
|
|
728
947
|
given list. Then do the body, then leave them all.
|
|
@@ -736,10 +955,12 @@ def in_contexts(contexts: List[str]) -> Iterator[None]:
|
|
|
736
955
|
rest = contexts[1:]
|
|
737
956
|
|
|
738
957
|
try:
|
|
739
|
-
manager = pickle.loads(base64.b64decode(first.encode(
|
|
958
|
+
manager = pickle.loads(base64.b64decode(first.encode("utf-8")))
|
|
740
959
|
except:
|
|
741
960
|
exc_info = sys.exc_info()
|
|
742
|
-
logger.error(
|
|
961
|
+
logger.error(
|
|
962
|
+
"Exception while unpickling context manager: ", exc_info=exc_info
|
|
963
|
+
)
|
|
743
964
|
raise
|
|
744
965
|
|
|
745
966
|
with manager:
|
|
@@ -749,14 +970,14 @@ def in_contexts(contexts: List[str]) -> Iterator[None]:
|
|
|
749
970
|
yield
|
|
750
971
|
|
|
751
972
|
|
|
752
|
-
def main(argv: Optional[
|
|
973
|
+
def main(argv: Optional[list[str]] = None) -> None:
|
|
753
974
|
if argv is None:
|
|
754
975
|
argv = sys.argv
|
|
755
976
|
# Parse our command line
|
|
756
977
|
options = parse_args(argv)
|
|
757
978
|
|
|
758
979
|
##########################################
|
|
759
|
-
#Load the jobStore/config file
|
|
980
|
+
# Load the jobStore/config file
|
|
760
981
|
##########################################
|
|
761
982
|
|
|
762
983
|
job_store = Toil.resumeJobStore(options.jobStoreLocator)
|