toil 6.1.0a1__py3-none-any.whl → 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +1 -232
- toil/batchSystems/abstractBatchSystem.py +41 -17
- toil/batchSystems/abstractGridEngineBatchSystem.py +79 -65
- toil/batchSystems/awsBatch.py +8 -8
- toil/batchSystems/cleanup_support.py +7 -3
- toil/batchSystems/contained_executor.py +4 -5
- toil/batchSystems/gridengine.py +1 -1
- toil/batchSystems/htcondor.py +5 -5
- toil/batchSystems/kubernetes.py +25 -11
- toil/batchSystems/local_support.py +3 -3
- toil/batchSystems/lsf.py +9 -9
- toil/batchSystems/mesos/batchSystem.py +4 -4
- toil/batchSystems/mesos/executor.py +3 -2
- toil/batchSystems/options.py +9 -0
- toil/batchSystems/singleMachine.py +11 -10
- toil/batchSystems/slurm.py +129 -16
- toil/batchSystems/torque.py +1 -1
- toil/bus.py +45 -3
- toil/common.py +56 -31
- toil/cwl/cwltoil.py +442 -371
- toil/deferred.py +1 -1
- toil/exceptions.py +1 -1
- toil/fileStores/abstractFileStore.py +69 -20
- toil/fileStores/cachingFileStore.py +6 -22
- toil/fileStores/nonCachingFileStore.py +6 -15
- toil/job.py +270 -86
- toil/jobStores/abstractJobStore.py +37 -31
- toil/jobStores/aws/jobStore.py +280 -218
- toil/jobStores/aws/utils.py +60 -31
- toil/jobStores/conftest.py +2 -2
- toil/jobStores/fileJobStore.py +3 -3
- toil/jobStores/googleJobStore.py +3 -4
- toil/leader.py +89 -38
- toil/lib/aws/__init__.py +26 -10
- toil/lib/aws/iam.py +2 -2
- toil/lib/aws/session.py +62 -22
- toil/lib/aws/utils.py +73 -37
- toil/lib/conversions.py +24 -1
- toil/lib/ec2.py +118 -69
- toil/lib/expando.py +1 -1
- toil/lib/generatedEC2Lists.py +8 -8
- toil/lib/io.py +42 -4
- toil/lib/misc.py +1 -3
- toil/lib/resources.py +57 -16
- toil/lib/retry.py +12 -5
- toil/lib/threading.py +29 -14
- toil/lib/throttle.py +1 -1
- toil/options/common.py +31 -30
- toil/options/wdl.py +5 -0
- toil/provisioners/__init__.py +9 -3
- toil/provisioners/abstractProvisioner.py +12 -2
- toil/provisioners/aws/__init__.py +20 -15
- toil/provisioners/aws/awsProvisioner.py +406 -329
- toil/provisioners/gceProvisioner.py +2 -2
- toil/provisioners/node.py +13 -5
- toil/server/app.py +1 -1
- toil/statsAndLogging.py +93 -23
- toil/test/__init__.py +27 -12
- toil/test/batchSystems/batchSystemTest.py +40 -33
- toil/test/batchSystems/batch_system_plugin_test.py +79 -0
- toil/test/batchSystems/test_slurm.py +22 -7
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +58 -0
- toil/test/cwl/cwlTest.py +245 -236
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +11 -14
- toil/test/jobStores/jobStoreTest.py +40 -54
- toil/test/lib/aws/test_iam.py +2 -2
- toil/test/lib/test_ec2.py +1 -1
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +37 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
- toil/test/provisioners/clusterTest.py +99 -16
- toil/test/server/serverTest.py +2 -2
- toil/test/src/autoDeploymentTest.py +1 -1
- toil/test/src/dockerCheckTest.py +2 -1
- toil/test/src/environmentTest.py +125 -0
- toil/test/src/fileStoreTest.py +1 -1
- toil/test/src/jobDescriptionTest.py +18 -8
- toil/test/src/jobTest.py +1 -1
- toil/test/src/realtimeLoggerTest.py +4 -0
- toil/test/src/workerTest.py +52 -19
- toil/test/utils/toilDebugTest.py +62 -4
- toil/test/utils/utilsTest.py +23 -21
- toil/test/wdl/wdltoil_test.py +49 -21
- toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
- toil/toilState.py +68 -9
- toil/utils/toilDebugFile.py +1 -1
- toil/utils/toilDebugJob.py +153 -26
- toil/utils/toilLaunchCluster.py +12 -2
- toil/utils/toilRsyncCluster.py +7 -2
- toil/utils/toilSshCluster.py +7 -3
- toil/utils/toilStats.py +310 -266
- toil/utils/toilStatus.py +98 -52
- toil/version.py +11 -11
- toil/wdl/wdltoil.py +644 -225
- toil/worker.py +125 -83
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
- toil-7.0.0.dist-info/METADATA +158 -0
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/RECORD +103 -96
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
toil/worker.py
CHANGED
|
@@ -26,7 +26,7 @@ import sys
|
|
|
26
26
|
import time
|
|
27
27
|
import traceback
|
|
28
28
|
from contextlib import contextmanager
|
|
29
|
-
from typing import Any, Callable, Iterator, List, Optional
|
|
29
|
+
from typing import Any, Callable, Iterator, List, Set, Optional
|
|
30
30
|
|
|
31
31
|
from configargparse import ArgParser
|
|
32
32
|
|
|
@@ -36,13 +36,12 @@ from toil.cwl.utils import (CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION,
|
|
|
36
36
|
CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE)
|
|
37
37
|
from toil.deferred import DeferredFunctionManager
|
|
38
38
|
from toil.fileStores.abstractFileStore import AbstractFileStore
|
|
39
|
-
from toil.job import CheckpointJobDescription, Job, JobDescription
|
|
39
|
+
from toil.job import CheckpointJobDescription, Job, JobDescription, DebugStoppingPointReached
|
|
40
40
|
from toil.jobStores.abstractJobStore import AbstractJobStore
|
|
41
41
|
from toil.lib.expando import MagicExpando
|
|
42
42
|
from toil.lib.io import make_public_dir
|
|
43
|
-
from toil.lib.resources import
|
|
44
|
-
|
|
45
|
-
from toil.statsAndLogging import configure_root_logger, set_log_level
|
|
43
|
+
from toil.lib.resources import ResourceMonitor
|
|
44
|
+
from toil.statsAndLogging import configure_root_logger, set_log_level, install_log_color
|
|
46
45
|
|
|
47
46
|
logger = logging.getLogger(__name__)
|
|
48
47
|
|
|
@@ -50,27 +49,27 @@ logger = logging.getLogger(__name__)
|
|
|
50
49
|
class StatsDict(MagicExpando):
|
|
51
50
|
"""Subclass of MagicExpando for type-checking purposes."""
|
|
52
51
|
|
|
53
|
-
jobs: List[
|
|
52
|
+
jobs: List[MagicExpando]
|
|
54
53
|
|
|
55
54
|
|
|
56
|
-
def nextChainable(predecessor: JobDescription,
|
|
55
|
+
def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, config: Config) -> Optional[JobDescription]:
|
|
57
56
|
"""
|
|
58
57
|
Returns the next chainable job's JobDescription after the given predecessor
|
|
59
58
|
JobDescription, if one exists, or None if the chain must terminate.
|
|
60
59
|
|
|
61
60
|
:param predecessor: The job to chain from
|
|
62
|
-
:param
|
|
61
|
+
:param job_store: The JobStore to fetch JobDescriptions from.
|
|
63
62
|
:param config: The configuration for the current run.
|
|
64
63
|
"""
|
|
65
64
|
#If no more jobs to run or services not finished, quit
|
|
66
|
-
if predecessor.nextSuccessors() is None or len(predecessor.services) > 0 or (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint
|
|
65
|
+
if predecessor.nextSuccessors() is None or len(predecessor.services) > 0 or (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint is not None):
|
|
67
66
|
logger.debug("Stopping running chain of jobs: no successors: %s, services: %s, checkpoint: %s",
|
|
68
|
-
predecessor.nextSuccessors() is None, len(predecessor.services), (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint
|
|
67
|
+
predecessor.nextSuccessors() is None, len(predecessor.services), (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint is not None))
|
|
69
68
|
return None
|
|
70
69
|
|
|
71
70
|
|
|
72
71
|
#Get the next set of jobs to run
|
|
73
|
-
jobs = list(predecessor.nextSuccessors())
|
|
72
|
+
jobs = list(predecessor.nextSuccessors() or set())
|
|
74
73
|
if len(jobs) == 0:
|
|
75
74
|
# If there are no jobs, we might just not have any children.
|
|
76
75
|
logger.debug("Stopping running chain of jobs because job has no ready children or follow-ons")
|
|
@@ -89,7 +88,7 @@ def nextChainable(predecessor: JobDescription, jobStore: AbstractJobStore, confi
|
|
|
89
88
|
logger.debug("%s would chain to ID %s", predecessor, successorID)
|
|
90
89
|
|
|
91
90
|
# Load the successor JobDescription
|
|
92
|
-
successor =
|
|
91
|
+
successor = job_store.load_job(successorID)
|
|
93
92
|
|
|
94
93
|
#We check the requirements of the successor to see if we can run it
|
|
95
94
|
#within the current worker
|
|
@@ -118,17 +117,38 @@ def nextChainable(predecessor: JobDescription, jobStore: AbstractJobStore, confi
|
|
|
118
117
|
logger.debug("Next job is checkpoint, so finishing")
|
|
119
118
|
return None
|
|
120
119
|
|
|
120
|
+
if not config.run_local_jobs_on_workers and predecessor.local and not successor.local:
|
|
121
|
+
# This job might be running on the leader, but the next job may not.
|
|
122
|
+
#
|
|
123
|
+
# TODO: Optimize by detecting whether we actually are on the leader,
|
|
124
|
+
# somehow.
|
|
125
|
+
logger.debug("Next job is not allowed to run on the leader, so finishing")
|
|
126
|
+
return None
|
|
127
|
+
|
|
121
128
|
# Made it through! This job is chainable.
|
|
122
129
|
return successor
|
|
123
130
|
|
|
124
|
-
def workerScript(
|
|
131
|
+
def workerScript(
|
|
132
|
+
job_store: AbstractJobStore,
|
|
133
|
+
config: Config,
|
|
134
|
+
job_name: str,
|
|
135
|
+
job_store_id: str,
|
|
136
|
+
redirect_output_to_log_file: bool = True,
|
|
137
|
+
local_worker_temp_dir: Optional[str] = None,
|
|
138
|
+
debug_flags: Optional[Set[str]] = None
|
|
139
|
+
) -> int:
|
|
125
140
|
"""
|
|
126
141
|
Worker process script, runs a job.
|
|
127
142
|
|
|
128
|
-
:param
|
|
143
|
+
:param job_store: The JobStore to fetch JobDescriptions from.
|
|
129
144
|
:param config: The configuration for the current run.
|
|
130
|
-
:param
|
|
131
|
-
:param
|
|
145
|
+
:param job_name: The "job name" (a user friendly name) of the job to be run
|
|
146
|
+
:param job_store_id: The job store ID of the job to be run
|
|
147
|
+
:param redirect_output_to_log_file: If False, log directly to the console
|
|
148
|
+
instead of capturing job output.
|
|
149
|
+
:param local_worker_temp_dir: The directory for the worker to work in. May
|
|
150
|
+
be recursively removed after the job runs.
|
|
151
|
+
:param debug_flags: Flags to set on each job before running it.
|
|
132
152
|
|
|
133
153
|
:return int: 1 if a job failed, or 0 if all jobs succeeded
|
|
134
154
|
"""
|
|
@@ -136,6 +156,11 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
136
156
|
configure_root_logger()
|
|
137
157
|
set_log_level(config.logLevel)
|
|
138
158
|
|
|
159
|
+
if config.colored_logs:
|
|
160
|
+
install_log_color()
|
|
161
|
+
|
|
162
|
+
logger.debug("Worker started for job %s...", job_name)
|
|
163
|
+
|
|
139
164
|
##########################################
|
|
140
165
|
#Create the worker killer, if requested
|
|
141
166
|
##########################################
|
|
@@ -182,7 +207,7 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
182
207
|
##########################################
|
|
183
208
|
|
|
184
209
|
#First load the environment for the job.
|
|
185
|
-
with
|
|
210
|
+
with job_store.read_shared_file_stream("environment.pickle") as fileHandle:
|
|
186
211
|
environment = safeUnpickleFromStream(fileHandle)
|
|
187
212
|
env_reject = {
|
|
188
213
|
"TMPDIR",
|
|
@@ -228,8 +253,10 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
228
253
|
toilWorkflowDir = Toil.getLocalWorkflowDir(config.workflowID, config.workDir)
|
|
229
254
|
# Dir to put lock files in, ideally not on NFS.
|
|
230
255
|
toil_coordination_dir = Toil.get_local_workflow_coordination_dir(config.workflowID, config.workDir, config.coordination_dir)
|
|
231
|
-
|
|
232
|
-
|
|
256
|
+
if local_worker_temp_dir is None:
|
|
257
|
+
# Invent a temp directory to work in
|
|
258
|
+
local_worker_temp_dir = make_public_dir(toilWorkflowDir)
|
|
259
|
+
os.chmod(local_worker_temp_dir, 0o755)
|
|
233
260
|
|
|
234
261
|
##########################################
|
|
235
262
|
#Setup the logging
|
|
@@ -245,12 +272,12 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
245
272
|
#file descriptor 1, and standard error is file descriptor 2.
|
|
246
273
|
|
|
247
274
|
# Do we even want to redirect output? Let the config make us not do it.
|
|
248
|
-
|
|
275
|
+
redirect_output_to_log_file = redirect_output_to_log_file and not config.disableWorkerOutputCapture
|
|
249
276
|
|
|
250
277
|
#What file do we want to point FDs 1 and 2 to?
|
|
251
|
-
tempWorkerLogPath = os.path.join(
|
|
278
|
+
tempWorkerLogPath = os.path.join(local_worker_temp_dir, "worker_log.txt")
|
|
252
279
|
|
|
253
|
-
if
|
|
280
|
+
if redirect_output_to_log_file:
|
|
254
281
|
# Announce that we are redirecting logging, and where it will now go.
|
|
255
282
|
# This is only important if we are trying to manually trace a faulty worker invocation.
|
|
256
283
|
logger.debug("Redirecting logging to %s", tempWorkerLogPath)
|
|
@@ -287,13 +314,14 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
287
314
|
|
|
288
315
|
jobAttemptFailed = False
|
|
289
316
|
failure_exit_code = 1
|
|
317
|
+
first_job_cores = None
|
|
290
318
|
statsDict = StatsDict() # type: ignore[no-untyped-call]
|
|
291
319
|
statsDict.jobs = []
|
|
292
|
-
statsDict.workers.
|
|
320
|
+
statsDict.workers.logs_to_leader = []
|
|
321
|
+
statsDict.workers.logging_user_streams = []
|
|
293
322
|
|
|
294
323
|
def blockFn() -> bool:
|
|
295
324
|
return True
|
|
296
|
-
listOfJobs = [jobName]
|
|
297
325
|
job = None
|
|
298
326
|
try:
|
|
299
327
|
|
|
@@ -312,18 +340,17 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
312
340
|
# Load the JobDescription
|
|
313
341
|
##########################################
|
|
314
342
|
|
|
315
|
-
jobDesc =
|
|
316
|
-
listOfJobs[0] = str(jobDesc)
|
|
343
|
+
jobDesc = job_store.load_job(job_store_id)
|
|
317
344
|
logger.debug("Parsed job description")
|
|
318
345
|
|
|
319
346
|
##########################################
|
|
320
347
|
# Cleanup from any earlier invocation of the job
|
|
321
348
|
##########################################
|
|
322
349
|
|
|
323
|
-
if jobDesc.
|
|
350
|
+
if not jobDesc.has_body():
|
|
324
351
|
logger.debug("Job description has no body to run.")
|
|
325
352
|
# Cleanup jobs already finished
|
|
326
|
-
jobDesc.clear_nonexistent_dependents(
|
|
353
|
+
jobDesc.clear_nonexistent_dependents(job_store)
|
|
327
354
|
logger.debug("Cleaned up any references to completed successor jobs")
|
|
328
355
|
|
|
329
356
|
# This cleans the old log file which may
|
|
@@ -331,8 +358,8 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
331
358
|
oldLogFile = jobDesc.logJobStoreFileID
|
|
332
359
|
if oldLogFile is not None:
|
|
333
360
|
jobDesc.logJobStoreFileID = None
|
|
334
|
-
|
|
335
|
-
|
|
361
|
+
job_store.update_job(jobDesc) # Update first, before deleting any files
|
|
362
|
+
job_store.delete_file(oldLogFile)
|
|
336
363
|
|
|
337
364
|
##########################################
|
|
338
365
|
# If a checkpoint exists, restart from the checkpoint
|
|
@@ -350,20 +377,22 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
350
377
|
if jobDesc.remainingTryCount < 0:
|
|
351
378
|
raise RuntimeError("The try count of the job cannot be negative.")
|
|
352
379
|
jobDesc.remainingTryCount = max(0, jobDesc.remainingTryCount - 1)
|
|
353
|
-
jobDesc.restartCheckpoint(
|
|
380
|
+
jobDesc.restartCheckpoint(job_store)
|
|
354
381
|
# Otherwise, the job and successors are done, and we can cleanup stuff we couldn't clean
|
|
355
382
|
# because of the job being a checkpoint
|
|
356
383
|
else:
|
|
357
384
|
logger.debug("The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete.")
|
|
358
385
|
#Delete any remnant files
|
|
359
|
-
list(map(
|
|
386
|
+
list(map(job_store.delete_file, list(filter(job_store.file_exists, jobDesc.checkpointFilesToDelete))))
|
|
360
387
|
|
|
361
388
|
##########################################
|
|
362
389
|
#Setup the stats, if requested
|
|
363
390
|
##########################################
|
|
364
391
|
|
|
365
392
|
if config.stats:
|
|
366
|
-
|
|
393
|
+
# Remember the cores from the first job, which is how many we have reserved for us.
|
|
394
|
+
statsDict.workers.requested_cores = jobDesc.cores
|
|
395
|
+
startClock = ResourceMonitor.get_total_cpu_time()
|
|
367
396
|
|
|
368
397
|
startTime = time.time()
|
|
369
398
|
while True:
|
|
@@ -373,20 +402,22 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
373
402
|
|
|
374
403
|
logger.info("Working on job %s", jobDesc)
|
|
375
404
|
|
|
376
|
-
if jobDesc.
|
|
377
|
-
if not jobDesc.command.startswith("_toil "):
|
|
378
|
-
raise RuntimeError("Job command must start with '_toil' before being converted to an executable command.")
|
|
379
|
-
logger.debug("Got a command to run: %s" % jobDesc.command)
|
|
405
|
+
if jobDesc.has_body():
|
|
380
406
|
# Load the job. It will use the same JobDescription we have been using.
|
|
381
|
-
job = Job.loadJob(
|
|
407
|
+
job = Job.loadJob(job_store, jobDesc)
|
|
382
408
|
if isinstance(jobDesc, CheckpointJobDescription):
|
|
383
|
-
# If it is a checkpoint job,
|
|
384
|
-
jobDesc.
|
|
409
|
+
# If it is a checkpoint job, set the checkpoint
|
|
410
|
+
jobDesc.set_checkpoint()
|
|
385
411
|
|
|
386
412
|
logger.info("Loaded body %s from description %s", job, jobDesc)
|
|
387
413
|
|
|
414
|
+
if debug_flags:
|
|
415
|
+
for flag in debug_flags:
|
|
416
|
+
logger.debug("Turning on debug flag %s on job", flag)
|
|
417
|
+
job.set_debug_flag(flag)
|
|
418
|
+
|
|
388
419
|
# Create a fileStore object for the job
|
|
389
|
-
fileStore = AbstractFileStore.createFileStore(
|
|
420
|
+
fileStore = AbstractFileStore.createFileStore(job_store, jobDesc, local_worker_temp_dir, blockFn,
|
|
390
421
|
caching=config.caching)
|
|
391
422
|
with job._executor(stats=statsDict if config.stats else None,
|
|
392
423
|
fileStore=fileStore):
|
|
@@ -404,21 +435,22 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
404
435
|
# wants across multiple Toil versions. We also
|
|
405
436
|
# still pass a jobGraph argument to placate old
|
|
406
437
|
# versions of Cactus.
|
|
407
|
-
job._runner(jobGraph=None, jobStore=
|
|
438
|
+
job._runner(jobGraph=None, jobStore=job_store, fileStore=fileStore, defer=defer)
|
|
408
439
|
|
|
409
440
|
# When the executor for the job finishes it will
|
|
410
|
-
# kick off a commit with the
|
|
411
|
-
#
|
|
441
|
+
# kick off a commit with the link to the job body
|
|
442
|
+
# cut.
|
|
412
443
|
|
|
413
444
|
# Accumulate messages from this job & any subsequent chained jobs
|
|
414
|
-
statsDict.workers.
|
|
445
|
+
statsDict.workers.logs_to_leader += fileStore.logging_messages
|
|
446
|
+
statsDict.workers.logging_user_streams += fileStore.logging_user_streams
|
|
415
447
|
|
|
416
448
|
logger.info("Completed body for %s", jobDesc)
|
|
417
449
|
|
|
418
450
|
else:
|
|
419
|
-
#The
|
|
420
|
-
#
|
|
421
|
-
#been scheduled after a failure to cleanup
|
|
451
|
+
# The body may not be attached, in which case the
|
|
452
|
+
# JobDescription is either a shell ready to be deleted or has
|
|
453
|
+
# been scheduled after a failure to cleanup
|
|
422
454
|
logger.debug("No user job to run, so finishing")
|
|
423
455
|
break
|
|
424
456
|
|
|
@@ -428,7 +460,7 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
428
460
|
##########################################
|
|
429
461
|
#Establish if we can run another job within the worker
|
|
430
462
|
##########################################
|
|
431
|
-
successor = nextChainable(jobDesc,
|
|
463
|
+
successor = nextChainable(jobDesc, job_store, config)
|
|
432
464
|
if successor is None or config.disableChaining:
|
|
433
465
|
# Can't chain any more jobs. We are going to stop.
|
|
434
466
|
|
|
@@ -457,9 +489,6 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
457
489
|
# body) up after we finish executing it.
|
|
458
490
|
successorID = successor.jobStoreID
|
|
459
491
|
|
|
460
|
-
# add the successor to the list of jobs run
|
|
461
|
-
listOfJobs.append(str(successor))
|
|
462
|
-
|
|
463
492
|
# Now we need to become that successor, under the original ID.
|
|
464
493
|
successor.replace(jobDesc)
|
|
465
494
|
jobDesc = successor
|
|
@@ -470,7 +499,7 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
470
499
|
|
|
471
500
|
# Build a fileStore to update the job and commit the replacement.
|
|
472
501
|
# TODO: can we have a commit operation without an entire FileStore???
|
|
473
|
-
fileStore = AbstractFileStore.createFileStore(
|
|
502
|
+
fileStore = AbstractFileStore.createFileStore(job_store, jobDesc, local_worker_temp_dir, blockFn,
|
|
474
503
|
caching=config.caching)
|
|
475
504
|
|
|
476
505
|
# Update blockFn to wait for that commit operation.
|
|
@@ -485,27 +514,44 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
485
514
|
#Finish up the stats
|
|
486
515
|
##########################################
|
|
487
516
|
if config.stats:
|
|
488
|
-
totalCPUTime, totalMemoryUsage = get_total_cpu_time_and_memory_usage()
|
|
517
|
+
totalCPUTime, totalMemoryUsage = ResourceMonitor.get_total_cpu_time_and_memory_usage()
|
|
489
518
|
statsDict.workers.time = str(time.time() - startTime)
|
|
490
519
|
statsDict.workers.clock = str(totalCPUTime - startClock)
|
|
491
520
|
statsDict.workers.memory = str(totalMemoryUsage)
|
|
521
|
+
# Say the worker used the max disk we saw from any job
|
|
522
|
+
max_bytes = 0
|
|
523
|
+
for job_stats in statsDict.jobs:
|
|
524
|
+
if "disk" in job_stats:
|
|
525
|
+
max_bytes = max(max_bytes, int(job_stats.disk))
|
|
526
|
+
statsDict.workers.disk = str(max_bytes)
|
|
527
|
+
# Count the jobs executed.
|
|
528
|
+
# TODO: toil stats could compute this but its parser is too general to hook into simply.
|
|
529
|
+
statsDict.workers.jobs_run = len(statsDict.jobs)
|
|
530
|
+
|
|
492
531
|
|
|
493
532
|
# log the worker log path here so that if the file is truncated the path can still be found
|
|
494
|
-
if
|
|
495
|
-
logger.info("Worker log can be found at %s. Set --cleanWorkDir to retain this log",
|
|
533
|
+
if redirect_output_to_log_file:
|
|
534
|
+
logger.info("Worker log can be found at %s. Set --cleanWorkDir to retain this log", local_worker_temp_dir)
|
|
496
535
|
|
|
497
536
|
logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds", time.time() - startTime)
|
|
498
537
|
|
|
499
538
|
##########################################
|
|
500
539
|
#Trapping where worker goes wrong
|
|
501
540
|
##########################################
|
|
502
|
-
except
|
|
503
|
-
|
|
541
|
+
except DebugStoppingPointReached:
|
|
542
|
+
# Job wants the worker to stop for debugging
|
|
543
|
+
raise
|
|
544
|
+
except BaseException as e: #Case that something goes wrong in worker, or we are asked to stop
|
|
545
|
+
if not isinstance(e, SystemExit):
|
|
546
|
+
logger.critical("Worker crashed with traceback:\n%s", traceback.format_exc())
|
|
504
547
|
logger.error("Exiting the worker because of a failed job on host %s", socket.gethostname())
|
|
505
548
|
if isinstance(e, CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION):
|
|
506
549
|
# We need to inform the leader that this is a CWL workflow problem
|
|
507
550
|
# and it needs to inform its caller.
|
|
508
551
|
failure_exit_code = CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
|
|
552
|
+
elif isinstance(e, SystemExit) and isinstance(e.code, int) and e.code != 0:
|
|
553
|
+
# We're meant to be exiting with a particular code.
|
|
554
|
+
failure_exit_code = e.code
|
|
509
555
|
AbstractFileStore._terminateEvent.set()
|
|
510
556
|
finally:
|
|
511
557
|
# Get rid of our deferred function manager now so we can't mistake it
|
|
@@ -538,7 +584,7 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
538
584
|
|
|
539
585
|
# Clobber any garbage state we have for this job from failing with
|
|
540
586
|
# whatever good state is still stored in the JobStore
|
|
541
|
-
jobDesc =
|
|
587
|
+
jobDesc = job_store.load_job(job_store_id)
|
|
542
588
|
# Remember that we failed
|
|
543
589
|
jobAttemptFailed = True
|
|
544
590
|
|
|
@@ -550,7 +596,7 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
550
596
|
# Flush at the Python level
|
|
551
597
|
sys.stdout.flush()
|
|
552
598
|
sys.stderr.flush()
|
|
553
|
-
if
|
|
599
|
+
if redirect_output_to_log_file:
|
|
554
600
|
# Flush at the OS level
|
|
555
601
|
os.fsync(1)
|
|
556
602
|
os.fsync(2)
|
|
@@ -577,12 +623,11 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
577
623
|
# relative to the end (since Python won't decode Unicode backward, or even
|
|
578
624
|
# interpret seek offsets in characters for us). TODO: We may get invalid or
|
|
579
625
|
# just different Unicode by breaking up a character at the boundary!
|
|
580
|
-
if jobAttemptFailed and
|
|
581
|
-
jobDesc.logJobStoreFileID = logJobStoreFileID =
|
|
626
|
+
if jobAttemptFailed and redirect_output_to_log_file:
|
|
627
|
+
jobDesc.logJobStoreFileID = logJobStoreFileID = job_store.getEmptyFileStoreID(
|
|
582
628
|
jobDesc.jobStoreID, cleanup=True
|
|
583
629
|
)
|
|
584
|
-
|
|
585
|
-
with jobStore.update_file_stream(logJobStoreFileID) as w:
|
|
630
|
+
with job_store.update_file_stream(logJobStoreFileID) as w:
|
|
586
631
|
with open(tempWorkerLogPath, 'rb') as f:
|
|
587
632
|
if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit !=0:
|
|
588
633
|
if logFileByteReportLimit > 0:
|
|
@@ -592,10 +637,10 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
592
637
|
# Dump the possibly-invalid-Unicode bytes into the log file
|
|
593
638
|
w.write(f.read()) # TODO load file using a buffer
|
|
594
639
|
# Commit log file reference back to JobStore
|
|
595
|
-
|
|
640
|
+
job_store.update_job(jobDesc)
|
|
596
641
|
|
|
597
642
|
elif ((debugging or (config.writeLogsFromAllJobs and not jobDesc.local))
|
|
598
|
-
and
|
|
643
|
+
and redirect_output_to_log_file): # write log messages
|
|
599
644
|
with open(tempWorkerLogPath, 'rb') as logFile:
|
|
600
645
|
if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit != 0:
|
|
601
646
|
if logFileByteReportLimit > 0:
|
|
@@ -605,11 +650,14 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
605
650
|
# Make sure lines are Unicode so they can be JSON serialized as part of the dict.
|
|
606
651
|
# We may have damaged the Unicode text by cutting it at an arbitrary byte so we drop bad characters.
|
|
607
652
|
logMessages = [line.decode('utf-8', 'skip') for line in logFile.read().splitlines()]
|
|
608
|
-
statsDict.logs.names =
|
|
653
|
+
statsDict.logs.names = [names.stats_name for names in jobDesc.get_chain()]
|
|
609
654
|
statsDict.logs.messages = logMessages
|
|
610
655
|
|
|
611
|
-
if
|
|
612
|
-
|
|
656
|
+
if debugging or config.stats or statsDict.workers.logs_to_leader or statsDict.workers.logging_user_streams:
|
|
657
|
+
# We have stats/logging to report back.
|
|
658
|
+
# We report even if the job attempt failed.
|
|
659
|
+
# TODO: Will that upset analysis of the stats?
|
|
660
|
+
job_store.write_logs(json.dumps(statsDict, ensure_ascii=True))
|
|
613
661
|
|
|
614
662
|
# Remove the temp dir
|
|
615
663
|
cleanUp = config.cleanWorkDir
|
|
@@ -627,14 +675,14 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
627
675
|
os.chmod(os.path.dirname(path), stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
|
|
628
676
|
except PermissionError as e:
|
|
629
677
|
logger.error('Could not set permissions on %s to allow cleanup of %s: %s', os.path.dirname(path), path, e)
|
|
630
|
-
shutil.rmtree(
|
|
678
|
+
shutil.rmtree(local_worker_temp_dir, onerror=make_parent_writable)
|
|
631
679
|
|
|
632
680
|
# This must happen after the log file is done with, else there is no place to put the log
|
|
633
681
|
if (not jobAttemptFailed) and jobDesc.is_subtree_done():
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
682
|
+
for merged_in in jobDesc.get_chain():
|
|
683
|
+
# We can now safely get rid of the JobDescription, and all jobs it chained up
|
|
684
|
+
job_store.delete_job(merged_in.job_store_id)
|
|
685
|
+
|
|
638
686
|
|
|
639
687
|
if jobAttemptFailed:
|
|
640
688
|
return failure_exit_code
|
|
@@ -704,25 +752,19 @@ def in_contexts(contexts: List[str]) -> Iterator[None]:
|
|
|
704
752
|
def main(argv: Optional[List[str]] = None) -> None:
|
|
705
753
|
if argv is None:
|
|
706
754
|
argv = sys.argv
|
|
707
|
-
|
|
708
755
|
# Parse our command line
|
|
709
756
|
options = parse_args(argv)
|
|
710
757
|
|
|
711
|
-
# Parse input args
|
|
712
|
-
jobName = argv[1]
|
|
713
|
-
jobStoreLocator = argv[2]
|
|
714
|
-
jobStoreID = argv[3]
|
|
715
|
-
|
|
716
758
|
##########################################
|
|
717
759
|
#Load the jobStore/config file
|
|
718
760
|
##########################################
|
|
719
761
|
|
|
720
|
-
|
|
721
|
-
config =
|
|
762
|
+
job_store = Toil.resumeJobStore(options.jobStoreLocator)
|
|
763
|
+
config = job_store.config
|
|
722
764
|
|
|
723
765
|
with in_contexts(options.context):
|
|
724
766
|
# Call the worker
|
|
725
|
-
exit_code = workerScript(
|
|
767
|
+
exit_code = workerScript(job_store, config, options.jobName, options.jobStoreID)
|
|
726
768
|
|
|
727
769
|
# Exit with its return value
|
|
728
770
|
sys.exit(exit_code)
|
|
@@ -202,3 +202,28 @@
|
|
|
202
202
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
203
203
|
See the License for the specific language governing permissions and
|
|
204
204
|
limitations under the License.
|
|
205
|
+
|
|
206
|
+
All code in this repository excluding src/toil/statsAndLogging.py::install_log_color is under the Apache License as outlined directly above.
|
|
207
|
+
Some code in src/toil/statsAndLogging.py::install_log_color is under the MiniWDL MIT License as outlined directly below.
|
|
208
|
+
|
|
209
|
+
MIT License
|
|
210
|
+
|
|
211
|
+
Copyright (c) 2018 Chan Zuckerberg Initiative
|
|
212
|
+
|
|
213
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
214
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
215
|
+
in the Software without restriction, including without limitation the rights
|
|
216
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
217
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
218
|
+
furnished to do so, subject to the following conditions:
|
|
219
|
+
|
|
220
|
+
The above copyright notice and this permission notice shall be included in all
|
|
221
|
+
copies or substantial portions of the Software.
|
|
222
|
+
|
|
223
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
224
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
225
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
226
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
227
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
228
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
229
|
+
SOFTWARE.
|