toil 6.1.0__py3-none-any.whl → 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +1 -232
- toil/batchSystems/abstractBatchSystem.py +22 -13
- toil/batchSystems/abstractGridEngineBatchSystem.py +59 -45
- toil/batchSystems/awsBatch.py +8 -8
- toil/batchSystems/contained_executor.py +4 -5
- toil/batchSystems/gridengine.py +1 -1
- toil/batchSystems/htcondor.py +5 -5
- toil/batchSystems/kubernetes.py +25 -11
- toil/batchSystems/local_support.py +3 -3
- toil/batchSystems/lsf.py +2 -2
- toil/batchSystems/mesos/batchSystem.py +4 -4
- toil/batchSystems/mesos/executor.py +3 -2
- toil/batchSystems/options.py +9 -0
- toil/batchSystems/singleMachine.py +11 -10
- toil/batchSystems/slurm.py +64 -22
- toil/batchSystems/torque.py +1 -1
- toil/bus.py +7 -3
- toil/common.py +36 -13
- toil/cwl/cwltoil.py +365 -312
- toil/deferred.py +1 -1
- toil/fileStores/abstractFileStore.py +17 -17
- toil/fileStores/cachingFileStore.py +2 -2
- toil/fileStores/nonCachingFileStore.py +1 -1
- toil/job.py +228 -60
- toil/jobStores/abstractJobStore.py +18 -10
- toil/jobStores/aws/jobStore.py +280 -218
- toil/jobStores/aws/utils.py +57 -29
- toil/jobStores/conftest.py +2 -2
- toil/jobStores/fileJobStore.py +2 -2
- toil/jobStores/googleJobStore.py +3 -4
- toil/leader.py +72 -24
- toil/lib/aws/__init__.py +26 -10
- toil/lib/aws/iam.py +2 -2
- toil/lib/aws/session.py +62 -22
- toil/lib/aws/utils.py +73 -37
- toil/lib/conversions.py +5 -1
- toil/lib/ec2.py +118 -69
- toil/lib/expando.py +1 -1
- toil/lib/io.py +14 -2
- toil/lib/misc.py +1 -3
- toil/lib/resources.py +55 -21
- toil/lib/retry.py +12 -5
- toil/lib/threading.py +2 -2
- toil/lib/throttle.py +1 -1
- toil/options/common.py +27 -24
- toil/provisioners/__init__.py +9 -3
- toil/provisioners/abstractProvisioner.py +9 -7
- toil/provisioners/aws/__init__.py +20 -15
- toil/provisioners/aws/awsProvisioner.py +406 -329
- toil/provisioners/gceProvisioner.py +2 -2
- toil/provisioners/node.py +13 -5
- toil/server/app.py +1 -1
- toil/statsAndLogging.py +58 -16
- toil/test/__init__.py +27 -12
- toil/test/batchSystems/batchSystemTest.py +40 -33
- toil/test/batchSystems/batch_system_plugin_test.py +79 -0
- toil/test/batchSystems/test_slurm.py +1 -1
- toil/test/cwl/cwlTest.py +8 -91
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +10 -13
- toil/test/jobStores/jobStoreTest.py +33 -49
- toil/test/lib/aws/test_iam.py +2 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
- toil/test/provisioners/clusterTest.py +90 -8
- toil/test/server/serverTest.py +2 -2
- toil/test/src/autoDeploymentTest.py +1 -1
- toil/test/src/dockerCheckTest.py +2 -1
- toil/test/src/environmentTest.py +125 -0
- toil/test/src/fileStoreTest.py +1 -1
- toil/test/src/jobDescriptionTest.py +18 -8
- toil/test/src/jobTest.py +1 -1
- toil/test/src/realtimeLoggerTest.py +4 -0
- toil/test/src/workerTest.py +52 -19
- toil/test/utils/toilDebugTest.py +61 -3
- toil/test/utils/utilsTest.py +20 -18
- toil/test/wdl/wdltoil_test.py +24 -71
- toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
- toil/toilState.py +68 -9
- toil/utils/toilDebugJob.py +153 -26
- toil/utils/toilLaunchCluster.py +12 -2
- toil/utils/toilRsyncCluster.py +7 -2
- toil/utils/toilSshCluster.py +7 -3
- toil/utils/toilStats.py +2 -1
- toil/utils/toilStatus.py +97 -51
- toil/version.py +10 -10
- toil/wdl/wdltoil.py +318 -51
- toil/worker.py +96 -69
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/METADATA +55 -21
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/RECORD +93 -90
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
toil/worker.py
CHANGED
|
@@ -26,7 +26,7 @@ import sys
|
|
|
26
26
|
import time
|
|
27
27
|
import traceback
|
|
28
28
|
from contextlib import contextmanager
|
|
29
|
-
from typing import Any, Callable, Iterator, List, Optional
|
|
29
|
+
from typing import Any, Callable, Iterator, List, Set, Optional
|
|
30
30
|
|
|
31
31
|
from configargparse import ArgParser
|
|
32
32
|
|
|
@@ -36,13 +36,12 @@ from toil.cwl.utils import (CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION,
|
|
|
36
36
|
CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE)
|
|
37
37
|
from toil.deferred import DeferredFunctionManager
|
|
38
38
|
from toil.fileStores.abstractFileStore import AbstractFileStore
|
|
39
|
-
from toil.job import CheckpointJobDescription, Job, JobDescription
|
|
39
|
+
from toil.job import CheckpointJobDescription, Job, JobDescription, DebugStoppingPointReached
|
|
40
40
|
from toil.jobStores.abstractJobStore import AbstractJobStore
|
|
41
41
|
from toil.lib.expando import MagicExpando
|
|
42
42
|
from toil.lib.io import make_public_dir
|
|
43
|
-
from toil.lib.resources import
|
|
44
|
-
|
|
45
|
-
from toil.statsAndLogging import configure_root_logger, set_log_level
|
|
43
|
+
from toil.lib.resources import ResourceMonitor
|
|
44
|
+
from toil.statsAndLogging import configure_root_logger, set_log_level, install_log_color
|
|
46
45
|
|
|
47
46
|
logger = logging.getLogger(__name__)
|
|
48
47
|
|
|
@@ -53,24 +52,24 @@ class StatsDict(MagicExpando):
|
|
|
53
52
|
jobs: List[MagicExpando]
|
|
54
53
|
|
|
55
54
|
|
|
56
|
-
def nextChainable(predecessor: JobDescription,
|
|
55
|
+
def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, config: Config) -> Optional[JobDescription]:
|
|
57
56
|
"""
|
|
58
57
|
Returns the next chainable job's JobDescription after the given predecessor
|
|
59
58
|
JobDescription, if one exists, or None if the chain must terminate.
|
|
60
59
|
|
|
61
60
|
:param predecessor: The job to chain from
|
|
62
|
-
:param
|
|
61
|
+
:param job_store: The JobStore to fetch JobDescriptions from.
|
|
63
62
|
:param config: The configuration for the current run.
|
|
64
63
|
"""
|
|
65
64
|
#If no more jobs to run or services not finished, quit
|
|
66
|
-
if predecessor.nextSuccessors() is None or len(predecessor.services) > 0 or (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint
|
|
65
|
+
if predecessor.nextSuccessors() is None or len(predecessor.services) > 0 or (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint is not None):
|
|
67
66
|
logger.debug("Stopping running chain of jobs: no successors: %s, services: %s, checkpoint: %s",
|
|
68
|
-
predecessor.nextSuccessors() is None, len(predecessor.services), (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint
|
|
67
|
+
predecessor.nextSuccessors() is None, len(predecessor.services), (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint is not None))
|
|
69
68
|
return None
|
|
70
69
|
|
|
71
70
|
|
|
72
71
|
#Get the next set of jobs to run
|
|
73
|
-
jobs = list(predecessor.nextSuccessors())
|
|
72
|
+
jobs = list(predecessor.nextSuccessors() or set())
|
|
74
73
|
if len(jobs) == 0:
|
|
75
74
|
# If there are no jobs, we might just not have any children.
|
|
76
75
|
logger.debug("Stopping running chain of jobs because job has no ready children or follow-ons")
|
|
@@ -89,7 +88,7 @@ def nextChainable(predecessor: JobDescription, jobStore: AbstractJobStore, confi
|
|
|
89
88
|
logger.debug("%s would chain to ID %s", predecessor, successorID)
|
|
90
89
|
|
|
91
90
|
# Load the successor JobDescription
|
|
92
|
-
successor =
|
|
91
|
+
successor = job_store.load_job(successorID)
|
|
93
92
|
|
|
94
93
|
#We check the requirements of the successor to see if we can run it
|
|
95
94
|
#within the current worker
|
|
@@ -118,17 +117,38 @@ def nextChainable(predecessor: JobDescription, jobStore: AbstractJobStore, confi
|
|
|
118
117
|
logger.debug("Next job is checkpoint, so finishing")
|
|
119
118
|
return None
|
|
120
119
|
|
|
120
|
+
if not config.run_local_jobs_on_workers and predecessor.local and not successor.local:
|
|
121
|
+
# This job might be running on the leader, but the next job may not.
|
|
122
|
+
#
|
|
123
|
+
# TODO: Optimize by detecting whether we actually are on the leader,
|
|
124
|
+
# somehow.
|
|
125
|
+
logger.debug("Next job is not allowed to run on the leader, so finishing")
|
|
126
|
+
return None
|
|
127
|
+
|
|
121
128
|
# Made it through! This job is chainable.
|
|
122
129
|
return successor
|
|
123
130
|
|
|
124
|
-
def workerScript(
|
|
131
|
+
def workerScript(
|
|
132
|
+
job_store: AbstractJobStore,
|
|
133
|
+
config: Config,
|
|
134
|
+
job_name: str,
|
|
135
|
+
job_store_id: str,
|
|
136
|
+
redirect_output_to_log_file: bool = True,
|
|
137
|
+
local_worker_temp_dir: Optional[str] = None,
|
|
138
|
+
debug_flags: Optional[Set[str]] = None
|
|
139
|
+
) -> int:
|
|
125
140
|
"""
|
|
126
141
|
Worker process script, runs a job.
|
|
127
142
|
|
|
128
|
-
:param
|
|
143
|
+
:param job_store: The JobStore to fetch JobDescriptions from.
|
|
129
144
|
:param config: The configuration for the current run.
|
|
130
|
-
:param
|
|
131
|
-
:param
|
|
145
|
+
:param job_name: The "job name" (a user friendly name) of the job to be run
|
|
146
|
+
:param job_store_id: The job store ID of the job to be run
|
|
147
|
+
:param redirect_output_to_log_file: If False, log directly to the console
|
|
148
|
+
instead of capturing job output.
|
|
149
|
+
:param local_worker_temp_dir: The directory for the worker to work in. May
|
|
150
|
+
be recursively removed after the job runs.
|
|
151
|
+
:param debug_flags: Flags to set on each job before running it.
|
|
132
152
|
|
|
133
153
|
:return int: 1 if a job failed, or 0 if all jobs succeeded
|
|
134
154
|
"""
|
|
@@ -136,6 +156,11 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
136
156
|
configure_root_logger()
|
|
137
157
|
set_log_level(config.logLevel)
|
|
138
158
|
|
|
159
|
+
if config.colored_logs:
|
|
160
|
+
install_log_color()
|
|
161
|
+
|
|
162
|
+
logger.debug("Worker started for job %s...", job_name)
|
|
163
|
+
|
|
139
164
|
##########################################
|
|
140
165
|
#Create the worker killer, if requested
|
|
141
166
|
##########################################
|
|
@@ -182,7 +207,7 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
182
207
|
##########################################
|
|
183
208
|
|
|
184
209
|
#First load the environment for the job.
|
|
185
|
-
with
|
|
210
|
+
with job_store.read_shared_file_stream("environment.pickle") as fileHandle:
|
|
186
211
|
environment = safeUnpickleFromStream(fileHandle)
|
|
187
212
|
env_reject = {
|
|
188
213
|
"TMPDIR",
|
|
@@ -228,8 +253,10 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
228
253
|
toilWorkflowDir = Toil.getLocalWorkflowDir(config.workflowID, config.workDir)
|
|
229
254
|
# Dir to put lock files in, ideally not on NFS.
|
|
230
255
|
toil_coordination_dir = Toil.get_local_workflow_coordination_dir(config.workflowID, config.workDir, config.coordination_dir)
|
|
231
|
-
|
|
232
|
-
|
|
256
|
+
if local_worker_temp_dir is None:
|
|
257
|
+
# Invent a temp directory to work in
|
|
258
|
+
local_worker_temp_dir = make_public_dir(toilWorkflowDir)
|
|
259
|
+
os.chmod(local_worker_temp_dir, 0o755)
|
|
233
260
|
|
|
234
261
|
##########################################
|
|
235
262
|
#Setup the logging
|
|
@@ -245,12 +272,12 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
245
272
|
#file descriptor 1, and standard error is file descriptor 2.
|
|
246
273
|
|
|
247
274
|
# Do we even want to redirect output? Let the config make us not do it.
|
|
248
|
-
|
|
275
|
+
redirect_output_to_log_file = redirect_output_to_log_file and not config.disableWorkerOutputCapture
|
|
249
276
|
|
|
250
277
|
#What file do we want to point FDs 1 and 2 to?
|
|
251
|
-
tempWorkerLogPath = os.path.join(
|
|
278
|
+
tempWorkerLogPath = os.path.join(local_worker_temp_dir, "worker_log.txt")
|
|
252
279
|
|
|
253
|
-
if
|
|
280
|
+
if redirect_output_to_log_file:
|
|
254
281
|
# Announce that we are redirecting logging, and where it will now go.
|
|
255
282
|
# This is only important if we are trying to manually trace a faulty worker invocation.
|
|
256
283
|
logger.debug("Redirecting logging to %s", tempWorkerLogPath)
|
|
@@ -313,17 +340,17 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
313
340
|
# Load the JobDescription
|
|
314
341
|
##########################################
|
|
315
342
|
|
|
316
|
-
jobDesc =
|
|
343
|
+
jobDesc = job_store.load_job(job_store_id)
|
|
317
344
|
logger.debug("Parsed job description")
|
|
318
345
|
|
|
319
346
|
##########################################
|
|
320
347
|
# Cleanup from any earlier invocation of the job
|
|
321
348
|
##########################################
|
|
322
349
|
|
|
323
|
-
if jobDesc.
|
|
350
|
+
if not jobDesc.has_body():
|
|
324
351
|
logger.debug("Job description has no body to run.")
|
|
325
352
|
# Cleanup jobs already finished
|
|
326
|
-
jobDesc.clear_nonexistent_dependents(
|
|
353
|
+
jobDesc.clear_nonexistent_dependents(job_store)
|
|
327
354
|
logger.debug("Cleaned up any references to completed successor jobs")
|
|
328
355
|
|
|
329
356
|
# This cleans the old log file which may
|
|
@@ -331,8 +358,8 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
331
358
|
oldLogFile = jobDesc.logJobStoreFileID
|
|
332
359
|
if oldLogFile is not None:
|
|
333
360
|
jobDesc.logJobStoreFileID = None
|
|
334
|
-
|
|
335
|
-
|
|
361
|
+
job_store.update_job(jobDesc) # Update first, before deleting any files
|
|
362
|
+
job_store.delete_file(oldLogFile)
|
|
336
363
|
|
|
337
364
|
##########################################
|
|
338
365
|
# If a checkpoint exists, restart from the checkpoint
|
|
@@ -350,13 +377,13 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
350
377
|
if jobDesc.remainingTryCount < 0:
|
|
351
378
|
raise RuntimeError("The try count of the job cannot be negative.")
|
|
352
379
|
jobDesc.remainingTryCount = max(0, jobDesc.remainingTryCount - 1)
|
|
353
|
-
jobDesc.restartCheckpoint(
|
|
380
|
+
jobDesc.restartCheckpoint(job_store)
|
|
354
381
|
# Otherwise, the job and successors are done, and we can cleanup stuff we couldn't clean
|
|
355
382
|
# because of the job being a checkpoint
|
|
356
383
|
else:
|
|
357
384
|
logger.debug("The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete.")
|
|
358
385
|
#Delete any remnant files
|
|
359
|
-
list(map(
|
|
386
|
+
list(map(job_store.delete_file, list(filter(job_store.file_exists, jobDesc.checkpointFilesToDelete))))
|
|
360
387
|
|
|
361
388
|
##########################################
|
|
362
389
|
#Setup the stats, if requested
|
|
@@ -365,7 +392,7 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
365
392
|
if config.stats:
|
|
366
393
|
# Remember the cores from the first job, which is how many we have reserved for us.
|
|
367
394
|
statsDict.workers.requested_cores = jobDesc.cores
|
|
368
|
-
startClock = get_total_cpu_time()
|
|
395
|
+
startClock = ResourceMonitor.get_total_cpu_time()
|
|
369
396
|
|
|
370
397
|
startTime = time.time()
|
|
371
398
|
while True:
|
|
@@ -375,20 +402,22 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
375
402
|
|
|
376
403
|
logger.info("Working on job %s", jobDesc)
|
|
377
404
|
|
|
378
|
-
if jobDesc.
|
|
379
|
-
if not jobDesc.command.startswith("_toil "):
|
|
380
|
-
raise RuntimeError("Job command must start with '_toil' before being converted to an executable command.")
|
|
381
|
-
logger.debug("Got a command to run: %s" % jobDesc.command)
|
|
405
|
+
if jobDesc.has_body():
|
|
382
406
|
# Load the job. It will use the same JobDescription we have been using.
|
|
383
|
-
job = Job.loadJob(
|
|
407
|
+
job = Job.loadJob(job_store, jobDesc)
|
|
384
408
|
if isinstance(jobDesc, CheckpointJobDescription):
|
|
385
|
-
# If it is a checkpoint job,
|
|
386
|
-
jobDesc.
|
|
409
|
+
# If it is a checkpoint job, set the checkpoint
|
|
410
|
+
jobDesc.set_checkpoint()
|
|
387
411
|
|
|
388
412
|
logger.info("Loaded body %s from description %s", job, jobDesc)
|
|
389
413
|
|
|
414
|
+
if debug_flags:
|
|
415
|
+
for flag in debug_flags:
|
|
416
|
+
logger.debug("Turning on debug flag %s on job", flag)
|
|
417
|
+
job.set_debug_flag(flag)
|
|
418
|
+
|
|
390
419
|
# Create a fileStore object for the job
|
|
391
|
-
fileStore = AbstractFileStore.createFileStore(
|
|
420
|
+
fileStore = AbstractFileStore.createFileStore(job_store, jobDesc, local_worker_temp_dir, blockFn,
|
|
392
421
|
caching=config.caching)
|
|
393
422
|
with job._executor(stats=statsDict if config.stats else None,
|
|
394
423
|
fileStore=fileStore):
|
|
@@ -406,11 +435,11 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
406
435
|
# wants across multiple Toil versions. We also
|
|
407
436
|
# still pass a jobGraph argument to placate old
|
|
408
437
|
# versions of Cactus.
|
|
409
|
-
job._runner(jobGraph=None, jobStore=
|
|
438
|
+
job._runner(jobGraph=None, jobStore=job_store, fileStore=fileStore, defer=defer)
|
|
410
439
|
|
|
411
440
|
# When the executor for the job finishes it will
|
|
412
|
-
# kick off a commit with the
|
|
413
|
-
#
|
|
441
|
+
# kick off a commit with the link to the job body
|
|
442
|
+
# cut.
|
|
414
443
|
|
|
415
444
|
# Accumulate messages from this job & any subsequent chained jobs
|
|
416
445
|
statsDict.workers.logs_to_leader += fileStore.logging_messages
|
|
@@ -419,9 +448,9 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
419
448
|
logger.info("Completed body for %s", jobDesc)
|
|
420
449
|
|
|
421
450
|
else:
|
|
422
|
-
#The
|
|
423
|
-
#
|
|
424
|
-
#been scheduled after a failure to cleanup
|
|
451
|
+
# The body may not be attached, in which case the
|
|
452
|
+
# JobDescription is either a shell ready to be deleted or has
|
|
453
|
+
# been scheduled after a failure to cleanup
|
|
425
454
|
logger.debug("No user job to run, so finishing")
|
|
426
455
|
break
|
|
427
456
|
|
|
@@ -431,7 +460,7 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
431
460
|
##########################################
|
|
432
461
|
#Establish if we can run another job within the worker
|
|
433
462
|
##########################################
|
|
434
|
-
successor = nextChainable(jobDesc,
|
|
463
|
+
successor = nextChainable(jobDesc, job_store, config)
|
|
435
464
|
if successor is None or config.disableChaining:
|
|
436
465
|
# Can't chain any more jobs. We are going to stop.
|
|
437
466
|
|
|
@@ -470,7 +499,7 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
470
499
|
|
|
471
500
|
# Build a fileStore to update the job and commit the replacement.
|
|
472
501
|
# TODO: can we have a commit operation without an entire FileStore???
|
|
473
|
-
fileStore = AbstractFileStore.createFileStore(
|
|
502
|
+
fileStore = AbstractFileStore.createFileStore(job_store, jobDesc, local_worker_temp_dir, blockFn,
|
|
474
503
|
caching=config.caching)
|
|
475
504
|
|
|
476
505
|
# Update blockFn to wait for that commit operation.
|
|
@@ -485,7 +514,7 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
485
514
|
#Finish up the stats
|
|
486
515
|
##########################################
|
|
487
516
|
if config.stats:
|
|
488
|
-
totalCPUTime, totalMemoryUsage = get_total_cpu_time_and_memory_usage()
|
|
517
|
+
totalCPUTime, totalMemoryUsage = ResourceMonitor.get_total_cpu_time_and_memory_usage()
|
|
489
518
|
statsDict.workers.time = str(time.time() - startTime)
|
|
490
519
|
statsDict.workers.clock = str(totalCPUTime - startClock)
|
|
491
520
|
statsDict.workers.memory = str(totalMemoryUsage)
|
|
@@ -501,16 +530,20 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
501
530
|
|
|
502
531
|
|
|
503
532
|
# log the worker log path here so that if the file is truncated the path can still be found
|
|
504
|
-
if
|
|
505
|
-
logger.info("Worker log can be found at %s. Set --cleanWorkDir to retain this log",
|
|
533
|
+
if redirect_output_to_log_file:
|
|
534
|
+
logger.info("Worker log can be found at %s. Set --cleanWorkDir to retain this log", local_worker_temp_dir)
|
|
506
535
|
|
|
507
536
|
logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds", time.time() - startTime)
|
|
508
537
|
|
|
509
538
|
##########################################
|
|
510
539
|
#Trapping where worker goes wrong
|
|
511
540
|
##########################################
|
|
541
|
+
except DebugStoppingPointReached:
|
|
542
|
+
# Job wants the worker to stop for debugging
|
|
543
|
+
raise
|
|
512
544
|
except BaseException as e: #Case that something goes wrong in worker, or we are asked to stop
|
|
513
|
-
|
|
545
|
+
if not isinstance(e, SystemExit):
|
|
546
|
+
logger.critical("Worker crashed with traceback:\n%s", traceback.format_exc())
|
|
514
547
|
logger.error("Exiting the worker because of a failed job on host %s", socket.gethostname())
|
|
515
548
|
if isinstance(e, CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION):
|
|
516
549
|
# We need to inform the leader that this is a CWL workflow problem
|
|
@@ -551,7 +584,7 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
551
584
|
|
|
552
585
|
# Clobber any garbage state we have for this job from failing with
|
|
553
586
|
# whatever good state is still stored in the JobStore
|
|
554
|
-
jobDesc =
|
|
587
|
+
jobDesc = job_store.load_job(job_store_id)
|
|
555
588
|
# Remember that we failed
|
|
556
589
|
jobAttemptFailed = True
|
|
557
590
|
|
|
@@ -563,7 +596,7 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
563
596
|
# Flush at the Python level
|
|
564
597
|
sys.stdout.flush()
|
|
565
598
|
sys.stderr.flush()
|
|
566
|
-
if
|
|
599
|
+
if redirect_output_to_log_file:
|
|
567
600
|
# Flush at the OS level
|
|
568
601
|
os.fsync(1)
|
|
569
602
|
os.fsync(2)
|
|
@@ -590,11 +623,11 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
590
623
|
# relative to the end (since Python won't decode Unicode backward, or even
|
|
591
624
|
# interpret seek offsets in characters for us). TODO: We may get invalid or
|
|
592
625
|
# just different Unicode by breaking up a character at the boundary!
|
|
593
|
-
if jobAttemptFailed and
|
|
594
|
-
jobDesc.logJobStoreFileID = logJobStoreFileID =
|
|
626
|
+
if jobAttemptFailed and redirect_output_to_log_file:
|
|
627
|
+
jobDesc.logJobStoreFileID = logJobStoreFileID = job_store.getEmptyFileStoreID(
|
|
595
628
|
jobDesc.jobStoreID, cleanup=True
|
|
596
629
|
)
|
|
597
|
-
with
|
|
630
|
+
with job_store.update_file_stream(logJobStoreFileID) as w:
|
|
598
631
|
with open(tempWorkerLogPath, 'rb') as f:
|
|
599
632
|
if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit !=0:
|
|
600
633
|
if logFileByteReportLimit > 0:
|
|
@@ -604,10 +637,10 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
604
637
|
# Dump the possibly-invalid-Unicode bytes into the log file
|
|
605
638
|
w.write(f.read()) # TODO load file using a buffer
|
|
606
639
|
# Commit log file reference back to JobStore
|
|
607
|
-
|
|
640
|
+
job_store.update_job(jobDesc)
|
|
608
641
|
|
|
609
642
|
elif ((debugging or (config.writeLogsFromAllJobs and not jobDesc.local))
|
|
610
|
-
and
|
|
643
|
+
and redirect_output_to_log_file): # write log messages
|
|
611
644
|
with open(tempWorkerLogPath, 'rb') as logFile:
|
|
612
645
|
if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit != 0:
|
|
613
646
|
if logFileByteReportLimit > 0:
|
|
@@ -624,7 +657,7 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
624
657
|
# We have stats/logging to report back.
|
|
625
658
|
# We report even if the job attempt failed.
|
|
626
659
|
# TODO: Will that upset analysis of the stats?
|
|
627
|
-
|
|
660
|
+
job_store.write_logs(json.dumps(statsDict, ensure_ascii=True))
|
|
628
661
|
|
|
629
662
|
# Remove the temp dir
|
|
630
663
|
cleanUp = config.cleanWorkDir
|
|
@@ -642,14 +675,14 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
|
|
|
642
675
|
os.chmod(os.path.dirname(path), stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
|
|
643
676
|
except PermissionError as e:
|
|
644
677
|
logger.error('Could not set permissions on %s to allow cleanup of %s: %s', os.path.dirname(path), path, e)
|
|
645
|
-
shutil.rmtree(
|
|
678
|
+
shutil.rmtree(local_worker_temp_dir, onerror=make_parent_writable)
|
|
646
679
|
|
|
647
680
|
# This must happen after the log file is done with, else there is no place to put the log
|
|
648
681
|
if (not jobAttemptFailed) and jobDesc.is_subtree_done():
|
|
649
682
|
for merged_in in jobDesc.get_chain():
|
|
650
683
|
# We can now safely get rid of the JobDescription, and all jobs it chained up
|
|
651
|
-
|
|
652
|
-
|
|
684
|
+
job_store.delete_job(merged_in.job_store_id)
|
|
685
|
+
|
|
653
686
|
|
|
654
687
|
if jobAttemptFailed:
|
|
655
688
|
return failure_exit_code
|
|
@@ -719,25 +752,19 @@ def in_contexts(contexts: List[str]) -> Iterator[None]:
|
|
|
719
752
|
def main(argv: Optional[List[str]] = None) -> None:
|
|
720
753
|
if argv is None:
|
|
721
754
|
argv = sys.argv
|
|
722
|
-
|
|
723
755
|
# Parse our command line
|
|
724
756
|
options = parse_args(argv)
|
|
725
757
|
|
|
726
|
-
# Parse input args
|
|
727
|
-
jobName = argv[1]
|
|
728
|
-
jobStoreLocator = argv[2]
|
|
729
|
-
jobStoreID = argv[3]
|
|
730
|
-
|
|
731
758
|
##########################################
|
|
732
759
|
#Load the jobStore/config file
|
|
733
760
|
##########################################
|
|
734
761
|
|
|
735
|
-
|
|
736
|
-
config =
|
|
762
|
+
job_store = Toil.resumeJobStore(options.jobStoreLocator)
|
|
763
|
+
config = job_store.config
|
|
737
764
|
|
|
738
765
|
with in_contexts(options.context):
|
|
739
766
|
# Call the worker
|
|
740
|
-
exit_code = workerScript(
|
|
767
|
+
exit_code = workerScript(job_store, config, options.jobName, options.jobStoreID)
|
|
741
768
|
|
|
742
769
|
# Exit with its return value
|
|
743
770
|
sys.exit(exit_code)
|
|
@@ -202,3 +202,28 @@
|
|
|
202
202
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
203
203
|
See the License for the specific language governing permissions and
|
|
204
204
|
limitations under the License.
|
|
205
|
+
|
|
206
|
+
All code in this repository excluding src/toil/statsAndLogging.py::install_log_color is under the Apache License as outlined directly above.
|
|
207
|
+
Some code in src/toil/statsAndLogging.py::install_log_color is under the MiniWDL MIT License as outlined directly below.
|
|
208
|
+
|
|
209
|
+
MIT License
|
|
210
|
+
|
|
211
|
+
Copyright (c) 2018 Chan Zuckerberg Initiative
|
|
212
|
+
|
|
213
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
214
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
215
|
+
in the Software without restriction, including without limitation the rights
|
|
216
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
217
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
218
|
+
furnished to do so, subject to the following conditions:
|
|
219
|
+
|
|
220
|
+
The above copyright notice and this permission notice shall be included in all
|
|
221
|
+
copies or substantial portions of the Software.
|
|
222
|
+
|
|
223
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
224
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
225
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
226
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
227
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
228
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
229
|
+
SOFTWARE.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: toil
|
|
3
|
-
Version:
|
|
3
|
+
Version: 7.0.0
|
|
4
4
|
Summary: Pipeline management software for clusters.
|
|
5
5
|
Home-page: https://github.com/DataBiosphere/toil
|
|
6
6
|
Author: Benedict Paten and the Toil community
|
|
@@ -29,30 +29,32 @@ Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
|
|
|
29
29
|
Classifier: Topic :: System :: Distributed Computing
|
|
30
30
|
Classifier: Topic :: Utilities
|
|
31
31
|
Requires-Python: >=3.8
|
|
32
|
+
Description-Content-Type: text/x-rst
|
|
32
33
|
License-File: LICENSE
|
|
33
34
|
Requires-Dist: dill <0.4,>=0.3.2
|
|
34
|
-
Requires-Dist: requests
|
|
35
|
+
Requires-Dist: requests <=2.31.0
|
|
35
36
|
Requires-Dist: docker <8,>=6.1.0
|
|
36
37
|
Requires-Dist: urllib3 <3,>=1.26.0
|
|
37
38
|
Requires-Dist: python-dateutil
|
|
38
39
|
Requires-Dist: psutil <6,>=3.0.1
|
|
39
40
|
Requires-Dist: PyPubSub <5,>=4.0.3
|
|
40
41
|
Requires-Dist: addict <2.5,>=2.2.1
|
|
41
|
-
Requires-Dist: pytz >=2012
|
|
42
42
|
Requires-Dist: enlighten <2,>=1.5.2
|
|
43
43
|
Requires-Dist: configargparse <2,>=1.7
|
|
44
44
|
Requires-Dist: ruamel.yaml >=0.15
|
|
45
45
|
Requires-Dist: pyyaml <7,>=6
|
|
46
46
|
Requires-Dist: typing-extensions <5,>=4.6.2
|
|
47
|
+
Requires-Dist: coloredlogs <16,>=15
|
|
47
48
|
Provides-Extra: all
|
|
48
|
-
Requires-Dist:
|
|
49
|
-
Requires-Dist: boto3-stubs[boto3,iam,s3,sdb,sts] <2,>=1.28.3.post2 ; extra == 'all'
|
|
49
|
+
Requires-Dist: boto3-stubs[autoscaling,boto3,ec2,iam,s3,sdb,sts] <2,>=1.28.3.post2 ; extra == 'all'
|
|
50
50
|
Requires-Dist: mypy-boto3-iam <2,>=1.28.3.post2 ; extra == 'all'
|
|
51
|
-
Requires-Dist:
|
|
52
|
-
Requires-Dist:
|
|
51
|
+
Requires-Dist: mypy-boto3-s3 <2,>=1.28.3.post2 ; extra == 'all'
|
|
52
|
+
Requires-Dist: moto <6,>=5.0.3 ; extra == 'all'
|
|
53
|
+
Requires-Dist: ec2-metadata <3 ; extra == 'all'
|
|
54
|
+
Requires-Dist: cwltool ==3.1.20240508115724 ; extra == 'all'
|
|
53
55
|
Requires-Dist: schema-salad <9,>=8.4.20230128170514 ; extra == 'all'
|
|
54
|
-
Requires-Dist: galaxy-tool-util <
|
|
55
|
-
Requires-Dist: galaxy-util <
|
|
56
|
+
Requires-Dist: galaxy-tool-util <25 ; extra == 'all'
|
|
57
|
+
Requires-Dist: galaxy-util <25 ; extra == 'all'
|
|
56
58
|
Requires-Dist: ruamel.yaml <=0.19,>=0.15 ; extra == 'all'
|
|
57
59
|
Requires-Dist: ruamel.yaml.clib >=0.2.6 ; extra == 'all'
|
|
58
60
|
Requires-Dist: networkx !=2.8.1,<4 ; extra == 'all'
|
|
@@ -66,28 +68,29 @@ Requires-Dist: kubernetes-stubs ==v22.6.0post1 ; extra == 'all'
|
|
|
66
68
|
Requires-Dist: types-urllib3 ; extra == 'all'
|
|
67
69
|
Requires-Dist: types-PyYAML ; extra == 'all'
|
|
68
70
|
Requires-Dist: idna >=2 ; extra == 'all'
|
|
69
|
-
Requires-Dist: miniwdl ==1.
|
|
71
|
+
Requires-Dist: miniwdl ==1.12.0 ; extra == 'all'
|
|
70
72
|
Requires-Dist: wdlparse ==0.1.0 ; extra == 'all'
|
|
71
73
|
Requires-Dist: connexion[swagger-ui] <3,>=2.10.0 ; extra == 'all'
|
|
72
74
|
Requires-Dist: flask <3,>=2.0 ; extra == 'all'
|
|
73
75
|
Requires-Dist: werkzeug <3,>=2.0 ; extra == 'all'
|
|
74
|
-
Requires-Dist: flask-cors ==4.0.
|
|
75
|
-
Requires-Dist: gunicorn ==
|
|
76
|
+
Requires-Dist: flask-cors ==4.0.1 ; extra == 'all'
|
|
77
|
+
Requires-Dist: gunicorn ==22.0.0 ; extra == 'all'
|
|
76
78
|
Requires-Dist: celery <6,>=5.1.0 ; extra == 'all'
|
|
77
79
|
Requires-Dist: wes-service <5,>=4.0.0 ; extra == 'all'
|
|
78
80
|
Requires-Dist: ruamel.yaml <0.19,>=0.15 ; extra == 'all'
|
|
79
81
|
Requires-Dist: pymesos <0.4,>=0.3.15 ; (python_version < "3.11") and extra == 'all'
|
|
80
82
|
Requires-Dist: graphlib-backport ==1.0 ; (python_version < "3.9") and extra == 'all'
|
|
81
83
|
Provides-Extra: aws
|
|
82
|
-
Requires-Dist:
|
|
83
|
-
Requires-Dist: boto3-stubs[boto3,iam,s3,sdb,sts] <2,>=1.28.3.post2 ; extra == 'aws'
|
|
84
|
+
Requires-Dist: boto3-stubs[autoscaling,boto3,ec2,iam,s3,sdb,sts] <2,>=1.28.3.post2 ; extra == 'aws'
|
|
84
85
|
Requires-Dist: mypy-boto3-iam <2,>=1.28.3.post2 ; extra == 'aws'
|
|
85
|
-
Requires-Dist:
|
|
86
|
+
Requires-Dist: mypy-boto3-s3 <2,>=1.28.3.post2 ; extra == 'aws'
|
|
87
|
+
Requires-Dist: moto <6,>=5.0.3 ; extra == 'aws'
|
|
88
|
+
Requires-Dist: ec2-metadata <3 ; extra == 'aws'
|
|
86
89
|
Provides-Extra: cwl
|
|
87
|
-
Requires-Dist: cwltool ==3.1.
|
|
90
|
+
Requires-Dist: cwltool ==3.1.20240508115724 ; extra == 'cwl'
|
|
88
91
|
Requires-Dist: schema-salad <9,>=8.4.20230128170514 ; extra == 'cwl'
|
|
89
|
-
Requires-Dist: galaxy-tool-util <
|
|
90
|
-
Requires-Dist: galaxy-util <
|
|
92
|
+
Requires-Dist: galaxy-tool-util <25 ; extra == 'cwl'
|
|
93
|
+
Requires-Dist: galaxy-util <25 ; extra == 'cwl'
|
|
91
94
|
Requires-Dist: ruamel.yaml <=0.19,>=0.15 ; extra == 'cwl'
|
|
92
95
|
Requires-Dist: ruamel.yaml.clib >=0.2.6 ; extra == 'cwl'
|
|
93
96
|
Requires-Dist: networkx !=2.8.1,<4 ; extra == 'cwl'
|
|
@@ -112,13 +115,44 @@ Provides-Extra: server
|
|
|
112
115
|
Requires-Dist: connexion[swagger-ui] <3,>=2.10.0 ; extra == 'server'
|
|
113
116
|
Requires-Dist: flask <3,>=2.0 ; extra == 'server'
|
|
114
117
|
Requires-Dist: werkzeug <3,>=2.0 ; extra == 'server'
|
|
115
|
-
Requires-Dist: flask-cors ==4.0.
|
|
116
|
-
Requires-Dist: gunicorn ==
|
|
118
|
+
Requires-Dist: flask-cors ==4.0.1 ; extra == 'server'
|
|
119
|
+
Requires-Dist: gunicorn ==22.0.0 ; extra == 'server'
|
|
117
120
|
Requires-Dist: celery <6,>=5.1.0 ; extra == 'server'
|
|
118
121
|
Requires-Dist: wes-service <5,>=4.0.0 ; extra == 'server'
|
|
119
122
|
Requires-Dist: ruamel.yaml <0.19,>=0.15 ; extra == 'server'
|
|
120
123
|
Provides-Extra: wdl
|
|
121
|
-
Requires-Dist: miniwdl ==1.
|
|
124
|
+
Requires-Dist: miniwdl ==1.12.0 ; extra == 'wdl'
|
|
122
125
|
Requires-Dist: wdlparse ==0.1.0 ; extra == 'wdl'
|
|
123
126
|
Requires-Dist: graphlib-backport ==1.0 ; (python_version < "3.9") and extra == 'wdl'
|
|
124
127
|
|
|
128
|
+
.. image:: https://badges.gitter.im/bd2k-genomics-toil/Lobby.svg
|
|
129
|
+
:alt: Join the chat at https://gitter.im/bd2k-genomics-toil/Lobby
|
|
130
|
+
:target: https://gitter.im/bd2k-genomics-toil/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge
|
|
131
|
+
|
|
132
|
+
Toil is a scalable, efficient, cross-platform (Linux & macOS) pipeline management system,
|
|
133
|
+
written entirely in Python, and designed around the principles of functional
|
|
134
|
+
programming. It supports running workflows written in either Common Workflow Language (`CWL`_) 1.0-1.2 or
|
|
135
|
+
Workflow Description Language (`WDL`_) 1.0-1.1, as well as having its own rich Python API for writing workflows against.
|
|
136
|
+
It supports running workflows locally on your system (e.g. a laptop), on an HPC cluster, or in the cloud.
|
|
137
|
+
|
|
138
|
+
* Check the `website`_ for a description of Toil and its features.
|
|
139
|
+
* Full documentation for the latest stable release can be found at
|
|
140
|
+
`Read the Docs`_.
|
|
141
|
+
* Please subscribe to low-volume `announce`_ mailing list so we keep you informed
|
|
142
|
+
* Google Groups discussion `forum`_
|
|
143
|
+
* See our occasional `blog`_ for tutorials.
|
|
144
|
+
* Use `biostars`_ channel for discussion.
|
|
145
|
+
|
|
146
|
+
.. _website: http://toil.ucsc-cgl.org/
|
|
147
|
+
.. _Read the Docs: https://toil.readthedocs.io/en/latest
|
|
148
|
+
.. _announce: https://groups.google.com/forum/#!forum/toil-announce
|
|
149
|
+
.. _forum: https://groups.google.com/forum/#!forum/toil-community
|
|
150
|
+
.. _blog: https://toilpipelines.wordpress.com/
|
|
151
|
+
.. _biostars: https://www.biostars.org/t/toil/
|
|
152
|
+
.. _CWL: https://www.commonwl.org/
|
|
153
|
+
.. _WDL: https://openwdl.org/
|
|
154
|
+
|
|
155
|
+
Notes:
|
|
156
|
+
|
|
157
|
+
* Toil moved from https://github.com/BD2KGenomics/toil to https://github.com/DataBiosphere/toil on July 5th, 2018.
|
|
158
|
+
* Toil dropped Python 2.7 support on February 13, 2020 (the last working py2.7 version is 3.24.0).
|