toil 6.0.0__py3-none-any.whl → 6.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/batchSystems/abstractBatchSystem.py +19 -4
- toil/batchSystems/abstractGridEngineBatchSystem.py +22 -22
- toil/batchSystems/cleanup_support.py +7 -3
- toil/batchSystems/lsf.py +7 -7
- toil/batchSystems/slurm.py +85 -14
- toil/bus.py +38 -0
- toil/common.py +20 -18
- toil/cwl/cwltoil.py +81 -63
- toil/exceptions.py +1 -1
- toil/fileStores/abstractFileStore.py +53 -4
- toil/fileStores/cachingFileStore.py +4 -20
- toil/fileStores/nonCachingFileStore.py +5 -14
- toil/job.py +46 -30
- toil/jobStores/abstractJobStore.py +21 -23
- toil/jobStores/aws/utils.py +5 -4
- toil/jobStores/fileJobStore.py +1 -1
- toil/leader.py +17 -14
- toil/lib/conversions.py +19 -0
- toil/lib/generatedEC2Lists.py +8 -8
- toil/lib/io.py +28 -2
- toil/lib/resources.py +8 -1
- toil/lib/threading.py +27 -12
- toil/options/common.py +5 -7
- toil/options/wdl.py +5 -0
- toil/provisioners/abstractProvisioner.py +8 -0
- toil/statsAndLogging.py +36 -8
- toil/test/batchSystems/test_slurm.py +21 -6
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +58 -0
- toil/test/cwl/cwlTest.py +243 -151
- toil/test/docs/scriptsTest.py +2 -2
- toil/test/jobStores/jobStoreTest.py +7 -5
- toil/test/lib/test_ec2.py +1 -1
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +37 -0
- toil/test/provisioners/clusterTest.py +9 -8
- toil/test/utils/toilDebugTest.py +1 -1
- toil/test/utils/utilsTest.py +3 -3
- toil/test/wdl/wdltoil_test.py +91 -16
- toil/utils/toilDebugFile.py +1 -1
- toil/utils/toilStats.py +309 -266
- toil/utils/toilStatus.py +1 -1
- toil/version.py +9 -9
- toil/wdl/wdltoil.py +341 -189
- toil/worker.py +31 -16
- {toil-6.0.0.dist-info → toil-6.1.0.dist-info}/METADATA +6 -7
- {toil-6.0.0.dist-info → toil-6.1.0.dist-info}/RECORD +51 -47
- {toil-6.0.0.dist-info → toil-6.1.0.dist-info}/LICENSE +0 -0
- {toil-6.0.0.dist-info → toil-6.1.0.dist-info}/WHEEL +0 -0
- {toil-6.0.0.dist-info → toil-6.1.0.dist-info}/entry_points.txt +0 -0
- {toil-6.0.0.dist-info → toil-6.1.0.dist-info}/top_level.txt +0 -0
toil/cwl/cwltoil.py
CHANGED
|
@@ -34,27 +34,25 @@ import stat
|
|
|
34
34
|
import sys
|
|
35
35
|
import textwrap
|
|
36
36
|
import uuid
|
|
37
|
-
from tempfile import NamedTemporaryFile, gettempdir
|
|
37
|
+
from tempfile import NamedTemporaryFile, TemporaryFile, gettempdir
|
|
38
38
|
from threading import Thread
|
|
39
|
-
from typing import (
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
Sequence,
|
|
57
|
-
)
|
|
39
|
+
from typing import (IO,
|
|
40
|
+
Any,
|
|
41
|
+
Callable,
|
|
42
|
+
Dict,
|
|
43
|
+
Iterator,
|
|
44
|
+
List,
|
|
45
|
+
Mapping,
|
|
46
|
+
MutableMapping,
|
|
47
|
+
MutableSequence,
|
|
48
|
+
Optional,
|
|
49
|
+
Sequence,
|
|
50
|
+
TextIO,
|
|
51
|
+
Tuple,
|
|
52
|
+
Type,
|
|
53
|
+
TypeVar,
|
|
54
|
+
Union,
|
|
55
|
+
cast)
|
|
58
56
|
from urllib.parse import quote, unquote, urlparse, urlsplit
|
|
59
57
|
|
|
60
58
|
import cwl_utils.errors
|
|
@@ -68,36 +66,30 @@ import cwltool.load_tool
|
|
|
68
66
|
import cwltool.main
|
|
69
67
|
import cwltool.resolver
|
|
70
68
|
import schema_salad.ref_resolver
|
|
71
|
-
from configargparse import
|
|
69
|
+
from configargparse import SUPPRESS, ArgParser, Namespace
|
|
72
70
|
from cwltool.loghandler import _logger as cwllogger
|
|
73
71
|
from cwltool.loghandler import defaultStreamHandler
|
|
74
72
|
from cwltool.mpi import MpiConfig
|
|
75
73
|
from cwltool.mutation import MutationManager
|
|
76
74
|
from cwltool.pathmapper import MapperEnt, PathMapper
|
|
77
|
-
from cwltool.process import (
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
shortname,
|
|
83
|
-
)
|
|
75
|
+
from cwltool.process import (Process,
|
|
76
|
+
add_sizes,
|
|
77
|
+
compute_checksums,
|
|
78
|
+
fill_in_defaults,
|
|
79
|
+
shortname)
|
|
84
80
|
from cwltool.secrets import SecretStore
|
|
85
|
-
from cwltool.software_requirements import (
|
|
86
|
-
|
|
87
|
-
get_container_from_software_requirements,
|
|
88
|
-
)
|
|
81
|
+
from cwltool.software_requirements import (DependenciesConfiguration,
|
|
82
|
+
get_container_from_software_requirements)
|
|
89
83
|
from cwltool.stdfsaccess import StdFsAccess, abspath
|
|
90
|
-
from cwltool.utils import (
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
visit_class,
|
|
100
|
-
)
|
|
84
|
+
from cwltool.utils import (CWLObjectType,
|
|
85
|
+
CWLOutputType,
|
|
86
|
+
DirectoryType,
|
|
87
|
+
adjustDirObjs,
|
|
88
|
+
aslist,
|
|
89
|
+
downloadHttpFile,
|
|
90
|
+
get_listing,
|
|
91
|
+
normalizeFilesDirs,
|
|
92
|
+
visit_class)
|
|
101
93
|
from ruamel.yaml.comments import CommentedMap, CommentedSeq
|
|
102
94
|
from schema_salad.avro.schema import Names
|
|
103
95
|
from schema_salad.exceptions import ValidationException
|
|
@@ -110,18 +102,17 @@ from toil.common import Toil, addOptions
|
|
|
110
102
|
from toil.cwl import check_cwltool_version
|
|
111
103
|
|
|
112
104
|
check_cwltool_version()
|
|
113
|
-
from toil.cwl.utils import (
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
visit_cwl_class_and_reduce,
|
|
119
|
-
)
|
|
105
|
+
from toil.cwl.utils import (CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION,
|
|
106
|
+
CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE,
|
|
107
|
+
download_structure,
|
|
108
|
+
get_from_structure,
|
|
109
|
+
visit_cwl_class_and_reduce)
|
|
120
110
|
from toil.exceptions import FailedJobsException
|
|
121
111
|
from toil.fileStores import FileID
|
|
122
112
|
from toil.fileStores.abstractFileStore import AbstractFileStore
|
|
123
113
|
from toil.job import AcceleratorRequirement, Job, Promise, Promised, unwrap
|
|
124
|
-
from toil.jobStores.abstractJobStore import AbstractJobStore,
|
|
114
|
+
from toil.jobStores.abstractJobStore import (AbstractJobStore,
|
|
115
|
+
NoSuchFileException)
|
|
125
116
|
from toil.jobStores.fileJobStore import FileJobStore
|
|
126
117
|
from toil.jobStores.utils import JobStoreUnavailableException, generate_locator
|
|
127
118
|
from toil.lib.io import mkdtemp
|
|
@@ -1987,7 +1978,7 @@ def upload_file(
|
|
|
1987
1978
|
|
|
1988
1979
|
Uploads local files to the Toil file store, and sets their location to a
|
|
1989
1980
|
reference to the toil file store.
|
|
1990
|
-
|
|
1981
|
+
|
|
1991
1982
|
Unless skip_remote is set, downloads remote files into the file store and
|
|
1992
1983
|
sets their locations to references into the file store as well.
|
|
1993
1984
|
"""
|
|
@@ -2614,6 +2605,13 @@ class CWLJob(CWLNamedJob):
|
|
|
2614
2605
|
streaming_allowed=runtime_context.streaming_allowed,
|
|
2615
2606
|
)
|
|
2616
2607
|
|
|
2608
|
+
# Collect standard output and standard error somewhere if they don't go to files.
|
|
2609
|
+
# We need to keep two FDs to these because cwltool will close what we give it.
|
|
2610
|
+
default_stdout = TemporaryFile()
|
|
2611
|
+
runtime_context.default_stdout = os.fdopen(os.dup(default_stdout.fileno()), 'wb')
|
|
2612
|
+
default_stderr = TemporaryFile()
|
|
2613
|
+
runtime_context.default_stderr = os.fdopen(os.dup(default_stderr.fileno()), 'wb')
|
|
2614
|
+
|
|
2617
2615
|
process_uuid = uuid.uuid4() # noqa F841
|
|
2618
2616
|
started_at = datetime.datetime.now() # noqa F841
|
|
2619
2617
|
|
|
@@ -2622,13 +2620,34 @@ class CWLJob(CWLNamedJob):
|
|
|
2622
2620
|
logger.debug("Running tool %s with order: %s", self.cwltool, self.cwljob)
|
|
2623
2621
|
|
|
2624
2622
|
runtime_context.name = self.description.unitName
|
|
2625
|
-
|
|
2626
|
-
|
|
2627
|
-
|
|
2628
|
-
|
|
2629
|
-
|
|
2630
|
-
|
|
2631
|
-
|
|
2623
|
+
|
|
2624
|
+
status = "did_not_run"
|
|
2625
|
+
try:
|
|
2626
|
+
output, status = ToilSingleJobExecutor().execute(
|
|
2627
|
+
process=self.cwltool,
|
|
2628
|
+
job_order_object=cwljob,
|
|
2629
|
+
runtime_context=runtime_context,
|
|
2630
|
+
logger=cwllogger,
|
|
2631
|
+
)
|
|
2632
|
+
finally:
|
|
2633
|
+
ended_at = datetime.datetime.now() # noqa F841
|
|
2634
|
+
|
|
2635
|
+
# Log any output/error data
|
|
2636
|
+
default_stdout.seek(0, os.SEEK_END)
|
|
2637
|
+
if default_stdout.tell() > 0:
|
|
2638
|
+
default_stdout.seek(0)
|
|
2639
|
+
file_store.log_user_stream(self.description.unitName + '.stdout', default_stdout)
|
|
2640
|
+
if status != "success":
|
|
2641
|
+
default_stdout.seek(0)
|
|
2642
|
+
logger.error("Failed command standard output:\n%s", default_stdout.read().decode("utf-8", errors="replace"))
|
|
2643
|
+
default_stderr.seek(0, os.SEEK_END)
|
|
2644
|
+
if default_stderr.tell():
|
|
2645
|
+
default_stderr.seek(0)
|
|
2646
|
+
file_store.log_user_stream(self.description.unitName + '.stderr', default_stderr)
|
|
2647
|
+
if status != "success":
|
|
2648
|
+
default_stderr.seek(0)
|
|
2649
|
+
logger.error("Failed command standard error:\n%s", default_stderr.read().decode("utf-8", errors="replace"))
|
|
2650
|
+
|
|
2632
2651
|
if status != "success":
|
|
2633
2652
|
raise cwl_utils.errors.WorkflowException(status)
|
|
2634
2653
|
|
|
@@ -3352,12 +3371,12 @@ def determine_load_listing(
|
|
|
3352
3371
|
|
|
3353
3372
|
1. no_listing: DIRECTORY_NAME.listing will be undefined.
|
|
3354
3373
|
e.g.
|
|
3355
|
-
|
|
3374
|
+
|
|
3356
3375
|
inputs.DIRECTORY_NAME.listing == unspecified
|
|
3357
3376
|
|
|
3358
3377
|
2. shallow_listing: DIRECTORY_NAME.listing will return a list one level
|
|
3359
3378
|
deep of DIRECTORY_NAME's contents.
|
|
3360
|
-
e.g.
|
|
3379
|
+
e.g.
|
|
3361
3380
|
|
|
3362
3381
|
inputs.DIRECTORY_NAME.listing == [items in directory]
|
|
3363
3382
|
inputs.DIRECTORY_NAME.listing[0].listing == undefined
|
|
@@ -3576,7 +3595,6 @@ def main(args: Optional[List[str]] = None, stdout: TextIO = sys.stdout) -> int:
|
|
|
3576
3595
|
dependencies_configuration = DependenciesConfiguration(options)
|
|
3577
3596
|
job_script_provider = dependencies_configuration
|
|
3578
3597
|
|
|
3579
|
-
options.default_container = None
|
|
3580
3598
|
runtime_context = cwltool.context.RuntimeContext(vars(options))
|
|
3581
3599
|
runtime_context.toplevel = True # enable discovery of secondaryFiles
|
|
3582
3600
|
runtime_context.find_default_container = functools.partial(
|
|
@@ -3789,7 +3807,7 @@ def main(args: Optional[List[str]] = None, stdout: TextIO = sys.stdout) -> int:
|
|
|
3789
3807
|
Callable[[str], FileID],
|
|
3790
3808
|
functools.partial(toil.import_file, symlink=True),
|
|
3791
3809
|
)
|
|
3792
|
-
|
|
3810
|
+
|
|
3793
3811
|
# Import all the input files, some of which may be missing optional
|
|
3794
3812
|
# files.
|
|
3795
3813
|
logger.info("Importing input files...")
|
toil/exceptions.py
CHANGED
|
@@ -36,7 +36,7 @@ class FailedJobsException(Exception):
|
|
|
36
36
|
for job_desc in failed_jobs:
|
|
37
37
|
if job_desc.logJobStoreFileID:
|
|
38
38
|
with job_desc.getLogFileHandle(job_store) as f:
|
|
39
|
-
self.msg += "\n" + StatsAndLogging.formatLogStream(f, job_desc)
|
|
39
|
+
self.msg += "\n" + StatsAndLogging.formatLogStream(f, f'Log from job "{job_desc}"')
|
|
40
40
|
# catch failures to prepare more complex details and only return the basics
|
|
41
41
|
except Exception:
|
|
42
42
|
logger.exception("Exception when compiling information about failed jobs")
|
|
@@ -37,11 +37,12 @@ from typing import (IO,
|
|
|
37
37
|
|
|
38
38
|
import dill
|
|
39
39
|
|
|
40
|
-
from toil.common import Toil, cacheDirName
|
|
40
|
+
from toil.common import Toil, cacheDirName, getDirSizeRecursively
|
|
41
41
|
from toil.fileStores import FileID
|
|
42
42
|
from toil.job import Job, JobDescription
|
|
43
43
|
from toil.jobStores.abstractJobStore import AbstractJobStore
|
|
44
44
|
from toil.lib.compatibility import deprecated
|
|
45
|
+
from toil.lib.conversions import bytes2human
|
|
45
46
|
from toil.lib.io import WriteWatchingStream, mkdtemp
|
|
46
47
|
|
|
47
48
|
logger = logging.getLogger(__name__)
|
|
@@ -116,7 +117,8 @@ class AbstractFileStore(ABC):
|
|
|
116
117
|
self.jobDesc.command.split()[1] if self.jobDesc.command else ""
|
|
117
118
|
)
|
|
118
119
|
self.waitForPreviousCommit = waitForPreviousCommit
|
|
119
|
-
self.
|
|
120
|
+
self.logging_messages: List[Dict[str, Union[int, str]]] = []
|
|
121
|
+
self.logging_user_streams: List[dict[str, str]] = []
|
|
120
122
|
# Records file IDs of files deleted during the current job. Doesn't get
|
|
121
123
|
# committed back until the job is completely successful, because if the
|
|
122
124
|
# job is re-run it will need to be able to re-delete these files.
|
|
@@ -125,6 +127,8 @@ class AbstractFileStore(ABC):
|
|
|
125
127
|
# Holds records of file ID, or file ID and local path, for reporting
|
|
126
128
|
# the accessed files of failed jobs.
|
|
127
129
|
self._accessLog: List[Tuple[str, ...]] = []
|
|
130
|
+
# Holds total bytes of observed disk usage for the last job run under open()
|
|
131
|
+
self._job_disk_used: Optional[int] = None
|
|
128
132
|
|
|
129
133
|
@staticmethod
|
|
130
134
|
def createFileStore(
|
|
@@ -188,6 +192,7 @@ class AbstractFileStore(ABC):
|
|
|
188
192
|
:param job: The job instance of the toil job to run.
|
|
189
193
|
"""
|
|
190
194
|
failed = True
|
|
195
|
+
job_requested_disk = job.disk
|
|
191
196
|
try:
|
|
192
197
|
yield
|
|
193
198
|
failed = False
|
|
@@ -197,6 +202,33 @@ class AbstractFileStore(ABC):
|
|
|
197
202
|
if failed:
|
|
198
203
|
self._dumpAccessLogs()
|
|
199
204
|
|
|
205
|
+
# See how much disk space is used at the end of the job.
|
|
206
|
+
# Not a real peak disk usage, but close enough to be useful for warning the user.
|
|
207
|
+
self._job_disk_used = getDirSizeRecursively(self.localTempDir)
|
|
208
|
+
|
|
209
|
+
# Report disk usage
|
|
210
|
+
percent: float = 0.0
|
|
211
|
+
if job_requested_disk and job_requested_disk > 0:
|
|
212
|
+
percent = float(self._job_disk_used) / job_requested_disk * 100
|
|
213
|
+
disk_usage: str = (f"Job {self.jobName} used {percent:.2f}% disk ({bytes2human(self._job_disk_used)}B [{self._job_disk_used}B] used, "
|
|
214
|
+
f"{bytes2human(job_requested_disk)}B [{job_requested_disk}B] requested).")
|
|
215
|
+
if self._job_disk_used > job_requested_disk:
|
|
216
|
+
self.log_to_leader("Job used more disk than requested. For CWL, consider increasing the outdirMin "
|
|
217
|
+
f"requirement, otherwise, consider increasing the disk requirement. {disk_usage}",
|
|
218
|
+
level=logging.WARNING)
|
|
219
|
+
else:
|
|
220
|
+
self.log_to_leader(disk_usage, level=logging.DEBUG)
|
|
221
|
+
|
|
222
|
+
def get_disk_usage(self) -> Optional[int]:
|
|
223
|
+
"""
|
|
224
|
+
Get the number of bytes of disk used by the last job run under open().
|
|
225
|
+
|
|
226
|
+
Disk usage is measured at the end of the job.
|
|
227
|
+
TODO: Sample periodically and record peak usage.
|
|
228
|
+
"""
|
|
229
|
+
return self._job_disk_used
|
|
230
|
+
|
|
231
|
+
|
|
200
232
|
# Functions related to temp files and directories
|
|
201
233
|
def getLocalTempDir(self) -> str:
|
|
202
234
|
"""
|
|
@@ -611,13 +643,30 @@ class AbstractFileStore(ABC):
|
|
|
611
643
|
:param level: The logging level.
|
|
612
644
|
"""
|
|
613
645
|
logger.log(level=level, msg=("LOG-TO-MASTER: " + text))
|
|
614
|
-
self.
|
|
646
|
+
self.logging_messages.append(dict(text=text, level=level))
|
|
615
647
|
|
|
616
648
|
|
|
617
649
|
@deprecated(new_function_name='export_file')
|
|
618
650
|
def logToMaster(self, text: str, level: int = logging.INFO) -> None:
|
|
619
651
|
self.log_to_leader(text, level)
|
|
620
|
-
|
|
652
|
+
|
|
653
|
+
def log_user_stream(self, name: str, stream: IO[bytes]) -> None:
|
|
654
|
+
"""
|
|
655
|
+
Send a stream of UTF-8 text to the leader as a named log stream.
|
|
656
|
+
|
|
657
|
+
Useful for things like the error logs of Docker containers. The leader
|
|
658
|
+
will show it to the user or organize it appropriately for user-level
|
|
659
|
+
log information.
|
|
660
|
+
|
|
661
|
+
:param name: A hierarchical, .-delimited string.
|
|
662
|
+
:param stream: A stream of encoded text. Encoding errors will be
|
|
663
|
+
tolerated.
|
|
664
|
+
"""
|
|
665
|
+
|
|
666
|
+
# Read the whole stream into memory
|
|
667
|
+
steam_data = stream.read().decode('utf-8', errors='replace')
|
|
668
|
+
# And remember it for the worker to fish out
|
|
669
|
+
self.logging_user_streams.append(dict(name=name, text=steam_data))
|
|
621
670
|
|
|
622
671
|
# Functions run after the completion of the job.
|
|
623
672
|
@abstractmethod
|
|
@@ -32,13 +32,12 @@ from typing import (Any,
|
|
|
32
32
|
Sequence,
|
|
33
33
|
Tuple)
|
|
34
34
|
|
|
35
|
-
from toil.common import cacheDirName,
|
|
35
|
+
from toil.common import cacheDirName, getFileSystemSize
|
|
36
36
|
from toil.fileStores import FileID
|
|
37
37
|
from toil.fileStores.abstractFileStore import AbstractFileStore
|
|
38
38
|
from toil.job import Job, JobDescription
|
|
39
39
|
from toil.jobStores.abstractJobStore import AbstractJobStore
|
|
40
40
|
from toil.lib.compatibility import deprecated
|
|
41
|
-
from toil.lib.conversions import bytes2human
|
|
42
41
|
from toil.lib.io import (atomic_copy,
|
|
43
42
|
atomic_copyobj,
|
|
44
43
|
make_public_dir,
|
|
@@ -1041,7 +1040,8 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1041
1040
|
# Check the status of all jobs on this node. If there are jobs that started and died before
|
|
1042
1041
|
# cleaning up their presence from the database, clean them up ourselves.
|
|
1043
1042
|
self._removeDeadJobs(self.coordination_dir, self.con)
|
|
1044
|
-
# Get the
|
|
1043
|
+
# Get the disk requirement for the job, which we will use to know if we
|
|
1044
|
+
# have filled the cache or not.
|
|
1045
1045
|
self.jobDiskBytes = job.disk
|
|
1046
1046
|
|
|
1047
1047
|
logger.debug('Actually running job (%s) with ID (%s) which wants %d of our %d bytes.',
|
|
@@ -1055,22 +1055,6 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1055
1055
|
with super().open(job):
|
|
1056
1056
|
yield
|
|
1057
1057
|
finally:
|
|
1058
|
-
# See how much disk space is used at the end of the job.
|
|
1059
|
-
# Not a real peak disk usage, but close enough to be useful for warning the user.
|
|
1060
|
-
# TODO: Push this logic into the abstract file store
|
|
1061
|
-
disk: int = getDirSizeRecursively(self.localTempDir)
|
|
1062
|
-
percent: float = 0.0
|
|
1063
|
-
if self.jobDiskBytes and self.jobDiskBytes > 0:
|
|
1064
|
-
percent = float(disk) / self.jobDiskBytes * 100
|
|
1065
|
-
disk_usage: str = (f"Job {self.jobName} used {percent:.2f}% disk ({bytes2human(disk)}B [{disk}B] used, "
|
|
1066
|
-
f"{bytes2human(self.jobDiskBytes)}B [{self.jobDiskBytes}B] requested).")
|
|
1067
|
-
if disk > self.jobDiskBytes:
|
|
1068
|
-
self.log_to_leader("Job used more disk than requested. For CWL, consider increasing the outdirMin "
|
|
1069
|
-
f"requirement, otherwise, consider increasing the disk requirement. {disk_usage}",
|
|
1070
|
-
level=logging.WARNING)
|
|
1071
|
-
else:
|
|
1072
|
-
self.log_to_leader(disk_usage, level=logging.DEBUG)
|
|
1073
|
-
|
|
1074
1058
|
# Go back up to the per-worker local temp directory.
|
|
1075
1059
|
os.chdir(startingDir)
|
|
1076
1060
|
self.cleanupInProgress = True
|
|
@@ -1095,7 +1079,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1095
1079
|
# Create an empty file to get an ID.
|
|
1096
1080
|
# Make sure to pass along the file basename.
|
|
1097
1081
|
# TODO: this empty file could leak if we die now...
|
|
1098
|
-
fileID = self.jobStore.
|
|
1082
|
+
fileID = self.jobStore.get_empty_file_store_id(creatorID, cleanup, os.path.basename(localFileName))
|
|
1099
1083
|
# Work out who we are
|
|
1100
1084
|
with self.as_process() as me:
|
|
1101
1085
|
|
|
@@ -35,13 +35,12 @@ from typing import (IO,
|
|
|
35
35
|
|
|
36
36
|
import dill
|
|
37
37
|
|
|
38
|
-
from toil.common import
|
|
38
|
+
from toil.common import getFileSystemSize
|
|
39
39
|
from toil.fileStores import FileID
|
|
40
40
|
from toil.fileStores.abstractFileStore import AbstractFileStore
|
|
41
41
|
from toil.job import Job, JobDescription
|
|
42
42
|
from toil.jobStores.abstractJobStore import AbstractJobStore
|
|
43
43
|
from toil.lib.compatibility import deprecated
|
|
44
|
-
from toil.lib.conversions import bytes2human
|
|
45
44
|
from toil.lib.io import make_public_dir, robust_rmtree
|
|
46
45
|
from toil.lib.retry import ErrorCondition, retry
|
|
47
46
|
from toil.lib.threading import get_process_name, process_name_exists
|
|
@@ -102,7 +101,6 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
102
101
|
|
|
103
102
|
@contextmanager
|
|
104
103
|
def open(self, job: Job) -> Generator[None, None, None]:
|
|
105
|
-
jobReqs = job.disk
|
|
106
104
|
startingDir = os.getcwd()
|
|
107
105
|
self.localTempDir: str = make_public_dir(in_directory=self.localTempDir)
|
|
108
106
|
self._removeDeadJobs(self.coordination_dir)
|
|
@@ -116,16 +114,6 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
116
114
|
with super().open(job):
|
|
117
115
|
yield
|
|
118
116
|
finally:
|
|
119
|
-
disk = getDirSizeRecursively(self.localTempDir)
|
|
120
|
-
percent = float(disk) / jobReqs * 100 if jobReqs > 0 else 0.0
|
|
121
|
-
disk_usage = (f"Job {self.jobName} used {percent:.2f}% disk ({bytes2human(disk)}B [{disk}B] used, "
|
|
122
|
-
f"{bytes2human(jobReqs)}B [{jobReqs}B] requested).")
|
|
123
|
-
if disk > jobReqs:
|
|
124
|
-
self.log_to_leader("Job used more disk than requested. For CWL, consider increasing the outdirMin "
|
|
125
|
-
f"requirement, otherwise, consider increasing the disk requirement. {disk_usage}",
|
|
126
|
-
level=logging.WARNING)
|
|
127
|
-
else:
|
|
128
|
-
self.log_to_leader(disk_usage, level=logging.DEBUG)
|
|
129
117
|
os.chdir(startingDir)
|
|
130
118
|
# Finally delete the job from the worker
|
|
131
119
|
self.check_for_state_corruption()
|
|
@@ -362,7 +350,10 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
362
350
|
jobState = {'jobProcessName': get_process_name(self.coordination_dir),
|
|
363
351
|
'jobName': self.jobName,
|
|
364
352
|
'jobDir': self.localTempDir}
|
|
365
|
-
|
|
353
|
+
try:
|
|
354
|
+
(fd, jobStateFile) = tempfile.mkstemp(suffix='.jobState.tmp', dir=self.coordination_dir)
|
|
355
|
+
except Exception as e:
|
|
356
|
+
raise RuntimeError("Could not make state file in " + self.coordination_dir) from e
|
|
366
357
|
with open(fd, 'wb') as fH:
|
|
367
358
|
# Write data
|
|
368
359
|
dill.dump(jobState, fH)
|
toil/job.py
CHANGED
|
@@ -45,6 +45,7 @@ from typing import (TYPE_CHECKING,
|
|
|
45
45
|
|
|
46
46
|
from configargparse import ArgParser
|
|
47
47
|
|
|
48
|
+
from toil.bus import Names
|
|
48
49
|
from toil.lib.compatibility import deprecated
|
|
49
50
|
|
|
50
51
|
if sys.version_info >= (3, 8):
|
|
@@ -710,7 +711,6 @@ class Requirer:
|
|
|
710
711
|
parts = ['no requirements']
|
|
711
712
|
return ', '.join(parts)
|
|
712
713
|
|
|
713
|
-
|
|
714
714
|
class JobDescription(Requirer):
|
|
715
715
|
"""
|
|
716
716
|
Stores all the information that the Toil Leader ever needs to know about a Job.
|
|
@@ -814,11 +814,14 @@ class JobDescription(Requirer):
|
|
|
814
814
|
# in the process of being committed.
|
|
815
815
|
self.filesToDelete = []
|
|
816
816
|
|
|
817
|
-
# Holds
|
|
817
|
+
# Holds job names and IDs of the jobs that have been chained into this
|
|
818
818
|
# job, and which should be deleted when this job finally is deleted
|
|
819
819
|
# (but not before). The successor relationships with them will have
|
|
820
|
-
# been cut, so we need to hold onto them somehow.
|
|
821
|
-
|
|
820
|
+
# been cut, so we need to hold onto them somehow. Includes each
|
|
821
|
+
# chained-in job with its original ID, and also this job's ID with its
|
|
822
|
+
# original names, or is empty if no chaining has happened.
|
|
823
|
+
# The first job in the chain comes first in the list.
|
|
824
|
+
self._merged_job_names: List[Names] = []
|
|
822
825
|
|
|
823
826
|
# The number of direct predecessors of the job. Needs to be stored at
|
|
824
827
|
# the JobDescription to support dynamically-created jobs with multiple
|
|
@@ -867,9 +870,26 @@ class JobDescription(Requirer):
|
|
|
867
870
|
# And we log who made the version (by PID)
|
|
868
871
|
self._job_version_writer = 0
|
|
869
872
|
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
+
def get_names(self) -> Names:
|
|
874
|
+
"""
|
|
875
|
+
Get the names and ID of this job as a named tuple.
|
|
876
|
+
"""
|
|
877
|
+
return Names(self.jobName, self.unitName, self.displayName, self.displayName, str(self.jobStoreID))
|
|
878
|
+
|
|
879
|
+
def get_chain(self) -> List[Names]:
|
|
880
|
+
"""
|
|
881
|
+
Get all the jobs that executed in this job's chain, in order.
|
|
882
|
+
|
|
883
|
+
For each job, produces a named tuple with its various names and its
|
|
884
|
+
original job store ID. The jobs in the chain are in execution order.
|
|
885
|
+
|
|
886
|
+
If the job hasn't run yet or it didn't chain, produces a one-item list.
|
|
887
|
+
"""
|
|
888
|
+
if len(self._merged_job_names) == 0:
|
|
889
|
+
# We haven't merged so we're just ourselves.
|
|
890
|
+
return [self.get_names()]
|
|
891
|
+
else:
|
|
892
|
+
return list(self._merged_job_names)
|
|
873
893
|
|
|
874
894
|
def serviceHostIDsInBatches(self) -> Iterator[List[str]]:
|
|
875
895
|
"""
|
|
@@ -1045,8 +1065,23 @@ class JobDescription(Requirer):
|
|
|
1045
1065
|
self.successor_phases = old_phases + self.successor_phases
|
|
1046
1066
|
|
|
1047
1067
|
# When deleting, we need to delete the files for our old ID, and also
|
|
1048
|
-
# anything that needed to be deleted for the job we are replacing.
|
|
1049
|
-
|
|
1068
|
+
# anything that needed to be deleted for the job we are replacing. And
|
|
1069
|
+
# we need to keep track of all the names of jobs involved for logging.
|
|
1070
|
+
|
|
1071
|
+
# We need first the job we are merging into if nothing has merged into
|
|
1072
|
+
# it yet, then anything that already merged into it (including it),
|
|
1073
|
+
# then us if nothing has yet merged into us, then anything that merged
|
|
1074
|
+
# into us (inclusing us)
|
|
1075
|
+
_merged_job_names = []
|
|
1076
|
+
if len(other._merged_job_names) == 0:
|
|
1077
|
+
_merged_job_names.append(other.get_names())
|
|
1078
|
+
_merged_job_names += other._merged_job_names
|
|
1079
|
+
if len(self._merged_job_names) == 0:
|
|
1080
|
+
_merged_job_names.append(self.get_names())
|
|
1081
|
+
_merged_job_names += self._merged_job_names
|
|
1082
|
+
self._merged_job_names = _merged_job_names
|
|
1083
|
+
|
|
1084
|
+
# Now steal its ID.
|
|
1050
1085
|
self.jobStoreID = other.jobStoreID
|
|
1051
1086
|
|
|
1052
1087
|
if len(other.filesToDelete) > 0:
|
|
@@ -1263,26 +1298,6 @@ class JobDescription(Requirer):
|
|
|
1263
1298
|
self._job_version_writer = os.getpid()
|
|
1264
1299
|
logger.debug("New job version: %s", self)
|
|
1265
1300
|
|
|
1266
|
-
def get_job_kind(self) -> str:
|
|
1267
|
-
"""
|
|
1268
|
-
Return an identifying string for the job.
|
|
1269
|
-
|
|
1270
|
-
The result may contain spaces.
|
|
1271
|
-
|
|
1272
|
-
Returns: Either the unit name, job name, or display name, which identifies
|
|
1273
|
-
the kind of job it is to toil.
|
|
1274
|
-
Otherwise "Unknown Job" in case no identifier is available
|
|
1275
|
-
"""
|
|
1276
|
-
if self.unitName:
|
|
1277
|
-
return self.unitName
|
|
1278
|
-
elif self.jobName:
|
|
1279
|
-
return self.jobName
|
|
1280
|
-
elif self.displayName:
|
|
1281
|
-
return self.displayName
|
|
1282
|
-
else:
|
|
1283
|
-
return "Unknown Job"
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
1301
|
class ServiceJobDescription(JobDescription):
|
|
1287
1302
|
"""A description of a job that hosts a service."""
|
|
1288
1303
|
|
|
@@ -2787,7 +2802,8 @@ class Job:
|
|
|
2787
2802
|
clock=str(totalCpuTime - startClock),
|
|
2788
2803
|
class_name=self._jobName(),
|
|
2789
2804
|
memory=str(totalMemoryUsage),
|
|
2790
|
-
requested_cores=str(self.cores)
|
|
2805
|
+
requested_cores=str(self.cores),
|
|
2806
|
+
disk=str(fileStore.get_disk_usage())
|
|
2791
2807
|
)
|
|
2792
2808
|
)
|
|
2793
2809
|
|
|
@@ -835,16 +835,17 @@ class AbstractJobStore(ABC):
|
|
|
835
835
|
root_job_description = self.load_root_job()
|
|
836
836
|
reachable_from_root: Set[str] = set()
|
|
837
837
|
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
for service_jobstore_id in root_job_description.services:
|
|
842
|
-
if haveJob(service_jobstore_id):
|
|
843
|
-
reachable_from_root.add(service_jobstore_id)
|
|
844
|
-
for merged_jobstore_id in root_job_description.merged_jobs:
|
|
838
|
+
|
|
839
|
+
for merged_in in root_job_description.get_chain():
|
|
840
|
+
# Add the job itself and any other jobs that chained with it.
|
|
845
841
|
# Keep merged-in jobs around themselves, but don't bother
|
|
846
842
|
# exploring them, since we took their successors.
|
|
847
|
-
reachable_from_root.add(
|
|
843
|
+
reachable_from_root.add(merged_in.job_store_id)
|
|
844
|
+
# add all of root's linked service jobs as well
|
|
845
|
+
for service_job_store_id in root_job_description.services:
|
|
846
|
+
if haveJob(service_job_store_id):
|
|
847
|
+
reachable_from_root.add(service_job_store_id)
|
|
848
|
+
|
|
848
849
|
|
|
849
850
|
# Unprocessed means it might have successor jobs we need to add.
|
|
850
851
|
unprocessed_job_descriptions = [root_job_description]
|
|
@@ -852,24 +853,21 @@ class AbstractJobStore(ABC):
|
|
|
852
853
|
while unprocessed_job_descriptions:
|
|
853
854
|
new_job_descriptions_to_process = [] # Reset.
|
|
854
855
|
for job_description in unprocessed_job_descriptions:
|
|
855
|
-
for
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
)
|
|
856
|
+
for merged_in in job_description.get_chain():
|
|
857
|
+
# Add the job and anything chained with it.
|
|
858
|
+
# Keep merged-in jobs around themselves, but don't bother
|
|
859
|
+
# exploring them, since we took their successors.
|
|
860
|
+
reachable_from_root.add(merged_in.job_store_id)
|
|
861
|
+
for successor_job_store_id in job_description.allSuccessors():
|
|
862
|
+
if successor_job_store_id not in reachable_from_root and haveJob(successor_job_store_id):
|
|
863
|
+
successor_job_description = getJobDescription(successor_job_store_id)
|
|
864
|
+
|
|
863
865
|
# Add all of the successor's linked service jobs as well.
|
|
864
|
-
for
|
|
865
|
-
if haveJob(
|
|
866
|
-
reachable_from_root.add(
|
|
866
|
+
for service_job_store_id in successor_job_description.services:
|
|
867
|
+
if haveJob(service_job_store_id):
|
|
868
|
+
reachable_from_root.add(service_job_store_id)
|
|
867
869
|
|
|
868
870
|
new_job_descriptions_to_process.append(successor_job_description)
|
|
869
|
-
for merged_jobstore_id in job_description.merged_jobs:
|
|
870
|
-
# Keep merged-in jobs around themselves, but don't bother
|
|
871
|
-
# exploring them, since we took their successors.
|
|
872
|
-
reachable_from_root.add(merged_jobstore_id)
|
|
873
871
|
unprocessed_job_descriptions = new_job_descriptions_to_process
|
|
874
872
|
|
|
875
873
|
logger.debug(f"{len(reachable_from_root)} jobs reachable from root.")
|