toil 6.1.0a1__py3-none-any.whl → 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +1 -232
- toil/batchSystems/abstractBatchSystem.py +41 -17
- toil/batchSystems/abstractGridEngineBatchSystem.py +79 -65
- toil/batchSystems/awsBatch.py +8 -8
- toil/batchSystems/cleanup_support.py +7 -3
- toil/batchSystems/contained_executor.py +4 -5
- toil/batchSystems/gridengine.py +1 -1
- toil/batchSystems/htcondor.py +5 -5
- toil/batchSystems/kubernetes.py +25 -11
- toil/batchSystems/local_support.py +3 -3
- toil/batchSystems/lsf.py +9 -9
- toil/batchSystems/mesos/batchSystem.py +4 -4
- toil/batchSystems/mesos/executor.py +3 -2
- toil/batchSystems/options.py +9 -0
- toil/batchSystems/singleMachine.py +11 -10
- toil/batchSystems/slurm.py +129 -16
- toil/batchSystems/torque.py +1 -1
- toil/bus.py +45 -3
- toil/common.py +56 -31
- toil/cwl/cwltoil.py +442 -371
- toil/deferred.py +1 -1
- toil/exceptions.py +1 -1
- toil/fileStores/abstractFileStore.py +69 -20
- toil/fileStores/cachingFileStore.py +6 -22
- toil/fileStores/nonCachingFileStore.py +6 -15
- toil/job.py +270 -86
- toil/jobStores/abstractJobStore.py +37 -31
- toil/jobStores/aws/jobStore.py +280 -218
- toil/jobStores/aws/utils.py +60 -31
- toil/jobStores/conftest.py +2 -2
- toil/jobStores/fileJobStore.py +3 -3
- toil/jobStores/googleJobStore.py +3 -4
- toil/leader.py +89 -38
- toil/lib/aws/__init__.py +26 -10
- toil/lib/aws/iam.py +2 -2
- toil/lib/aws/session.py +62 -22
- toil/lib/aws/utils.py +73 -37
- toil/lib/conversions.py +24 -1
- toil/lib/ec2.py +118 -69
- toil/lib/expando.py +1 -1
- toil/lib/generatedEC2Lists.py +8 -8
- toil/lib/io.py +42 -4
- toil/lib/misc.py +1 -3
- toil/lib/resources.py +57 -16
- toil/lib/retry.py +12 -5
- toil/lib/threading.py +29 -14
- toil/lib/throttle.py +1 -1
- toil/options/common.py +31 -30
- toil/options/wdl.py +5 -0
- toil/provisioners/__init__.py +9 -3
- toil/provisioners/abstractProvisioner.py +12 -2
- toil/provisioners/aws/__init__.py +20 -15
- toil/provisioners/aws/awsProvisioner.py +406 -329
- toil/provisioners/gceProvisioner.py +2 -2
- toil/provisioners/node.py +13 -5
- toil/server/app.py +1 -1
- toil/statsAndLogging.py +93 -23
- toil/test/__init__.py +27 -12
- toil/test/batchSystems/batchSystemTest.py +40 -33
- toil/test/batchSystems/batch_system_plugin_test.py +79 -0
- toil/test/batchSystems/test_slurm.py +22 -7
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +58 -0
- toil/test/cwl/cwlTest.py +245 -236
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +11 -14
- toil/test/jobStores/jobStoreTest.py +40 -54
- toil/test/lib/aws/test_iam.py +2 -2
- toil/test/lib/test_ec2.py +1 -1
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +37 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
- toil/test/provisioners/clusterTest.py +99 -16
- toil/test/server/serverTest.py +2 -2
- toil/test/src/autoDeploymentTest.py +1 -1
- toil/test/src/dockerCheckTest.py +2 -1
- toil/test/src/environmentTest.py +125 -0
- toil/test/src/fileStoreTest.py +1 -1
- toil/test/src/jobDescriptionTest.py +18 -8
- toil/test/src/jobTest.py +1 -1
- toil/test/src/realtimeLoggerTest.py +4 -0
- toil/test/src/workerTest.py +52 -19
- toil/test/utils/toilDebugTest.py +62 -4
- toil/test/utils/utilsTest.py +23 -21
- toil/test/wdl/wdltoil_test.py +49 -21
- toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
- toil/toilState.py +68 -9
- toil/utils/toilDebugFile.py +1 -1
- toil/utils/toilDebugJob.py +153 -26
- toil/utils/toilLaunchCluster.py +12 -2
- toil/utils/toilRsyncCluster.py +7 -2
- toil/utils/toilSshCluster.py +7 -3
- toil/utils/toilStats.py +310 -266
- toil/utils/toilStatus.py +98 -52
- toil/version.py +11 -11
- toil/wdl/wdltoil.py +644 -225
- toil/worker.py +125 -83
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
- toil-7.0.0.dist-info/METADATA +158 -0
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/RECORD +103 -96
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
toil/lib/io.py
CHANGED
|
@@ -16,7 +16,7 @@ def mkdtemp(suffix: Optional[str] = None, prefix: Optional[str] = None, dir: Opt
|
|
|
16
16
|
|
|
17
17
|
The permissions on the directory will be 711 instead of 700, allowing the
|
|
18
18
|
group and all other users to traverse the directory. This is necessary if
|
|
19
|
-
the
|
|
19
|
+
the directory is on NFS and the Docker daemon would like to mount it or a
|
|
20
20
|
file inside it into a container, because on NFS even the Docker daemon
|
|
21
21
|
appears bound by the file permissions.
|
|
22
22
|
|
|
@@ -159,14 +159,26 @@ def atomic_copyobj(src_fh: BytesIO, dest_path: str, length: int = 16384, executa
|
|
|
159
159
|
os.chmod(dest_path_tmp, os.stat(dest_path_tmp).st_mode | stat.S_IXUSR)
|
|
160
160
|
|
|
161
161
|
|
|
162
|
-
def make_public_dir(in_directory: Optional[str] = None) -> str:
|
|
162
|
+
def make_public_dir(in_directory: str, suggested_name: Optional[str] = None) -> str:
|
|
163
163
|
"""
|
|
164
|
+
Make a publicly-accessible directory in the given directory.
|
|
165
|
+
|
|
166
|
+
:param suggested_name: Use this directory name first if possible.
|
|
167
|
+
|
|
164
168
|
Try to make a random directory name with length 4 that doesn't exist, with the given prefix.
|
|
165
169
|
Otherwise, try length 5, length 6, etc, up to a max of 32 (len of uuid4 with dashes replaced).
|
|
166
170
|
This function's purpose is mostly to avoid having long file names when generating directories.
|
|
167
171
|
If somehow this fails, which should be incredibly unlikely, default to a normal uuid4, which was
|
|
168
172
|
our old default.
|
|
169
173
|
"""
|
|
174
|
+
if suggested_name is not None:
|
|
175
|
+
generated_dir_path: str = os.path.join(in_directory, suggested_name)
|
|
176
|
+
try:
|
|
177
|
+
os.mkdir(generated_dir_path)
|
|
178
|
+
os.chmod(generated_dir_path, 0o777)
|
|
179
|
+
return generated_dir_path
|
|
180
|
+
except FileExistsError:
|
|
181
|
+
pass
|
|
170
182
|
for i in range(4, 32 + 1): # make random uuids and truncate to lengths starting at 4 and working up to max 32
|
|
171
183
|
for _ in range(10): # make 10 attempts for each length
|
|
172
184
|
truncated_uuid: str = str(uuid.uuid4()).replace('-', '')[:i]
|
|
@@ -182,17 +194,43 @@ def make_public_dir(in_directory: Optional[str] = None) -> str:
|
|
|
182
194
|
os.chmod(this_should_never_happen, 0o777)
|
|
183
195
|
return this_should_never_happen
|
|
184
196
|
|
|
185
|
-
def try_path(path: str) -> Optional[str]:
|
|
197
|
+
def try_path(path: str, min_size: int = 100 * 1024 * 1024) -> Optional[str]:
|
|
186
198
|
"""
|
|
187
199
|
Try to use the given path. Return it if it exists or can be made,
|
|
188
200
|
and we can make things within it, or None otherwise.
|
|
201
|
+
|
|
202
|
+
:param min_size: Reject paths on filesystems smaller than this many bytes.
|
|
189
203
|
"""
|
|
204
|
+
|
|
190
205
|
try:
|
|
191
206
|
os.makedirs(path, exist_ok=True)
|
|
192
207
|
except OSError:
|
|
193
208
|
# Maybe we lack permissions
|
|
194
209
|
return None
|
|
195
|
-
|
|
210
|
+
|
|
211
|
+
if not os.path.exists(path):
|
|
212
|
+
# We didn't manage to make it
|
|
213
|
+
return None
|
|
214
|
+
|
|
215
|
+
if not os.access(path, os.W_OK):
|
|
216
|
+
# It doesn't look writable
|
|
217
|
+
return None
|
|
218
|
+
|
|
219
|
+
try:
|
|
220
|
+
stats = os.statvfs(path)
|
|
221
|
+
except OSError:
|
|
222
|
+
# Maybe we lack permissions
|
|
223
|
+
return None
|
|
224
|
+
|
|
225
|
+
# Is the filesystem big enough?
|
|
226
|
+
# We need to look at the FS size and not the free space so we don't change
|
|
227
|
+
# over to a different filesystem when this one fills up.
|
|
228
|
+
fs_size = stats.f_frsize * stats.f_blocks
|
|
229
|
+
if fs_size < min_size:
|
|
230
|
+
# Too small
|
|
231
|
+
return None
|
|
232
|
+
|
|
233
|
+
return path
|
|
196
234
|
|
|
197
235
|
|
|
198
236
|
class WriteWatchingStream:
|
toil/lib/misc.py
CHANGED
|
@@ -11,8 +11,6 @@ import typing
|
|
|
11
11
|
from contextlib import closing
|
|
12
12
|
from typing import Iterator, List, Optional
|
|
13
13
|
|
|
14
|
-
import pytz
|
|
15
|
-
|
|
16
14
|
logger = logging.getLogger(__name__)
|
|
17
15
|
|
|
18
16
|
|
|
@@ -56,7 +54,7 @@ def get_user_name() -> str:
|
|
|
56
54
|
|
|
57
55
|
def utc_now() -> datetime.datetime:
|
|
58
56
|
"""Return a datetime in the UTC timezone corresponding to right now."""
|
|
59
|
-
return datetime.datetime.utcnow().replace(tzinfo=
|
|
57
|
+
return datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
|
|
60
58
|
|
|
61
59
|
def unix_now_ms() -> float:
|
|
62
60
|
"""Return the current time in milliseconds since the Unix epoch."""
|
toil/lib/resources.py
CHANGED
|
@@ -13,27 +13,68 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import fnmatch
|
|
15
15
|
import os
|
|
16
|
+
import math
|
|
17
|
+
import sys
|
|
16
18
|
import resource
|
|
17
19
|
from typing import List, Tuple
|
|
18
20
|
|
|
19
|
-
|
|
20
|
-
def get_total_cpu_time_and_memory_usage() -> Tuple[float, int]:
|
|
21
|
+
class ResourceMonitor:
|
|
21
22
|
"""
|
|
22
|
-
|
|
23
|
-
|
|
23
|
+
Global resource monitoring widget.
|
|
24
|
+
|
|
25
|
+
Presents class methods to get the resource usage of this process and child
|
|
26
|
+
processes, and other class methods to adjust the statistics so they can
|
|
27
|
+
account for e.g. resources used inside containers, or other resource usage
|
|
28
|
+
that *should* be billable to the current process.
|
|
24
29
|
"""
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
30
|
+
|
|
31
|
+
# Store some extra usage to tack onto the stats as module-level globals
|
|
32
|
+
_extra_cpu_seconds: float = 0
|
|
33
|
+
_extra_memory_ki: int = 0
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def record_extra_memory(cls, peak_ki: int) -> None:
|
|
37
|
+
"""
|
|
38
|
+
Become responsible for the given peak memory usage, in kibibytes.
|
|
39
|
+
|
|
40
|
+
The memory will be treated as if it was used by a child process at the time
|
|
41
|
+
our real child processes were also using their peak memory.
|
|
42
|
+
"""
|
|
43
|
+
cls._extra_memory_ki = max(cls._extra_memory_ki, peak_ki)
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def record_extra_cpu(cls, seconds: float) -> None:
|
|
47
|
+
"""
|
|
48
|
+
Become responsible for the given CPU time.
|
|
49
|
+
|
|
50
|
+
The CPU time will be treated as if it had been used by a child process.
|
|
51
|
+
"""
|
|
52
|
+
cls._extra_cpu_seconds += seconds
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def get_total_cpu_time_and_memory_usage(cls) -> Tuple[float, int]:
|
|
56
|
+
"""
|
|
57
|
+
Gives the total cpu time of itself and all its children, and the maximum RSS memory usage of
|
|
58
|
+
itself and its single largest child (in kibibytes).
|
|
59
|
+
"""
|
|
60
|
+
me = resource.getrusage(resource.RUSAGE_SELF)
|
|
61
|
+
children = resource.getrusage(resource.RUSAGE_CHILDREN)
|
|
62
|
+
total_cpu_time = me.ru_utime + me.ru_stime + children.ru_utime + children.ru_stime + cls._extra_cpu_seconds
|
|
63
|
+
total_memory_usage = me.ru_maxrss + children.ru_maxrss
|
|
64
|
+
if sys.platform == "darwin":
|
|
65
|
+
# On Linux, getrusage works in "kilobytes" (really kibibytes), but on
|
|
66
|
+
# Mac it works in bytes. See
|
|
67
|
+
# <https://github.com/python/cpython/issues/74698>
|
|
68
|
+
total_memory_usage = int(math.ceil(total_memory_usage / 1024))
|
|
69
|
+
total_memory_usage += cls._extra_memory_ki
|
|
70
|
+
return total_cpu_time, total_memory_usage
|
|
71
|
+
|
|
72
|
+
@classmethod
|
|
73
|
+
def get_total_cpu_time(cls) -> float:
|
|
74
|
+
"""Gives the total cpu time, including the children."""
|
|
75
|
+
me = resource.getrusage(resource.RUSAGE_SELF)
|
|
76
|
+
childs = resource.getrusage(resource.RUSAGE_CHILDREN)
|
|
77
|
+
return me.ru_utime + me.ru_stime + childs.ru_utime + childs.ru_stime + cls._extra_cpu_seconds
|
|
37
78
|
|
|
38
79
|
|
|
39
80
|
def glob(glob_pattern: str, directoryname: str) -> List[str]:
|
toil/lib/retry.py
CHANGED
|
@@ -142,7 +142,7 @@ from typing import (Any,
|
|
|
142
142
|
Sequence,
|
|
143
143
|
Tuple,
|
|
144
144
|
Type,
|
|
145
|
-
Union)
|
|
145
|
+
Union, TypeVar)
|
|
146
146
|
|
|
147
147
|
import requests.exceptions
|
|
148
148
|
import urllib3.exceptions
|
|
@@ -224,13 +224,16 @@ class ErrorCondition:
|
|
|
224
224
|
)
|
|
225
225
|
|
|
226
226
|
|
|
227
|
+
# There is a better way to type hint this with python 3.10
|
|
228
|
+
# https://stackoverflow.com/a/68290080
|
|
229
|
+
RT = TypeVar("RT")
|
|
227
230
|
def retry(
|
|
228
231
|
intervals: Optional[List] = None,
|
|
229
232
|
infinite_retries: bool = False,
|
|
230
233
|
errors: Optional[Sequence[Union[ErrorCondition, Type[Exception]]]] = None,
|
|
231
234
|
log_message: Optional[Tuple[Callable, str]] = None,
|
|
232
235
|
prepare: Optional[List[Callable]] = None,
|
|
233
|
-
) -> Callable[[
|
|
236
|
+
) -> Callable[[Callable[..., RT]], Callable[..., RT]]:
|
|
234
237
|
"""
|
|
235
238
|
Retry a function if it fails with any Exception defined in "errors".
|
|
236
239
|
|
|
@@ -281,9 +284,9 @@ def retry(
|
|
|
281
284
|
if error_condition.retry_on_this_condition:
|
|
282
285
|
retriable_errors.add(error_condition.error)
|
|
283
286
|
|
|
284
|
-
def decorate(func):
|
|
287
|
+
def decorate(func: Callable[..., RT]) -> Callable[..., RT]:
|
|
285
288
|
@functools.wraps(func)
|
|
286
|
-
def call(*args, **kwargs):
|
|
289
|
+
def call(*args, **kwargs) -> RT:
|
|
287
290
|
intervals_remaining = copy.deepcopy(intervals)
|
|
288
291
|
while True:
|
|
289
292
|
try:
|
|
@@ -488,13 +491,15 @@ def error_meets_conditions(e, error_conditions):
|
|
|
488
491
|
DEFAULT_DELAYS = (0, 1, 1, 4, 16, 64)
|
|
489
492
|
DEFAULT_TIMEOUT = 300
|
|
490
493
|
|
|
494
|
+
E = TypeVar("E", bound=Exception) # so mypy understands passed through types
|
|
495
|
+
|
|
491
496
|
# TODO: Replace the use of this with retry()
|
|
492
497
|
# The aws provisioner and jobstore need a large refactoring to be boto3 compliant, so this is
|
|
493
498
|
# still used there to avoid the duplication of future work
|
|
494
499
|
def old_retry(
|
|
495
500
|
delays: Iterable[float] = DEFAULT_DELAYS,
|
|
496
501
|
timeout: float = DEFAULT_TIMEOUT,
|
|
497
|
-
predicate: Callable[[
|
|
502
|
+
predicate: Callable[[E], bool] = lambda e: False,
|
|
498
503
|
) -> Generator[ContextManager, None, None]:
|
|
499
504
|
"""
|
|
500
505
|
Deprecated.
|
|
@@ -567,6 +572,8 @@ def old_retry(
|
|
|
567
572
|
>>> i
|
|
568
573
|
1
|
|
569
574
|
"""
|
|
575
|
+
if timeout is None:
|
|
576
|
+
timeout = DEFAULT_TIMEOUT
|
|
570
577
|
if timeout > 0:
|
|
571
578
|
go = [ None ]
|
|
572
579
|
|
toil/lib/threading.py
CHANGED
|
@@ -28,7 +28,7 @@ import traceback
|
|
|
28
28
|
from contextlib import contextmanager
|
|
29
29
|
from typing import Dict, Iterator, Optional, Union, cast
|
|
30
30
|
|
|
31
|
-
import psutil
|
|
31
|
+
import psutil
|
|
32
32
|
|
|
33
33
|
from toil.lib.exceptions import raise_
|
|
34
34
|
from toil.lib.io import robust_rmtree
|
|
@@ -41,7 +41,7 @@ class ExceptionalThread(threading.Thread):
|
|
|
41
41
|
A thread whose join() method re-raises exceptions raised during run(). While join() is
|
|
42
42
|
idempotent, the exception is only during the first invocation of join() that successfully
|
|
43
43
|
joined the thread. If join() times out, no exception will be re reraised even though an
|
|
44
|
-
exception might already have
|
|
44
|
+
exception might already have occurred in run().
|
|
45
45
|
|
|
46
46
|
When subclassing this thread, override tryRun() instead of run().
|
|
47
47
|
|
|
@@ -109,9 +109,12 @@ def cpu_count() -> int:
|
|
|
109
109
|
return cast(int, cached)
|
|
110
110
|
|
|
111
111
|
# Get the fallback answer of all the CPUs on the machine
|
|
112
|
-
|
|
112
|
+
psutil_cpu_count = cast(Optional[int], psutil.cpu_count(logical=True))
|
|
113
|
+
if psutil_cpu_count is None:
|
|
114
|
+
logger.debug('Could not retrieve the logical CPU count.')
|
|
113
115
|
|
|
114
|
-
|
|
116
|
+
total_machine_size: Union[float, int] = psutil_cpu_count if psutil_cpu_count is not None else float('inf')
|
|
117
|
+
logger.debug('Total machine size: %s core(s)', total_machine_size)
|
|
115
118
|
|
|
116
119
|
# cgroups may limit the size
|
|
117
120
|
cgroup_size: Union[float, int] = float('inf')
|
|
@@ -151,13 +154,13 @@ def cpu_count() -> int:
|
|
|
151
154
|
if quota == -1:
|
|
152
155
|
# But the quota can be -1 for unset.
|
|
153
156
|
# Assume we can use the whole machine.
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
157
|
+
cgroup_size = float('inf')
|
|
158
|
+
else:
|
|
159
|
+
# The thread count is how many multiples of a wall clock period we
|
|
160
|
+
# can burn in that period.
|
|
161
|
+
cgroup_size = int(math.ceil(float(quota)/float(period)))
|
|
159
162
|
|
|
160
|
-
logger.debug('Control group size in cores: %
|
|
163
|
+
logger.debug('Control group size in cores: %s', cgroup_size)
|
|
161
164
|
except:
|
|
162
165
|
# We can't actually read these cgroup fields. Maybe we are a mac or something.
|
|
163
166
|
logger.debug('Could not inspect cgroup: %s', traceback.format_exc())
|
|
@@ -175,9 +178,16 @@ def cpu_count() -> int:
|
|
|
175
178
|
else:
|
|
176
179
|
logger.debug('CPU affinity not available')
|
|
177
180
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
+
limit: Union[float, int] = float('inf')
|
|
182
|
+
# Apply all the limits to take the smallest
|
|
183
|
+
limit = min(limit, total_machine_size)
|
|
184
|
+
limit = min(limit, cgroup_size)
|
|
185
|
+
limit = min(limit, affinity_size)
|
|
186
|
+
if limit < 1 or limit == float('inf'):
|
|
187
|
+
# Fall back to 1 if we can't get a size
|
|
188
|
+
limit = 1
|
|
189
|
+
result = int(limit)
|
|
190
|
+
logger.debug('cpu_count: %s', result)
|
|
181
191
|
# Make sure to remember it for the next call
|
|
182
192
|
setattr(cpu_count, 'result', result)
|
|
183
193
|
return result
|
|
@@ -529,9 +539,14 @@ class LastProcessStandingArena:
|
|
|
529
539
|
os.mkdir(self.lockfileDir)
|
|
530
540
|
except FileExistsError:
|
|
531
541
|
pass
|
|
542
|
+
except Exception as e:
|
|
543
|
+
raise RuntimeError("Could not make lock file directory " + self.lockfileDir) from e
|
|
532
544
|
|
|
533
545
|
# Make ourselves a file in it and lock it to prove we are alive.
|
|
534
|
-
|
|
546
|
+
try:
|
|
547
|
+
self.lockfileFD, self.lockfileName = tempfile.mkstemp(dir=self.lockfileDir) # type: ignore
|
|
548
|
+
except Exception as e:
|
|
549
|
+
raise RuntimeError("Could not make lock file in " + self.lockfileDir) from e
|
|
535
550
|
# Nobody can see it yet, so lock it right away
|
|
536
551
|
fcntl.lockf(self.lockfileFD, fcntl.LOCK_EX) # type: ignore
|
|
537
552
|
|
toil/lib/throttle.py
CHANGED
|
@@ -43,7 +43,7 @@ class LocalThrottle:
|
|
|
43
43
|
thread as necessary to ensure that no less than the configured minimum interval has
|
|
44
44
|
passed since the last invocation of this method in the current thread returned True.
|
|
45
45
|
|
|
46
|
-
If the wait parameter is False, this method
|
|
46
|
+
If the wait parameter is False, this method immediatley returns True (if at least the
|
|
47
47
|
configured minimum interval has passed since the last time this method returned True in
|
|
48
48
|
the current thread) or False otherwise.
|
|
49
49
|
"""
|
toil/options/common.py
CHANGED
|
@@ -2,13 +2,12 @@ import os
|
|
|
2
2
|
from argparse import ArgumentParser, Action, _AppendAction
|
|
3
3
|
from typing import Any, Optional, Union, Type, Callable, List, Dict, TYPE_CHECKING
|
|
4
4
|
|
|
5
|
-
from distutils.util import strtobool
|
|
6
5
|
from configargparse import SUPPRESS
|
|
7
6
|
import logging
|
|
8
7
|
|
|
9
8
|
from ruamel.yaml import YAML
|
|
10
9
|
|
|
11
|
-
from toil.lib.conversions import bytes2human, human2bytes
|
|
10
|
+
from toil.lib.conversions import bytes2human, human2bytes, strtobool, opt_strtobool
|
|
12
11
|
|
|
13
12
|
from toil.batchSystems.options import add_all_batchsystem_options
|
|
14
13
|
from toil.provisioners import parse_node_types
|
|
@@ -138,8 +137,14 @@ def make_open_interval_action(min: Union[int, float], max: Optional[Union[int, f
|
|
|
138
137
|
func = fC(min, max)
|
|
139
138
|
try:
|
|
140
139
|
if not func(values):
|
|
141
|
-
|
|
142
|
-
|
|
140
|
+
if max is None:
|
|
141
|
+
raise parser.error(
|
|
142
|
+
f"{option_string} ({values}) must be at least {min}"
|
|
143
|
+
)
|
|
144
|
+
else:
|
|
145
|
+
raise parser.error(
|
|
146
|
+
f"{option_string} ({values}) must be at least {min} and strictly less than {max})"
|
|
147
|
+
)
|
|
143
148
|
except AssertionError:
|
|
144
149
|
raise RuntimeError(f"The {option_string} option has an invalid value: {values}")
|
|
145
150
|
setattr(namespace, self.dest, values)
|
|
@@ -174,7 +179,7 @@ JOBSTORE_HELP = ("The location of the job store for the workflow. "
|
|
|
174
179
|
"store must be accessible by all worker nodes. Depending on the desired "
|
|
175
180
|
"job store implementation, the location should be formatted according to "
|
|
176
181
|
"one of the following schemes:\n\n"
|
|
177
|
-
"file:<path> where <path> points to a directory on the file
|
|
182
|
+
"file:<path> where <path> points to a directory on the file system\n\n"
|
|
178
183
|
"aws:<region>:<prefix> where <region> is the name of an AWS region like "
|
|
179
184
|
"us-west-2 and <prefix> will be prepended to the names of any top-level "
|
|
180
185
|
"AWS resources in use by job store, e.g. S3 buckets.\n\n "
|
|
@@ -201,14 +206,6 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
|
|
|
201
206
|
config.add_argument('--config', dest='config', is_config_file_arg=True, default=None, metavar="PATH",
|
|
202
207
|
help="Get options from a config file.")
|
|
203
208
|
|
|
204
|
-
def convert_bool(b: str) -> bool:
|
|
205
|
-
"""Convert a string representation of bool to bool"""
|
|
206
|
-
return bool(strtobool(b))
|
|
207
|
-
|
|
208
|
-
def opt_strtobool(b: Optional[str]) -> Optional[bool]:
|
|
209
|
-
"""Convert an optional string representation of bool to None or bool"""
|
|
210
|
-
return b if b is None else convert_bool(b)
|
|
211
|
-
|
|
212
209
|
add_logging_options(parser)
|
|
213
210
|
parser.register("type", "bool", parseBool) # Custom type for arg=True/False.
|
|
214
211
|
|
|
@@ -354,7 +351,7 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
|
|
|
354
351
|
"them from being modified externally. When set to False, as long as caching is enabled, "
|
|
355
352
|
"Toil will protect the file automatically by changing the permissions to read-only."
|
|
356
353
|
"default=%(default)s")
|
|
357
|
-
link_imports.add_argument("--symlinkImports", dest="symlinkImports", type=
|
|
354
|
+
link_imports.add_argument("--symlinkImports", dest="symlinkImports", type=strtobool, default=True,
|
|
358
355
|
metavar="BOOL", help=link_imports_help)
|
|
359
356
|
move_exports = file_store_options.add_mutually_exclusive_group()
|
|
360
357
|
move_exports_help = ('When using a filesystem based job store, output files are by default moved to the '
|
|
@@ -362,7 +359,7 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
|
|
|
362
359
|
'location. Setting this option to True instead copies the files into the output directory. '
|
|
363
360
|
'Applies to filesystem-based job stores only.'
|
|
364
361
|
'default=%(default)s')
|
|
365
|
-
move_exports.add_argument("--moveOutputs", dest="moveOutputs", type=
|
|
362
|
+
move_exports.add_argument("--moveOutputs", dest="moveOutputs", type=strtobool, default=False, metavar="BOOL",
|
|
366
363
|
help=move_exports_help)
|
|
367
364
|
|
|
368
365
|
caching = file_store_options.add_mutually_exclusive_group()
|
|
@@ -472,11 +469,11 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
|
|
|
472
469
|
"This is useful for heterogeneous jobs where some tasks require much more "
|
|
473
470
|
"disk than others.")
|
|
474
471
|
|
|
475
|
-
autoscaling_options.add_argument("--metrics", dest="metrics", default=False, type=
|
|
472
|
+
autoscaling_options.add_argument("--metrics", dest="metrics", default=False, type=strtobool, metavar="BOOL",
|
|
476
473
|
help="Enable the prometheus/grafana dashboard for monitoring CPU/RAM usage, "
|
|
477
474
|
"queue size, and issued jobs.")
|
|
478
475
|
autoscaling_options.add_argument("--assumeZeroOverhead", dest="assume_zero_overhead", default=False,
|
|
479
|
-
type=
|
|
476
|
+
type=strtobool, metavar="BOOL",
|
|
480
477
|
help="Ignore scheduler and OS overhead and assume jobs can use every last byte "
|
|
481
478
|
"of memory and disk on a node when autoscaling.")
|
|
482
479
|
|
|
@@ -547,7 +544,7 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
|
|
|
547
544
|
help=resource_help_msg.format('default', 'accelerators', accelerators_note, []))
|
|
548
545
|
resource_options.add_argument('--defaultPreemptible', '--defaultPreemptable', dest='defaultPreemptible',
|
|
549
546
|
metavar='BOOL',
|
|
550
|
-
type=
|
|
547
|
+
type=strtobool, nargs='?', const=True, default=False,
|
|
551
548
|
help='Make all jobs able to run on preemptible (spot) nodes by default.')
|
|
552
549
|
resource_options.add_argument('--maxCores', dest='maxCores', default=SYS_MAX_SIZE, metavar='INT', type=int,
|
|
553
550
|
action=make_open_interval_action(1),
|
|
@@ -572,10 +569,10 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
|
|
|
572
569
|
f"labeling job failed. default={1}")
|
|
573
570
|
job_options.add_argument("--enableUnlimitedPreemptibleRetries", "--enableUnlimitedPreemptableRetries",
|
|
574
571
|
dest="enableUnlimitedPreemptibleRetries",
|
|
575
|
-
type=
|
|
572
|
+
type=strtobool, default=False, metavar="BOOL",
|
|
576
573
|
help="If set, preemptible failures (or any failure due to an instance getting "
|
|
577
574
|
"unexpectedly terminated) will not count towards job failures and --retryCount.")
|
|
578
|
-
job_options.add_argument("--doubleMem", dest="doubleMem", type=
|
|
575
|
+
job_options.add_argument("--doubleMem", dest="doubleMem", type=strtobool, default=False, metavar="BOOL",
|
|
579
576
|
help="If set, batch jobs which die to reaching memory limit on batch schedulers "
|
|
580
577
|
"will have their memory doubled and they will be retried. The remaining "
|
|
581
578
|
"retry count will be reduced by 1. Currently supported by LSF.")
|
|
@@ -589,18 +586,23 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
|
|
|
589
586
|
help=f"Period of time to wait (in seconds) between checking for missing/overlong jobs, "
|
|
590
587
|
f"that is jobs which get lost by the batch system. Expert parameter. "
|
|
591
588
|
f"default=%(default)s")
|
|
589
|
+
job_options.add_argument("--jobStoreTimeout", dest="job_store_timeout", default=30, type=float,
|
|
590
|
+
action=make_open_interval_action(0), metavar="FLOAT",
|
|
591
|
+
help=f"Maximum time (in seconds) to wait for a job's update to the job store "
|
|
592
|
+
f"before declaring it failed. default=%(default)s")
|
|
593
|
+
|
|
592
594
|
|
|
593
595
|
# Log management options
|
|
594
596
|
log_options = parser.add_argument_group(
|
|
595
597
|
title="Toil log management options.",
|
|
596
598
|
description="Options for how Toil should manage its logs."
|
|
597
599
|
)
|
|
598
|
-
log_options.add_argument("--maxLogFileSize", dest="maxLogFileSize", default=
|
|
600
|
+
log_options.add_argument("--maxLogFileSize", dest="maxLogFileSize", default=100 * 1024 * 1024, type=h2b,
|
|
599
601
|
action=make_open_interval_action(1),
|
|
600
602
|
help=f"The maximum size of a job log file to keep (in bytes), log files larger than "
|
|
601
603
|
f"this will be truncated to the last X bytes. Setting this option to zero will "
|
|
602
604
|
f"prevent any truncation. Setting this option to a negative value will truncate "
|
|
603
|
-
f"from the beginning. Default={bytes2human(
|
|
605
|
+
f"from the beginning. Default={bytes2human(100 * 1024 * 1024)}")
|
|
604
606
|
log_options.add_argument("--writeLogs", dest="writeLogs", nargs='?', action='store', default=None,
|
|
605
607
|
const=os.getcwd(), metavar="OPT_PATH",
|
|
606
608
|
help="Write worker logs received by the leader into their own files at the specified "
|
|
@@ -613,7 +615,7 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
|
|
|
613
615
|
log_options.add_argument("--writeLogsGzip", dest="writeLogsGzip", nargs='?', action='store', default=None,
|
|
614
616
|
const=os.getcwd(), metavar="OPT_PATH",
|
|
615
617
|
help="Identical to --writeLogs except the logs files are gzipped on the leader.")
|
|
616
|
-
log_options.add_argument("--writeLogsFromAllJobs", dest="writeLogsFromAllJobs", type=
|
|
618
|
+
log_options.add_argument("--writeLogsFromAllJobs", dest="writeLogsFromAllJobs", type=strtobool,
|
|
617
619
|
default=False, metavar="BOOL",
|
|
618
620
|
help="Whether to write logs from all jobs (including the successful ones) without "
|
|
619
621
|
"necessarily setting the log level to 'debug'. Ensure that either --writeLogs "
|
|
@@ -621,7 +623,7 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
|
|
|
621
623
|
log_options.add_argument("--writeMessages", dest="write_messages", default=None,
|
|
622
624
|
type=lambda x: None if x is None else os.path.abspath(x), metavar="PATH",
|
|
623
625
|
help="File to send messages from the leader's message bus to.")
|
|
624
|
-
log_options.add_argument("--realTimeLogging", dest="realTimeLogging", type=
|
|
626
|
+
log_options.add_argument("--realTimeLogging", dest="realTimeLogging", type=strtobool, default=False,
|
|
625
627
|
help="Enable real-time logging from workers to leader")
|
|
626
628
|
|
|
627
629
|
# Misc options
|
|
@@ -629,12 +631,12 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
|
|
|
629
631
|
title="Toil miscellaneous options.",
|
|
630
632
|
description="Everything else."
|
|
631
633
|
)
|
|
632
|
-
misc_options.add_argument('--disableChaining', dest='disableChaining', type=
|
|
634
|
+
misc_options.add_argument('--disableChaining', dest='disableChaining', type=strtobool, default=False,
|
|
633
635
|
metavar="BOOL",
|
|
634
636
|
help="Disables chaining of jobs (chaining uses one job's resource allocation "
|
|
635
637
|
"for its successor job if possible).")
|
|
636
638
|
misc_options.add_argument("--disableJobStoreChecksumVerification", dest="disableJobStoreChecksumVerification",
|
|
637
|
-
default=False, type=
|
|
639
|
+
default=False, type=strtobool, metavar="BOOL",
|
|
638
640
|
help="Disables checksum verification for files transferred to/from the job store. "
|
|
639
641
|
"Checksum verification is a safety check to ensure the data is not corrupted "
|
|
640
642
|
"during transfer. Currently only supported for non-streaming AWS files.")
|
|
@@ -686,14 +688,13 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
|
|
|
686
688
|
action=make_open_interval_action(0.0), metavar="FLOAT",
|
|
687
689
|
help=f"Interval of time service jobs wait between polling for the existence of the "
|
|
688
690
|
f"keep-alive flag. Default: {60.0}")
|
|
689
|
-
misc_options.add_argument('--forceDockerAppliance', dest='forceDockerAppliance', type=
|
|
691
|
+
misc_options.add_argument('--forceDockerAppliance', dest='forceDockerAppliance', type=strtobool, default=False,
|
|
690
692
|
metavar="BOOL",
|
|
691
693
|
help='Disables sanity checking the existence of the docker image specified by '
|
|
692
694
|
'TOIL_APPLIANCE_SELF, which Toil uses to provision mesos for autoscaling.')
|
|
693
695
|
misc_options.add_argument('--statusWait', dest='statusWait', type=int, default=3600, metavar="INT",
|
|
694
696
|
help="Seconds to wait between reports of running jobs.")
|
|
695
|
-
misc_options.add_argument('--disableProgress', dest='disableProgress',
|
|
696
|
-
metavar="BOOL",
|
|
697
|
+
misc_options.add_argument('--disableProgress', dest='disableProgress', action="store_true", default=False,
|
|
697
698
|
help="Disables the progress bar shown when standard error is a terminal.")
|
|
698
699
|
|
|
699
700
|
# Debug options
|
|
@@ -735,4 +736,4 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
|
|
|
735
736
|
|
|
736
737
|
# dest is set to enableCaching to not conflict with the current --caching destination
|
|
737
738
|
caching.add_argument('--disableCaching', dest='enableCaching', action='store_false', help=SUPPRESS)
|
|
738
|
-
caching.set_defaults(
|
|
739
|
+
caching.set_defaults(enableCaching=None)
|
toil/options/wdl.py
CHANGED
|
@@ -13,6 +13,8 @@ def add_wdl_options(parser: ArgumentParser, suppress: bool = True) -> None:
|
|
|
13
13
|
suppress_help = SUPPRESS if suppress else None
|
|
14
14
|
# include arg names without a wdl specifier if suppress is False
|
|
15
15
|
# this is to avoid possible duplicate options in custom toil scripts, ex outputFile can be a common argument name
|
|
16
|
+
# TODO: Why do we even need them at all in other Toil scripts? Do we have to worry about dest= collisions?
|
|
17
|
+
# TODO: Can the better option name be first?
|
|
16
18
|
output_dialect_arguments = ["--wdlOutputDialect"] + (["--outputDialect"] if not suppress else [])
|
|
17
19
|
parser.add_argument(*output_dialect_arguments, dest="output_dialect", type=str, default='cromwell',
|
|
18
20
|
choices=['cromwell', 'miniwdl'],
|
|
@@ -30,3 +32,6 @@ def add_wdl_options(parser: ArgumentParser, suppress: bool = True) -> None:
|
|
|
30
32
|
reference_inputs_arguments = ["--wdlReferenceInputs"] + (["--referenceInputs"] if not suppress else [])
|
|
31
33
|
parser.add_argument(*reference_inputs_arguments, dest="reference_inputs", type=bool, default=False,
|
|
32
34
|
help=suppress_help or "Pass input files by URL")
|
|
35
|
+
container_arguments = ["--wdlContainer"] + (["--container"] if not suppress else [])
|
|
36
|
+
parser.add_argument(*container_arguments, dest="container", type=str, choices=["singularity", "docker", "auto"], default="auto",
|
|
37
|
+
help=suppress_help or "Container engine to use to run WDL tasks")
|
toil/provisioners/__init__.py
CHANGED
|
@@ -32,6 +32,7 @@ def cluster_factory(
|
|
|
32
32
|
nodeStorage: int = 50,
|
|
33
33
|
nodeStorageOverrides: Optional[List[str]] = None,
|
|
34
34
|
sseKey: Optional[str] = None,
|
|
35
|
+
enable_fuse: bool = False
|
|
35
36
|
) -> Union["AWSProvisioner", "GCEProvisioner"]:
|
|
36
37
|
"""
|
|
37
38
|
Find and instantiate the appropriate provisioner instance to make clusters in the given cloud.
|
|
@@ -51,14 +52,14 @@ def cluster_factory(
|
|
|
51
52
|
except ImportError:
|
|
52
53
|
logger.error('The aws extra must be installed to use this provisioner')
|
|
53
54
|
raise
|
|
54
|
-
return AWSProvisioner(clusterName, clusterType, zone, nodeStorage, nodeStorageOverrides, sseKey)
|
|
55
|
+
return AWSProvisioner(clusterName, clusterType, zone, nodeStorage, nodeStorageOverrides, sseKey, enable_fuse)
|
|
55
56
|
elif provisioner == 'gce':
|
|
56
57
|
try:
|
|
57
58
|
from toil.provisioners.gceProvisioner import GCEProvisioner
|
|
58
59
|
except ImportError:
|
|
59
60
|
logger.error('The google extra must be installed to use this provisioner')
|
|
60
61
|
raise
|
|
61
|
-
return GCEProvisioner(clusterName, clusterType, zone, nodeStorage, nodeStorageOverrides, sseKey)
|
|
62
|
+
return GCEProvisioner(clusterName, clusterType, zone, nodeStorage, nodeStorageOverrides, sseKey, enable_fuse)
|
|
62
63
|
else:
|
|
63
64
|
raise RuntimeError("Invalid provisioner '%s'" % provisioner)
|
|
64
65
|
|
|
@@ -174,9 +175,14 @@ def check_valid_node_types(provisioner, node_types: List[Tuple[Set[str], Optiona
|
|
|
174
175
|
|
|
175
176
|
class NoSuchClusterException(Exception):
|
|
176
177
|
"""Indicates that the specified cluster does not exist."""
|
|
177
|
-
def __init__(self, cluster_name):
|
|
178
|
+
def __init__(self, cluster_name: str) -> None:
|
|
178
179
|
super().__init__(f"The cluster '{cluster_name}' could not be found")
|
|
179
180
|
|
|
181
|
+
class NoSuchZoneException(Exception):
|
|
182
|
+
"""Indicates that a valid zone could not be found."""
|
|
183
|
+
def __init__(self) -> None:
|
|
184
|
+
super().__init__(f"No valid zone could be found!")
|
|
185
|
+
|
|
180
186
|
|
|
181
187
|
class ClusterTypeNotSupportedException(Exception):
|
|
182
188
|
"""Indicates that a provisioner does not support a given cluster type."""
|
|
@@ -137,6 +137,7 @@ class AbstractProvisioner(ABC):
|
|
|
137
137
|
zone: Optional[str] = None,
|
|
138
138
|
nodeStorage: int = 50,
|
|
139
139
|
nodeStorageOverrides: Optional[List[str]] = None,
|
|
140
|
+
enable_fuse: bool = False
|
|
140
141
|
) -> None:
|
|
141
142
|
"""
|
|
142
143
|
Initialize provisioner.
|
|
@@ -162,11 +163,14 @@ class AbstractProvisioner(ABC):
|
|
|
162
163
|
for override in nodeStorageOverrides or []:
|
|
163
164
|
nodeShape, storageOverride = override.split(':')
|
|
164
165
|
self._nodeStorageOverrides[nodeShape] = int(storageOverride)
|
|
165
|
-
self._leaderPrivateIP = None
|
|
166
|
+
self._leaderPrivateIP: Optional[str] = None
|
|
166
167
|
# This will hold an SSH public key for Mesos clusters, or the
|
|
167
168
|
# Kubernetes joining information as a dict for Kubernetes clusters.
|
|
168
169
|
self._leaderWorkerAuthentication = None
|
|
169
170
|
|
|
171
|
+
# Whether or not to use FUSE on the cluster. If true, the cluster's Toil containers will be launched in privileged mode
|
|
172
|
+
self.enable_fuse = enable_fuse
|
|
173
|
+
|
|
170
174
|
if clusterName:
|
|
171
175
|
# Making a new cluster
|
|
172
176
|
self.createClusterSettings()
|
|
@@ -812,6 +816,12 @@ class AbstractProvisioner(ABC):
|
|
|
812
816
|
-v /opt:/opt \\
|
|
813
817
|
-v /etc/kubernetes:/etc/kubernetes \\
|
|
814
818
|
-v /etc/kubernetes/admin.conf:/root/.kube/config \\
|
|
819
|
+
{"-e TOIL_KUBERNETES_PRIVILEGED=True --privileged" if self.enable_fuse else
|
|
820
|
+
"--security-opt seccomp=unconfined --security-opt systempaths=unconfined"} \\
|
|
821
|
+
-e TOIL_KUBERNETES_HOST_PATH=/var/lib/toil \\
|
|
822
|
+
# Pass in a path to use for singularity image caching into the container
|
|
823
|
+
-e SINGULARITY_CACHEDIR=/var/lib/toil/singularity \\
|
|
824
|
+
-e MINIWDL__SINGULARITY__IMAGE_CACHE=/var/lib/toil/miniwdl \\
|
|
815
825
|
--name=toil_{role} \\
|
|
816
826
|
{applianceSelf()} \\
|
|
817
827
|
{entryPointArgs}
|
|
@@ -1228,7 +1238,7 @@ class AbstractProvisioner(ABC):
|
|
|
1228
1238
|
WantedBy=multi-user.target
|
|
1229
1239
|
''').format(**values))
|
|
1230
1240
|
|
|
1231
|
-
def _getIgnitionUserData(self, role, keyPath=None, preemptible=False, architecture='amd64'):
|
|
1241
|
+
def _getIgnitionUserData(self, role: str, keyPath: Optional[str] = None, preemptible: bool = False, architecture: str = 'amd64') -> str:
|
|
1232
1242
|
"""
|
|
1233
1243
|
Return the text (not bytes) user data to pass to a provisioned node.
|
|
1234
1244
|
|