toil 6.1.0a1__py3-none-any.whl → 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. toil/__init__.py +1 -232
  2. toil/batchSystems/abstractBatchSystem.py +41 -17
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +79 -65
  4. toil/batchSystems/awsBatch.py +8 -8
  5. toil/batchSystems/cleanup_support.py +7 -3
  6. toil/batchSystems/contained_executor.py +4 -5
  7. toil/batchSystems/gridengine.py +1 -1
  8. toil/batchSystems/htcondor.py +5 -5
  9. toil/batchSystems/kubernetes.py +25 -11
  10. toil/batchSystems/local_support.py +3 -3
  11. toil/batchSystems/lsf.py +9 -9
  12. toil/batchSystems/mesos/batchSystem.py +4 -4
  13. toil/batchSystems/mesos/executor.py +3 -2
  14. toil/batchSystems/options.py +9 -0
  15. toil/batchSystems/singleMachine.py +11 -10
  16. toil/batchSystems/slurm.py +129 -16
  17. toil/batchSystems/torque.py +1 -1
  18. toil/bus.py +45 -3
  19. toil/common.py +56 -31
  20. toil/cwl/cwltoil.py +442 -371
  21. toil/deferred.py +1 -1
  22. toil/exceptions.py +1 -1
  23. toil/fileStores/abstractFileStore.py +69 -20
  24. toil/fileStores/cachingFileStore.py +6 -22
  25. toil/fileStores/nonCachingFileStore.py +6 -15
  26. toil/job.py +270 -86
  27. toil/jobStores/abstractJobStore.py +37 -31
  28. toil/jobStores/aws/jobStore.py +280 -218
  29. toil/jobStores/aws/utils.py +60 -31
  30. toil/jobStores/conftest.py +2 -2
  31. toil/jobStores/fileJobStore.py +3 -3
  32. toil/jobStores/googleJobStore.py +3 -4
  33. toil/leader.py +89 -38
  34. toil/lib/aws/__init__.py +26 -10
  35. toil/lib/aws/iam.py +2 -2
  36. toil/lib/aws/session.py +62 -22
  37. toil/lib/aws/utils.py +73 -37
  38. toil/lib/conversions.py +24 -1
  39. toil/lib/ec2.py +118 -69
  40. toil/lib/expando.py +1 -1
  41. toil/lib/generatedEC2Lists.py +8 -8
  42. toil/lib/io.py +42 -4
  43. toil/lib/misc.py +1 -3
  44. toil/lib/resources.py +57 -16
  45. toil/lib/retry.py +12 -5
  46. toil/lib/threading.py +29 -14
  47. toil/lib/throttle.py +1 -1
  48. toil/options/common.py +31 -30
  49. toil/options/wdl.py +5 -0
  50. toil/provisioners/__init__.py +9 -3
  51. toil/provisioners/abstractProvisioner.py +12 -2
  52. toil/provisioners/aws/__init__.py +20 -15
  53. toil/provisioners/aws/awsProvisioner.py +406 -329
  54. toil/provisioners/gceProvisioner.py +2 -2
  55. toil/provisioners/node.py +13 -5
  56. toil/server/app.py +1 -1
  57. toil/statsAndLogging.py +93 -23
  58. toil/test/__init__.py +27 -12
  59. toil/test/batchSystems/batchSystemTest.py +40 -33
  60. toil/test/batchSystems/batch_system_plugin_test.py +79 -0
  61. toil/test/batchSystems/test_slurm.py +22 -7
  62. toil/test/cactus/__init__.py +0 -0
  63. toil/test/cactus/test_cactus_integration.py +58 -0
  64. toil/test/cwl/cwlTest.py +245 -236
  65. toil/test/cwl/seqtk_seq.cwl +1 -1
  66. toil/test/docs/scriptsTest.py +11 -14
  67. toil/test/jobStores/jobStoreTest.py +40 -54
  68. toil/test/lib/aws/test_iam.py +2 -2
  69. toil/test/lib/test_ec2.py +1 -1
  70. toil/test/options/__init__.py +13 -0
  71. toil/test/options/options.py +37 -0
  72. toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
  73. toil/test/provisioners/clusterTest.py +99 -16
  74. toil/test/server/serverTest.py +2 -2
  75. toil/test/src/autoDeploymentTest.py +1 -1
  76. toil/test/src/dockerCheckTest.py +2 -1
  77. toil/test/src/environmentTest.py +125 -0
  78. toil/test/src/fileStoreTest.py +1 -1
  79. toil/test/src/jobDescriptionTest.py +18 -8
  80. toil/test/src/jobTest.py +1 -1
  81. toil/test/src/realtimeLoggerTest.py +4 -0
  82. toil/test/src/workerTest.py +52 -19
  83. toil/test/utils/toilDebugTest.py +62 -4
  84. toil/test/utils/utilsTest.py +23 -21
  85. toil/test/wdl/wdltoil_test.py +49 -21
  86. toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
  87. toil/toilState.py +68 -9
  88. toil/utils/toilDebugFile.py +1 -1
  89. toil/utils/toilDebugJob.py +153 -26
  90. toil/utils/toilLaunchCluster.py +12 -2
  91. toil/utils/toilRsyncCluster.py +7 -2
  92. toil/utils/toilSshCluster.py +7 -3
  93. toil/utils/toilStats.py +310 -266
  94. toil/utils/toilStatus.py +98 -52
  95. toil/version.py +11 -11
  96. toil/wdl/wdltoil.py +644 -225
  97. toil/worker.py +125 -83
  98. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
  99. toil-7.0.0.dist-info/METADATA +158 -0
  100. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/RECORD +103 -96
  101. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
  102. toil-6.1.0a1.dist-info/METADATA +0 -125
  103. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
  104. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
toil/lib/io.py CHANGED
@@ -16,7 +16,7 @@ def mkdtemp(suffix: Optional[str] = None, prefix: Optional[str] = None, dir: Opt
16
16
 
17
17
  The permissions on the directory will be 711 instead of 700, allowing the
18
18
  group and all other users to traverse the directory. This is necessary if
19
- the direcotry is on NFS and the Docker daemon would like to mount it or a
19
+ the directory is on NFS and the Docker daemon would like to mount it or a
20
20
  file inside it into a container, because on NFS even the Docker daemon
21
21
  appears bound by the file permissions.
22
22
 
@@ -159,14 +159,26 @@ def atomic_copyobj(src_fh: BytesIO, dest_path: str, length: int = 16384, executa
159
159
  os.chmod(dest_path_tmp, os.stat(dest_path_tmp).st_mode | stat.S_IXUSR)
160
160
 
161
161
 
162
- def make_public_dir(in_directory: Optional[str] = None) -> str:
162
+ def make_public_dir(in_directory: str, suggested_name: Optional[str] = None) -> str:
163
163
  """
164
+ Make a publicly-accessible directory in the given directory.
165
+
166
+ :param suggested_name: Use this directory name first if possible.
167
+
164
168
  Try to make a random directory name with length 4 that doesn't exist, with the given prefix.
165
169
  Otherwise, try length 5, length 6, etc, up to a max of 32 (len of uuid4 with dashes replaced).
166
170
  This function's purpose is mostly to avoid having long file names when generating directories.
167
171
  If somehow this fails, which should be incredibly unlikely, default to a normal uuid4, which was
168
172
  our old default.
169
173
  """
174
+ if suggested_name is not None:
175
+ generated_dir_path: str = os.path.join(in_directory, suggested_name)
176
+ try:
177
+ os.mkdir(generated_dir_path)
178
+ os.chmod(generated_dir_path, 0o777)
179
+ return generated_dir_path
180
+ except FileExistsError:
181
+ pass
170
182
  for i in range(4, 32 + 1): # make random uuids and truncate to lengths starting at 4 and working up to max 32
171
183
  for _ in range(10): # make 10 attempts for each length
172
184
  truncated_uuid: str = str(uuid.uuid4()).replace('-', '')[:i]
@@ -182,17 +194,43 @@ def make_public_dir(in_directory: Optional[str] = None) -> str:
182
194
  os.chmod(this_should_never_happen, 0o777)
183
195
  return this_should_never_happen
184
196
 
185
- def try_path(path: str) -> Optional[str]:
197
+ def try_path(path: str, min_size: int = 100 * 1024 * 1024) -> Optional[str]:
186
198
  """
187
199
  Try to use the given path. Return it if it exists or can be made,
188
200
  and we can make things within it, or None otherwise.
201
+
202
+ :param min_size: Reject paths on filesystems smaller than this many bytes.
189
203
  """
204
+
190
205
  try:
191
206
  os.makedirs(path, exist_ok=True)
192
207
  except OSError:
193
208
  # Maybe we lack permissions
194
209
  return None
195
- return path if os.path.exists(path) and os.access(path, os.W_OK) else None
210
+
211
+ if not os.path.exists(path):
212
+ # We didn't manage to make it
213
+ return None
214
+
215
+ if not os.access(path, os.W_OK):
216
+ # It doesn't look writable
217
+ return None
218
+
219
+ try:
220
+ stats = os.statvfs(path)
221
+ except OSError:
222
+ # Maybe we lack permissions
223
+ return None
224
+
225
+ # Is the filesystem big enough?
226
+ # We need to look at the FS size and not the free space so we don't change
227
+ # over to a different filesystem when this one fills up.
228
+ fs_size = stats.f_frsize * stats.f_blocks
229
+ if fs_size < min_size:
230
+ # Too small
231
+ return None
232
+
233
+ return path
196
234
 
197
235
 
198
236
  class WriteWatchingStream:
toil/lib/misc.py CHANGED
@@ -11,8 +11,6 @@ import typing
11
11
  from contextlib import closing
12
12
  from typing import Iterator, List, Optional
13
13
 
14
- import pytz
15
-
16
14
  logger = logging.getLogger(__name__)
17
15
 
18
16
 
@@ -56,7 +54,7 @@ def get_user_name() -> str:
56
54
 
57
55
  def utc_now() -> datetime.datetime:
58
56
  """Return a datetime in the UTC timezone corresponding to right now."""
59
- return datetime.datetime.utcnow().replace(tzinfo=pytz.UTC)
57
+ return datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
60
58
 
61
59
  def unix_now_ms() -> float:
62
60
  """Return the current time in milliseconds since the Unix epoch."""
toil/lib/resources.py CHANGED
@@ -13,27 +13,68 @@
13
13
  # limitations under the License.
14
14
  import fnmatch
15
15
  import os
16
+ import math
17
+ import sys
16
18
  import resource
17
19
  from typing import List, Tuple
18
20
 
19
-
20
- def get_total_cpu_time_and_memory_usage() -> Tuple[float, int]:
21
+ class ResourceMonitor:
21
22
  """
22
- Gives the total cpu time of itself and all its children, and the maximum RSS memory usage of
23
- itself and its single largest child.
23
+ Global resource monitoring widget.
24
+
25
+ Presents class methods to get the resource usage of this process and child
26
+ processes, and other class methods to adjust the statistics so they can
27
+ account for e.g. resources used inside containers, or other resource usage
28
+ that *should* be billable to the current process.
24
29
  """
25
- me = resource.getrusage(resource.RUSAGE_SELF)
26
- children = resource.getrusage(resource.RUSAGE_CHILDREN)
27
- total_cpu_time = me.ru_utime + me.ru_stime + children.ru_utime + children.ru_stime
28
- total_memory_usage = me.ru_maxrss + children.ru_maxrss
29
- return total_cpu_time, total_memory_usage
30
-
31
-
32
- def get_total_cpu_time() -> float:
33
- """Gives the total cpu time, including the children."""
34
- me = resource.getrusage(resource.RUSAGE_SELF)
35
- childs = resource.getrusage(resource.RUSAGE_CHILDREN)
36
- return me.ru_utime + me.ru_stime + childs.ru_utime + childs.ru_stime
30
+
31
+ # Store some extra usage to tack onto the stats as module-level globals
32
+ _extra_cpu_seconds: float = 0
33
+ _extra_memory_ki: int = 0
34
+
35
+ @classmethod
36
+ def record_extra_memory(cls, peak_ki: int) -> None:
37
+ """
38
+ Become responsible for the given peak memory usage, in kibibytes.
39
+
40
+ The memory will be treated as if it was used by a child process at the time
41
+ our real child processes were also using their peak memory.
42
+ """
43
+ cls._extra_memory_ki = max(cls._extra_memory_ki, peak_ki)
44
+
45
+ @classmethod
46
+ def record_extra_cpu(cls, seconds: float) -> None:
47
+ """
48
+ Become responsible for the given CPU time.
49
+
50
+ The CPU time will be treated as if it had been used by a child process.
51
+ """
52
+ cls._extra_cpu_seconds += seconds
53
+
54
+ @classmethod
55
+ def get_total_cpu_time_and_memory_usage(cls) -> Tuple[float, int]:
56
+ """
57
+ Gives the total cpu time of itself and all its children, and the maximum RSS memory usage of
58
+ itself and its single largest child (in kibibytes).
59
+ """
60
+ me = resource.getrusage(resource.RUSAGE_SELF)
61
+ children = resource.getrusage(resource.RUSAGE_CHILDREN)
62
+ total_cpu_time = me.ru_utime + me.ru_stime + children.ru_utime + children.ru_stime + cls._extra_cpu_seconds
63
+ total_memory_usage = me.ru_maxrss + children.ru_maxrss
64
+ if sys.platform == "darwin":
65
+ # On Linux, getrusage works in "kilobytes" (really kibibytes), but on
66
+ # Mac it works in bytes. See
67
+ # <https://github.com/python/cpython/issues/74698>
68
+ total_memory_usage = int(math.ceil(total_memory_usage / 1024))
69
+ total_memory_usage += cls._extra_memory_ki
70
+ return total_cpu_time, total_memory_usage
71
+
72
+ @classmethod
73
+ def get_total_cpu_time(cls) -> float:
74
+ """Gives the total cpu time, including the children."""
75
+ me = resource.getrusage(resource.RUSAGE_SELF)
76
+ childs = resource.getrusage(resource.RUSAGE_CHILDREN)
77
+ return me.ru_utime + me.ru_stime + childs.ru_utime + childs.ru_stime + cls._extra_cpu_seconds
37
78
 
38
79
 
39
80
  def glob(glob_pattern: str, directoryname: str) -> List[str]:
toil/lib/retry.py CHANGED
@@ -142,7 +142,7 @@ from typing import (Any,
142
142
  Sequence,
143
143
  Tuple,
144
144
  Type,
145
- Union)
145
+ Union, TypeVar)
146
146
 
147
147
  import requests.exceptions
148
148
  import urllib3.exceptions
@@ -224,13 +224,16 @@ class ErrorCondition:
224
224
  )
225
225
 
226
226
 
227
+ # There is a better way to type hint this with python 3.10
228
+ # https://stackoverflow.com/a/68290080
229
+ RT = TypeVar("RT")
227
230
  def retry(
228
231
  intervals: Optional[List] = None,
229
232
  infinite_retries: bool = False,
230
233
  errors: Optional[Sequence[Union[ErrorCondition, Type[Exception]]]] = None,
231
234
  log_message: Optional[Tuple[Callable, str]] = None,
232
235
  prepare: Optional[List[Callable]] = None,
233
- ) -> Callable[[Any], Any]:
236
+ ) -> Callable[[Callable[..., RT]], Callable[..., RT]]:
234
237
  """
235
238
  Retry a function if it fails with any Exception defined in "errors".
236
239
 
@@ -281,9 +284,9 @@ def retry(
281
284
  if error_condition.retry_on_this_condition:
282
285
  retriable_errors.add(error_condition.error)
283
286
 
284
- def decorate(func):
287
+ def decorate(func: Callable[..., RT]) -> Callable[..., RT]:
285
288
  @functools.wraps(func)
286
- def call(*args, **kwargs):
289
+ def call(*args, **kwargs) -> RT:
287
290
  intervals_remaining = copy.deepcopy(intervals)
288
291
  while True:
289
292
  try:
@@ -488,13 +491,15 @@ def error_meets_conditions(e, error_conditions):
488
491
  DEFAULT_DELAYS = (0, 1, 1, 4, 16, 64)
489
492
  DEFAULT_TIMEOUT = 300
490
493
 
494
+ E = TypeVar("E", bound=Exception) # so mypy understands passed through types
495
+
491
496
  # TODO: Replace the use of this with retry()
492
497
  # The aws provisioner and jobstore need a large refactoring to be boto3 compliant, so this is
493
498
  # still used there to avoid the duplication of future work
494
499
  def old_retry(
495
500
  delays: Iterable[float] = DEFAULT_DELAYS,
496
501
  timeout: float = DEFAULT_TIMEOUT,
497
- predicate: Callable[[Exception], bool] = lambda e: False,
502
+ predicate: Callable[[E], bool] = lambda e: False,
498
503
  ) -> Generator[ContextManager, None, None]:
499
504
  """
500
505
  Deprecated.
@@ -567,6 +572,8 @@ def old_retry(
567
572
  >>> i
568
573
  1
569
574
  """
575
+ if timeout is None:
576
+ timeout = DEFAULT_TIMEOUT
570
577
  if timeout > 0:
571
578
  go = [ None ]
572
579
 
toil/lib/threading.py CHANGED
@@ -28,7 +28,7 @@ import traceback
28
28
  from contextlib import contextmanager
29
29
  from typing import Dict, Iterator, Optional, Union, cast
30
30
 
31
- import psutil # type: ignore
31
+ import psutil
32
32
 
33
33
  from toil.lib.exceptions import raise_
34
34
  from toil.lib.io import robust_rmtree
@@ -41,7 +41,7 @@ class ExceptionalThread(threading.Thread):
41
41
  A thread whose join() method re-raises exceptions raised during run(). While join() is
42
42
  idempotent, the exception is only during the first invocation of join() that successfully
43
43
  joined the thread. If join() times out, no exception will be re reraised even though an
44
- exception might already have occured in run().
44
+ exception might already have occurred in run().
45
45
 
46
46
  When subclassing this thread, override tryRun() instead of run().
47
47
 
@@ -109,9 +109,12 @@ def cpu_count() -> int:
109
109
  return cast(int, cached)
110
110
 
111
111
  # Get the fallback answer of all the CPUs on the machine
112
- total_machine_size = cast(int, psutil.cpu_count(logical=True))
112
+ psutil_cpu_count = cast(Optional[int], psutil.cpu_count(logical=True))
113
+ if psutil_cpu_count is None:
114
+ logger.debug('Could not retrieve the logical CPU count.')
113
115
 
114
- logger.debug('Total machine size: %d cores', total_machine_size)
116
+ total_machine_size: Union[float, int] = psutil_cpu_count if psutil_cpu_count is not None else float('inf')
117
+ logger.debug('Total machine size: %s core(s)', total_machine_size)
115
118
 
116
119
  # cgroups may limit the size
117
120
  cgroup_size: Union[float, int] = float('inf')
@@ -151,13 +154,13 @@ def cpu_count() -> int:
151
154
  if quota == -1:
152
155
  # But the quota can be -1 for unset.
153
156
  # Assume we can use the whole machine.
154
- return total_machine_size
155
-
156
- # The thread count is how many multiples of a wall clock period we
157
- # can burn in that period.
158
- cgroup_size = int(math.ceil(float(quota)/float(period)))
157
+ cgroup_size = float('inf')
158
+ else:
159
+ # The thread count is how many multiples of a wall clock period we
160
+ # can burn in that period.
161
+ cgroup_size = int(math.ceil(float(quota)/float(period)))
159
162
 
160
- logger.debug('Control group size in cores: %d', cgroup_size)
163
+ logger.debug('Control group size in cores: %s', cgroup_size)
161
164
  except:
162
165
  # We can't actually read these cgroup fields. Maybe we are a mac or something.
163
166
  logger.debug('Could not inspect cgroup: %s', traceback.format_exc())
@@ -175,9 +178,16 @@ def cpu_count() -> int:
175
178
  else:
176
179
  logger.debug('CPU affinity not available')
177
180
 
178
- # Return the smaller of the actual thread count and the cgroup's limit, minimum 1.
179
- result = cast(int, max(1, min(min(affinity_size, cgroup_size), total_machine_size)))
180
- logger.debug('cpu_count: %s', str(result))
181
+ limit: Union[float, int] = float('inf')
182
+ # Apply all the limits to take the smallest
183
+ limit = min(limit, total_machine_size)
184
+ limit = min(limit, cgroup_size)
185
+ limit = min(limit, affinity_size)
186
+ if limit < 1 or limit == float('inf'):
187
+ # Fall back to 1 if we can't get a size
188
+ limit = 1
189
+ result = int(limit)
190
+ logger.debug('cpu_count: %s', result)
181
191
  # Make sure to remember it for the next call
182
192
  setattr(cpu_count, 'result', result)
183
193
  return result
@@ -529,9 +539,14 @@ class LastProcessStandingArena:
529
539
  os.mkdir(self.lockfileDir)
530
540
  except FileExistsError:
531
541
  pass
542
+ except Exception as e:
543
+ raise RuntimeError("Could not make lock file directory " + self.lockfileDir) from e
532
544
 
533
545
  # Make ourselves a file in it and lock it to prove we are alive.
534
- self.lockfileFD, self.lockfileName = tempfile.mkstemp(dir=self.lockfileDir) # type: ignore
546
+ try:
547
+ self.lockfileFD, self.lockfileName = tempfile.mkstemp(dir=self.lockfileDir) # type: ignore
548
+ except Exception as e:
549
+ raise RuntimeError("Could not make lock file in " + self.lockfileDir) from e
535
550
  # Nobody can see it yet, so lock it right away
536
551
  fcntl.lockf(self.lockfileFD, fcntl.LOCK_EX) # type: ignore
537
552
 
toil/lib/throttle.py CHANGED
@@ -43,7 +43,7 @@ class LocalThrottle:
43
43
  thread as necessary to ensure that no less than the configured minimum interval has
44
44
  passed since the last invocation of this method in the current thread returned True.
45
45
 
46
- If the wait parameter is False, this method immediatly returns True (if at least the
46
+ If the wait parameter is False, this method immediatley returns True (if at least the
47
47
  configured minimum interval has passed since the last time this method returned True in
48
48
  the current thread) or False otherwise.
49
49
  """
toil/options/common.py CHANGED
@@ -2,13 +2,12 @@ import os
2
2
  from argparse import ArgumentParser, Action, _AppendAction
3
3
  from typing import Any, Optional, Union, Type, Callable, List, Dict, TYPE_CHECKING
4
4
 
5
- from distutils.util import strtobool
6
5
  from configargparse import SUPPRESS
7
6
  import logging
8
7
 
9
8
  from ruamel.yaml import YAML
10
9
 
11
- from toil.lib.conversions import bytes2human, human2bytes
10
+ from toil.lib.conversions import bytes2human, human2bytes, strtobool, opt_strtobool
12
11
 
13
12
  from toil.batchSystems.options import add_all_batchsystem_options
14
13
  from toil.provisioners import parse_node_types
@@ -138,8 +137,14 @@ def make_open_interval_action(min: Union[int, float], max: Optional[Union[int, f
138
137
  func = fC(min, max)
139
138
  try:
140
139
  if not func(values):
141
- raise parser.error(
142
- f"{option_string} ({values}) must be within the range: [{min}, {'infinity' if max is None else max})")
140
+ if max is None:
141
+ raise parser.error(
142
+ f"{option_string} ({values}) must be at least {min}"
143
+ )
144
+ else:
145
+ raise parser.error(
146
+ f"{option_string} ({values}) must be at least {min} and strictly less than {max})"
147
+ )
143
148
  except AssertionError:
144
149
  raise RuntimeError(f"The {option_string} option has an invalid value: {values}")
145
150
  setattr(namespace, self.dest, values)
@@ -174,7 +179,7 @@ JOBSTORE_HELP = ("The location of the job store for the workflow. "
174
179
  "store must be accessible by all worker nodes. Depending on the desired "
175
180
  "job store implementation, the location should be formatted according to "
176
181
  "one of the following schemes:\n\n"
177
- "file:<path> where <path> points to a directory on the file systen\n\n"
182
+ "file:<path> where <path> points to a directory on the file system\n\n"
178
183
  "aws:<region>:<prefix> where <region> is the name of an AWS region like "
179
184
  "us-west-2 and <prefix> will be prepended to the names of any top-level "
180
185
  "AWS resources in use by job store, e.g. S3 buckets.\n\n "
@@ -201,14 +206,6 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
201
206
  config.add_argument('--config', dest='config', is_config_file_arg=True, default=None, metavar="PATH",
202
207
  help="Get options from a config file.")
203
208
 
204
- def convert_bool(b: str) -> bool:
205
- """Convert a string representation of bool to bool"""
206
- return bool(strtobool(b))
207
-
208
- def opt_strtobool(b: Optional[str]) -> Optional[bool]:
209
- """Convert an optional string representation of bool to None or bool"""
210
- return b if b is None else convert_bool(b)
211
-
212
209
  add_logging_options(parser)
213
210
  parser.register("type", "bool", parseBool) # Custom type for arg=True/False.
214
211
 
@@ -354,7 +351,7 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
354
351
  "them from being modified externally. When set to False, as long as caching is enabled, "
355
352
  "Toil will protect the file automatically by changing the permissions to read-only."
356
353
  "default=%(default)s")
357
- link_imports.add_argument("--symlinkImports", dest="symlinkImports", type=convert_bool, default=True,
354
+ link_imports.add_argument("--symlinkImports", dest="symlinkImports", type=strtobool, default=True,
358
355
  metavar="BOOL", help=link_imports_help)
359
356
  move_exports = file_store_options.add_mutually_exclusive_group()
360
357
  move_exports_help = ('When using a filesystem based job store, output files are by default moved to the '
@@ -362,7 +359,7 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
362
359
  'location. Setting this option to True instead copies the files into the output directory. '
363
360
  'Applies to filesystem-based job stores only.'
364
361
  'default=%(default)s')
365
- move_exports.add_argument("--moveOutputs", dest="moveOutputs", type=convert_bool, default=False, metavar="BOOL",
362
+ move_exports.add_argument("--moveOutputs", dest="moveOutputs", type=strtobool, default=False, metavar="BOOL",
366
363
  help=move_exports_help)
367
364
 
368
365
  caching = file_store_options.add_mutually_exclusive_group()
@@ -472,11 +469,11 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
472
469
  "This is useful for heterogeneous jobs where some tasks require much more "
473
470
  "disk than others.")
474
471
 
475
- autoscaling_options.add_argument("--metrics", dest="metrics", default=False, type=convert_bool, metavar="BOOL",
472
+ autoscaling_options.add_argument("--metrics", dest="metrics", default=False, type=strtobool, metavar="BOOL",
476
473
  help="Enable the prometheus/grafana dashboard for monitoring CPU/RAM usage, "
477
474
  "queue size, and issued jobs.")
478
475
  autoscaling_options.add_argument("--assumeZeroOverhead", dest="assume_zero_overhead", default=False,
479
- type=convert_bool, metavar="BOOL",
476
+ type=strtobool, metavar="BOOL",
480
477
  help="Ignore scheduler and OS overhead and assume jobs can use every last byte "
481
478
  "of memory and disk on a node when autoscaling.")
482
479
 
@@ -547,7 +544,7 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
547
544
  help=resource_help_msg.format('default', 'accelerators', accelerators_note, []))
548
545
  resource_options.add_argument('--defaultPreemptible', '--defaultPreemptable', dest='defaultPreemptible',
549
546
  metavar='BOOL',
550
- type=convert_bool, nargs='?', const=True, default=False,
547
+ type=strtobool, nargs='?', const=True, default=False,
551
548
  help='Make all jobs able to run on preemptible (spot) nodes by default.')
552
549
  resource_options.add_argument('--maxCores', dest='maxCores', default=SYS_MAX_SIZE, metavar='INT', type=int,
553
550
  action=make_open_interval_action(1),
@@ -572,10 +569,10 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
572
569
  f"labeling job failed. default={1}")
573
570
  job_options.add_argument("--enableUnlimitedPreemptibleRetries", "--enableUnlimitedPreemptableRetries",
574
571
  dest="enableUnlimitedPreemptibleRetries",
575
- type=convert_bool, default=False, metavar="BOOL",
572
+ type=strtobool, default=False, metavar="BOOL",
576
573
  help="If set, preemptible failures (or any failure due to an instance getting "
577
574
  "unexpectedly terminated) will not count towards job failures and --retryCount.")
578
- job_options.add_argument("--doubleMem", dest="doubleMem", type=convert_bool, default=False, metavar="BOOL",
575
+ job_options.add_argument("--doubleMem", dest="doubleMem", type=strtobool, default=False, metavar="BOOL",
579
576
  help="If set, batch jobs which die to reaching memory limit on batch schedulers "
580
577
  "will have their memory doubled and they will be retried. The remaining "
581
578
  "retry count will be reduced by 1. Currently supported by LSF.")
@@ -589,18 +586,23 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
589
586
  help=f"Period of time to wait (in seconds) between checking for missing/overlong jobs, "
590
587
  f"that is jobs which get lost by the batch system. Expert parameter. "
591
588
  f"default=%(default)s")
589
+ job_options.add_argument("--jobStoreTimeout", dest="job_store_timeout", default=30, type=float,
590
+ action=make_open_interval_action(0), metavar="FLOAT",
591
+ help=f"Maximum time (in seconds) to wait for a job's update to the job store "
592
+ f"before declaring it failed. default=%(default)s")
593
+
592
594
 
593
595
  # Log management options
594
596
  log_options = parser.add_argument_group(
595
597
  title="Toil log management options.",
596
598
  description="Options for how Toil should manage its logs."
597
599
  )
598
- log_options.add_argument("--maxLogFileSize", dest="maxLogFileSize", default=64000, type=h2b,
600
+ log_options.add_argument("--maxLogFileSize", dest="maxLogFileSize", default=100 * 1024 * 1024, type=h2b,
599
601
  action=make_open_interval_action(1),
600
602
  help=f"The maximum size of a job log file to keep (in bytes), log files larger than "
601
603
  f"this will be truncated to the last X bytes. Setting this option to zero will "
602
604
  f"prevent any truncation. Setting this option to a negative value will truncate "
603
- f"from the beginning. Default={bytes2human(64000)}")
605
+ f"from the beginning. Default={bytes2human(100 * 1024 * 1024)}")
604
606
  log_options.add_argument("--writeLogs", dest="writeLogs", nargs='?', action='store', default=None,
605
607
  const=os.getcwd(), metavar="OPT_PATH",
606
608
  help="Write worker logs received by the leader into their own files at the specified "
@@ -613,7 +615,7 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
613
615
  log_options.add_argument("--writeLogsGzip", dest="writeLogsGzip", nargs='?', action='store', default=None,
614
616
  const=os.getcwd(), metavar="OPT_PATH",
615
617
  help="Identical to --writeLogs except the logs files are gzipped on the leader.")
616
- log_options.add_argument("--writeLogsFromAllJobs", dest="writeLogsFromAllJobs", type=convert_bool,
618
+ log_options.add_argument("--writeLogsFromAllJobs", dest="writeLogsFromAllJobs", type=strtobool,
617
619
  default=False, metavar="BOOL",
618
620
  help="Whether to write logs from all jobs (including the successful ones) without "
619
621
  "necessarily setting the log level to 'debug'. Ensure that either --writeLogs "
@@ -621,7 +623,7 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
621
623
  log_options.add_argument("--writeMessages", dest="write_messages", default=None,
622
624
  type=lambda x: None if x is None else os.path.abspath(x), metavar="PATH",
623
625
  help="File to send messages from the leader's message bus to.")
624
- log_options.add_argument("--realTimeLogging", dest="realTimeLogging", type=convert_bool, default=False,
626
+ log_options.add_argument("--realTimeLogging", dest="realTimeLogging", type=strtobool, default=False,
625
627
  help="Enable real-time logging from workers to leader")
626
628
 
627
629
  # Misc options
@@ -629,12 +631,12 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
629
631
  title="Toil miscellaneous options.",
630
632
  description="Everything else."
631
633
  )
632
- misc_options.add_argument('--disableChaining', dest='disableChaining', type=convert_bool, default=False,
634
+ misc_options.add_argument('--disableChaining', dest='disableChaining', type=strtobool, default=False,
633
635
  metavar="BOOL",
634
636
  help="Disables chaining of jobs (chaining uses one job's resource allocation "
635
637
  "for its successor job if possible).")
636
638
  misc_options.add_argument("--disableJobStoreChecksumVerification", dest="disableJobStoreChecksumVerification",
637
- default=False, type=convert_bool, metavar="BOOL",
639
+ default=False, type=strtobool, metavar="BOOL",
638
640
  help="Disables checksum verification for files transferred to/from the job store. "
639
641
  "Checksum verification is a safety check to ensure the data is not corrupted "
640
642
  "during transfer. Currently only supported for non-streaming AWS files.")
@@ -686,14 +688,13 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
686
688
  action=make_open_interval_action(0.0), metavar="FLOAT",
687
689
  help=f"Interval of time service jobs wait between polling for the existence of the "
688
690
  f"keep-alive flag. Default: {60.0}")
689
- misc_options.add_argument('--forceDockerAppliance', dest='forceDockerAppliance', type=convert_bool, default=False,
691
+ misc_options.add_argument('--forceDockerAppliance', dest='forceDockerAppliance', type=strtobool, default=False,
690
692
  metavar="BOOL",
691
693
  help='Disables sanity checking the existence of the docker image specified by '
692
694
  'TOIL_APPLIANCE_SELF, which Toil uses to provision mesos for autoscaling.')
693
695
  misc_options.add_argument('--statusWait', dest='statusWait', type=int, default=3600, metavar="INT",
694
696
  help="Seconds to wait between reports of running jobs.")
695
- misc_options.add_argument('--disableProgress', dest='disableProgress', type=convert_bool, default=False,
696
- metavar="BOOL",
697
+ misc_options.add_argument('--disableProgress', dest='disableProgress', action="store_true", default=False,
697
698
  help="Disables the progress bar shown when standard error is a terminal.")
698
699
 
699
700
  # Debug options
@@ -735,4 +736,4 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
735
736
 
736
737
  # dest is set to enableCaching to not conflict with the current --caching destination
737
738
  caching.add_argument('--disableCaching', dest='enableCaching', action='store_false', help=SUPPRESS)
738
- caching.set_defaults(disableCaching=None)
739
+ caching.set_defaults(enableCaching=None)
toil/options/wdl.py CHANGED
@@ -13,6 +13,8 @@ def add_wdl_options(parser: ArgumentParser, suppress: bool = True) -> None:
13
13
  suppress_help = SUPPRESS if suppress else None
14
14
  # include arg names without a wdl specifier if suppress is False
15
15
  # this is to avoid possible duplicate options in custom toil scripts, ex outputFile can be a common argument name
16
+ # TODO: Why do we even need them at all in other Toil scripts? Do we have to worry about dest= collisions?
17
+ # TODO: Can the better option name be first?
16
18
  output_dialect_arguments = ["--wdlOutputDialect"] + (["--outputDialect"] if not suppress else [])
17
19
  parser.add_argument(*output_dialect_arguments, dest="output_dialect", type=str, default='cromwell',
18
20
  choices=['cromwell', 'miniwdl'],
@@ -30,3 +32,6 @@ def add_wdl_options(parser: ArgumentParser, suppress: bool = True) -> None:
30
32
  reference_inputs_arguments = ["--wdlReferenceInputs"] + (["--referenceInputs"] if not suppress else [])
31
33
  parser.add_argument(*reference_inputs_arguments, dest="reference_inputs", type=bool, default=False,
32
34
  help=suppress_help or "Pass input files by URL")
35
+ container_arguments = ["--wdlContainer"] + (["--container"] if not suppress else [])
36
+ parser.add_argument(*container_arguments, dest="container", type=str, choices=["singularity", "docker", "auto"], default="auto",
37
+ help=suppress_help or "Container engine to use to run WDL tasks")
@@ -32,6 +32,7 @@ def cluster_factory(
32
32
  nodeStorage: int = 50,
33
33
  nodeStorageOverrides: Optional[List[str]] = None,
34
34
  sseKey: Optional[str] = None,
35
+ enable_fuse: bool = False
35
36
  ) -> Union["AWSProvisioner", "GCEProvisioner"]:
36
37
  """
37
38
  Find and instantiate the appropriate provisioner instance to make clusters in the given cloud.
@@ -51,14 +52,14 @@ def cluster_factory(
51
52
  except ImportError:
52
53
  logger.error('The aws extra must be installed to use this provisioner')
53
54
  raise
54
- return AWSProvisioner(clusterName, clusterType, zone, nodeStorage, nodeStorageOverrides, sseKey)
55
+ return AWSProvisioner(clusterName, clusterType, zone, nodeStorage, nodeStorageOverrides, sseKey, enable_fuse)
55
56
  elif provisioner == 'gce':
56
57
  try:
57
58
  from toil.provisioners.gceProvisioner import GCEProvisioner
58
59
  except ImportError:
59
60
  logger.error('The google extra must be installed to use this provisioner')
60
61
  raise
61
- return GCEProvisioner(clusterName, clusterType, zone, nodeStorage, nodeStorageOverrides, sseKey)
62
+ return GCEProvisioner(clusterName, clusterType, zone, nodeStorage, nodeStorageOverrides, sseKey, enable_fuse)
62
63
  else:
63
64
  raise RuntimeError("Invalid provisioner '%s'" % provisioner)
64
65
 
@@ -174,9 +175,14 @@ def check_valid_node_types(provisioner, node_types: List[Tuple[Set[str], Optiona
174
175
 
175
176
  class NoSuchClusterException(Exception):
176
177
  """Indicates that the specified cluster does not exist."""
177
- def __init__(self, cluster_name):
178
+ def __init__(self, cluster_name: str) -> None:
178
179
  super().__init__(f"The cluster '{cluster_name}' could not be found")
179
180
 
181
+ class NoSuchZoneException(Exception):
182
+ """Indicates that a valid zone could not be found."""
183
+ def __init__(self) -> None:
184
+ super().__init__(f"No valid zone could be found!")
185
+
180
186
 
181
187
  class ClusterTypeNotSupportedException(Exception):
182
188
  """Indicates that a provisioner does not support a given cluster type."""
@@ -137,6 +137,7 @@ class AbstractProvisioner(ABC):
137
137
  zone: Optional[str] = None,
138
138
  nodeStorage: int = 50,
139
139
  nodeStorageOverrides: Optional[List[str]] = None,
140
+ enable_fuse: bool = False
140
141
  ) -> None:
141
142
  """
142
143
  Initialize provisioner.
@@ -162,11 +163,14 @@ class AbstractProvisioner(ABC):
162
163
  for override in nodeStorageOverrides or []:
163
164
  nodeShape, storageOverride = override.split(':')
164
165
  self._nodeStorageOverrides[nodeShape] = int(storageOverride)
165
- self._leaderPrivateIP = None
166
+ self._leaderPrivateIP: Optional[str] = None
166
167
  # This will hold an SSH public key for Mesos clusters, or the
167
168
  # Kubernetes joining information as a dict for Kubernetes clusters.
168
169
  self._leaderWorkerAuthentication = None
169
170
 
171
+ # Whether or not to use FUSE on the cluster. If true, the cluster's Toil containers will be launched in privileged mode
172
+ self.enable_fuse = enable_fuse
173
+
170
174
  if clusterName:
171
175
  # Making a new cluster
172
176
  self.createClusterSettings()
@@ -812,6 +816,12 @@ class AbstractProvisioner(ABC):
812
816
  -v /opt:/opt \\
813
817
  -v /etc/kubernetes:/etc/kubernetes \\
814
818
  -v /etc/kubernetes/admin.conf:/root/.kube/config \\
819
+ {"-e TOIL_KUBERNETES_PRIVILEGED=True --privileged" if self.enable_fuse else
820
+ "--security-opt seccomp=unconfined --security-opt systempaths=unconfined"} \\
821
+ -e TOIL_KUBERNETES_HOST_PATH=/var/lib/toil \\
822
+ # Pass in a path to use for singularity image caching into the container
823
+ -e SINGULARITY_CACHEDIR=/var/lib/toil/singularity \\
824
+ -e MINIWDL__SINGULARITY__IMAGE_CACHE=/var/lib/toil/miniwdl \\
815
825
  --name=toil_{role} \\
816
826
  {applianceSelf()} \\
817
827
  {entryPointArgs}
@@ -1228,7 +1238,7 @@ class AbstractProvisioner(ABC):
1228
1238
  WantedBy=multi-user.target
1229
1239
  ''').format(**values))
1230
1240
 
1231
- def _getIgnitionUserData(self, role, keyPath=None, preemptible=False, architecture='amd64'):
1241
+ def _getIgnitionUserData(self, role: str, keyPath: Optional[str] = None, preemptible: bool = False, architecture: str = 'amd64') -> str:
1232
1242
  """
1233
1243
  Return the text (not bytes) user data to pass to a provisioned node.
1234
1244