toil 6.0.0__py3-none-any.whl → 6.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/batchSystems/abstractBatchSystem.py +19 -4
- toil/batchSystems/abstractGridEngineBatchSystem.py +22 -22
- toil/batchSystems/cleanup_support.py +7 -3
- toil/batchSystems/lsf.py +7 -7
- toil/batchSystems/slurm.py +85 -14
- toil/bus.py +38 -0
- toil/common.py +20 -18
- toil/cwl/cwltoil.py +81 -63
- toil/exceptions.py +1 -1
- toil/fileStores/abstractFileStore.py +53 -4
- toil/fileStores/cachingFileStore.py +4 -20
- toil/fileStores/nonCachingFileStore.py +5 -14
- toil/job.py +46 -30
- toil/jobStores/abstractJobStore.py +21 -23
- toil/jobStores/aws/utils.py +5 -4
- toil/jobStores/fileJobStore.py +1 -1
- toil/leader.py +17 -14
- toil/lib/conversions.py +19 -0
- toil/lib/generatedEC2Lists.py +8 -8
- toil/lib/io.py +28 -2
- toil/lib/resources.py +8 -1
- toil/lib/threading.py +27 -12
- toil/options/common.py +5 -7
- toil/options/wdl.py +5 -0
- toil/provisioners/abstractProvisioner.py +8 -0
- toil/statsAndLogging.py +36 -8
- toil/test/batchSystems/test_slurm.py +21 -6
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +58 -0
- toil/test/cwl/cwlTest.py +243 -151
- toil/test/docs/scriptsTest.py +2 -2
- toil/test/jobStores/jobStoreTest.py +7 -5
- toil/test/lib/test_ec2.py +1 -1
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +37 -0
- toil/test/provisioners/clusterTest.py +9 -8
- toil/test/utils/toilDebugTest.py +1 -1
- toil/test/utils/utilsTest.py +3 -3
- toil/test/wdl/wdltoil_test.py +91 -16
- toil/utils/toilDebugFile.py +1 -1
- toil/utils/toilStats.py +309 -266
- toil/utils/toilStatus.py +1 -1
- toil/version.py +9 -9
- toil/wdl/wdltoil.py +341 -189
- toil/worker.py +31 -16
- {toil-6.0.0.dist-info → toil-6.1.0.dist-info}/METADATA +6 -7
- {toil-6.0.0.dist-info → toil-6.1.0.dist-info}/RECORD +51 -47
- {toil-6.0.0.dist-info → toil-6.1.0.dist-info}/LICENSE +0 -0
- {toil-6.0.0.dist-info → toil-6.1.0.dist-info}/WHEEL +0 -0
- {toil-6.0.0.dist-info → toil-6.1.0.dist-info}/entry_points.txt +0 -0
- {toil-6.0.0.dist-info → toil-6.1.0.dist-info}/top_level.txt +0 -0
toil/lib/io.py
CHANGED
|
@@ -182,17 +182,43 @@ def make_public_dir(in_directory: Optional[str] = None) -> str:
|
|
|
182
182
|
os.chmod(this_should_never_happen, 0o777)
|
|
183
183
|
return this_should_never_happen
|
|
184
184
|
|
|
185
|
-
def try_path(path: str) -> Optional[str]:
|
|
185
|
+
def try_path(path: str, min_size: int = 100 * 1024 * 1024) -> Optional[str]:
|
|
186
186
|
"""
|
|
187
187
|
Try to use the given path. Return it if it exists or can be made,
|
|
188
188
|
and we can make things within it, or None otherwise.
|
|
189
|
+
|
|
190
|
+
:param min_size: Reject paths on filesystems smaller than this many bytes.
|
|
189
191
|
"""
|
|
192
|
+
|
|
190
193
|
try:
|
|
191
194
|
os.makedirs(path, exist_ok=True)
|
|
192
195
|
except OSError:
|
|
193
196
|
# Maybe we lack permissions
|
|
194
197
|
return None
|
|
195
|
-
|
|
198
|
+
|
|
199
|
+
if not os.path.exists(path):
|
|
200
|
+
# We didn't manage to make it
|
|
201
|
+
return None
|
|
202
|
+
|
|
203
|
+
if not os.access(path, os.W_OK):
|
|
204
|
+
# It doesn't look writable
|
|
205
|
+
return None
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
stats = os.statvfs(path)
|
|
209
|
+
except OSError:
|
|
210
|
+
# Maybe we lack permissions
|
|
211
|
+
return None
|
|
212
|
+
|
|
213
|
+
# Is the filesystem big enough?
|
|
214
|
+
# We need to look at the FS size and not the free space so we don't change
|
|
215
|
+
# over to a different filesystem when this one fills up.
|
|
216
|
+
fs_size = stats.f_frsize * stats.f_blocks
|
|
217
|
+
if fs_size < min_size:
|
|
218
|
+
# Too small
|
|
219
|
+
return None
|
|
220
|
+
|
|
221
|
+
return path
|
|
196
222
|
|
|
197
223
|
|
|
198
224
|
class WriteWatchingStream:
|
toil/lib/resources.py
CHANGED
|
@@ -13,6 +13,8 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import fnmatch
|
|
15
15
|
import os
|
|
16
|
+
import math
|
|
17
|
+
import sys
|
|
16
18
|
import resource
|
|
17
19
|
from typing import List, Tuple
|
|
18
20
|
|
|
@@ -20,12 +22,17 @@ from typing import List, Tuple
|
|
|
20
22
|
def get_total_cpu_time_and_memory_usage() -> Tuple[float, int]:
|
|
21
23
|
"""
|
|
22
24
|
Gives the total cpu time of itself and all its children, and the maximum RSS memory usage of
|
|
23
|
-
itself and its single largest child.
|
|
25
|
+
itself and its single largest child (in kibibytes).
|
|
24
26
|
"""
|
|
25
27
|
me = resource.getrusage(resource.RUSAGE_SELF)
|
|
26
28
|
children = resource.getrusage(resource.RUSAGE_CHILDREN)
|
|
27
29
|
total_cpu_time = me.ru_utime + me.ru_stime + children.ru_utime + children.ru_stime
|
|
28
30
|
total_memory_usage = me.ru_maxrss + children.ru_maxrss
|
|
31
|
+
if sys.platform == "darwin":
|
|
32
|
+
# On Linux, getrusage works in "kilobytes" (really kibibytes), but on
|
|
33
|
+
# Mac it works in bytes. See
|
|
34
|
+
# <https://github.com/python/cpython/issues/74698>
|
|
35
|
+
total_memory_usage = int(math.ceil(total_memory_usage / 1024))
|
|
29
36
|
return total_cpu_time, total_memory_usage
|
|
30
37
|
|
|
31
38
|
|
toil/lib/threading.py
CHANGED
|
@@ -109,9 +109,12 @@ def cpu_count() -> int:
|
|
|
109
109
|
return cast(int, cached)
|
|
110
110
|
|
|
111
111
|
# Get the fallback answer of all the CPUs on the machine
|
|
112
|
-
|
|
112
|
+
psutil_cpu_count = cast(Optional[int], psutil.cpu_count(logical=True))
|
|
113
|
+
if psutil_cpu_count is None:
|
|
114
|
+
logger.debug('Could not retrieve the logical CPU count.')
|
|
113
115
|
|
|
114
|
-
|
|
116
|
+
total_machine_size: Union[float, int] = psutil_cpu_count if psutil_cpu_count is not None else float('inf')
|
|
117
|
+
logger.debug('Total machine size: %s core(s)', total_machine_size)
|
|
115
118
|
|
|
116
119
|
# cgroups may limit the size
|
|
117
120
|
cgroup_size: Union[float, int] = float('inf')
|
|
@@ -151,13 +154,13 @@ def cpu_count() -> int:
|
|
|
151
154
|
if quota == -1:
|
|
152
155
|
# But the quota can be -1 for unset.
|
|
153
156
|
# Assume we can use the whole machine.
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
157
|
+
cgroup_size = float('inf')
|
|
158
|
+
else:
|
|
159
|
+
# The thread count is how many multiples of a wall clock period we
|
|
160
|
+
# can burn in that period.
|
|
161
|
+
cgroup_size = int(math.ceil(float(quota)/float(period)))
|
|
159
162
|
|
|
160
|
-
logger.debug('Control group size in cores: %
|
|
163
|
+
logger.debug('Control group size in cores: %s', cgroup_size)
|
|
161
164
|
except:
|
|
162
165
|
# We can't actually read these cgroup fields. Maybe we are a mac or something.
|
|
163
166
|
logger.debug('Could not inspect cgroup: %s', traceback.format_exc())
|
|
@@ -175,9 +178,16 @@ def cpu_count() -> int:
|
|
|
175
178
|
else:
|
|
176
179
|
logger.debug('CPU affinity not available')
|
|
177
180
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
+
limit: Union[float, int] = float('inf')
|
|
182
|
+
# Apply all the limits to take the smallest
|
|
183
|
+
limit = min(limit, total_machine_size)
|
|
184
|
+
limit = min(limit, cgroup_size)
|
|
185
|
+
limit = min(limit, affinity_size)
|
|
186
|
+
if limit < 1 or limit == float('inf'):
|
|
187
|
+
# Fall back to 1 if we can't get a size
|
|
188
|
+
limit = 1
|
|
189
|
+
result = int(limit)
|
|
190
|
+
logger.debug('cpu_count: %s', result)
|
|
181
191
|
# Make sure to remember it for the next call
|
|
182
192
|
setattr(cpu_count, 'result', result)
|
|
183
193
|
return result
|
|
@@ -529,9 +539,14 @@ class LastProcessStandingArena:
|
|
|
529
539
|
os.mkdir(self.lockfileDir)
|
|
530
540
|
except FileExistsError:
|
|
531
541
|
pass
|
|
542
|
+
except Exception as e:
|
|
543
|
+
raise RuntimeError("Could not make lock file directory " + self.lockfileDir) from e
|
|
532
544
|
|
|
533
545
|
# Make ourselves a file in it and lock it to prove we are alive.
|
|
534
|
-
|
|
546
|
+
try:
|
|
547
|
+
self.lockfileFD, self.lockfileName = tempfile.mkstemp(dir=self.lockfileDir) # type: ignore
|
|
548
|
+
except Exception as e:
|
|
549
|
+
raise RuntimeError("Could not make lock file in " + self.lockfileDir) from e
|
|
535
550
|
# Nobody can see it yet, so lock it right away
|
|
536
551
|
fcntl.lockf(self.lockfileFD, fcntl.LOCK_EX) # type: ignore
|
|
537
552
|
|
toil/options/common.py
CHANGED
|
@@ -2,13 +2,12 @@ import os
|
|
|
2
2
|
from argparse import ArgumentParser, Action, _AppendAction
|
|
3
3
|
from typing import Any, Optional, Union, Type, Callable, List, Dict, TYPE_CHECKING
|
|
4
4
|
|
|
5
|
-
from distutils.util import strtobool
|
|
6
5
|
from configargparse import SUPPRESS
|
|
7
6
|
import logging
|
|
8
7
|
|
|
9
8
|
from ruamel.yaml import YAML
|
|
10
9
|
|
|
11
|
-
from toil.lib.conversions import bytes2human, human2bytes
|
|
10
|
+
from toil.lib.conversions import bytes2human, human2bytes, strtobool
|
|
12
11
|
|
|
13
12
|
from toil.batchSystems.options import add_all_batchsystem_options
|
|
14
13
|
from toil.provisioners import parse_node_types
|
|
@@ -595,12 +594,12 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
|
|
|
595
594
|
title="Toil log management options.",
|
|
596
595
|
description="Options for how Toil should manage its logs."
|
|
597
596
|
)
|
|
598
|
-
log_options.add_argument("--maxLogFileSize", dest="maxLogFileSize", default=
|
|
597
|
+
log_options.add_argument("--maxLogFileSize", dest="maxLogFileSize", default=100 * 1024 * 1024, type=h2b,
|
|
599
598
|
action=make_open_interval_action(1),
|
|
600
599
|
help=f"The maximum size of a job log file to keep (in bytes), log files larger than "
|
|
601
600
|
f"this will be truncated to the last X bytes. Setting this option to zero will "
|
|
602
601
|
f"prevent any truncation. Setting this option to a negative value will truncate "
|
|
603
|
-
f"from the beginning. Default={bytes2human(
|
|
602
|
+
f"from the beginning. Default={bytes2human(100 * 1024 * 1024)}")
|
|
604
603
|
log_options.add_argument("--writeLogs", dest="writeLogs", nargs='?', action='store', default=None,
|
|
605
604
|
const=os.getcwd(), metavar="OPT_PATH",
|
|
606
605
|
help="Write worker logs received by the leader into their own files at the specified "
|
|
@@ -692,8 +691,7 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
|
|
|
692
691
|
'TOIL_APPLIANCE_SELF, which Toil uses to provision mesos for autoscaling.')
|
|
693
692
|
misc_options.add_argument('--statusWait', dest='statusWait', type=int, default=3600, metavar="INT",
|
|
694
693
|
help="Seconds to wait between reports of running jobs.")
|
|
695
|
-
misc_options.add_argument('--disableProgress', dest='disableProgress',
|
|
696
|
-
metavar="BOOL",
|
|
694
|
+
misc_options.add_argument('--disableProgress', dest='disableProgress', action="store_true", default=False,
|
|
697
695
|
help="Disables the progress bar shown when standard error is a terminal.")
|
|
698
696
|
|
|
699
697
|
# Debug options
|
|
@@ -735,4 +733,4 @@ def add_base_toil_options(parser: ArgumentParser, jobstore_as_flag: bool = False
|
|
|
735
733
|
|
|
736
734
|
# dest is set to enableCaching to not conflict with the current --caching destination
|
|
737
735
|
caching.add_argument('--disableCaching', dest='enableCaching', action='store_false', help=SUPPRESS)
|
|
738
|
-
caching.set_defaults(
|
|
736
|
+
caching.set_defaults(enableCaching=None)
|
toil/options/wdl.py
CHANGED
|
@@ -13,6 +13,8 @@ def add_wdl_options(parser: ArgumentParser, suppress: bool = True) -> None:
|
|
|
13
13
|
suppress_help = SUPPRESS if suppress else None
|
|
14
14
|
# include arg names without a wdl specifier if suppress is False
|
|
15
15
|
# this is to avoid possible duplicate options in custom toil scripts, ex outputFile can be a common argument name
|
|
16
|
+
# TODO: Why do we even need them at all in other Toil scripts? Do we have to worry about dest= collisions?
|
|
17
|
+
# TODO: Can the better option name be first?
|
|
16
18
|
output_dialect_arguments = ["--wdlOutputDialect"] + (["--outputDialect"] if not suppress else [])
|
|
17
19
|
parser.add_argument(*output_dialect_arguments, dest="output_dialect", type=str, default='cromwell',
|
|
18
20
|
choices=['cromwell', 'miniwdl'],
|
|
@@ -30,3 +32,6 @@ def add_wdl_options(parser: ArgumentParser, suppress: bool = True) -> None:
|
|
|
30
32
|
reference_inputs_arguments = ["--wdlReferenceInputs"] + (["--referenceInputs"] if not suppress else [])
|
|
31
33
|
parser.add_argument(*reference_inputs_arguments, dest="reference_inputs", type=bool, default=False,
|
|
32
34
|
help=suppress_help or "Pass input files by URL")
|
|
35
|
+
container_arguments = ["--wdlContainer"] + (["--container"] if not suppress else [])
|
|
36
|
+
parser.add_argument(*container_arguments, dest="container", type=str, choices=["singularity", "docker", "auto"], default="auto",
|
|
37
|
+
help=suppress_help or "Container engine to use to run WDL tasks")
|
|
@@ -812,6 +812,14 @@ class AbstractProvisioner(ABC):
|
|
|
812
812
|
-v /opt:/opt \\
|
|
813
813
|
-v /etc/kubernetes:/etc/kubernetes \\
|
|
814
814
|
-v /etc/kubernetes/admin.conf:/root/.kube/config \\
|
|
815
|
+
# Pass in a path to use for singularity image caching into the container
|
|
816
|
+
-e TOIL_KUBERNETES_HOST_PATH=/var/lib/toil \\
|
|
817
|
+
-e SINGULARITY_CACHEDIR=/var/lib/toil/singularity \\
|
|
818
|
+
-e MINIWDL__SINGULARITY__IMAGE_CACHE=/var/lib/toil/miniwdl \\
|
|
819
|
+
# These rules are necessary in order to get user namespaces working
|
|
820
|
+
# https://github.com/apptainer/singularity/issues/5806
|
|
821
|
+
--security-opt seccomp=unconfined \\
|
|
822
|
+
--security-opt systempaths=unconfined \\
|
|
815
823
|
--name=toil_{role} \\
|
|
816
824
|
{applianceSelf()} \\
|
|
817
825
|
{entryPointArgs}
|
toil/statsAndLogging.py
CHANGED
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import gzip
|
|
15
|
+
import io
|
|
15
16
|
import json
|
|
16
17
|
import logging
|
|
17
18
|
import os
|
|
@@ -49,7 +50,7 @@ class StatsAndLogging:
|
|
|
49
50
|
self._worker.start()
|
|
50
51
|
|
|
51
52
|
@classmethod
|
|
52
|
-
def formatLogStream(cls, stream: Union[IO[str], IO[bytes]],
|
|
53
|
+
def formatLogStream(cls, stream: Union[IO[str], IO[bytes]], stream_name: str) -> str:
|
|
53
54
|
"""
|
|
54
55
|
Given a stream of text or bytes, and the job name, job itself, or some
|
|
55
56
|
other optional stringifyable identity info for the job, return a big
|
|
@@ -62,7 +63,7 @@ class StatsAndLogging:
|
|
|
62
63
|
|
|
63
64
|
:param stream: The stream of text or bytes to print for the user.
|
|
64
65
|
"""
|
|
65
|
-
lines = [f'
|
|
66
|
+
lines = [f'{stream_name} follows:', '=========>']
|
|
66
67
|
|
|
67
68
|
for line in stream:
|
|
68
69
|
if isinstance(line, bytes):
|
|
@@ -75,13 +76,13 @@ class StatsAndLogging:
|
|
|
75
76
|
|
|
76
77
|
|
|
77
78
|
@classmethod
|
|
78
|
-
def logWithFormatting(cls,
|
|
79
|
+
def logWithFormatting(cls, stream_name: str, jobLogs: Union[IO[str], IO[bytes]], method: Callable[[str], None] = logger.debug,
|
|
79
80
|
message: Optional[str] = None) -> None:
|
|
80
81
|
if message is not None:
|
|
81
82
|
method(message)
|
|
82
83
|
|
|
83
|
-
# Format and log the logs, identifying the
|
|
84
|
-
method(cls.formatLogStream(jobLogs,
|
|
84
|
+
# Format and log the logs, identifying the stream with the given name.
|
|
85
|
+
method(cls.formatLogStream(jobLogs, stream_name))
|
|
85
86
|
|
|
86
87
|
@classmethod
|
|
87
88
|
def writeLogFiles(cls, jobNames: List[str], jobLogList: List[str], config: 'Config', failed: bool = False) -> None:
|
|
@@ -95,7 +96,7 @@ class StatsAndLogging:
|
|
|
95
96
|
logName = ('failed_' if failed else '') + logName
|
|
96
97
|
counter = 0
|
|
97
98
|
while True:
|
|
98
|
-
suffix = str(counter).zfill(3) + logExtension
|
|
99
|
+
suffix = '_' + str(counter).zfill(3) + logExtension
|
|
99
100
|
fullName = os.path.join(logPath, logName + suffix)
|
|
100
101
|
# The maximum file name size in the default HFS+ file system is 255 UTF-16 encoding units, so basically 255 characters
|
|
101
102
|
if len(fullName) >= 255:
|
|
@@ -118,6 +119,9 @@ class StatsAndLogging:
|
|
|
118
119
|
# we don't have anywhere to write the logs, return now
|
|
119
120
|
return
|
|
120
121
|
|
|
122
|
+
# Make sure the destination exists
|
|
123
|
+
os.makedirs(path, exist_ok=True)
|
|
124
|
+
|
|
121
125
|
fullName = createName(path, mainFileName, extension, failed)
|
|
122
126
|
with writeFn(fullName, 'wb') as f:
|
|
123
127
|
for l in jobLogList:
|
|
@@ -150,8 +154,10 @@ class StatsAndLogging:
|
|
|
150
154
|
stats = json.loads(statsStr, object_hook=Expando)
|
|
151
155
|
if not stats:
|
|
152
156
|
return
|
|
157
|
+
|
|
153
158
|
try:
|
|
154
|
-
|
|
159
|
+
# Handle all the log_to_leader messages
|
|
160
|
+
logs = stats.workers.logs_to_leader
|
|
155
161
|
except AttributeError:
|
|
156
162
|
# To be expected if there were no calls to log_to_leader()
|
|
157
163
|
pass
|
|
@@ -160,6 +166,28 @@ class StatsAndLogging:
|
|
|
160
166
|
logger.log(int(message.level),
|
|
161
167
|
'Got message from job at time %s: %s',
|
|
162
168
|
time.strftime('%m-%d-%Y %H:%M:%S'), message.text)
|
|
169
|
+
|
|
170
|
+
try:
|
|
171
|
+
# Handle all the user-level text streams reported back (command output, etc.)
|
|
172
|
+
user_logs = stats.workers.logging_user_streams
|
|
173
|
+
except AttributeError:
|
|
174
|
+
# To be expected if there were no calls to log_user_stream()
|
|
175
|
+
pass
|
|
176
|
+
else:
|
|
177
|
+
for stream_entry in user_logs:
|
|
178
|
+
try:
|
|
179
|
+
# Unpack the stream name and text.
|
|
180
|
+
name, text = stream_entry.name, stream_entry.text
|
|
181
|
+
except AttributeError:
|
|
182
|
+
# Doesn't have a user-provided stream name and stream
|
|
183
|
+
# text, so skip it.
|
|
184
|
+
continue
|
|
185
|
+
# Since this is sent as inline text we need to pretend to stream it.
|
|
186
|
+
# TODO: Save these as individual files if they start to get too big?
|
|
187
|
+
cls.logWithFormatting(name, io.StringIO(text), logger.info)
|
|
188
|
+
# Save it as a log file, as if it were a Toil-level job.
|
|
189
|
+
cls.writeLogFiles([name], [text], config=config)
|
|
190
|
+
|
|
163
191
|
try:
|
|
164
192
|
logs = stats.logs
|
|
165
193
|
except AttributeError:
|
|
@@ -168,7 +196,7 @@ class StatsAndLogging:
|
|
|
168
196
|
# we may have multiple jobs per worker
|
|
169
197
|
jobNames = logs.names
|
|
170
198
|
messages = logs.messages
|
|
171
|
-
cls.logWithFormatting(jobNames[0], messages,
|
|
199
|
+
cls.logWithFormatting(f'Log from job "{jobNames[0]}"', messages,
|
|
172
200
|
message='Received Toil worker log. Disable debug level logging to hide this output')
|
|
173
201
|
cls.writeLogFiles(jobNames, messages, config=config)
|
|
174
202
|
|
|
@@ -4,6 +4,7 @@ from queue import Queue
|
|
|
4
4
|
import pytest
|
|
5
5
|
|
|
6
6
|
import toil.batchSystems.slurm
|
|
7
|
+
from toil.batchSystems.abstractBatchSystem import BatchJobExitReason, EXIT_STATUS_UNAVAILABLE_VALUE
|
|
7
8
|
from toil.common import Config
|
|
8
9
|
from toil.lib.misc import CalledProcessErrorStderr
|
|
9
10
|
from toil.test import ToilTest
|
|
@@ -284,7 +285,7 @@ class SlurmTest(ToilTest):
|
|
|
284
285
|
def test_getJobExitCode_job_exists(self):
|
|
285
286
|
self.monkeypatch.setattr(toil.batchSystems.slurm, "call_command", call_sacct)
|
|
286
287
|
job_id = '785023' # FAILED
|
|
287
|
-
expected_result = 127
|
|
288
|
+
expected_result = (127, BatchJobExitReason.FAILED)
|
|
288
289
|
result = self.worker.getJobExitCode(job_id)
|
|
289
290
|
assert result == expected_result, f"{result} != {expected_result}"
|
|
290
291
|
|
|
@@ -303,7 +304,7 @@ class SlurmTest(ToilTest):
|
|
|
303
304
|
self.monkeypatch.setattr(self.worker, "_getJobDetailsFromSacct", call_sacct_raises)
|
|
304
305
|
self.monkeypatch.setattr(toil.batchSystems.slurm, "call_command", call_scontrol)
|
|
305
306
|
job_id = '787204' # COMPLETED
|
|
306
|
-
expected_result = 0
|
|
307
|
+
expected_result = (0, BatchJobExitReason.FINISHED)
|
|
307
308
|
result = self.worker.getJobExitCode(job_id)
|
|
308
309
|
assert result == expected_result, f"{result} != {expected_result}"
|
|
309
310
|
|
|
@@ -329,7 +330,7 @@ class SlurmTest(ToilTest):
|
|
|
329
330
|
def test_coalesce_job_exit_codes_one_exists(self):
|
|
330
331
|
self.monkeypatch.setattr(toil.batchSystems.slurm, "call_command", call_sacct)
|
|
331
332
|
job_ids = ['785023'] # FAILED
|
|
332
|
-
expected_result = [127]
|
|
333
|
+
expected_result = [(127, BatchJobExitReason.FAILED)]
|
|
333
334
|
result = self.worker.coalesce_job_exit_codes(job_ids)
|
|
334
335
|
assert result == expected_result, f"{result} != {expected_result}"
|
|
335
336
|
|
|
@@ -347,7 +348,14 @@ class SlurmTest(ToilTest):
|
|
|
347
348
|
'789724', # RUNNING,
|
|
348
349
|
'789868', # PENDING,
|
|
349
350
|
'789869'] # COMPLETED
|
|
350
|
-
|
|
351
|
+
# RUNNING and PENDING jobs should return None
|
|
352
|
+
expected_result = [
|
|
353
|
+
(EXIT_STATUS_UNAVAILABLE_VALUE, BatchJobExitReason.KILLED),
|
|
354
|
+
(1, BatchJobExitReason.FAILED),
|
|
355
|
+
None,
|
|
356
|
+
None,
|
|
357
|
+
(0, BatchJobExitReason.FINISHED)
|
|
358
|
+
]
|
|
351
359
|
result = self.worker.coalesce_job_exit_codes(job_ids)
|
|
352
360
|
assert result == expected_result, f"{result} != {expected_result}"
|
|
353
361
|
|
|
@@ -358,7 +366,14 @@ class SlurmTest(ToilTest):
|
|
|
358
366
|
'789724', # RUNNING,
|
|
359
367
|
'999999', # Non-existent,
|
|
360
368
|
'789869'] # COMPLETED
|
|
361
|
-
|
|
369
|
+
# RUNNING job should return None
|
|
370
|
+
expected_result = [
|
|
371
|
+
(130, BatchJobExitReason.FAILED),
|
|
372
|
+
(2, BatchJobExitReason.FAILED),
|
|
373
|
+
None,
|
|
374
|
+
None,
|
|
375
|
+
(0, BatchJobExitReason.FINISHED)
|
|
376
|
+
]
|
|
362
377
|
result = self.worker.coalesce_job_exit_codes(job_ids)
|
|
363
378
|
assert result == expected_result, f"{result} != {expected_result}"
|
|
364
379
|
|
|
@@ -370,7 +385,7 @@ class SlurmTest(ToilTest):
|
|
|
370
385
|
self.monkeypatch.setattr(self.worker, "_getJobDetailsFromSacct", call_sacct_raises)
|
|
371
386
|
self.monkeypatch.setattr(toil.batchSystems.slurm, "call_command", call_scontrol)
|
|
372
387
|
job_ids = ['787204'] # COMPLETED
|
|
373
|
-
expected_result = [0]
|
|
388
|
+
expected_result = [(0, BatchJobExitReason.FINISHED)]
|
|
374
389
|
result = self.worker.coalesce_job_exit_codes(job_ids)
|
|
375
390
|
assert result == expected_result, f"{result} != {expected_result}"
|
|
376
391
|
|
|
File without changes
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import uuid
|
|
3
|
+
|
|
4
|
+
from toil.provisioners import cluster_factory
|
|
5
|
+
from toil.test.provisioners.clusterTest import AbstractClusterTest
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class CactusIntegrationTest(AbstractClusterTest):
|
|
9
|
+
"""
|
|
10
|
+
Run the Cactus Integration test on a Kubernetes AWS cluster
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, methodName):
|
|
14
|
+
super().__init__(methodName=methodName)
|
|
15
|
+
self.clusterName = "cactus-test-" + str(uuid.uuid4())
|
|
16
|
+
self.leaderNodeType = "t2.medium"
|
|
17
|
+
self.clusterType = "kubernetes"
|
|
18
|
+
|
|
19
|
+
def setUp(self):
|
|
20
|
+
super().setUp()
|
|
21
|
+
self.jobStore = f"aws:{self.awsRegion()}:cluster-{uuid.uuid4()}"
|
|
22
|
+
|
|
23
|
+
def test_cactus_integration(self):
|
|
24
|
+
# Make a cluster with worker nodes
|
|
25
|
+
self.createClusterUtil(args=["--nodeTypes=t2.xlarge", "-w=1-3"])
|
|
26
|
+
# get the leader so we know the IP address - we don't need to wait since create cluster
|
|
27
|
+
# already ensures the leader is running
|
|
28
|
+
self.cluster = cluster_factory(
|
|
29
|
+
provisioner="aws", zone=self.zone, clusterName=self.clusterName
|
|
30
|
+
)
|
|
31
|
+
self.leader = self.cluster.getLeader()
|
|
32
|
+
|
|
33
|
+
CACTUS_COMMIT_SHA = os.environ["CACTUS_COMMIT_SHA"] or "f5adf4013326322ae58ef1eccb8409b71d761583" # default cactus commit
|
|
34
|
+
|
|
35
|
+
# command to install and run cactus on the cluster
|
|
36
|
+
cactus_command = ("python -m virtualenv --system-site-packages venv && "
|
|
37
|
+
". venv/bin/activate && "
|
|
38
|
+
"git clone https://github.com/ComparativeGenomicsToolkit/cactus.git --recursive && "
|
|
39
|
+
"cd cactus && "
|
|
40
|
+
"git fetch origin && "
|
|
41
|
+
f"git checkout {CACTUS_COMMIT_SHA} && "
|
|
42
|
+
"git submodule update --init --recursive && "
|
|
43
|
+
"pip install --upgrade 'setuptools<66' pip && "
|
|
44
|
+
"pip install --upgrade . && "
|
|
45
|
+
"pip install --upgrade numpy psutil && "
|
|
46
|
+
"time cactus --batchSystem kubernetes --retryCount=3 "
|
|
47
|
+
f"--consCores 2 --binariesMode singularity --clean always {self.jobStore} "
|
|
48
|
+
"examples/evolverMammals.txt examples/evolverMammals.hal --root mr --defaultDisk 8G --logDebug")
|
|
49
|
+
|
|
50
|
+
# run cactus
|
|
51
|
+
self.sshUtil(
|
|
52
|
+
[
|
|
53
|
+
"bash",
|
|
54
|
+
"-c",
|
|
55
|
+
cactus_command
|
|
56
|
+
]
|
|
57
|
+
)
|
|
58
|
+
|