toil 8.0.0__py3-none-any.whl → 8.1.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +4 -4
- toil/batchSystems/options.py +1 -0
- toil/batchSystems/slurm.py +227 -83
- toil/common.py +161 -45
- toil/cwl/cwltoil.py +31 -10
- toil/job.py +47 -38
- toil/jobStores/aws/jobStore.py +46 -10
- toil/lib/aws/session.py +14 -3
- toil/lib/aws/utils.py +92 -35
- toil/lib/dockstore.py +379 -0
- toil/lib/ec2nodes.py +3 -2
- toil/lib/history.py +1271 -0
- toil/lib/history_submission.py +681 -0
- toil/lib/io.py +22 -1
- toil/lib/misc.py +18 -0
- toil/lib/retry.py +10 -10
- toil/lib/{integration.py → trs.py} +95 -46
- toil/lib/web.py +38 -0
- toil/options/common.py +17 -2
- toil/options/cwl.py +10 -0
- toil/provisioners/gceProvisioner.py +4 -4
- toil/server/cli/wes_cwl_runner.py +3 -3
- toil/server/utils.py +2 -3
- toil/statsAndLogging.py +35 -1
- toil/test/batchSystems/test_slurm.py +172 -2
- toil/test/cwl/conftest.py +39 -0
- toil/test/cwl/cwlTest.py +105 -2
- toil/test/cwl/optional-file.cwl +18 -0
- toil/test/lib/test_history.py +212 -0
- toil/test/lib/test_trs.py +161 -0
- toil/test/wdl/wdltoil_test.py +1 -1
- toil/version.py +10 -10
- toil/wdl/wdltoil.py +23 -9
- toil/worker.py +113 -33
- {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/METADATA +9 -4
- {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/RECORD +40 -34
- {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/WHEEL +1 -1
- toil/test/lib/test_integration.py +0 -104
- {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/LICENSE +0 -0
- {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/entry_points.txt +0 -0
- {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/top_level.txt +0 -0
toil/__init__.py
CHANGED
|
@@ -19,7 +19,7 @@ import sys
|
|
|
19
19
|
from datetime import datetime
|
|
20
20
|
from typing import TYPE_CHECKING, Optional
|
|
21
21
|
|
|
22
|
-
import
|
|
22
|
+
from toil.lib.web import web_session
|
|
23
23
|
|
|
24
24
|
from docker.errors import ImageNotFound
|
|
25
25
|
from toil.lib.memoize import memoize
|
|
@@ -425,7 +425,7 @@ def requestCheckRegularDocker(
|
|
|
425
425
|
ioURL = "https://{webhost}/v2/{pathName}/manifests/{tag}" "".format(
|
|
426
426
|
webhost=registryName, pathName=imageName, tag=tag
|
|
427
427
|
)
|
|
428
|
-
response =
|
|
428
|
+
response = web_session.head(ioURL)
|
|
429
429
|
if not response.ok:
|
|
430
430
|
raise ApplianceImageNotFound(origAppliance, ioURL, response.status_code)
|
|
431
431
|
else:
|
|
@@ -459,10 +459,10 @@ def requestCheckDockerIo(origAppliance: str, imageName: str, tag: str) -> bool:
|
|
|
459
459
|
)
|
|
460
460
|
requests_url = f"https://registry-1.docker.io/v2/{imageName}/manifests/{tag}"
|
|
461
461
|
|
|
462
|
-
token =
|
|
462
|
+
token = web_session.get(token_url)
|
|
463
463
|
jsonToken = token.json()
|
|
464
464
|
bearer = jsonToken["token"]
|
|
465
|
-
response =
|
|
465
|
+
response = web_session.head(
|
|
466
466
|
requests_url, headers={"Authorization": f"Bearer {bearer}"}
|
|
467
467
|
)
|
|
468
468
|
if not response.ok:
|
toil/batchSystems/options.py
CHANGED
|
@@ -185,6 +185,7 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) -
|
|
|
185
185
|
"systems such as gridengine, htcondor, torque, slurm, and lsf.",
|
|
186
186
|
)
|
|
187
187
|
|
|
188
|
+
# TODO: Move this to Slurm specifically.
|
|
188
189
|
parser.add_argument(
|
|
189
190
|
"--memoryIsProduct",
|
|
190
191
|
dest="memory_is_product",
|
toil/batchSystems/slurm.py
CHANGED
|
@@ -13,13 +13,14 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
from __future__ import annotations
|
|
15
15
|
|
|
16
|
+
import errno
|
|
16
17
|
import logging
|
|
17
18
|
import math
|
|
18
19
|
import os
|
|
19
20
|
import sys
|
|
20
21
|
from argparse import SUPPRESS, ArgumentParser, _ArgumentGroup
|
|
21
|
-
|
|
22
|
-
from typing import NamedTuple, TypeVar
|
|
22
|
+
import shlex
|
|
23
|
+
from typing import Callable, NamedTuple, TypeVar
|
|
23
24
|
|
|
24
25
|
from toil.batchSystems.abstractBatchSystem import (
|
|
25
26
|
EXIT_STATUS_UNAVAILABLE_VALUE,
|
|
@@ -185,6 +186,8 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
185
186
|
def get_partition(self, time_limit: float | None) -> str | None:
|
|
186
187
|
"""
|
|
187
188
|
Get the partition name to use for a job with the given time limit.
|
|
189
|
+
|
|
190
|
+
:param time_limit: Time limit in seconds.
|
|
188
191
|
"""
|
|
189
192
|
|
|
190
193
|
if time_limit is None:
|
|
@@ -193,17 +196,36 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
193
196
|
|
|
194
197
|
winning_partition = None
|
|
195
198
|
for partition in self.all_partitions:
|
|
196
|
-
if partition.time_limit
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
#
|
|
199
|
+
if partition.time_limit < time_limit:
|
|
200
|
+
# Can't use this
|
|
201
|
+
continue
|
|
202
|
+
if winning_partition is None:
|
|
203
|
+
# Anything beats None
|
|
201
204
|
winning_partition = partition
|
|
205
|
+
continue
|
|
206
|
+
if partition.gres and not winning_partition.gres:
|
|
207
|
+
# Never use a partition witn GRES if you can avoid it
|
|
208
|
+
continue
|
|
209
|
+
elif not partition.gres and winning_partition.gres:
|
|
210
|
+
# Never keep a partition with GRES if we find one without
|
|
211
|
+
winning_partition = partition
|
|
212
|
+
continue
|
|
213
|
+
if partition.priority > winning_partition.priority:
|
|
214
|
+
# After that, don't raise priority
|
|
215
|
+
continue
|
|
216
|
+
elif partition.priority < winning_partition.priority:
|
|
217
|
+
# And always lower it
|
|
218
|
+
winning_partition = partition
|
|
219
|
+
continue
|
|
220
|
+
if partition.time_limit < winning_partition.time_limit:
|
|
221
|
+
# Finally, lower time limit
|
|
222
|
+
winning_partition = partition
|
|
223
|
+
|
|
202
224
|
# TODO: Store partitions in a better indexed way
|
|
203
225
|
if winning_partition is None and len(self.all_partitions) > 0:
|
|
204
226
|
# We have partitions and none of them can fit this
|
|
205
227
|
raise RuntimeError(
|
|
206
|
-
"Could not find a Slurm partition that can fit a job that runs for {time_limit} seconds"
|
|
228
|
+
f"Could not find a Slurm partition that can fit a job that runs for {time_limit} seconds"
|
|
207
229
|
)
|
|
208
230
|
|
|
209
231
|
if winning_partition is None:
|
|
@@ -344,7 +366,9 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
344
366
|
"""
|
|
345
367
|
try:
|
|
346
368
|
status_dict = self._getJobDetailsFromSacct(job_id_list)
|
|
347
|
-
except CalledProcessErrorStderr:
|
|
369
|
+
except (CalledProcessErrorStderr, OSError) as e:
|
|
370
|
+
if isinstance(e, OSError):
|
|
371
|
+
logger.warning("Could not run sacct: %s", e)
|
|
348
372
|
status_dict = self._getJobDetailsFromScontrol(job_id_list)
|
|
349
373
|
return status_dict
|
|
350
374
|
|
|
@@ -437,11 +461,25 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
437
461
|
"-S",
|
|
438
462
|
"1970-01-01",
|
|
439
463
|
] # override start time limit
|
|
440
|
-
stdout = call_command(args, quiet=True)
|
|
441
464
|
|
|
442
465
|
# Collect the job statuses in a dict; key is the job-id, value is a tuple containing
|
|
443
466
|
# job state and exit status. Initialize dict before processing output of `sacct`.
|
|
444
467
|
job_statuses: dict[int, tuple[str | None, int | None]] = {}
|
|
468
|
+
|
|
469
|
+
try:
|
|
470
|
+
stdout = call_command(args, quiet=True)
|
|
471
|
+
except OSError as e:
|
|
472
|
+
if e.errno == errno.E2BIG:
|
|
473
|
+
# Argument list is too big, recurse on half the argument list
|
|
474
|
+
if len(job_id_list) == 1:
|
|
475
|
+
# 1 is too big, we can't recurse further, bail out
|
|
476
|
+
raise
|
|
477
|
+
job_statuses.update(self._getJobDetailsFromSacct(job_id_list[:len(job_id_list)//2]))
|
|
478
|
+
job_statuses.update(self._getJobDetailsFromSacct(job_id_list[len(job_id_list)//2:]))
|
|
479
|
+
return job_statuses
|
|
480
|
+
else:
|
|
481
|
+
raise
|
|
482
|
+
|
|
445
483
|
for job_id in job_id_list:
|
|
446
484
|
job_statuses[job_id] = (None, None)
|
|
447
485
|
|
|
@@ -609,104 +647,194 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
609
647
|
# Also any extra arguments from --slurmArgs or TOIL_SLURM_ARGS
|
|
610
648
|
nativeConfig: str = self.boss.config.slurm_args # type: ignore[attr-defined]
|
|
611
649
|
|
|
650
|
+
# For parsing user-provided option overrides (or self-generated
|
|
651
|
+
# options) we need a way to recognize long, long-with-equals, and
|
|
652
|
+
# short forms.
|
|
653
|
+
def option_detector(long: str, short: str | None = None) -> Callable[[str], bool]:
|
|
654
|
+
"""
|
|
655
|
+
Get a function that returns true if it sees the long or short
|
|
656
|
+
option.
|
|
657
|
+
"""
|
|
658
|
+
def is_match(option: str) -> bool:
|
|
659
|
+
return option == f"--{long}" or option.startswith(f"--{long}=") or (short is not None and option == f"-{short}")
|
|
660
|
+
return is_match
|
|
661
|
+
|
|
662
|
+
def any_option_detector(options: list[str | tuple[str, str]]) -> Callable[[str], bool]:
|
|
663
|
+
"""
|
|
664
|
+
Get a function that returns true if it sees any of the long
|
|
665
|
+
options or long or short option pairs.
|
|
666
|
+
"""
|
|
667
|
+
detectors = [option_detector(o) if isinstance(o, str) else option_detector(*o) for o in options]
|
|
668
|
+
def is_match(option: str) -> bool:
|
|
669
|
+
for detector in detectors:
|
|
670
|
+
if detector(option):
|
|
671
|
+
return True
|
|
672
|
+
return False
|
|
673
|
+
return is_match
|
|
674
|
+
|
|
675
|
+
is_any_mem_option = any_option_detector(["mem", "mem-per-cpu", "mem-per-gpu"])
|
|
676
|
+
is_any_cpus_option = any_option_detector([("cpus-per-task", "c"), "cpus-per-gpu"])
|
|
677
|
+
is_export_option = option_detector("export")
|
|
678
|
+
is_export_file_option = option_detector("export-file")
|
|
679
|
+
is_time_option = option_detector("time", "t")
|
|
680
|
+
is_partition_option = option_detector("partition", "p")
|
|
681
|
+
|
|
682
|
+
# We will fill these in with stuff parsed from TOIL_SLURM_ARGS, or
|
|
683
|
+
# with our own determinations if they aren't there.
|
|
684
|
+
|
|
612
685
|
# --export=[ALL,]<environment_toil_variables>
|
|
613
|
-
|
|
686
|
+
export_all = True
|
|
687
|
+
export_list = [] # Some items here may be multiple comma-separated values
|
|
688
|
+
time_limit: int | None = self.boss.config.slurm_time # type: ignore[attr-defined]
|
|
689
|
+
partition: str | None = None
|
|
614
690
|
|
|
615
691
|
if nativeConfig is not None:
|
|
616
692
|
logger.debug(
|
|
617
693
|
"Native SLURM options appended to sbatch: %s", nativeConfig
|
|
618
694
|
)
|
|
619
695
|
|
|
620
|
-
|
|
621
|
-
|
|
696
|
+
# Do a mini argument parse to pull out export and parse time if
|
|
697
|
+
# needed
|
|
698
|
+
args = shlex.split(nativeConfig)
|
|
699
|
+
i = 0
|
|
700
|
+
while i < len(args):
|
|
701
|
+
arg = args[i]
|
|
702
|
+
if is_any_mem_option(arg) or is_any_cpus_option(arg):
|
|
703
|
+
# Prohibit arguments that set CPUs or memory
|
|
622
704
|
raise ValueError(
|
|
623
|
-
f"
|
|
705
|
+
f"Cannot use Slurm argument {arg} which conflicts "
|
|
706
|
+
f"with Toil's own arguments to Slurm"
|
|
624
707
|
)
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
708
|
+
elif is_export_option(arg):
|
|
709
|
+
# Capture the export argument value so we can modify it
|
|
710
|
+
export_all = False
|
|
711
|
+
if "=" not in arg:
|
|
712
|
+
if i + 1 >= len(args):
|
|
713
|
+
raise ValueError(
|
|
714
|
+
f"No value supplied for Slurm {arg} argument"
|
|
715
|
+
)
|
|
716
|
+
i += 1
|
|
717
|
+
export_list.append(args[i])
|
|
718
|
+
else:
|
|
719
|
+
export_list.append(arg.split("=", 1)[1])
|
|
720
|
+
elif is_export_file_option(arg):
|
|
721
|
+
# Keep --export-file but turn off --export=ALL in that
|
|
722
|
+
# case.
|
|
723
|
+
export_all = False
|
|
724
|
+
sbatch_line.append(arg)
|
|
725
|
+
elif is_time_option(arg):
|
|
726
|
+
# Capture the time limit in seconds so we can use it for picking a partition
|
|
727
|
+
if "=" not in arg:
|
|
728
|
+
if i + 1 >= len(args):
|
|
729
|
+
raise ValueError(
|
|
730
|
+
f"No value supplied for Slurm {arg} argument"
|
|
731
|
+
)
|
|
732
|
+
i += 1
|
|
733
|
+
time_string = args[i]
|
|
734
|
+
else:
|
|
735
|
+
time_string = arg.split("=", 1)[1]
|
|
736
|
+
time_limit = parse_slurm_time(time_string)
|
|
737
|
+
elif is_partition_option(arg):
|
|
738
|
+
# Capture the partition so we can run checks on it and know not to assign one
|
|
739
|
+
if "=" not in arg:
|
|
740
|
+
if i + 1 >= len(args):
|
|
741
|
+
raise ValueError(
|
|
742
|
+
f"No value supplied for Slurm {arg} argument"
|
|
743
|
+
)
|
|
744
|
+
i += 1
|
|
745
|
+
partition = args[i]
|
|
746
|
+
else:
|
|
747
|
+
partition = arg.split("=", 1)[1]
|
|
748
|
+
else:
|
|
749
|
+
# Other arguments pass through.
|
|
750
|
+
sbatch_line.append(arg)
|
|
751
|
+
i += 1
|
|
752
|
+
|
|
753
|
+
if export_all:
|
|
754
|
+
# We don't have any export overrides so we ened to start with
|
|
755
|
+
# an ALL
|
|
756
|
+
export_list.append("ALL")
|
|
629
757
|
|
|
630
758
|
if environment:
|
|
631
759
|
argList = []
|
|
632
760
|
|
|
633
761
|
for k, v in environment.items():
|
|
634
|
-
|
|
762
|
+
# TODO: The sbatch man page doesn't say we can quote these;
|
|
763
|
+
# if we need to send characters like , itself we need to
|
|
764
|
+
# use --export-file and clean it up when the command has
|
|
765
|
+
# been issued.
|
|
766
|
+
quoted_value = shlex.quote(os.environ[k] if v is None else v)
|
|
635
767
|
argList.append(f"{k}={quoted_value}")
|
|
636
768
|
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
#
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
769
|
+
export_list.extend(argList)
|
|
770
|
+
|
|
771
|
+
# If partition isn't set and we have a GPU partition override
|
|
772
|
+
# that applies, apply it
|
|
773
|
+
gpu_partition_override: str | None = self.boss.config.slurm_gpu_partition # type: ignore[attr-defined]
|
|
774
|
+
if partition is None and gpus and gpu_partition_override:
|
|
775
|
+
partition = gpu_partition_override
|
|
776
|
+
|
|
777
|
+
# If partition isn't set and we have a parallel partition override
|
|
778
|
+
# that applies, apply it
|
|
779
|
+
parallel_env: str | None = self.boss.config.slurm_pe # type: ignore[attr-defined]
|
|
780
|
+
if partition is None and cpu and cpu > 1 and parallel_env:
|
|
781
|
+
partition = parallel_env
|
|
782
|
+
|
|
783
|
+
# If partition isn't set and we have a general partition override
|
|
784
|
+
# that applies, apply it
|
|
785
|
+
partition_override: str | None = self.boss.config.slurm_partition # type: ignore[attr-defined]
|
|
786
|
+
if partition is None and partition_override:
|
|
787
|
+
partition = partition_override
|
|
788
|
+
|
|
789
|
+
if partition is None and gpus:
|
|
790
|
+
# Send to a GPU partition
|
|
791
|
+
gpu_partition = self.boss.partitions.default_gpu_partition
|
|
792
|
+
if gpu_partition is None:
|
|
793
|
+
# no gpu partitions are available, raise an error
|
|
794
|
+
raise RuntimeError(
|
|
795
|
+
f"The job {jobName} is requesting GPUs, but the Slurm cluster does not appear to have an accessible partition with GPUs"
|
|
796
|
+
)
|
|
797
|
+
if (
|
|
798
|
+
time_limit is not None
|
|
799
|
+
and gpu_partition.time_limit < time_limit
|
|
800
|
+
):
|
|
801
|
+
# TODO: find the lowest-priority GPU partition that has at least each job's time limit!
|
|
802
|
+
logger.warning(
|
|
803
|
+
"Trying to submit a job that needs %s seconds to partition %s that has a limit of %s seconds",
|
|
804
|
+
time_limit,
|
|
805
|
+
gpu_partition.partition_name,
|
|
806
|
+
gpu_partition.time_limit,
|
|
807
|
+
)
|
|
808
|
+
partition = gpu_partition.partition_name
|
|
809
|
+
|
|
810
|
+
if partition is None:
|
|
811
|
+
# Pick a partition based on time limit
|
|
812
|
+
partition = self.boss.partitions.get_partition(time_limit)
|
|
813
|
+
|
|
814
|
+
# Now generate all the arguments
|
|
815
|
+
if len(export_list) > 0:
|
|
816
|
+
# add --export to the sbatch
|
|
817
|
+
sbatch_line.append("--export=" + ",".join(export_list))
|
|
818
|
+
if partition is not None:
|
|
819
|
+
sbatch_line.append(f"--partition={partition}")
|
|
820
|
+
if gpus:
|
|
821
|
+
# Generate GPU assignment argument
|
|
822
|
+
sbatch_line.append(f"--gres=gpu:{gpus}")
|
|
823
|
+
if partition is not None and partition not in self.boss.partitions.gpu_partitions:
|
|
824
|
+
# the specified partition is not compatible, so warn the user that the job may not work
|
|
825
|
+
logger.warning(
|
|
826
|
+
f"Job {jobName} needs GPUs, but specified partition {partition} does not have them. This job may not work."
|
|
827
|
+
f"Try specifying one of these partitions instead: {', '.join(self.boss.partitions.gpu_partitions)}."
|
|
828
|
+
)
|
|
646
829
|
if mem is not None and self.boss.config.slurm_allocate_mem: # type: ignore[attr-defined]
|
|
647
830
|
# memory passed in is in bytes, but slurm expects megabytes
|
|
648
831
|
sbatch_line.append(f"--mem={math.ceil(mem / 2 ** 20)}")
|
|
649
832
|
if cpu is not None:
|
|
650
833
|
sbatch_line.append(f"--cpus-per-task={math.ceil(cpu)}")
|
|
651
|
-
|
|
652
|
-
time_limit: int = self.boss.config.slurm_time # type: ignore[attr-defined]
|
|
653
834
|
if time_limit is not None:
|
|
654
835
|
# Put all the seconds in the seconds slot
|
|
655
836
|
sbatch_line.append(f"--time=0:{time_limit}")
|
|
656
837
|
|
|
657
|
-
if gpus:
|
|
658
|
-
# This block will add a gpu supported partition only if no partition is supplied by the user
|
|
659
|
-
sbatch_line = sbatch_line[:1] + [f"--gres=gpu:{gpus}"] + sbatch_line[1:]
|
|
660
|
-
if not any(option.startswith("--partition") for option in sbatch_line):
|
|
661
|
-
# no partition specified, so specify one
|
|
662
|
-
# try to get the name of the lowest priority gpu supported partition
|
|
663
|
-
lowest_gpu_partition = self.boss.partitions.default_gpu_partition
|
|
664
|
-
if lowest_gpu_partition is None:
|
|
665
|
-
# no gpu partitions are available, raise an error
|
|
666
|
-
raise RuntimeError(
|
|
667
|
-
f"The job {jobName} is requesting GPUs, but the Slurm cluster does not appear to have an accessible partition with GPUs"
|
|
668
|
-
)
|
|
669
|
-
if (
|
|
670
|
-
time_limit is not None
|
|
671
|
-
and lowest_gpu_partition.time_limit < time_limit
|
|
672
|
-
):
|
|
673
|
-
# TODO: find the lowest-priority GPU partition that has at least each job's time limit!
|
|
674
|
-
logger.warning(
|
|
675
|
-
"Trying to submit a job that needs %s seconds to partition %s that has a limit of %s seconds",
|
|
676
|
-
time_limit,
|
|
677
|
-
lowest_gpu_partition.partition_name,
|
|
678
|
-
lowest_gpu_partition.time_limit,
|
|
679
|
-
)
|
|
680
|
-
sbatch_line.append(
|
|
681
|
-
f"--partition={lowest_gpu_partition.partition_name}"
|
|
682
|
-
)
|
|
683
|
-
else:
|
|
684
|
-
# there is a partition specified already, check if the partition has GPUs
|
|
685
|
-
for i, option in enumerate(sbatch_line):
|
|
686
|
-
if option.startswith("--partition"):
|
|
687
|
-
# grab the partition name depending on if it's specified via an "=" or a space
|
|
688
|
-
if "=" in option:
|
|
689
|
-
partition_name = option[len("--partition=") :]
|
|
690
|
-
else:
|
|
691
|
-
partition_name = option[i + 1]
|
|
692
|
-
available_gpu_partitions = (
|
|
693
|
-
self.boss.partitions.gpu_partitions
|
|
694
|
-
)
|
|
695
|
-
if partition_name not in available_gpu_partitions:
|
|
696
|
-
# the specified partition is not compatible, so warn the user that the job may not work
|
|
697
|
-
logger.warning(
|
|
698
|
-
f"Job {jobName} needs {gpus} GPUs, but specified partition {partition_name} is incompatible. This job may not work."
|
|
699
|
-
f"Try specifying one of these partitions instead: {', '.join(available_gpu_partitions)}."
|
|
700
|
-
)
|
|
701
|
-
break
|
|
702
|
-
|
|
703
|
-
if not any(option.startswith("--partition") for option in sbatch_line):
|
|
704
|
-
# Pick a partition ourselves
|
|
705
|
-
chosen_partition = self.boss.partitions.get_partition(time_limit)
|
|
706
|
-
if chosen_partition is not None:
|
|
707
|
-
# Route to that partition
|
|
708
|
-
sbatch_line.append(f"--partition={chosen_partition}")
|
|
709
|
-
|
|
710
838
|
stdoutfile: str = self.boss.format_std_out_err_path(jobID, "%j", "out")
|
|
711
839
|
stderrfile: str = self.boss.format_std_out_err_path(jobID, "%j", "err")
|
|
712
840
|
sbatch_line.extend(["-o", stdoutfile, "-e", stderrfile])
|
|
@@ -830,6 +958,20 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
830
958
|
env_var="TOIL_SLURM_TIME",
|
|
831
959
|
help="Slurm job time limit, in [DD-]HH:MM:SS format.",
|
|
832
960
|
)
|
|
961
|
+
parser.add_argument(
|
|
962
|
+
"--slurmPartition",
|
|
963
|
+
dest="slurm_partition",
|
|
964
|
+
default=None,
|
|
965
|
+
env_var="TOIL_SLURM_PARTITION",
|
|
966
|
+
help="Partition to send Slurm jobs to.",
|
|
967
|
+
)
|
|
968
|
+
parser.add_argument(
|
|
969
|
+
"--slurmGPUPartition",
|
|
970
|
+
dest="slurm_gpu_partition",
|
|
971
|
+
default=None,
|
|
972
|
+
env_var="TOIL_SLURM_GPU_PARTITION",
|
|
973
|
+
help="Partition to send Slurm jobs to if they ask for GPUs.",
|
|
974
|
+
)
|
|
833
975
|
parser.add_argument(
|
|
834
976
|
"--slurmPE",
|
|
835
977
|
dest="slurm_pe",
|
|
@@ -852,5 +994,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
852
994
|
setOption("slurm_allocate_mem")
|
|
853
995
|
setOption("slurm_default_all_mem")
|
|
854
996
|
setOption("slurm_time")
|
|
997
|
+
setOption("slurm_partition")
|
|
998
|
+
setOption("slurm_gpu_partition")
|
|
855
999
|
setOption("slurm_pe")
|
|
856
1000
|
setOption("slurm_args")
|