toil 8.0.0__py3-none-any.whl → 8.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +4 -39
- toil/batchSystems/abstractBatchSystem.py +1 -1
- toil/batchSystems/abstractGridEngineBatchSystem.py +1 -1
- toil/batchSystems/awsBatch.py +1 -1
- toil/batchSystems/cleanup_support.py +1 -1
- toil/batchSystems/kubernetes.py +53 -7
- toil/batchSystems/local_support.py +1 -1
- toil/batchSystems/mesos/batchSystem.py +13 -8
- toil/batchSystems/mesos/test/__init__.py +3 -2
- toil/batchSystems/options.py +1 -0
- toil/batchSystems/singleMachine.py +1 -1
- toil/batchSystems/slurm.py +229 -84
- toil/bus.py +5 -3
- toil/common.py +198 -54
- toil/cwl/cwltoil.py +32 -11
- toil/job.py +110 -86
- toil/jobStores/abstractJobStore.py +24 -3
- toil/jobStores/aws/jobStore.py +46 -10
- toil/jobStores/fileJobStore.py +25 -1
- toil/jobStores/googleJobStore.py +104 -30
- toil/leader.py +9 -0
- toil/lib/accelerators.py +3 -1
- toil/lib/aws/session.py +14 -3
- toil/lib/aws/utils.py +92 -35
- toil/lib/aws/utils.py.orig +504 -0
- toil/lib/bioio.py +1 -1
- toil/lib/docker.py +252 -91
- toil/lib/dockstore.py +387 -0
- toil/lib/ec2nodes.py +3 -2
- toil/lib/exceptions.py +5 -3
- toil/lib/history.py +1345 -0
- toil/lib/history_submission.py +695 -0
- toil/lib/io.py +56 -23
- toil/lib/misc.py +25 -1
- toil/lib/resources.py +2 -1
- toil/lib/retry.py +10 -10
- toil/lib/threading.py +11 -10
- toil/lib/{integration.py → trs.py} +95 -46
- toil/lib/web.py +38 -0
- toil/options/common.py +25 -2
- toil/options/cwl.py +10 -0
- toil/options/wdl.py +11 -0
- toil/provisioners/gceProvisioner.py +4 -4
- toil/server/api_spec/LICENSE +201 -0
- toil/server/api_spec/README.rst +5 -0
- toil/server/cli/wes_cwl_runner.py +5 -4
- toil/server/utils.py +2 -3
- toil/statsAndLogging.py +35 -1
- toil/test/__init__.py +275 -115
- toil/test/batchSystems/batchSystemTest.py +227 -205
- toil/test/batchSystems/test_slurm.py +199 -2
- toil/test/cactus/pestis.tar.gz +0 -0
- toil/test/conftest.py +7 -0
- toil/test/cwl/2.fasta +11 -0
- toil/test/cwl/2.fastq +12 -0
- toil/test/cwl/conftest.py +39 -0
- toil/test/cwl/cwlTest.py +1015 -780
- toil/test/cwl/directory/directory/file.txt +15 -0
- toil/test/cwl/download_directory_file.json +4 -0
- toil/test/cwl/download_directory_s3.json +4 -0
- toil/test/cwl/download_file.json +6 -0
- toil/test/cwl/download_http.json +6 -0
- toil/test/cwl/download_https.json +6 -0
- toil/test/cwl/download_s3.json +6 -0
- toil/test/cwl/download_subdirectory_file.json +5 -0
- toil/test/cwl/download_subdirectory_s3.json +5 -0
- toil/test/cwl/empty.json +1 -0
- toil/test/cwl/mock_mpi/fake_mpi.yml +8 -0
- toil/test/cwl/mock_mpi/fake_mpi_run.py +42 -0
- toil/test/cwl/optional-file-exists.json +6 -0
- toil/test/cwl/optional-file-missing.json +6 -0
- toil/test/cwl/optional-file.cwl +18 -0
- toil/test/cwl/preemptible_expression.json +1 -0
- toil/test/cwl/revsort-job-missing.json +6 -0
- toil/test/cwl/revsort-job.json +6 -0
- toil/test/cwl/s3_secondary_file.json +16 -0
- toil/test/cwl/seqtk_seq_job.json +6 -0
- toil/test/cwl/stream.json +6 -0
- toil/test/cwl/test_filename_conflict_resolution.ms/table.dat +0 -0
- toil/test/cwl/test_filename_conflict_resolution.ms/table.f0 +0 -0
- toil/test/cwl/test_filename_conflict_resolution.ms/table.f1 +0 -0
- toil/test/cwl/test_filename_conflict_resolution.ms/table.f1i +0 -0
- toil/test/cwl/test_filename_conflict_resolution.ms/table.f2 +0 -0
- toil/test/cwl/test_filename_conflict_resolution.ms/table.f2_TSM0 +0 -0
- toil/test/cwl/test_filename_conflict_resolution.ms/table.f3 +0 -0
- toil/test/cwl/test_filename_conflict_resolution.ms/table.f3_TSM0 +0 -0
- toil/test/cwl/test_filename_conflict_resolution.ms/table.f4 +0 -0
- toil/test/cwl/test_filename_conflict_resolution.ms/table.f4_TSM0 +0 -0
- toil/test/cwl/test_filename_conflict_resolution.ms/table.f5 +0 -0
- toil/test/cwl/test_filename_conflict_resolution.ms/table.info +0 -0
- toil/test/cwl/test_filename_conflict_resolution.ms/table.lock +0 -0
- toil/test/cwl/whale.txt +16 -0
- toil/test/docs/scripts/example_alwaysfail.py +38 -0
- toil/test/docs/scripts/example_alwaysfail_with_files.wdl +33 -0
- toil/test/docs/scripts/example_cachingbenchmark.py +117 -0
- toil/test/docs/scripts/stagingExampleFiles/in.txt +1 -0
- toil/test/docs/scripts/stagingExampleFiles/out.txt +2 -0
- toil/test/docs/scripts/tutorial_arguments.py +23 -0
- toil/test/docs/scripts/tutorial_debugging.patch +12 -0
- toil/test/docs/scripts/tutorial_debugging_hangs.wdl +126 -0
- toil/test/docs/scripts/tutorial_debugging_works.wdl +129 -0
- toil/test/docs/scripts/tutorial_docker.py +20 -0
- toil/test/docs/scripts/tutorial_dynamic.py +24 -0
- toil/test/docs/scripts/tutorial_encapsulation.py +28 -0
- toil/test/docs/scripts/tutorial_encapsulation2.py +29 -0
- toil/test/docs/scripts/tutorial_helloworld.py +15 -0
- toil/test/docs/scripts/tutorial_invokeworkflow.py +27 -0
- toil/test/docs/scripts/tutorial_invokeworkflow2.py +30 -0
- toil/test/docs/scripts/tutorial_jobfunctions.py +22 -0
- toil/test/docs/scripts/tutorial_managing.py +29 -0
- toil/test/docs/scripts/tutorial_managing2.py +56 -0
- toil/test/docs/scripts/tutorial_multiplejobs.py +25 -0
- toil/test/docs/scripts/tutorial_multiplejobs2.py +21 -0
- toil/test/docs/scripts/tutorial_multiplejobs3.py +22 -0
- toil/test/docs/scripts/tutorial_promises.py +25 -0
- toil/test/docs/scripts/tutorial_promises2.py +30 -0
- toil/test/docs/scripts/tutorial_quickstart.py +22 -0
- toil/test/docs/scripts/tutorial_requirements.py +44 -0
- toil/test/docs/scripts/tutorial_services.py +45 -0
- toil/test/docs/scripts/tutorial_staging.py +45 -0
- toil/test/docs/scripts/tutorial_stats.py +64 -0
- toil/test/lib/aws/test_iam.py +3 -1
- toil/test/lib/dockerTest.py +205 -122
- toil/test/lib/test_history.py +236 -0
- toil/test/lib/test_trs.py +161 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +12 -9
- toil/test/provisioners/clusterTest.py +4 -4
- toil/test/provisioners/gceProvisionerTest.py +16 -14
- toil/test/sort/sort.py +4 -1
- toil/test/src/busTest.py +17 -17
- toil/test/src/deferredFunctionTest.py +145 -132
- toil/test/src/importExportFileTest.py +71 -63
- toil/test/src/jobEncapsulationTest.py +27 -28
- toil/test/src/jobServiceTest.py +149 -133
- toil/test/src/jobTest.py +219 -211
- toil/test/src/miscTests.py +66 -60
- toil/test/src/promisedRequirementTest.py +163 -169
- toil/test/src/regularLogTest.py +24 -24
- toil/test/src/resourceTest.py +82 -76
- toil/test/src/restartDAGTest.py +51 -47
- toil/test/src/resumabilityTest.py +24 -19
- toil/test/src/retainTempDirTest.py +60 -57
- toil/test/src/systemTest.py +17 -13
- toil/test/src/threadingTest.py +29 -32
- toil/test/utils/ABCWorkflowDebug/B_file.txt +1 -0
- toil/test/utils/ABCWorkflowDebug/debugWorkflow.py +204 -0
- toil/test/utils/ABCWorkflowDebug/mkFile.py +16 -0
- toil/test/utils/ABCWorkflowDebug/sleep.cwl +12 -0
- toil/test/utils/ABCWorkflowDebug/sleep.yaml +1 -0
- toil/test/utils/toilDebugTest.py +117 -102
- toil/test/utils/toilKillTest.py +54 -53
- toil/test/utils/utilsTest.py +303 -229
- toil/test/wdl/lint_error.wdl +9 -0
- toil/test/wdl/md5sum/empty_file.json +1 -0
- toil/test/wdl/md5sum/md5sum-gs.json +1 -0
- toil/test/wdl/md5sum/md5sum.1.0.wdl +32 -0
- toil/test/wdl/md5sum/md5sum.input +1 -0
- toil/test/wdl/md5sum/md5sum.json +1 -0
- toil/test/wdl/md5sum/md5sum.wdl +25 -0
- toil/test/wdl/miniwdl_self_test/inputs-namespaced.json +1 -0
- toil/test/wdl/miniwdl_self_test/inputs.json +1 -0
- toil/test/wdl/miniwdl_self_test/self_test.wdl +40 -0
- toil/test/wdl/standard_library/as_map.json +16 -0
- toil/test/wdl/standard_library/as_map_as_input.wdl +23 -0
- toil/test/wdl/standard_library/as_pairs.json +7 -0
- toil/test/wdl/standard_library/as_pairs_as_input.wdl +23 -0
- toil/test/wdl/standard_library/ceil.json +3 -0
- toil/test/wdl/standard_library/ceil_as_command.wdl +16 -0
- toil/test/wdl/standard_library/ceil_as_input.wdl +16 -0
- toil/test/wdl/standard_library/collect_by_key.json +1 -0
- toil/test/wdl/standard_library/collect_by_key_as_input.wdl +23 -0
- toil/test/wdl/standard_library/cross.json +11 -0
- toil/test/wdl/standard_library/cross_as_input.wdl +19 -0
- toil/test/wdl/standard_library/flatten.json +7 -0
- toil/test/wdl/standard_library/flatten_as_input.wdl +18 -0
- toil/test/wdl/standard_library/floor.json +3 -0
- toil/test/wdl/standard_library/floor_as_command.wdl +16 -0
- toil/test/wdl/standard_library/floor_as_input.wdl +16 -0
- toil/test/wdl/standard_library/keys.json +8 -0
- toil/test/wdl/standard_library/keys_as_input.wdl +24 -0
- toil/test/wdl/standard_library/length.json +7 -0
- toil/test/wdl/standard_library/length_as_input.wdl +16 -0
- toil/test/wdl/standard_library/length_as_input_with_map.json +7 -0
- toil/test/wdl/standard_library/length_as_input_with_map.wdl +17 -0
- toil/test/wdl/standard_library/length_invalid.json +3 -0
- toil/test/wdl/standard_library/range.json +3 -0
- toil/test/wdl/standard_library/range_0.json +3 -0
- toil/test/wdl/standard_library/range_as_input.wdl +17 -0
- toil/test/wdl/standard_library/range_invalid.json +3 -0
- toil/test/wdl/standard_library/read_boolean.json +3 -0
- toil/test/wdl/standard_library/read_boolean_as_command.wdl +17 -0
- toil/test/wdl/standard_library/read_float.json +3 -0
- toil/test/wdl/standard_library/read_float_as_command.wdl +17 -0
- toil/test/wdl/standard_library/read_int.json +3 -0
- toil/test/wdl/standard_library/read_int_as_command.wdl +17 -0
- toil/test/wdl/standard_library/read_json.json +3 -0
- toil/test/wdl/standard_library/read_json_as_output.wdl +31 -0
- toil/test/wdl/standard_library/read_lines.json +3 -0
- toil/test/wdl/standard_library/read_lines_as_output.wdl +31 -0
- toil/test/wdl/standard_library/read_map.json +3 -0
- toil/test/wdl/standard_library/read_map_as_output.wdl +31 -0
- toil/test/wdl/standard_library/read_string.json +3 -0
- toil/test/wdl/standard_library/read_string_as_command.wdl +17 -0
- toil/test/wdl/standard_library/read_tsv.json +3 -0
- toil/test/wdl/standard_library/read_tsv_as_output.wdl +31 -0
- toil/test/wdl/standard_library/round.json +3 -0
- toil/test/wdl/standard_library/round_as_command.wdl +16 -0
- toil/test/wdl/standard_library/round_as_input.wdl +16 -0
- toil/test/wdl/standard_library/size.json +3 -0
- toil/test/wdl/standard_library/size_as_command.wdl +17 -0
- toil/test/wdl/standard_library/size_as_output.wdl +36 -0
- toil/test/wdl/standard_library/stderr.json +3 -0
- toil/test/wdl/standard_library/stderr_as_output.wdl +30 -0
- toil/test/wdl/standard_library/stdout.json +3 -0
- toil/test/wdl/standard_library/stdout_as_output.wdl +30 -0
- toil/test/wdl/standard_library/sub.json +3 -0
- toil/test/wdl/standard_library/sub_as_input.wdl +17 -0
- toil/test/wdl/standard_library/sub_as_input_with_file.wdl +17 -0
- toil/test/wdl/standard_library/transpose.json +6 -0
- toil/test/wdl/standard_library/transpose_as_input.wdl +18 -0
- toil/test/wdl/standard_library/write_json.json +6 -0
- toil/test/wdl/standard_library/write_json_as_command.wdl +17 -0
- toil/test/wdl/standard_library/write_lines.json +7 -0
- toil/test/wdl/standard_library/write_lines_as_command.wdl +17 -0
- toil/test/wdl/standard_library/write_map.json +6 -0
- toil/test/wdl/standard_library/write_map_as_command.wdl +17 -0
- toil/test/wdl/standard_library/write_tsv.json +6 -0
- toil/test/wdl/standard_library/write_tsv_as_command.wdl +17 -0
- toil/test/wdl/standard_library/zip.json +12 -0
- toil/test/wdl/standard_library/zip_as_input.wdl +19 -0
- toil/test/wdl/test.csv +3 -0
- toil/test/wdl/test.tsv +3 -0
- toil/test/wdl/testfiles/croo.wdl +38 -0
- toil/test/wdl/testfiles/drop_files.wdl +62 -0
- toil/test/wdl/testfiles/drop_files_subworkflow.wdl +13 -0
- toil/test/wdl/testfiles/empty.txt +0 -0
- toil/test/wdl/testfiles/not_enough_outputs.wdl +33 -0
- toil/test/wdl/testfiles/random.wdl +66 -0
- toil/test/wdl/testfiles/string_file_coercion.json +1 -0
- toil/test/wdl/testfiles/string_file_coercion.wdl +35 -0
- toil/test/wdl/testfiles/test.json +4 -0
- toil/test/wdl/testfiles/test_boolean.txt +1 -0
- toil/test/wdl/testfiles/test_float.txt +1 -0
- toil/test/wdl/testfiles/test_int.txt +1 -0
- toil/test/wdl/testfiles/test_lines.txt +5 -0
- toil/test/wdl/testfiles/test_map.txt +2 -0
- toil/test/wdl/testfiles/test_string.txt +1 -0
- toil/test/wdl/testfiles/url_to_file.wdl +13 -0
- toil/test/wdl/testfiles/url_to_optional_file.wdl +13 -0
- toil/test/wdl/testfiles/vocab.json +1 -0
- toil/test/wdl/testfiles/vocab.wdl +66 -0
- toil/test/wdl/testfiles/wait.wdl +34 -0
- toil/test/wdl/wdl_specification/type_pair.json +23 -0
- toil/test/wdl/wdl_specification/type_pair_basic.wdl +36 -0
- toil/test/wdl/wdl_specification/type_pair_with_files.wdl +36 -0
- toil/test/wdl/wdl_specification/v1_spec.json +1 -0
- toil/test/wdl/wdl_specification/v1_spec_declaration.wdl +39 -0
- toil/test/wdl/wdltoil_test.py +681 -408
- toil/test/wdl/wdltoil_test_kubernetes.py +2 -2
- toil/version.py +10 -10
- toil/wdl/wdltoil.py +350 -123
- toil/worker.py +113 -33
- {toil-8.0.0.dist-info → toil-8.2.0.dist-info}/METADATA +13 -7
- toil-8.2.0.dist-info/RECORD +439 -0
- {toil-8.0.0.dist-info → toil-8.2.0.dist-info}/WHEEL +1 -1
- toil/test/lib/test_integration.py +0 -104
- toil-8.0.0.dist-info/RECORD +0 -253
- {toil-8.0.0.dist-info → toil-8.2.0.dist-info}/entry_points.txt +0 -0
- {toil-8.0.0.dist-info → toil-8.2.0.dist-info/licenses}/LICENSE +0 -0
- {toil-8.0.0.dist-info → toil-8.2.0.dist-info}/top_level.txt +0 -0
toil/batchSystems/slurm.py
CHANGED
|
@@ -13,13 +13,14 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
from __future__ import annotations
|
|
15
15
|
|
|
16
|
+
import errno
|
|
16
17
|
import logging
|
|
17
18
|
import math
|
|
18
19
|
import os
|
|
19
20
|
import sys
|
|
20
21
|
from argparse import SUPPRESS, ArgumentParser, _ArgumentGroup
|
|
21
|
-
|
|
22
|
-
from typing import NamedTuple, TypeVar
|
|
22
|
+
import shlex
|
|
23
|
+
from typing import Callable, NamedTuple, TypeVar
|
|
23
24
|
|
|
24
25
|
from toil.batchSystems.abstractBatchSystem import (
|
|
25
26
|
EXIT_STATUS_UNAVAILABLE_VALUE,
|
|
@@ -100,6 +101,32 @@ def parse_slurm_time(slurm_time: str) -> int:
|
|
|
100
101
|
total_seconds += multiplier * int(elapsed_split[index])
|
|
101
102
|
return total_seconds
|
|
102
103
|
|
|
104
|
+
# For parsing user-provided option overrides (or self-generated
|
|
105
|
+
# options) for sbatch, we need a way to recognize long, long-with-equals, and
|
|
106
|
+
# short forms.
|
|
107
|
+
def option_detector(long: str, short: str | None = None) -> Callable[[str], bool]:
|
|
108
|
+
"""
|
|
109
|
+
Get a function that returns true if it sees the long or short
|
|
110
|
+
option.
|
|
111
|
+
"""
|
|
112
|
+
def is_match(option: str) -> bool:
|
|
113
|
+
return option == f"--{long}" or option.startswith(f"--{long}=") or (short is not None and option == f"-{short}")
|
|
114
|
+
return is_match
|
|
115
|
+
|
|
116
|
+
def any_option_detector(options: list[str | tuple[str, str]]) -> Callable[[str], bool]:
|
|
117
|
+
"""
|
|
118
|
+
Get a function that returns true if it sees any of the long
|
|
119
|
+
options or long or short option pairs.
|
|
120
|
+
"""
|
|
121
|
+
detectors = [option_detector(o) if isinstance(o, str) else option_detector(*o) for o in options]
|
|
122
|
+
def is_match(option: str) -> bool:
|
|
123
|
+
for detector in detectors:
|
|
124
|
+
if detector(option):
|
|
125
|
+
return True
|
|
126
|
+
return False
|
|
127
|
+
return is_match
|
|
128
|
+
|
|
129
|
+
|
|
103
130
|
|
|
104
131
|
class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
105
132
|
class PartitionInfo(NamedTuple):
|
|
@@ -185,6 +212,8 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
185
212
|
def get_partition(self, time_limit: float | None) -> str | None:
|
|
186
213
|
"""
|
|
187
214
|
Get the partition name to use for a job with the given time limit.
|
|
215
|
+
|
|
216
|
+
:param time_limit: Time limit in seconds.
|
|
188
217
|
"""
|
|
189
218
|
|
|
190
219
|
if time_limit is None:
|
|
@@ -193,17 +222,36 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
193
222
|
|
|
194
223
|
winning_partition = None
|
|
195
224
|
for partition in self.all_partitions:
|
|
196
|
-
if partition.time_limit
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
#
|
|
225
|
+
if partition.time_limit < time_limit:
|
|
226
|
+
# Can't use this
|
|
227
|
+
continue
|
|
228
|
+
if winning_partition is None:
|
|
229
|
+
# Anything beats None
|
|
230
|
+
winning_partition = partition
|
|
231
|
+
continue
|
|
232
|
+
if partition.gres and not winning_partition.gres:
|
|
233
|
+
# Never use a partition witn GRES if you can avoid it
|
|
234
|
+
continue
|
|
235
|
+
elif not partition.gres and winning_partition.gres:
|
|
236
|
+
# Never keep a partition with GRES if we find one without
|
|
237
|
+
winning_partition = partition
|
|
238
|
+
continue
|
|
239
|
+
if partition.priority > winning_partition.priority:
|
|
240
|
+
# After that, don't raise priority
|
|
241
|
+
continue
|
|
242
|
+
elif partition.priority < winning_partition.priority:
|
|
243
|
+
# And always lower it
|
|
244
|
+
winning_partition = partition
|
|
245
|
+
continue
|
|
246
|
+
if partition.time_limit < winning_partition.time_limit:
|
|
247
|
+
# Finally, lower time limit
|
|
201
248
|
winning_partition = partition
|
|
249
|
+
|
|
202
250
|
# TODO: Store partitions in a better indexed way
|
|
203
251
|
if winning_partition is None and len(self.all_partitions) > 0:
|
|
204
252
|
# We have partitions and none of them can fit this
|
|
205
253
|
raise RuntimeError(
|
|
206
|
-
"Could not find a Slurm partition that can fit a job that runs for {time_limit} seconds"
|
|
254
|
+
f"Could not find a Slurm partition that can fit a job that runs for {time_limit} seconds"
|
|
207
255
|
)
|
|
208
256
|
|
|
209
257
|
if winning_partition is None:
|
|
@@ -344,7 +392,9 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
344
392
|
"""
|
|
345
393
|
try:
|
|
346
394
|
status_dict = self._getJobDetailsFromSacct(job_id_list)
|
|
347
|
-
except CalledProcessErrorStderr:
|
|
395
|
+
except (CalledProcessErrorStderr, OSError) as e:
|
|
396
|
+
if isinstance(e, OSError):
|
|
397
|
+
logger.warning("Could not run sacct: %s", e)
|
|
348
398
|
status_dict = self._getJobDetailsFromScontrol(job_id_list)
|
|
349
399
|
return status_dict
|
|
350
400
|
|
|
@@ -437,11 +487,25 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
437
487
|
"-S",
|
|
438
488
|
"1970-01-01",
|
|
439
489
|
] # override start time limit
|
|
440
|
-
stdout = call_command(args, quiet=True)
|
|
441
490
|
|
|
442
491
|
# Collect the job statuses in a dict; key is the job-id, value is a tuple containing
|
|
443
492
|
# job state and exit status. Initialize dict before processing output of `sacct`.
|
|
444
493
|
job_statuses: dict[int, tuple[str | None, int | None]] = {}
|
|
494
|
+
|
|
495
|
+
try:
|
|
496
|
+
stdout = call_command(args, quiet=True)
|
|
497
|
+
except OSError as e:
|
|
498
|
+
if e.errno == errno.E2BIG:
|
|
499
|
+
# Argument list is too big, recurse on half the argument list
|
|
500
|
+
if len(job_id_list) == 1:
|
|
501
|
+
# 1 is too big, we can't recurse further, bail out
|
|
502
|
+
raise
|
|
503
|
+
job_statuses.update(self._getJobDetailsFromSacct(job_id_list[:len(job_id_list)//2]))
|
|
504
|
+
job_statuses.update(self._getJobDetailsFromSacct(job_id_list[len(job_id_list)//2:]))
|
|
505
|
+
return job_statuses
|
|
506
|
+
else:
|
|
507
|
+
raise
|
|
508
|
+
|
|
445
509
|
for job_id in job_id_list:
|
|
446
510
|
job_statuses[job_id] = (None, None)
|
|
447
511
|
|
|
@@ -609,104 +673,169 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
609
673
|
# Also any extra arguments from --slurmArgs or TOIL_SLURM_ARGS
|
|
610
674
|
nativeConfig: str = self.boss.config.slurm_args # type: ignore[attr-defined]
|
|
611
675
|
|
|
676
|
+
is_any_mem_option = any_option_detector(["mem", "mem-per-cpu", "mem-per-gpu"])
|
|
677
|
+
is_any_cpus_option = any_option_detector([("cpus-per-task", "c"), "cpus-per-gpu"])
|
|
678
|
+
is_export_option = option_detector("export")
|
|
679
|
+
is_export_file_option = option_detector("export-file")
|
|
680
|
+
is_time_option = option_detector("time", "t")
|
|
681
|
+
is_partition_option = option_detector("partition", "p")
|
|
682
|
+
|
|
683
|
+
# We will fill these in with stuff parsed from TOIL_SLURM_ARGS, or
|
|
684
|
+
# with our own determinations if they aren't there.
|
|
685
|
+
|
|
612
686
|
# --export=[ALL,]<environment_toil_variables>
|
|
613
|
-
|
|
687
|
+
export_all = True
|
|
688
|
+
export_list = [] # Some items here may be multiple comma-separated values
|
|
689
|
+
time_limit: int | None = self.boss.config.slurm_time # type: ignore[attr-defined]
|
|
690
|
+
partition: str | None = None
|
|
614
691
|
|
|
615
692
|
if nativeConfig is not None:
|
|
616
693
|
logger.debug(
|
|
617
694
|
"Native SLURM options appended to sbatch: %s", nativeConfig
|
|
618
695
|
)
|
|
619
696
|
|
|
620
|
-
|
|
621
|
-
|
|
697
|
+
# Do a mini argument parse to pull out export and parse time if
|
|
698
|
+
# needed
|
|
699
|
+
args = shlex.split(nativeConfig)
|
|
700
|
+
i = 0
|
|
701
|
+
while i < len(args):
|
|
702
|
+
arg = args[i]
|
|
703
|
+
if is_any_mem_option(arg) or is_any_cpus_option(arg):
|
|
704
|
+
# Prohibit arguments that set CPUs or memory
|
|
622
705
|
raise ValueError(
|
|
623
|
-
f"
|
|
706
|
+
f"Cannot use Slurm argument {arg} which conflicts "
|
|
707
|
+
f"with Toil's own arguments to Slurm"
|
|
624
708
|
)
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
709
|
+
elif is_export_option(arg):
|
|
710
|
+
# Capture the export argument value so we can modify it
|
|
711
|
+
export_all = False
|
|
712
|
+
if "=" not in arg:
|
|
713
|
+
if i + 1 >= len(args):
|
|
714
|
+
raise ValueError(
|
|
715
|
+
f"No value supplied for Slurm {arg} argument"
|
|
716
|
+
)
|
|
717
|
+
i += 1
|
|
718
|
+
export_list.append(args[i])
|
|
719
|
+
else:
|
|
720
|
+
export_list.append(arg.split("=", 1)[1])
|
|
721
|
+
elif is_export_file_option(arg):
|
|
722
|
+
# Keep --export-file but turn off --export=ALL in that
|
|
723
|
+
# case.
|
|
724
|
+
export_all = False
|
|
725
|
+
sbatch_line.append(arg)
|
|
726
|
+
elif is_time_option(arg):
|
|
727
|
+
# Capture the time limit in seconds so we can use it for picking a partition
|
|
728
|
+
if "=" not in arg:
|
|
729
|
+
if i + 1 >= len(args):
|
|
730
|
+
raise ValueError(
|
|
731
|
+
f"No value supplied for Slurm {arg} argument"
|
|
732
|
+
)
|
|
733
|
+
i += 1
|
|
734
|
+
time_string = args[i]
|
|
735
|
+
else:
|
|
736
|
+
time_string = arg.split("=", 1)[1]
|
|
737
|
+
time_limit = parse_slurm_time(time_string)
|
|
738
|
+
elif is_partition_option(arg):
|
|
739
|
+
# Capture the partition so we can run checks on it and know not to assign one
|
|
740
|
+
if "=" not in arg:
|
|
741
|
+
if i + 1 >= len(args):
|
|
742
|
+
raise ValueError(
|
|
743
|
+
f"No value supplied for Slurm {arg} argument"
|
|
744
|
+
)
|
|
745
|
+
i += 1
|
|
746
|
+
partition = args[i]
|
|
747
|
+
else:
|
|
748
|
+
partition = arg.split("=", 1)[1]
|
|
749
|
+
else:
|
|
750
|
+
# Other arguments pass through.
|
|
751
|
+
sbatch_line.append(arg)
|
|
752
|
+
i += 1
|
|
753
|
+
|
|
754
|
+
if export_all:
|
|
755
|
+
# We don't have any export overrides so we ened to start with
|
|
756
|
+
# an ALL
|
|
757
|
+
export_list.append("ALL")
|
|
629
758
|
|
|
630
759
|
if environment:
|
|
631
760
|
argList = []
|
|
632
761
|
|
|
633
762
|
for k, v in environment.items():
|
|
634
|
-
|
|
763
|
+
# TODO: The sbatch man page doesn't say we can quote these;
|
|
764
|
+
# if we need to send characters like , itself we need to
|
|
765
|
+
# use --export-file and clean it up when the command has
|
|
766
|
+
# been issued.
|
|
767
|
+
quoted_value = shlex.quote(os.environ[k] if v is None else v)
|
|
635
768
|
argList.append(f"{k}={quoted_value}")
|
|
636
769
|
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
#
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
770
|
+
export_list.extend(argList)
|
|
771
|
+
|
|
772
|
+
# If partition isn't set and we have a GPU partition override
|
|
773
|
+
# that applies, apply it
|
|
774
|
+
gpu_partition_override: str | None = self.boss.config.slurm_gpu_partition # type: ignore[attr-defined]
|
|
775
|
+
if partition is None and gpus and gpu_partition_override:
|
|
776
|
+
partition = gpu_partition_override
|
|
777
|
+
|
|
778
|
+
# If partition isn't set and we have a parallel partition override
|
|
779
|
+
# that applies, apply it
|
|
780
|
+
parallel_env: str | None = self.boss.config.slurm_pe # type: ignore[attr-defined]
|
|
781
|
+
if partition is None and cpu and cpu > 1 and parallel_env:
|
|
782
|
+
partition = parallel_env
|
|
783
|
+
|
|
784
|
+
# If partition isn't set and we have a general partition override
|
|
785
|
+
# that applies, apply it
|
|
786
|
+
partition_override: str | None = self.boss.config.slurm_partition # type: ignore[attr-defined]
|
|
787
|
+
if partition is None and partition_override:
|
|
788
|
+
partition = partition_override
|
|
789
|
+
|
|
790
|
+
if partition is None and gpus:
|
|
791
|
+
# Send to a GPU partition
|
|
792
|
+
gpu_partition = self.boss.partitions.default_gpu_partition
|
|
793
|
+
if gpu_partition is None:
|
|
794
|
+
# no gpu partitions are available, raise an error
|
|
795
|
+
raise RuntimeError(
|
|
796
|
+
f"The job {jobName} is requesting GPUs, but the Slurm cluster does not appear to have an accessible partition with GPUs"
|
|
797
|
+
)
|
|
798
|
+
if (
|
|
799
|
+
time_limit is not None
|
|
800
|
+
and gpu_partition.time_limit < time_limit
|
|
801
|
+
):
|
|
802
|
+
# TODO: find the lowest-priority GPU partition that has at least each job's time limit!
|
|
803
|
+
logger.warning(
|
|
804
|
+
"Trying to submit a job that needs %s seconds to partition %s that has a limit of %s seconds",
|
|
805
|
+
time_limit,
|
|
806
|
+
gpu_partition.partition_name,
|
|
807
|
+
gpu_partition.time_limit,
|
|
808
|
+
)
|
|
809
|
+
partition = gpu_partition.partition_name
|
|
810
|
+
|
|
811
|
+
if partition is None:
|
|
812
|
+
# Pick a partition based on time limit
|
|
813
|
+
partition = self.boss.partitions.get_partition(time_limit)
|
|
814
|
+
|
|
815
|
+
# Now generate all the arguments
|
|
816
|
+
if len(export_list) > 0:
|
|
817
|
+
# add --export to the sbatch
|
|
818
|
+
sbatch_line.append("--export=" + ",".join(export_list))
|
|
819
|
+
if partition is not None:
|
|
820
|
+
sbatch_line.append(f"--partition={partition}")
|
|
821
|
+
if gpus:
|
|
822
|
+
# Generate GPU assignment argument
|
|
823
|
+
sbatch_line.append(f"--gres=gpu:{gpus}")
|
|
824
|
+
if partition is not None and partition not in self.boss.partitions.gpu_partitions:
|
|
825
|
+
# the specified partition is not compatible, so warn the user that the job may not work
|
|
826
|
+
logger.warning(
|
|
827
|
+
f"Job {jobName} needs GPUs, but specified partition {partition} does not have them. This job may not work."
|
|
828
|
+
f"Try specifying one of these partitions instead: {', '.join(self.boss.partitions.gpu_partitions)}."
|
|
829
|
+
)
|
|
646
830
|
if mem is not None and self.boss.config.slurm_allocate_mem: # type: ignore[attr-defined]
|
|
647
831
|
# memory passed in is in bytes, but slurm expects megabytes
|
|
648
832
|
sbatch_line.append(f"--mem={math.ceil(mem / 2 ** 20)}")
|
|
649
833
|
if cpu is not None:
|
|
650
834
|
sbatch_line.append(f"--cpus-per-task={math.ceil(cpu)}")
|
|
651
|
-
|
|
652
|
-
time_limit: int = self.boss.config.slurm_time # type: ignore[attr-defined]
|
|
653
835
|
if time_limit is not None:
|
|
654
836
|
# Put all the seconds in the seconds slot
|
|
655
837
|
sbatch_line.append(f"--time=0:{time_limit}")
|
|
656
838
|
|
|
657
|
-
if gpus:
|
|
658
|
-
# This block will add a gpu supported partition only if no partition is supplied by the user
|
|
659
|
-
sbatch_line = sbatch_line[:1] + [f"--gres=gpu:{gpus}"] + sbatch_line[1:]
|
|
660
|
-
if not any(option.startswith("--partition") for option in sbatch_line):
|
|
661
|
-
# no partition specified, so specify one
|
|
662
|
-
# try to get the name of the lowest priority gpu supported partition
|
|
663
|
-
lowest_gpu_partition = self.boss.partitions.default_gpu_partition
|
|
664
|
-
if lowest_gpu_partition is None:
|
|
665
|
-
# no gpu partitions are available, raise an error
|
|
666
|
-
raise RuntimeError(
|
|
667
|
-
f"The job {jobName} is requesting GPUs, but the Slurm cluster does not appear to have an accessible partition with GPUs"
|
|
668
|
-
)
|
|
669
|
-
if (
|
|
670
|
-
time_limit is not None
|
|
671
|
-
and lowest_gpu_partition.time_limit < time_limit
|
|
672
|
-
):
|
|
673
|
-
# TODO: find the lowest-priority GPU partition that has at least each job's time limit!
|
|
674
|
-
logger.warning(
|
|
675
|
-
"Trying to submit a job that needs %s seconds to partition %s that has a limit of %s seconds",
|
|
676
|
-
time_limit,
|
|
677
|
-
lowest_gpu_partition.partition_name,
|
|
678
|
-
lowest_gpu_partition.time_limit,
|
|
679
|
-
)
|
|
680
|
-
sbatch_line.append(
|
|
681
|
-
f"--partition={lowest_gpu_partition.partition_name}"
|
|
682
|
-
)
|
|
683
|
-
else:
|
|
684
|
-
# there is a partition specified already, check if the partition has GPUs
|
|
685
|
-
for i, option in enumerate(sbatch_line):
|
|
686
|
-
if option.startswith("--partition"):
|
|
687
|
-
# grab the partition name depending on if it's specified via an "=" or a space
|
|
688
|
-
if "=" in option:
|
|
689
|
-
partition_name = option[len("--partition=") :]
|
|
690
|
-
else:
|
|
691
|
-
partition_name = option[i + 1]
|
|
692
|
-
available_gpu_partitions = (
|
|
693
|
-
self.boss.partitions.gpu_partitions
|
|
694
|
-
)
|
|
695
|
-
if partition_name not in available_gpu_partitions:
|
|
696
|
-
# the specified partition is not compatible, so warn the user that the job may not work
|
|
697
|
-
logger.warning(
|
|
698
|
-
f"Job {jobName} needs {gpus} GPUs, but specified partition {partition_name} is incompatible. This job may not work."
|
|
699
|
-
f"Try specifying one of these partitions instead: {', '.join(available_gpu_partitions)}."
|
|
700
|
-
)
|
|
701
|
-
break
|
|
702
|
-
|
|
703
|
-
if not any(option.startswith("--partition") for option in sbatch_line):
|
|
704
|
-
# Pick a partition ourselves
|
|
705
|
-
chosen_partition = self.boss.partitions.get_partition(time_limit)
|
|
706
|
-
if chosen_partition is not None:
|
|
707
|
-
# Route to that partition
|
|
708
|
-
sbatch_line.append(f"--partition={chosen_partition}")
|
|
709
|
-
|
|
710
839
|
stdoutfile: str = self.boss.format_std_out_err_path(jobID, "%j", "out")
|
|
711
840
|
stderrfile: str = self.boss.format_std_out_err_path(jobID, "%j", "err")
|
|
712
841
|
sbatch_line.extend(["-o", stdoutfile, "-e", stderrfile])
|
|
@@ -714,7 +843,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
714
843
|
return sbatch_line
|
|
715
844
|
|
|
716
845
|
def __init__(
|
|
717
|
-
self, config: Config, maxCores: float, maxMemory:
|
|
846
|
+
self, config: Config, maxCores: float, maxMemory: float, maxDisk: float
|
|
718
847
|
) -> None:
|
|
719
848
|
super().__init__(config, maxCores, maxMemory, maxDisk)
|
|
720
849
|
self.partitions = SlurmBatchSystem.PartitionSet()
|
|
@@ -830,6 +959,20 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
830
959
|
env_var="TOIL_SLURM_TIME",
|
|
831
960
|
help="Slurm job time limit, in [DD-]HH:MM:SS format.",
|
|
832
961
|
)
|
|
962
|
+
parser.add_argument(
|
|
963
|
+
"--slurmPartition",
|
|
964
|
+
dest="slurm_partition",
|
|
965
|
+
default=None,
|
|
966
|
+
env_var="TOIL_SLURM_PARTITION",
|
|
967
|
+
help="Partition to send Slurm jobs to.",
|
|
968
|
+
)
|
|
969
|
+
parser.add_argument(
|
|
970
|
+
"--slurmGPUPartition",
|
|
971
|
+
dest="slurm_gpu_partition",
|
|
972
|
+
default=None,
|
|
973
|
+
env_var="TOIL_SLURM_GPU_PARTITION",
|
|
974
|
+
help="Partition to send Slurm jobs to if they ask for GPUs.",
|
|
975
|
+
)
|
|
833
976
|
parser.add_argument(
|
|
834
977
|
"--slurmPE",
|
|
835
978
|
dest="slurm_pe",
|
|
@@ -852,5 +995,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
852
995
|
setOption("slurm_allocate_mem")
|
|
853
996
|
setOption("slurm_default_all_mem")
|
|
854
997
|
setOption("slurm_time")
|
|
998
|
+
setOption("slurm_partition")
|
|
999
|
+
setOption("slurm_gpu_partition")
|
|
855
1000
|
setOption("slurm_pe")
|
|
856
1001
|
setOption("slurm_args")
|
toil/bus.py
CHANGED
|
@@ -69,13 +69,15 @@ import tempfile
|
|
|
69
69
|
import threading
|
|
70
70
|
from collections.abc import Iterator
|
|
71
71
|
from dataclasses import dataclass
|
|
72
|
-
from typing import IO, Any, Callable, NamedTuple, Optional, TypeVar, cast
|
|
72
|
+
from typing import IO, Any, Callable, NamedTuple, Optional, TypeVar, TYPE_CHECKING, cast
|
|
73
73
|
|
|
74
74
|
from pubsub.core import Publisher
|
|
75
75
|
from pubsub.core.listener import Listener
|
|
76
76
|
from pubsub.core.topicobj import Topic
|
|
77
77
|
from pubsub.core.topicutils import ALL_TOPICS
|
|
78
78
|
|
|
79
|
+
from toil.lib.misc import FileDescriptorOrPath
|
|
80
|
+
|
|
79
81
|
logger = logging.getLogger(__name__)
|
|
80
82
|
|
|
81
83
|
# We define some ways to talk about jobs.
|
|
@@ -434,7 +436,7 @@ class MessageBus:
|
|
|
434
436
|
connection._set_bus(self)
|
|
435
437
|
return connection
|
|
436
438
|
|
|
437
|
-
def connect_output_file(self, file_path:
|
|
439
|
+
def connect_output_file(self, file_path: FileDescriptorOrPath) -> Any:
|
|
438
440
|
"""
|
|
439
441
|
Send copies of all messages to the given output file.
|
|
440
442
|
|
|
@@ -736,7 +738,7 @@ class JobStatus:
|
|
|
736
738
|
) # if the exit code is -1 and the job id is specified, we assume the job is running
|
|
737
739
|
|
|
738
740
|
|
|
739
|
-
def replay_message_bus(path:
|
|
741
|
+
def replay_message_bus(path: FileDescriptorOrPath) -> dict[str, JobStatus]:
|
|
740
742
|
"""
|
|
741
743
|
Replay all the messages and work out what they mean for jobs.
|
|
742
744
|
|