toil 8.0.0__py3-none-any.whl → 8.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. toil/__init__.py +4 -39
  2. toil/batchSystems/abstractBatchSystem.py +1 -1
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +1 -1
  4. toil/batchSystems/awsBatch.py +1 -1
  5. toil/batchSystems/cleanup_support.py +1 -1
  6. toil/batchSystems/kubernetes.py +53 -7
  7. toil/batchSystems/local_support.py +1 -1
  8. toil/batchSystems/mesos/batchSystem.py +13 -8
  9. toil/batchSystems/mesos/test/__init__.py +3 -2
  10. toil/batchSystems/options.py +1 -0
  11. toil/batchSystems/singleMachine.py +1 -1
  12. toil/batchSystems/slurm.py +229 -84
  13. toil/bus.py +5 -3
  14. toil/common.py +198 -54
  15. toil/cwl/cwltoil.py +32 -11
  16. toil/job.py +110 -86
  17. toil/jobStores/abstractJobStore.py +24 -3
  18. toil/jobStores/aws/jobStore.py +46 -10
  19. toil/jobStores/fileJobStore.py +25 -1
  20. toil/jobStores/googleJobStore.py +104 -30
  21. toil/leader.py +9 -0
  22. toil/lib/accelerators.py +3 -1
  23. toil/lib/aws/session.py +14 -3
  24. toil/lib/aws/utils.py +92 -35
  25. toil/lib/aws/utils.py.orig +504 -0
  26. toil/lib/bioio.py +1 -1
  27. toil/lib/docker.py +252 -91
  28. toil/lib/dockstore.py +387 -0
  29. toil/lib/ec2nodes.py +3 -2
  30. toil/lib/exceptions.py +5 -3
  31. toil/lib/history.py +1345 -0
  32. toil/lib/history_submission.py +695 -0
  33. toil/lib/io.py +56 -23
  34. toil/lib/misc.py +25 -1
  35. toil/lib/resources.py +2 -1
  36. toil/lib/retry.py +10 -10
  37. toil/lib/threading.py +11 -10
  38. toil/lib/{integration.py → trs.py} +95 -46
  39. toil/lib/web.py +38 -0
  40. toil/options/common.py +25 -2
  41. toil/options/cwl.py +10 -0
  42. toil/options/wdl.py +11 -0
  43. toil/provisioners/gceProvisioner.py +4 -4
  44. toil/server/api_spec/LICENSE +201 -0
  45. toil/server/api_spec/README.rst +5 -0
  46. toil/server/cli/wes_cwl_runner.py +5 -4
  47. toil/server/utils.py +2 -3
  48. toil/statsAndLogging.py +35 -1
  49. toil/test/__init__.py +275 -115
  50. toil/test/batchSystems/batchSystemTest.py +227 -205
  51. toil/test/batchSystems/test_slurm.py +199 -2
  52. toil/test/cactus/pestis.tar.gz +0 -0
  53. toil/test/conftest.py +7 -0
  54. toil/test/cwl/2.fasta +11 -0
  55. toil/test/cwl/2.fastq +12 -0
  56. toil/test/cwl/conftest.py +39 -0
  57. toil/test/cwl/cwlTest.py +1015 -780
  58. toil/test/cwl/directory/directory/file.txt +15 -0
  59. toil/test/cwl/download_directory_file.json +4 -0
  60. toil/test/cwl/download_directory_s3.json +4 -0
  61. toil/test/cwl/download_file.json +6 -0
  62. toil/test/cwl/download_http.json +6 -0
  63. toil/test/cwl/download_https.json +6 -0
  64. toil/test/cwl/download_s3.json +6 -0
  65. toil/test/cwl/download_subdirectory_file.json +5 -0
  66. toil/test/cwl/download_subdirectory_s3.json +5 -0
  67. toil/test/cwl/empty.json +1 -0
  68. toil/test/cwl/mock_mpi/fake_mpi.yml +8 -0
  69. toil/test/cwl/mock_mpi/fake_mpi_run.py +42 -0
  70. toil/test/cwl/optional-file-exists.json +6 -0
  71. toil/test/cwl/optional-file-missing.json +6 -0
  72. toil/test/cwl/optional-file.cwl +18 -0
  73. toil/test/cwl/preemptible_expression.json +1 -0
  74. toil/test/cwl/revsort-job-missing.json +6 -0
  75. toil/test/cwl/revsort-job.json +6 -0
  76. toil/test/cwl/s3_secondary_file.json +16 -0
  77. toil/test/cwl/seqtk_seq_job.json +6 -0
  78. toil/test/cwl/stream.json +6 -0
  79. toil/test/cwl/test_filename_conflict_resolution.ms/table.dat +0 -0
  80. toil/test/cwl/test_filename_conflict_resolution.ms/table.f0 +0 -0
  81. toil/test/cwl/test_filename_conflict_resolution.ms/table.f1 +0 -0
  82. toil/test/cwl/test_filename_conflict_resolution.ms/table.f1i +0 -0
  83. toil/test/cwl/test_filename_conflict_resolution.ms/table.f2 +0 -0
  84. toil/test/cwl/test_filename_conflict_resolution.ms/table.f2_TSM0 +0 -0
  85. toil/test/cwl/test_filename_conflict_resolution.ms/table.f3 +0 -0
  86. toil/test/cwl/test_filename_conflict_resolution.ms/table.f3_TSM0 +0 -0
  87. toil/test/cwl/test_filename_conflict_resolution.ms/table.f4 +0 -0
  88. toil/test/cwl/test_filename_conflict_resolution.ms/table.f4_TSM0 +0 -0
  89. toil/test/cwl/test_filename_conflict_resolution.ms/table.f5 +0 -0
  90. toil/test/cwl/test_filename_conflict_resolution.ms/table.info +0 -0
  91. toil/test/cwl/test_filename_conflict_resolution.ms/table.lock +0 -0
  92. toil/test/cwl/whale.txt +16 -0
  93. toil/test/docs/scripts/example_alwaysfail.py +38 -0
  94. toil/test/docs/scripts/example_alwaysfail_with_files.wdl +33 -0
  95. toil/test/docs/scripts/example_cachingbenchmark.py +117 -0
  96. toil/test/docs/scripts/stagingExampleFiles/in.txt +1 -0
  97. toil/test/docs/scripts/stagingExampleFiles/out.txt +2 -0
  98. toil/test/docs/scripts/tutorial_arguments.py +23 -0
  99. toil/test/docs/scripts/tutorial_debugging.patch +12 -0
  100. toil/test/docs/scripts/tutorial_debugging_hangs.wdl +126 -0
  101. toil/test/docs/scripts/tutorial_debugging_works.wdl +129 -0
  102. toil/test/docs/scripts/tutorial_docker.py +20 -0
  103. toil/test/docs/scripts/tutorial_dynamic.py +24 -0
  104. toil/test/docs/scripts/tutorial_encapsulation.py +28 -0
  105. toil/test/docs/scripts/tutorial_encapsulation2.py +29 -0
  106. toil/test/docs/scripts/tutorial_helloworld.py +15 -0
  107. toil/test/docs/scripts/tutorial_invokeworkflow.py +27 -0
  108. toil/test/docs/scripts/tutorial_invokeworkflow2.py +30 -0
  109. toil/test/docs/scripts/tutorial_jobfunctions.py +22 -0
  110. toil/test/docs/scripts/tutorial_managing.py +29 -0
  111. toil/test/docs/scripts/tutorial_managing2.py +56 -0
  112. toil/test/docs/scripts/tutorial_multiplejobs.py +25 -0
  113. toil/test/docs/scripts/tutorial_multiplejobs2.py +21 -0
  114. toil/test/docs/scripts/tutorial_multiplejobs3.py +22 -0
  115. toil/test/docs/scripts/tutorial_promises.py +25 -0
  116. toil/test/docs/scripts/tutorial_promises2.py +30 -0
  117. toil/test/docs/scripts/tutorial_quickstart.py +22 -0
  118. toil/test/docs/scripts/tutorial_requirements.py +44 -0
  119. toil/test/docs/scripts/tutorial_services.py +45 -0
  120. toil/test/docs/scripts/tutorial_staging.py +45 -0
  121. toil/test/docs/scripts/tutorial_stats.py +64 -0
  122. toil/test/lib/aws/test_iam.py +3 -1
  123. toil/test/lib/dockerTest.py +205 -122
  124. toil/test/lib/test_history.py +236 -0
  125. toil/test/lib/test_trs.py +161 -0
  126. toil/test/provisioners/aws/awsProvisionerTest.py +12 -9
  127. toil/test/provisioners/clusterTest.py +4 -4
  128. toil/test/provisioners/gceProvisionerTest.py +16 -14
  129. toil/test/sort/sort.py +4 -1
  130. toil/test/src/busTest.py +17 -17
  131. toil/test/src/deferredFunctionTest.py +145 -132
  132. toil/test/src/importExportFileTest.py +71 -63
  133. toil/test/src/jobEncapsulationTest.py +27 -28
  134. toil/test/src/jobServiceTest.py +149 -133
  135. toil/test/src/jobTest.py +219 -211
  136. toil/test/src/miscTests.py +66 -60
  137. toil/test/src/promisedRequirementTest.py +163 -169
  138. toil/test/src/regularLogTest.py +24 -24
  139. toil/test/src/resourceTest.py +82 -76
  140. toil/test/src/restartDAGTest.py +51 -47
  141. toil/test/src/resumabilityTest.py +24 -19
  142. toil/test/src/retainTempDirTest.py +60 -57
  143. toil/test/src/systemTest.py +17 -13
  144. toil/test/src/threadingTest.py +29 -32
  145. toil/test/utils/ABCWorkflowDebug/B_file.txt +1 -0
  146. toil/test/utils/ABCWorkflowDebug/debugWorkflow.py +204 -0
  147. toil/test/utils/ABCWorkflowDebug/mkFile.py +16 -0
  148. toil/test/utils/ABCWorkflowDebug/sleep.cwl +12 -0
  149. toil/test/utils/ABCWorkflowDebug/sleep.yaml +1 -0
  150. toil/test/utils/toilDebugTest.py +117 -102
  151. toil/test/utils/toilKillTest.py +54 -53
  152. toil/test/utils/utilsTest.py +303 -229
  153. toil/test/wdl/lint_error.wdl +9 -0
  154. toil/test/wdl/md5sum/empty_file.json +1 -0
  155. toil/test/wdl/md5sum/md5sum-gs.json +1 -0
  156. toil/test/wdl/md5sum/md5sum.1.0.wdl +32 -0
  157. toil/test/wdl/md5sum/md5sum.input +1 -0
  158. toil/test/wdl/md5sum/md5sum.json +1 -0
  159. toil/test/wdl/md5sum/md5sum.wdl +25 -0
  160. toil/test/wdl/miniwdl_self_test/inputs-namespaced.json +1 -0
  161. toil/test/wdl/miniwdl_self_test/inputs.json +1 -0
  162. toil/test/wdl/miniwdl_self_test/self_test.wdl +40 -0
  163. toil/test/wdl/standard_library/as_map.json +16 -0
  164. toil/test/wdl/standard_library/as_map_as_input.wdl +23 -0
  165. toil/test/wdl/standard_library/as_pairs.json +7 -0
  166. toil/test/wdl/standard_library/as_pairs_as_input.wdl +23 -0
  167. toil/test/wdl/standard_library/ceil.json +3 -0
  168. toil/test/wdl/standard_library/ceil_as_command.wdl +16 -0
  169. toil/test/wdl/standard_library/ceil_as_input.wdl +16 -0
  170. toil/test/wdl/standard_library/collect_by_key.json +1 -0
  171. toil/test/wdl/standard_library/collect_by_key_as_input.wdl +23 -0
  172. toil/test/wdl/standard_library/cross.json +11 -0
  173. toil/test/wdl/standard_library/cross_as_input.wdl +19 -0
  174. toil/test/wdl/standard_library/flatten.json +7 -0
  175. toil/test/wdl/standard_library/flatten_as_input.wdl +18 -0
  176. toil/test/wdl/standard_library/floor.json +3 -0
  177. toil/test/wdl/standard_library/floor_as_command.wdl +16 -0
  178. toil/test/wdl/standard_library/floor_as_input.wdl +16 -0
  179. toil/test/wdl/standard_library/keys.json +8 -0
  180. toil/test/wdl/standard_library/keys_as_input.wdl +24 -0
  181. toil/test/wdl/standard_library/length.json +7 -0
  182. toil/test/wdl/standard_library/length_as_input.wdl +16 -0
  183. toil/test/wdl/standard_library/length_as_input_with_map.json +7 -0
  184. toil/test/wdl/standard_library/length_as_input_with_map.wdl +17 -0
  185. toil/test/wdl/standard_library/length_invalid.json +3 -0
  186. toil/test/wdl/standard_library/range.json +3 -0
  187. toil/test/wdl/standard_library/range_0.json +3 -0
  188. toil/test/wdl/standard_library/range_as_input.wdl +17 -0
  189. toil/test/wdl/standard_library/range_invalid.json +3 -0
  190. toil/test/wdl/standard_library/read_boolean.json +3 -0
  191. toil/test/wdl/standard_library/read_boolean_as_command.wdl +17 -0
  192. toil/test/wdl/standard_library/read_float.json +3 -0
  193. toil/test/wdl/standard_library/read_float_as_command.wdl +17 -0
  194. toil/test/wdl/standard_library/read_int.json +3 -0
  195. toil/test/wdl/standard_library/read_int_as_command.wdl +17 -0
  196. toil/test/wdl/standard_library/read_json.json +3 -0
  197. toil/test/wdl/standard_library/read_json_as_output.wdl +31 -0
  198. toil/test/wdl/standard_library/read_lines.json +3 -0
  199. toil/test/wdl/standard_library/read_lines_as_output.wdl +31 -0
  200. toil/test/wdl/standard_library/read_map.json +3 -0
  201. toil/test/wdl/standard_library/read_map_as_output.wdl +31 -0
  202. toil/test/wdl/standard_library/read_string.json +3 -0
  203. toil/test/wdl/standard_library/read_string_as_command.wdl +17 -0
  204. toil/test/wdl/standard_library/read_tsv.json +3 -0
  205. toil/test/wdl/standard_library/read_tsv_as_output.wdl +31 -0
  206. toil/test/wdl/standard_library/round.json +3 -0
  207. toil/test/wdl/standard_library/round_as_command.wdl +16 -0
  208. toil/test/wdl/standard_library/round_as_input.wdl +16 -0
  209. toil/test/wdl/standard_library/size.json +3 -0
  210. toil/test/wdl/standard_library/size_as_command.wdl +17 -0
  211. toil/test/wdl/standard_library/size_as_output.wdl +36 -0
  212. toil/test/wdl/standard_library/stderr.json +3 -0
  213. toil/test/wdl/standard_library/stderr_as_output.wdl +30 -0
  214. toil/test/wdl/standard_library/stdout.json +3 -0
  215. toil/test/wdl/standard_library/stdout_as_output.wdl +30 -0
  216. toil/test/wdl/standard_library/sub.json +3 -0
  217. toil/test/wdl/standard_library/sub_as_input.wdl +17 -0
  218. toil/test/wdl/standard_library/sub_as_input_with_file.wdl +17 -0
  219. toil/test/wdl/standard_library/transpose.json +6 -0
  220. toil/test/wdl/standard_library/transpose_as_input.wdl +18 -0
  221. toil/test/wdl/standard_library/write_json.json +6 -0
  222. toil/test/wdl/standard_library/write_json_as_command.wdl +17 -0
  223. toil/test/wdl/standard_library/write_lines.json +7 -0
  224. toil/test/wdl/standard_library/write_lines_as_command.wdl +17 -0
  225. toil/test/wdl/standard_library/write_map.json +6 -0
  226. toil/test/wdl/standard_library/write_map_as_command.wdl +17 -0
  227. toil/test/wdl/standard_library/write_tsv.json +6 -0
  228. toil/test/wdl/standard_library/write_tsv_as_command.wdl +17 -0
  229. toil/test/wdl/standard_library/zip.json +12 -0
  230. toil/test/wdl/standard_library/zip_as_input.wdl +19 -0
  231. toil/test/wdl/test.csv +3 -0
  232. toil/test/wdl/test.tsv +3 -0
  233. toil/test/wdl/testfiles/croo.wdl +38 -0
  234. toil/test/wdl/testfiles/drop_files.wdl +62 -0
  235. toil/test/wdl/testfiles/drop_files_subworkflow.wdl +13 -0
  236. toil/test/wdl/testfiles/empty.txt +0 -0
  237. toil/test/wdl/testfiles/not_enough_outputs.wdl +33 -0
  238. toil/test/wdl/testfiles/random.wdl +66 -0
  239. toil/test/wdl/testfiles/string_file_coercion.json +1 -0
  240. toil/test/wdl/testfiles/string_file_coercion.wdl +35 -0
  241. toil/test/wdl/testfiles/test.json +4 -0
  242. toil/test/wdl/testfiles/test_boolean.txt +1 -0
  243. toil/test/wdl/testfiles/test_float.txt +1 -0
  244. toil/test/wdl/testfiles/test_int.txt +1 -0
  245. toil/test/wdl/testfiles/test_lines.txt +5 -0
  246. toil/test/wdl/testfiles/test_map.txt +2 -0
  247. toil/test/wdl/testfiles/test_string.txt +1 -0
  248. toil/test/wdl/testfiles/url_to_file.wdl +13 -0
  249. toil/test/wdl/testfiles/url_to_optional_file.wdl +13 -0
  250. toil/test/wdl/testfiles/vocab.json +1 -0
  251. toil/test/wdl/testfiles/vocab.wdl +66 -0
  252. toil/test/wdl/testfiles/wait.wdl +34 -0
  253. toil/test/wdl/wdl_specification/type_pair.json +23 -0
  254. toil/test/wdl/wdl_specification/type_pair_basic.wdl +36 -0
  255. toil/test/wdl/wdl_specification/type_pair_with_files.wdl +36 -0
  256. toil/test/wdl/wdl_specification/v1_spec.json +1 -0
  257. toil/test/wdl/wdl_specification/v1_spec_declaration.wdl +39 -0
  258. toil/test/wdl/wdltoil_test.py +681 -408
  259. toil/test/wdl/wdltoil_test_kubernetes.py +2 -2
  260. toil/version.py +10 -10
  261. toil/wdl/wdltoil.py +350 -123
  262. toil/worker.py +113 -33
  263. {toil-8.0.0.dist-info → toil-8.2.0.dist-info}/METADATA +13 -7
  264. toil-8.2.0.dist-info/RECORD +439 -0
  265. {toil-8.0.0.dist-info → toil-8.2.0.dist-info}/WHEEL +1 -1
  266. toil/test/lib/test_integration.py +0 -104
  267. toil-8.0.0.dist-info/RECORD +0 -253
  268. {toil-8.0.0.dist-info → toil-8.2.0.dist-info}/entry_points.txt +0 -0
  269. {toil-8.0.0.dist-info → toil-8.2.0.dist-info/licenses}/LICENSE +0 -0
  270. {toil-8.0.0.dist-info → toil-8.2.0.dist-info}/top_level.txt +0 -0
@@ -13,13 +13,14 @@
13
13
  # limitations under the License.
14
14
  from __future__ import annotations
15
15
 
16
+ import errno
16
17
  import logging
17
18
  import math
18
19
  import os
19
20
  import sys
20
21
  from argparse import SUPPRESS, ArgumentParser, _ArgumentGroup
21
- from shlex import quote
22
- from typing import NamedTuple, TypeVar
22
+ import shlex
23
+ from typing import Callable, NamedTuple, TypeVar
23
24
 
24
25
  from toil.batchSystems.abstractBatchSystem import (
25
26
  EXIT_STATUS_UNAVAILABLE_VALUE,
@@ -100,6 +101,32 @@ def parse_slurm_time(slurm_time: str) -> int:
100
101
  total_seconds += multiplier * int(elapsed_split[index])
101
102
  return total_seconds
102
103
 
104
+ # For parsing user-provided option overrides (or self-generated
105
+ # options) for sbatch, we need a way to recognize long, long-with-equals, and
106
+ # short forms.
107
+ def option_detector(long: str, short: str | None = None) -> Callable[[str], bool]:
108
+ """
109
+ Get a function that returns true if it sees the long or short
110
+ option.
111
+ """
112
+ def is_match(option: str) -> bool:
113
+ return option == f"--{long}" or option.startswith(f"--{long}=") or (short is not None and option == f"-{short}")
114
+ return is_match
115
+
116
+ def any_option_detector(options: list[str | tuple[str, str]]) -> Callable[[str], bool]:
117
+ """
118
+ Get a function that returns true if it sees any of the long
119
+ options or long or short option pairs.
120
+ """
121
+ detectors = [option_detector(o) if isinstance(o, str) else option_detector(*o) for o in options]
122
+ def is_match(option: str) -> bool:
123
+ for detector in detectors:
124
+ if detector(option):
125
+ return True
126
+ return False
127
+ return is_match
128
+
129
+
103
130
 
104
131
  class SlurmBatchSystem(AbstractGridEngineBatchSystem):
105
132
  class PartitionInfo(NamedTuple):
@@ -185,6 +212,8 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
185
212
  def get_partition(self, time_limit: float | None) -> str | None:
186
213
  """
187
214
  Get the partition name to use for a job with the given time limit.
215
+
216
+ :param time_limit: Time limit in seconds.
188
217
  """
189
218
 
190
219
  if time_limit is None:
@@ -193,17 +222,36 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
193
222
 
194
223
  winning_partition = None
195
224
  for partition in self.all_partitions:
196
- if partition.time_limit >= time_limit and (
197
- winning_partition is None
198
- or partition.time_limit < winning_partition.time_limit
199
- ):
200
- # If this partition can fit the job and is faster than the current winner, take it
225
+ if partition.time_limit < time_limit:
226
+ # Can't use this
227
+ continue
228
+ if winning_partition is None:
229
+ # Anything beats None
230
+ winning_partition = partition
231
+ continue
232
+ if partition.gres and not winning_partition.gres:
233
+ # Never use a partition witn GRES if you can avoid it
234
+ continue
235
+ elif not partition.gres and winning_partition.gres:
236
+ # Never keep a partition with GRES if we find one without
237
+ winning_partition = partition
238
+ continue
239
+ if partition.priority > winning_partition.priority:
240
+ # After that, don't raise priority
241
+ continue
242
+ elif partition.priority < winning_partition.priority:
243
+ # And always lower it
244
+ winning_partition = partition
245
+ continue
246
+ if partition.time_limit < winning_partition.time_limit:
247
+ # Finally, lower time limit
201
248
  winning_partition = partition
249
+
202
250
  # TODO: Store partitions in a better indexed way
203
251
  if winning_partition is None and len(self.all_partitions) > 0:
204
252
  # We have partitions and none of them can fit this
205
253
  raise RuntimeError(
206
- "Could not find a Slurm partition that can fit a job that runs for {time_limit} seconds"
254
+ f"Could not find a Slurm partition that can fit a job that runs for {time_limit} seconds"
207
255
  )
208
256
 
209
257
  if winning_partition is None:
@@ -344,7 +392,9 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
344
392
  """
345
393
  try:
346
394
  status_dict = self._getJobDetailsFromSacct(job_id_list)
347
- except CalledProcessErrorStderr:
395
+ except (CalledProcessErrorStderr, OSError) as e:
396
+ if isinstance(e, OSError):
397
+ logger.warning("Could not run sacct: %s", e)
348
398
  status_dict = self._getJobDetailsFromScontrol(job_id_list)
349
399
  return status_dict
350
400
 
@@ -437,11 +487,25 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
437
487
  "-S",
438
488
  "1970-01-01",
439
489
  ] # override start time limit
440
- stdout = call_command(args, quiet=True)
441
490
 
442
491
  # Collect the job statuses in a dict; key is the job-id, value is a tuple containing
443
492
  # job state and exit status. Initialize dict before processing output of `sacct`.
444
493
  job_statuses: dict[int, tuple[str | None, int | None]] = {}
494
+
495
+ try:
496
+ stdout = call_command(args, quiet=True)
497
+ except OSError as e:
498
+ if e.errno == errno.E2BIG:
499
+ # Argument list is too big, recurse on half the argument list
500
+ if len(job_id_list) == 1:
501
+ # 1 is too big, we can't recurse further, bail out
502
+ raise
503
+ job_statuses.update(self._getJobDetailsFromSacct(job_id_list[:len(job_id_list)//2]))
504
+ job_statuses.update(self._getJobDetailsFromSacct(job_id_list[len(job_id_list)//2:]))
505
+ return job_statuses
506
+ else:
507
+ raise
508
+
445
509
  for job_id in job_id_list:
446
510
  job_statuses[job_id] = (None, None)
447
511
 
@@ -609,104 +673,169 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
609
673
  # Also any extra arguments from --slurmArgs or TOIL_SLURM_ARGS
610
674
  nativeConfig: str = self.boss.config.slurm_args # type: ignore[attr-defined]
611
675
 
676
+ is_any_mem_option = any_option_detector(["mem", "mem-per-cpu", "mem-per-gpu"])
677
+ is_any_cpus_option = any_option_detector([("cpus-per-task", "c"), "cpus-per-gpu"])
678
+ is_export_option = option_detector("export")
679
+ is_export_file_option = option_detector("export-file")
680
+ is_time_option = option_detector("time", "t")
681
+ is_partition_option = option_detector("partition", "p")
682
+
683
+ # We will fill these in with stuff parsed from TOIL_SLURM_ARGS, or
684
+ # with our own determinations if they aren't there.
685
+
612
686
  # --export=[ALL,]<environment_toil_variables>
613
- set_exports = "--export=ALL"
687
+ export_all = True
688
+ export_list = [] # Some items here may be multiple comma-separated values
689
+ time_limit: int | None = self.boss.config.slurm_time # type: ignore[attr-defined]
690
+ partition: str | None = None
614
691
 
615
692
  if nativeConfig is not None:
616
693
  logger.debug(
617
694
  "Native SLURM options appended to sbatch: %s", nativeConfig
618
695
  )
619
696
 
620
- for arg in nativeConfig.split():
621
- if arg.startswith("--mem") or arg.startswith("--cpus-per-task"):
697
+ # Do a mini argument parse to pull out export and parse time if
698
+ # needed
699
+ args = shlex.split(nativeConfig)
700
+ i = 0
701
+ while i < len(args):
702
+ arg = args[i]
703
+ if is_any_mem_option(arg) or is_any_cpus_option(arg):
704
+ # Prohibit arguments that set CPUs or memory
622
705
  raise ValueError(
623
- f"Some resource arguments are incompatible: {nativeConfig}"
706
+ f"Cannot use Slurm argument {arg} which conflicts "
707
+ f"with Toil's own arguments to Slurm"
624
708
  )
625
- # repleace default behaviour by the one stated at TOIL_SLURM_ARGS
626
- if arg.startswith("--export"):
627
- set_exports = arg
628
- sbatch_line.extend(nativeConfig.split())
709
+ elif is_export_option(arg):
710
+ # Capture the export argument value so we can modify it
711
+ export_all = False
712
+ if "=" not in arg:
713
+ if i + 1 >= len(args):
714
+ raise ValueError(
715
+ f"No value supplied for Slurm {arg} argument"
716
+ )
717
+ i += 1
718
+ export_list.append(args[i])
719
+ else:
720
+ export_list.append(arg.split("=", 1)[1])
721
+ elif is_export_file_option(arg):
722
+ # Keep --export-file but turn off --export=ALL in that
723
+ # case.
724
+ export_all = False
725
+ sbatch_line.append(arg)
726
+ elif is_time_option(arg):
727
+ # Capture the time limit in seconds so we can use it for picking a partition
728
+ if "=" not in arg:
729
+ if i + 1 >= len(args):
730
+ raise ValueError(
731
+ f"No value supplied for Slurm {arg} argument"
732
+ )
733
+ i += 1
734
+ time_string = args[i]
735
+ else:
736
+ time_string = arg.split("=", 1)[1]
737
+ time_limit = parse_slurm_time(time_string)
738
+ elif is_partition_option(arg):
739
+ # Capture the partition so we can run checks on it and know not to assign one
740
+ if "=" not in arg:
741
+ if i + 1 >= len(args):
742
+ raise ValueError(
743
+ f"No value supplied for Slurm {arg} argument"
744
+ )
745
+ i += 1
746
+ partition = args[i]
747
+ else:
748
+ partition = arg.split("=", 1)[1]
749
+ else:
750
+ # Other arguments pass through.
751
+ sbatch_line.append(arg)
752
+ i += 1
753
+
754
+ if export_all:
755
+ # We don't have any export overrides so we ened to start with
756
+ # an ALL
757
+ export_list.append("ALL")
629
758
 
630
759
  if environment:
631
760
  argList = []
632
761
 
633
762
  for k, v in environment.items():
634
- quoted_value = quote(os.environ[k] if v is None else v)
763
+ # TODO: The sbatch man page doesn't say we can quote these;
764
+ # if we need to send characters like , itself we need to
765
+ # use --export-file and clean it up when the command has
766
+ # been issued.
767
+ quoted_value = shlex.quote(os.environ[k] if v is None else v)
635
768
  argList.append(f"{k}={quoted_value}")
636
769
 
637
- set_exports += "," + ",".join(argList)
638
-
639
- # add --export to the sbatch
640
- sbatch_line.append(set_exports)
641
-
642
- parallel_env: str = self.boss.config.slurm_pe # type: ignore[attr-defined]
643
- if cpu and cpu > 1 and parallel_env:
644
- sbatch_line.append(f"--partition={parallel_env}")
645
-
770
+ export_list.extend(argList)
771
+
772
+ # If partition isn't set and we have a GPU partition override
773
+ # that applies, apply it
774
+ gpu_partition_override: str | None = self.boss.config.slurm_gpu_partition # type: ignore[attr-defined]
775
+ if partition is None and gpus and gpu_partition_override:
776
+ partition = gpu_partition_override
777
+
778
+ # If partition isn't set and we have a parallel partition override
779
+ # that applies, apply it
780
+ parallel_env: str | None = self.boss.config.slurm_pe # type: ignore[attr-defined]
781
+ if partition is None and cpu and cpu > 1 and parallel_env:
782
+ partition = parallel_env
783
+
784
+ # If partition isn't set and we have a general partition override
785
+ # that applies, apply it
786
+ partition_override: str | None = self.boss.config.slurm_partition # type: ignore[attr-defined]
787
+ if partition is None and partition_override:
788
+ partition = partition_override
789
+
790
+ if partition is None and gpus:
791
+ # Send to a GPU partition
792
+ gpu_partition = self.boss.partitions.default_gpu_partition
793
+ if gpu_partition is None:
794
+ # no gpu partitions are available, raise an error
795
+ raise RuntimeError(
796
+ f"The job {jobName} is requesting GPUs, but the Slurm cluster does not appear to have an accessible partition with GPUs"
797
+ )
798
+ if (
799
+ time_limit is not None
800
+ and gpu_partition.time_limit < time_limit
801
+ ):
802
+ # TODO: find the lowest-priority GPU partition that has at least each job's time limit!
803
+ logger.warning(
804
+ "Trying to submit a job that needs %s seconds to partition %s that has a limit of %s seconds",
805
+ time_limit,
806
+ gpu_partition.partition_name,
807
+ gpu_partition.time_limit,
808
+ )
809
+ partition = gpu_partition.partition_name
810
+
811
+ if partition is None:
812
+ # Pick a partition based on time limit
813
+ partition = self.boss.partitions.get_partition(time_limit)
814
+
815
+ # Now generate all the arguments
816
+ if len(export_list) > 0:
817
+ # add --export to the sbatch
818
+ sbatch_line.append("--export=" + ",".join(export_list))
819
+ if partition is not None:
820
+ sbatch_line.append(f"--partition={partition}")
821
+ if gpus:
822
+ # Generate GPU assignment argument
823
+ sbatch_line.append(f"--gres=gpu:{gpus}")
824
+ if partition is not None and partition not in self.boss.partitions.gpu_partitions:
825
+ # the specified partition is not compatible, so warn the user that the job may not work
826
+ logger.warning(
827
+ f"Job {jobName} needs GPUs, but specified partition {partition} does not have them. This job may not work."
828
+ f"Try specifying one of these partitions instead: {', '.join(self.boss.partitions.gpu_partitions)}."
829
+ )
646
830
  if mem is not None and self.boss.config.slurm_allocate_mem: # type: ignore[attr-defined]
647
831
  # memory passed in is in bytes, but slurm expects megabytes
648
832
  sbatch_line.append(f"--mem={math.ceil(mem / 2 ** 20)}")
649
833
  if cpu is not None:
650
834
  sbatch_line.append(f"--cpus-per-task={math.ceil(cpu)}")
651
-
652
- time_limit: int = self.boss.config.slurm_time # type: ignore[attr-defined]
653
835
  if time_limit is not None:
654
836
  # Put all the seconds in the seconds slot
655
837
  sbatch_line.append(f"--time=0:{time_limit}")
656
838
 
657
- if gpus:
658
- # This block will add a gpu supported partition only if no partition is supplied by the user
659
- sbatch_line = sbatch_line[:1] + [f"--gres=gpu:{gpus}"] + sbatch_line[1:]
660
- if not any(option.startswith("--partition") for option in sbatch_line):
661
- # no partition specified, so specify one
662
- # try to get the name of the lowest priority gpu supported partition
663
- lowest_gpu_partition = self.boss.partitions.default_gpu_partition
664
- if lowest_gpu_partition is None:
665
- # no gpu partitions are available, raise an error
666
- raise RuntimeError(
667
- f"The job {jobName} is requesting GPUs, but the Slurm cluster does not appear to have an accessible partition with GPUs"
668
- )
669
- if (
670
- time_limit is not None
671
- and lowest_gpu_partition.time_limit < time_limit
672
- ):
673
- # TODO: find the lowest-priority GPU partition that has at least each job's time limit!
674
- logger.warning(
675
- "Trying to submit a job that needs %s seconds to partition %s that has a limit of %s seconds",
676
- time_limit,
677
- lowest_gpu_partition.partition_name,
678
- lowest_gpu_partition.time_limit,
679
- )
680
- sbatch_line.append(
681
- f"--partition={lowest_gpu_partition.partition_name}"
682
- )
683
- else:
684
- # there is a partition specified already, check if the partition has GPUs
685
- for i, option in enumerate(sbatch_line):
686
- if option.startswith("--partition"):
687
- # grab the partition name depending on if it's specified via an "=" or a space
688
- if "=" in option:
689
- partition_name = option[len("--partition=") :]
690
- else:
691
- partition_name = option[i + 1]
692
- available_gpu_partitions = (
693
- self.boss.partitions.gpu_partitions
694
- )
695
- if partition_name not in available_gpu_partitions:
696
- # the specified partition is not compatible, so warn the user that the job may not work
697
- logger.warning(
698
- f"Job {jobName} needs {gpus} GPUs, but specified partition {partition_name} is incompatible. This job may not work."
699
- f"Try specifying one of these partitions instead: {', '.join(available_gpu_partitions)}."
700
- )
701
- break
702
-
703
- if not any(option.startswith("--partition") for option in sbatch_line):
704
- # Pick a partition ourselves
705
- chosen_partition = self.boss.partitions.get_partition(time_limit)
706
- if chosen_partition is not None:
707
- # Route to that partition
708
- sbatch_line.append(f"--partition={chosen_partition}")
709
-
710
839
  stdoutfile: str = self.boss.format_std_out_err_path(jobID, "%j", "out")
711
840
  stderrfile: str = self.boss.format_std_out_err_path(jobID, "%j", "err")
712
841
  sbatch_line.extend(["-o", stdoutfile, "-e", stderrfile])
@@ -714,7 +843,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
714
843
  return sbatch_line
715
844
 
716
845
  def __init__(
717
- self, config: Config, maxCores: float, maxMemory: int, maxDisk: int
846
+ self, config: Config, maxCores: float, maxMemory: float, maxDisk: float
718
847
  ) -> None:
719
848
  super().__init__(config, maxCores, maxMemory, maxDisk)
720
849
  self.partitions = SlurmBatchSystem.PartitionSet()
@@ -830,6 +959,20 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
830
959
  env_var="TOIL_SLURM_TIME",
831
960
  help="Slurm job time limit, in [DD-]HH:MM:SS format.",
832
961
  )
962
+ parser.add_argument(
963
+ "--slurmPartition",
964
+ dest="slurm_partition",
965
+ default=None,
966
+ env_var="TOIL_SLURM_PARTITION",
967
+ help="Partition to send Slurm jobs to.",
968
+ )
969
+ parser.add_argument(
970
+ "--slurmGPUPartition",
971
+ dest="slurm_gpu_partition",
972
+ default=None,
973
+ env_var="TOIL_SLURM_GPU_PARTITION",
974
+ help="Partition to send Slurm jobs to if they ask for GPUs.",
975
+ )
833
976
  parser.add_argument(
834
977
  "--slurmPE",
835
978
  dest="slurm_pe",
@@ -852,5 +995,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
852
995
  setOption("slurm_allocate_mem")
853
996
  setOption("slurm_default_all_mem")
854
997
  setOption("slurm_time")
998
+ setOption("slurm_partition")
999
+ setOption("slurm_gpu_partition")
855
1000
  setOption("slurm_pe")
856
1001
  setOption("slurm_args")
toil/bus.py CHANGED
@@ -69,13 +69,15 @@ import tempfile
69
69
  import threading
70
70
  from collections.abc import Iterator
71
71
  from dataclasses import dataclass
72
- from typing import IO, Any, Callable, NamedTuple, Optional, TypeVar, cast
72
+ from typing import IO, Any, Callable, NamedTuple, Optional, TypeVar, TYPE_CHECKING, cast
73
73
 
74
74
  from pubsub.core import Publisher
75
75
  from pubsub.core.listener import Listener
76
76
  from pubsub.core.topicobj import Topic
77
77
  from pubsub.core.topicutils import ALL_TOPICS
78
78
 
79
+ from toil.lib.misc import FileDescriptorOrPath
80
+
79
81
  logger = logging.getLogger(__name__)
80
82
 
81
83
  # We define some ways to talk about jobs.
@@ -434,7 +436,7 @@ class MessageBus:
434
436
  connection._set_bus(self)
435
437
  return connection
436
438
 
437
- def connect_output_file(self, file_path: str) -> Any:
439
+ def connect_output_file(self, file_path: FileDescriptorOrPath) -> Any:
438
440
  """
439
441
  Send copies of all messages to the given output file.
440
442
 
@@ -736,7 +738,7 @@ class JobStatus:
736
738
  ) # if the exit code is -1 and the job id is specified, we assume the job is running
737
739
 
738
740
 
739
- def replay_message_bus(path: str) -> dict[str, JobStatus]:
741
+ def replay_message_bus(path: FileDescriptorOrPath) -> dict[str, JobStatus]:
740
742
  """
741
743
  Replay all the messages and work out what they mean for jobs.
742
744