toil 8.0.0__py3-none-any.whl → 8.1.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. toil/__init__.py +4 -4
  2. toil/batchSystems/options.py +1 -0
  3. toil/batchSystems/slurm.py +227 -83
  4. toil/common.py +161 -45
  5. toil/cwl/cwltoil.py +31 -10
  6. toil/job.py +47 -38
  7. toil/jobStores/aws/jobStore.py +46 -10
  8. toil/lib/aws/session.py +14 -3
  9. toil/lib/aws/utils.py +92 -35
  10. toil/lib/dockstore.py +379 -0
  11. toil/lib/ec2nodes.py +3 -2
  12. toil/lib/history.py +1271 -0
  13. toil/lib/history_submission.py +681 -0
  14. toil/lib/io.py +22 -1
  15. toil/lib/misc.py +18 -0
  16. toil/lib/retry.py +10 -10
  17. toil/lib/{integration.py → trs.py} +95 -46
  18. toil/lib/web.py +38 -0
  19. toil/options/common.py +17 -2
  20. toil/options/cwl.py +10 -0
  21. toil/provisioners/gceProvisioner.py +4 -4
  22. toil/server/cli/wes_cwl_runner.py +3 -3
  23. toil/server/utils.py +2 -3
  24. toil/statsAndLogging.py +35 -1
  25. toil/test/batchSystems/test_slurm.py +172 -2
  26. toil/test/cwl/conftest.py +39 -0
  27. toil/test/cwl/cwlTest.py +105 -2
  28. toil/test/cwl/optional-file.cwl +18 -0
  29. toil/test/lib/test_history.py +212 -0
  30. toil/test/lib/test_trs.py +161 -0
  31. toil/test/wdl/wdltoil_test.py +1 -1
  32. toil/version.py +10 -10
  33. toil/wdl/wdltoil.py +23 -9
  34. toil/worker.py +113 -33
  35. {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/METADATA +9 -4
  36. {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/RECORD +40 -34
  37. {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/WHEEL +1 -1
  38. toil/test/lib/test_integration.py +0 -104
  39. {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/LICENSE +0 -0
  40. {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/entry_points.txt +0 -0
  41. {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/top_level.txt +0 -0
toil/__init__.py CHANGED
@@ -19,7 +19,7 @@ import sys
19
19
  from datetime import datetime
20
20
  from typing import TYPE_CHECKING, Optional
21
21
 
22
- import requests
22
+ from toil.lib.web import web_session
23
23
 
24
24
  from docker.errors import ImageNotFound
25
25
  from toil.lib.memoize import memoize
@@ -425,7 +425,7 @@ def requestCheckRegularDocker(
425
425
  ioURL = "https://{webhost}/v2/{pathName}/manifests/{tag}" "".format(
426
426
  webhost=registryName, pathName=imageName, tag=tag
427
427
  )
428
- response = requests.head(ioURL)
428
+ response = web_session.head(ioURL)
429
429
  if not response.ok:
430
430
  raise ApplianceImageNotFound(origAppliance, ioURL, response.status_code)
431
431
  else:
@@ -459,10 +459,10 @@ def requestCheckDockerIo(origAppliance: str, imageName: str, tag: str) -> bool:
459
459
  )
460
460
  requests_url = f"https://registry-1.docker.io/v2/{imageName}/manifests/{tag}"
461
461
 
462
- token = requests.get(token_url)
462
+ token = web_session.get(token_url)
463
463
  jsonToken = token.json()
464
464
  bearer = jsonToken["token"]
465
- response = requests.head(
465
+ response = web_session.head(
466
466
  requests_url, headers={"Authorization": f"Bearer {bearer}"}
467
467
  )
468
468
  if not response.ok:
@@ -185,6 +185,7 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) -
185
185
  "systems such as gridengine, htcondor, torque, slurm, and lsf.",
186
186
  )
187
187
 
188
+ # TODO: Move this to Slurm specifically.
188
189
  parser.add_argument(
189
190
  "--memoryIsProduct",
190
191
  dest="memory_is_product",
@@ -13,13 +13,14 @@
13
13
  # limitations under the License.
14
14
  from __future__ import annotations
15
15
 
16
+ import errno
16
17
  import logging
17
18
  import math
18
19
  import os
19
20
  import sys
20
21
  from argparse import SUPPRESS, ArgumentParser, _ArgumentGroup
21
- from shlex import quote
22
- from typing import NamedTuple, TypeVar
22
+ import shlex
23
+ from typing import Callable, NamedTuple, TypeVar
23
24
 
24
25
  from toil.batchSystems.abstractBatchSystem import (
25
26
  EXIT_STATUS_UNAVAILABLE_VALUE,
@@ -185,6 +186,8 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
185
186
  def get_partition(self, time_limit: float | None) -> str | None:
186
187
  """
187
188
  Get the partition name to use for a job with the given time limit.
189
+
190
+ :param time_limit: Time limit in seconds.
188
191
  """
189
192
 
190
193
  if time_limit is None:
@@ -193,17 +196,36 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
193
196
 
194
197
  winning_partition = None
195
198
  for partition in self.all_partitions:
196
- if partition.time_limit >= time_limit and (
197
- winning_partition is None
198
- or partition.time_limit < winning_partition.time_limit
199
- ):
200
- # If this partition can fit the job and is faster than the current winner, take it
199
+ if partition.time_limit < time_limit:
200
+ # Can't use this
201
+ continue
202
+ if winning_partition is None:
203
+ # Anything beats None
201
204
  winning_partition = partition
205
+ continue
206
+ if partition.gres and not winning_partition.gres:
207
+ # Never use a partition witn GRES if you can avoid it
208
+ continue
209
+ elif not partition.gres and winning_partition.gres:
210
+ # Never keep a partition with GRES if we find one without
211
+ winning_partition = partition
212
+ continue
213
+ if partition.priority > winning_partition.priority:
214
+ # After that, don't raise priority
215
+ continue
216
+ elif partition.priority < winning_partition.priority:
217
+ # And always lower it
218
+ winning_partition = partition
219
+ continue
220
+ if partition.time_limit < winning_partition.time_limit:
221
+ # Finally, lower time limit
222
+ winning_partition = partition
223
+
202
224
  # TODO: Store partitions in a better indexed way
203
225
  if winning_partition is None and len(self.all_partitions) > 0:
204
226
  # We have partitions and none of them can fit this
205
227
  raise RuntimeError(
206
- "Could not find a Slurm partition that can fit a job that runs for {time_limit} seconds"
228
+ f"Could not find a Slurm partition that can fit a job that runs for {time_limit} seconds"
207
229
  )
208
230
 
209
231
  if winning_partition is None:
@@ -344,7 +366,9 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
344
366
  """
345
367
  try:
346
368
  status_dict = self._getJobDetailsFromSacct(job_id_list)
347
- except CalledProcessErrorStderr:
369
+ except (CalledProcessErrorStderr, OSError) as e:
370
+ if isinstance(e, OSError):
371
+ logger.warning("Could not run sacct: %s", e)
348
372
  status_dict = self._getJobDetailsFromScontrol(job_id_list)
349
373
  return status_dict
350
374
 
@@ -437,11 +461,25 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
437
461
  "-S",
438
462
  "1970-01-01",
439
463
  ] # override start time limit
440
- stdout = call_command(args, quiet=True)
441
464
 
442
465
  # Collect the job statuses in a dict; key is the job-id, value is a tuple containing
443
466
  # job state and exit status. Initialize dict before processing output of `sacct`.
444
467
  job_statuses: dict[int, tuple[str | None, int | None]] = {}
468
+
469
+ try:
470
+ stdout = call_command(args, quiet=True)
471
+ except OSError as e:
472
+ if e.errno == errno.E2BIG:
473
+ # Argument list is too big, recurse on half the argument list
474
+ if len(job_id_list) == 1:
475
+ # 1 is too big, we can't recurse further, bail out
476
+ raise
477
+ job_statuses.update(self._getJobDetailsFromSacct(job_id_list[:len(job_id_list)//2]))
478
+ job_statuses.update(self._getJobDetailsFromSacct(job_id_list[len(job_id_list)//2:]))
479
+ return job_statuses
480
+ else:
481
+ raise
482
+
445
483
  for job_id in job_id_list:
446
484
  job_statuses[job_id] = (None, None)
447
485
 
@@ -609,104 +647,194 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
609
647
  # Also any extra arguments from --slurmArgs or TOIL_SLURM_ARGS
610
648
  nativeConfig: str = self.boss.config.slurm_args # type: ignore[attr-defined]
611
649
 
650
+ # For parsing user-provided option overrides (or self-generated
651
+ # options) we need a way to recognize long, long-with-equals, and
652
+ # short forms.
653
+ def option_detector(long: str, short: str | None = None) -> Callable[[str], bool]:
654
+ """
655
+ Get a function that returns true if it sees the long or short
656
+ option.
657
+ """
658
+ def is_match(option: str) -> bool:
659
+ return option == f"--{long}" or option.startswith(f"--{long}=") or (short is not None and option == f"-{short}")
660
+ return is_match
661
+
662
+ def any_option_detector(options: list[str | tuple[str, str]]) -> Callable[[str], bool]:
663
+ """
664
+ Get a function that returns true if it sees any of the long
665
+ options or long or short option pairs.
666
+ """
667
+ detectors = [option_detector(o) if isinstance(o, str) else option_detector(*o) for o in options]
668
+ def is_match(option: str) -> bool:
669
+ for detector in detectors:
670
+ if detector(option):
671
+ return True
672
+ return False
673
+ return is_match
674
+
675
+ is_any_mem_option = any_option_detector(["mem", "mem-per-cpu", "mem-per-gpu"])
676
+ is_any_cpus_option = any_option_detector([("cpus-per-task", "c"), "cpus-per-gpu"])
677
+ is_export_option = option_detector("export")
678
+ is_export_file_option = option_detector("export-file")
679
+ is_time_option = option_detector("time", "t")
680
+ is_partition_option = option_detector("partition", "p")
681
+
682
+ # We will fill these in with stuff parsed from TOIL_SLURM_ARGS, or
683
+ # with our own determinations if they aren't there.
684
+
612
685
  # --export=[ALL,]<environment_toil_variables>
613
- set_exports = "--export=ALL"
686
+ export_all = True
687
+ export_list = [] # Some items here may be multiple comma-separated values
688
+ time_limit: int | None = self.boss.config.slurm_time # type: ignore[attr-defined]
689
+ partition: str | None = None
614
690
 
615
691
  if nativeConfig is not None:
616
692
  logger.debug(
617
693
  "Native SLURM options appended to sbatch: %s", nativeConfig
618
694
  )
619
695
 
620
- for arg in nativeConfig.split():
621
- if arg.startswith("--mem") or arg.startswith("--cpus-per-task"):
696
+ # Do a mini argument parse to pull out export and parse time if
697
+ # needed
698
+ args = shlex.split(nativeConfig)
699
+ i = 0
700
+ while i < len(args):
701
+ arg = args[i]
702
+ if is_any_mem_option(arg) or is_any_cpus_option(arg):
703
+ # Prohibit arguments that set CPUs or memory
622
704
  raise ValueError(
623
- f"Some resource arguments are incompatible: {nativeConfig}"
705
+ f"Cannot use Slurm argument {arg} which conflicts "
706
+ f"with Toil's own arguments to Slurm"
624
707
  )
625
- # repleace default behaviour by the one stated at TOIL_SLURM_ARGS
626
- if arg.startswith("--export"):
627
- set_exports = arg
628
- sbatch_line.extend(nativeConfig.split())
708
+ elif is_export_option(arg):
709
+ # Capture the export argument value so we can modify it
710
+ export_all = False
711
+ if "=" not in arg:
712
+ if i + 1 >= len(args):
713
+ raise ValueError(
714
+ f"No value supplied for Slurm {arg} argument"
715
+ )
716
+ i += 1
717
+ export_list.append(args[i])
718
+ else:
719
+ export_list.append(arg.split("=", 1)[1])
720
+ elif is_export_file_option(arg):
721
+ # Keep --export-file but turn off --export=ALL in that
722
+ # case.
723
+ export_all = False
724
+ sbatch_line.append(arg)
725
+ elif is_time_option(arg):
726
+ # Capture the time limit in seconds so we can use it for picking a partition
727
+ if "=" not in arg:
728
+ if i + 1 >= len(args):
729
+ raise ValueError(
730
+ f"No value supplied for Slurm {arg} argument"
731
+ )
732
+ i += 1
733
+ time_string = args[i]
734
+ else:
735
+ time_string = arg.split("=", 1)[1]
736
+ time_limit = parse_slurm_time(time_string)
737
+ elif is_partition_option(arg):
738
+ # Capture the partition so we can run checks on it and know not to assign one
739
+ if "=" not in arg:
740
+ if i + 1 >= len(args):
741
+ raise ValueError(
742
+ f"No value supplied for Slurm {arg} argument"
743
+ )
744
+ i += 1
745
+ partition = args[i]
746
+ else:
747
+ partition = arg.split("=", 1)[1]
748
+ else:
749
+ # Other arguments pass through.
750
+ sbatch_line.append(arg)
751
+ i += 1
752
+
753
+ if export_all:
754
+ # We don't have any export overrides so we ened to start with
755
+ # an ALL
756
+ export_list.append("ALL")
629
757
 
630
758
  if environment:
631
759
  argList = []
632
760
 
633
761
  for k, v in environment.items():
634
- quoted_value = quote(os.environ[k] if v is None else v)
762
+ # TODO: The sbatch man page doesn't say we can quote these;
763
+ # if we need to send characters like , itself we need to
764
+ # use --export-file and clean it up when the command has
765
+ # been issued.
766
+ quoted_value = shlex.quote(os.environ[k] if v is None else v)
635
767
  argList.append(f"{k}={quoted_value}")
636
768
 
637
- set_exports += "," + ",".join(argList)
638
-
639
- # add --export to the sbatch
640
- sbatch_line.append(set_exports)
641
-
642
- parallel_env: str = self.boss.config.slurm_pe # type: ignore[attr-defined]
643
- if cpu and cpu > 1 and parallel_env:
644
- sbatch_line.append(f"--partition={parallel_env}")
645
-
769
+ export_list.extend(argList)
770
+
771
+ # If partition isn't set and we have a GPU partition override
772
+ # that applies, apply it
773
+ gpu_partition_override: str | None = self.boss.config.slurm_gpu_partition # type: ignore[attr-defined]
774
+ if partition is None and gpus and gpu_partition_override:
775
+ partition = gpu_partition_override
776
+
777
+ # If partition isn't set and we have a parallel partition override
778
+ # that applies, apply it
779
+ parallel_env: str | None = self.boss.config.slurm_pe # type: ignore[attr-defined]
780
+ if partition is None and cpu and cpu > 1 and parallel_env:
781
+ partition = parallel_env
782
+
783
+ # If partition isn't set and we have a general partition override
784
+ # that applies, apply it
785
+ partition_override: str | None = self.boss.config.slurm_partition # type: ignore[attr-defined]
786
+ if partition is None and partition_override:
787
+ partition = partition_override
788
+
789
+ if partition is None and gpus:
790
+ # Send to a GPU partition
791
+ gpu_partition = self.boss.partitions.default_gpu_partition
792
+ if gpu_partition is None:
793
+ # no gpu partitions are available, raise an error
794
+ raise RuntimeError(
795
+ f"The job {jobName} is requesting GPUs, but the Slurm cluster does not appear to have an accessible partition with GPUs"
796
+ )
797
+ if (
798
+ time_limit is not None
799
+ and gpu_partition.time_limit < time_limit
800
+ ):
801
+ # TODO: find the lowest-priority GPU partition that has at least each job's time limit!
802
+ logger.warning(
803
+ "Trying to submit a job that needs %s seconds to partition %s that has a limit of %s seconds",
804
+ time_limit,
805
+ gpu_partition.partition_name,
806
+ gpu_partition.time_limit,
807
+ )
808
+ partition = gpu_partition.partition_name
809
+
810
+ if partition is None:
811
+ # Pick a partition based on time limit
812
+ partition = self.boss.partitions.get_partition(time_limit)
813
+
814
+ # Now generate all the arguments
815
+ if len(export_list) > 0:
816
+ # add --export to the sbatch
817
+ sbatch_line.append("--export=" + ",".join(export_list))
818
+ if partition is not None:
819
+ sbatch_line.append(f"--partition={partition}")
820
+ if gpus:
821
+ # Generate GPU assignment argument
822
+ sbatch_line.append(f"--gres=gpu:{gpus}")
823
+ if partition is not None and partition not in self.boss.partitions.gpu_partitions:
824
+ # the specified partition is not compatible, so warn the user that the job may not work
825
+ logger.warning(
826
+ f"Job {jobName} needs GPUs, but specified partition {partition} does not have them. This job may not work."
827
+ f"Try specifying one of these partitions instead: {', '.join(self.boss.partitions.gpu_partitions)}."
828
+ )
646
829
  if mem is not None and self.boss.config.slurm_allocate_mem: # type: ignore[attr-defined]
647
830
  # memory passed in is in bytes, but slurm expects megabytes
648
831
  sbatch_line.append(f"--mem={math.ceil(mem / 2 ** 20)}")
649
832
  if cpu is not None:
650
833
  sbatch_line.append(f"--cpus-per-task={math.ceil(cpu)}")
651
-
652
- time_limit: int = self.boss.config.slurm_time # type: ignore[attr-defined]
653
834
  if time_limit is not None:
654
835
  # Put all the seconds in the seconds slot
655
836
  sbatch_line.append(f"--time=0:{time_limit}")
656
837
 
657
- if gpus:
658
- # This block will add a gpu supported partition only if no partition is supplied by the user
659
- sbatch_line = sbatch_line[:1] + [f"--gres=gpu:{gpus}"] + sbatch_line[1:]
660
- if not any(option.startswith("--partition") for option in sbatch_line):
661
- # no partition specified, so specify one
662
- # try to get the name of the lowest priority gpu supported partition
663
- lowest_gpu_partition = self.boss.partitions.default_gpu_partition
664
- if lowest_gpu_partition is None:
665
- # no gpu partitions are available, raise an error
666
- raise RuntimeError(
667
- f"The job {jobName} is requesting GPUs, but the Slurm cluster does not appear to have an accessible partition with GPUs"
668
- )
669
- if (
670
- time_limit is not None
671
- and lowest_gpu_partition.time_limit < time_limit
672
- ):
673
- # TODO: find the lowest-priority GPU partition that has at least each job's time limit!
674
- logger.warning(
675
- "Trying to submit a job that needs %s seconds to partition %s that has a limit of %s seconds",
676
- time_limit,
677
- lowest_gpu_partition.partition_name,
678
- lowest_gpu_partition.time_limit,
679
- )
680
- sbatch_line.append(
681
- f"--partition={lowest_gpu_partition.partition_name}"
682
- )
683
- else:
684
- # there is a partition specified already, check if the partition has GPUs
685
- for i, option in enumerate(sbatch_line):
686
- if option.startswith("--partition"):
687
- # grab the partition name depending on if it's specified via an "=" or a space
688
- if "=" in option:
689
- partition_name = option[len("--partition=") :]
690
- else:
691
- partition_name = option[i + 1]
692
- available_gpu_partitions = (
693
- self.boss.partitions.gpu_partitions
694
- )
695
- if partition_name not in available_gpu_partitions:
696
- # the specified partition is not compatible, so warn the user that the job may not work
697
- logger.warning(
698
- f"Job {jobName} needs {gpus} GPUs, but specified partition {partition_name} is incompatible. This job may not work."
699
- f"Try specifying one of these partitions instead: {', '.join(available_gpu_partitions)}."
700
- )
701
- break
702
-
703
- if not any(option.startswith("--partition") for option in sbatch_line):
704
- # Pick a partition ourselves
705
- chosen_partition = self.boss.partitions.get_partition(time_limit)
706
- if chosen_partition is not None:
707
- # Route to that partition
708
- sbatch_line.append(f"--partition={chosen_partition}")
709
-
710
838
  stdoutfile: str = self.boss.format_std_out_err_path(jobID, "%j", "out")
711
839
  stderrfile: str = self.boss.format_std_out_err_path(jobID, "%j", "err")
712
840
  sbatch_line.extend(["-o", stdoutfile, "-e", stderrfile])
@@ -830,6 +958,20 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
830
958
  env_var="TOIL_SLURM_TIME",
831
959
  help="Slurm job time limit, in [DD-]HH:MM:SS format.",
832
960
  )
961
+ parser.add_argument(
962
+ "--slurmPartition",
963
+ dest="slurm_partition",
964
+ default=None,
965
+ env_var="TOIL_SLURM_PARTITION",
966
+ help="Partition to send Slurm jobs to.",
967
+ )
968
+ parser.add_argument(
969
+ "--slurmGPUPartition",
970
+ dest="slurm_gpu_partition",
971
+ default=None,
972
+ env_var="TOIL_SLURM_GPU_PARTITION",
973
+ help="Partition to send Slurm jobs to if they ask for GPUs.",
974
+ )
833
975
  parser.add_argument(
834
976
  "--slurmPE",
835
977
  dest="slurm_pe",
@@ -852,5 +994,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
852
994
  setOption("slurm_allocate_mem")
853
995
  setOption("slurm_default_all_mem")
854
996
  setOption("slurm_time")
997
+ setOption("slurm_partition")
998
+ setOption("slurm_gpu_partition")
855
999
  setOption("slurm_pe")
856
1000
  setOption("slurm_args")