toil 6.0.0__py3-none-any.whl → 6.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. toil/batchSystems/abstractBatchSystem.py +19 -4
  2. toil/batchSystems/abstractGridEngineBatchSystem.py +22 -22
  3. toil/batchSystems/cleanup_support.py +7 -3
  4. toil/batchSystems/lsf.py +7 -7
  5. toil/batchSystems/slurm.py +85 -14
  6. toil/bus.py +38 -0
  7. toil/common.py +20 -18
  8. toil/cwl/cwltoil.py +81 -63
  9. toil/exceptions.py +1 -1
  10. toil/fileStores/abstractFileStore.py +53 -4
  11. toil/fileStores/cachingFileStore.py +4 -20
  12. toil/fileStores/nonCachingFileStore.py +5 -14
  13. toil/job.py +46 -30
  14. toil/jobStores/abstractJobStore.py +21 -23
  15. toil/jobStores/aws/utils.py +5 -4
  16. toil/jobStores/fileJobStore.py +1 -1
  17. toil/leader.py +17 -14
  18. toil/lib/conversions.py +19 -0
  19. toil/lib/generatedEC2Lists.py +8 -8
  20. toil/lib/io.py +28 -2
  21. toil/lib/resources.py +8 -1
  22. toil/lib/threading.py +27 -12
  23. toil/options/common.py +5 -7
  24. toil/options/wdl.py +5 -0
  25. toil/provisioners/abstractProvisioner.py +8 -0
  26. toil/statsAndLogging.py +36 -8
  27. toil/test/batchSystems/test_slurm.py +21 -6
  28. toil/test/cactus/__init__.py +0 -0
  29. toil/test/cactus/test_cactus_integration.py +58 -0
  30. toil/test/cwl/cwlTest.py +243 -151
  31. toil/test/docs/scriptsTest.py +2 -2
  32. toil/test/jobStores/jobStoreTest.py +7 -5
  33. toil/test/lib/test_ec2.py +1 -1
  34. toil/test/options/__init__.py +13 -0
  35. toil/test/options/options.py +37 -0
  36. toil/test/provisioners/clusterTest.py +9 -8
  37. toil/test/utils/toilDebugTest.py +1 -1
  38. toil/test/utils/utilsTest.py +3 -3
  39. toil/test/wdl/wdltoil_test.py +91 -16
  40. toil/utils/toilDebugFile.py +1 -1
  41. toil/utils/toilStats.py +309 -266
  42. toil/utils/toilStatus.py +1 -1
  43. toil/version.py +9 -9
  44. toil/wdl/wdltoil.py +341 -189
  45. toil/worker.py +31 -16
  46. {toil-6.0.0.dist-info → toil-6.1.0.dist-info}/METADATA +6 -7
  47. {toil-6.0.0.dist-info → toil-6.1.0.dist-info}/RECORD +51 -47
  48. {toil-6.0.0.dist-info → toil-6.1.0.dist-info}/LICENSE +0 -0
  49. {toil-6.0.0.dist-info → toil-6.1.0.dist-info}/WHEEL +0 -0
  50. {toil-6.0.0.dist-info → toil-6.1.0.dist-info}/entry_points.txt +0 -0
  51. {toil-6.0.0.dist-info → toil-6.1.0.dist-info}/top_level.txt +0 -0
toil/cwl/cwltoil.py CHANGED
@@ -34,27 +34,25 @@ import stat
34
34
  import sys
35
35
  import textwrap
36
36
  import uuid
37
- from tempfile import NamedTemporaryFile, gettempdir
37
+ from tempfile import NamedTemporaryFile, TemporaryFile, gettempdir
38
38
  from threading import Thread
39
- from typing import (
40
- IO,
41
- Any,
42
- Callable,
43
- Dict,
44
- Iterator,
45
- List,
46
- Mapping,
47
- MutableMapping,
48
- MutableSequence,
49
- Optional,
50
- TextIO,
51
- Tuple,
52
- Type,
53
- TypeVar,
54
- Union,
55
- cast,
56
- Sequence,
57
- )
39
+ from typing import (IO,
40
+ Any,
41
+ Callable,
42
+ Dict,
43
+ Iterator,
44
+ List,
45
+ Mapping,
46
+ MutableMapping,
47
+ MutableSequence,
48
+ Optional,
49
+ Sequence,
50
+ TextIO,
51
+ Tuple,
52
+ Type,
53
+ TypeVar,
54
+ Union,
55
+ cast)
58
56
  from urllib.parse import quote, unquote, urlparse, urlsplit
59
57
 
60
58
  import cwl_utils.errors
@@ -68,36 +66,30 @@ import cwltool.load_tool
68
66
  import cwltool.main
69
67
  import cwltool.resolver
70
68
  import schema_salad.ref_resolver
71
- from configargparse import ArgParser, SUPPRESS, Namespace
69
+ from configargparse import SUPPRESS, ArgParser, Namespace
72
70
  from cwltool.loghandler import _logger as cwllogger
73
71
  from cwltool.loghandler import defaultStreamHandler
74
72
  from cwltool.mpi import MpiConfig
75
73
  from cwltool.mutation import MutationManager
76
74
  from cwltool.pathmapper import MapperEnt, PathMapper
77
- from cwltool.process import (
78
- Process,
79
- add_sizes,
80
- compute_checksums,
81
- fill_in_defaults,
82
- shortname,
83
- )
75
+ from cwltool.process import (Process,
76
+ add_sizes,
77
+ compute_checksums,
78
+ fill_in_defaults,
79
+ shortname)
84
80
  from cwltool.secrets import SecretStore
85
- from cwltool.software_requirements import (
86
- DependenciesConfiguration,
87
- get_container_from_software_requirements,
88
- )
81
+ from cwltool.software_requirements import (DependenciesConfiguration,
82
+ get_container_from_software_requirements)
89
83
  from cwltool.stdfsaccess import StdFsAccess, abspath
90
- from cwltool.utils import (
91
- CWLObjectType,
92
- CWLOutputType,
93
- DirectoryType,
94
- adjustDirObjs,
95
- aslist,
96
- downloadHttpFile,
97
- get_listing,
98
- normalizeFilesDirs,
99
- visit_class,
100
- )
84
+ from cwltool.utils import (CWLObjectType,
85
+ CWLOutputType,
86
+ DirectoryType,
87
+ adjustDirObjs,
88
+ aslist,
89
+ downloadHttpFile,
90
+ get_listing,
91
+ normalizeFilesDirs,
92
+ visit_class)
101
93
  from ruamel.yaml.comments import CommentedMap, CommentedSeq
102
94
  from schema_salad.avro.schema import Names
103
95
  from schema_salad.exceptions import ValidationException
@@ -110,18 +102,17 @@ from toil.common import Toil, addOptions
110
102
  from toil.cwl import check_cwltool_version
111
103
 
112
104
  check_cwltool_version()
113
- from toil.cwl.utils import (
114
- CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION,
115
- CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE,
116
- download_structure,
117
- get_from_structure,
118
- visit_cwl_class_and_reduce,
119
- )
105
+ from toil.cwl.utils import (CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION,
106
+ CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE,
107
+ download_structure,
108
+ get_from_structure,
109
+ visit_cwl_class_and_reduce)
120
110
  from toil.exceptions import FailedJobsException
121
111
  from toil.fileStores import FileID
122
112
  from toil.fileStores.abstractFileStore import AbstractFileStore
123
113
  from toil.job import AcceleratorRequirement, Job, Promise, Promised, unwrap
124
- from toil.jobStores.abstractJobStore import AbstractJobStore, NoSuchFileException
114
+ from toil.jobStores.abstractJobStore import (AbstractJobStore,
115
+ NoSuchFileException)
125
116
  from toil.jobStores.fileJobStore import FileJobStore
126
117
  from toil.jobStores.utils import JobStoreUnavailableException, generate_locator
127
118
  from toil.lib.io import mkdtemp
@@ -1987,7 +1978,7 @@ def upload_file(
1987
1978
 
1988
1979
  Uploads local files to the Toil file store, and sets their location to a
1989
1980
  reference to the toil file store.
1990
-
1981
+
1991
1982
  Unless skip_remote is set, downloads remote files into the file store and
1992
1983
  sets their locations to references into the file store as well.
1993
1984
  """
@@ -2614,6 +2605,13 @@ class CWLJob(CWLNamedJob):
2614
2605
  streaming_allowed=runtime_context.streaming_allowed,
2615
2606
  )
2616
2607
 
2608
+ # Collect standard output and standard error somewhere if they don't go to files.
2609
+ # We need to keep two FDs to these because cwltool will close what we give it.
2610
+ default_stdout = TemporaryFile()
2611
+ runtime_context.default_stdout = os.fdopen(os.dup(default_stdout.fileno()), 'wb')
2612
+ default_stderr = TemporaryFile()
2613
+ runtime_context.default_stderr = os.fdopen(os.dup(default_stderr.fileno()), 'wb')
2614
+
2617
2615
  process_uuid = uuid.uuid4() # noqa F841
2618
2616
  started_at = datetime.datetime.now() # noqa F841
2619
2617
 
@@ -2622,13 +2620,34 @@ class CWLJob(CWLNamedJob):
2622
2620
  logger.debug("Running tool %s with order: %s", self.cwltool, self.cwljob)
2623
2621
 
2624
2622
  runtime_context.name = self.description.unitName
2625
- output, status = ToilSingleJobExecutor().execute(
2626
- process=self.cwltool,
2627
- job_order_object=cwljob,
2628
- runtime_context=runtime_context,
2629
- logger=cwllogger,
2630
- )
2631
- ended_at = datetime.datetime.now() # noqa F841
2623
+
2624
+ status = "did_not_run"
2625
+ try:
2626
+ output, status = ToilSingleJobExecutor().execute(
2627
+ process=self.cwltool,
2628
+ job_order_object=cwljob,
2629
+ runtime_context=runtime_context,
2630
+ logger=cwllogger,
2631
+ )
2632
+ finally:
2633
+ ended_at = datetime.datetime.now() # noqa F841
2634
+
2635
+ # Log any output/error data
2636
+ default_stdout.seek(0, os.SEEK_END)
2637
+ if default_stdout.tell() > 0:
2638
+ default_stdout.seek(0)
2639
+ file_store.log_user_stream(self.description.unitName + '.stdout', default_stdout)
2640
+ if status != "success":
2641
+ default_stdout.seek(0)
2642
+ logger.error("Failed command standard output:\n%s", default_stdout.read().decode("utf-8", errors="replace"))
2643
+ default_stderr.seek(0, os.SEEK_END)
2644
+ if default_stderr.tell():
2645
+ default_stderr.seek(0)
2646
+ file_store.log_user_stream(self.description.unitName + '.stderr', default_stderr)
2647
+ if status != "success":
2648
+ default_stderr.seek(0)
2649
+ logger.error("Failed command standard error:\n%s", default_stderr.read().decode("utf-8", errors="replace"))
2650
+
2632
2651
  if status != "success":
2633
2652
  raise cwl_utils.errors.WorkflowException(status)
2634
2653
 
@@ -3352,12 +3371,12 @@ def determine_load_listing(
3352
3371
 
3353
3372
  1. no_listing: DIRECTORY_NAME.listing will be undefined.
3354
3373
  e.g.
3355
-
3374
+
3356
3375
  inputs.DIRECTORY_NAME.listing == unspecified
3357
3376
 
3358
3377
  2. shallow_listing: DIRECTORY_NAME.listing will return a list one level
3359
3378
  deep of DIRECTORY_NAME's contents.
3360
- e.g.
3379
+ e.g.
3361
3380
 
3362
3381
  inputs.DIRECTORY_NAME.listing == [items in directory]
3363
3382
  inputs.DIRECTORY_NAME.listing[0].listing == undefined
@@ -3576,7 +3595,6 @@ def main(args: Optional[List[str]] = None, stdout: TextIO = sys.stdout) -> int:
3576
3595
  dependencies_configuration = DependenciesConfiguration(options)
3577
3596
  job_script_provider = dependencies_configuration
3578
3597
 
3579
- options.default_container = None
3580
3598
  runtime_context = cwltool.context.RuntimeContext(vars(options))
3581
3599
  runtime_context.toplevel = True # enable discovery of secondaryFiles
3582
3600
  runtime_context.find_default_container = functools.partial(
@@ -3789,7 +3807,7 @@ def main(args: Optional[List[str]] = None, stdout: TextIO = sys.stdout) -> int:
3789
3807
  Callable[[str], FileID],
3790
3808
  functools.partial(toil.import_file, symlink=True),
3791
3809
  )
3792
-
3810
+
3793
3811
  # Import all the input files, some of which may be missing optional
3794
3812
  # files.
3795
3813
  logger.info("Importing input files...")
toil/exceptions.py CHANGED
@@ -36,7 +36,7 @@ class FailedJobsException(Exception):
36
36
  for job_desc in failed_jobs:
37
37
  if job_desc.logJobStoreFileID:
38
38
  with job_desc.getLogFileHandle(job_store) as f:
39
- self.msg += "\n" + StatsAndLogging.formatLogStream(f, job_desc)
39
+ self.msg += "\n" + StatsAndLogging.formatLogStream(f, f'Log from job "{job_desc}"')
40
40
  # catch failures to prepare more complex details and only return the basics
41
41
  except Exception:
42
42
  logger.exception("Exception when compiling information about failed jobs")
@@ -37,11 +37,12 @@ from typing import (IO,
37
37
 
38
38
  import dill
39
39
 
40
- from toil.common import Toil, cacheDirName
40
+ from toil.common import Toil, cacheDirName, getDirSizeRecursively
41
41
  from toil.fileStores import FileID
42
42
  from toil.job import Job, JobDescription
43
43
  from toil.jobStores.abstractJobStore import AbstractJobStore
44
44
  from toil.lib.compatibility import deprecated
45
+ from toil.lib.conversions import bytes2human
45
46
  from toil.lib.io import WriteWatchingStream, mkdtemp
46
47
 
47
48
  logger = logging.getLogger(__name__)
@@ -116,7 +117,8 @@ class AbstractFileStore(ABC):
116
117
  self.jobDesc.command.split()[1] if self.jobDesc.command else ""
117
118
  )
118
119
  self.waitForPreviousCommit = waitForPreviousCommit
119
- self.loggingMessages: List[Dict[str, Union[int, str]]] = []
120
+ self.logging_messages: List[Dict[str, Union[int, str]]] = []
121
+ self.logging_user_streams: List[dict[str, str]] = []
120
122
  # Records file IDs of files deleted during the current job. Doesn't get
121
123
  # committed back until the job is completely successful, because if the
122
124
  # job is re-run it will need to be able to re-delete these files.
@@ -125,6 +127,8 @@ class AbstractFileStore(ABC):
125
127
  # Holds records of file ID, or file ID and local path, for reporting
126
128
  # the accessed files of failed jobs.
127
129
  self._accessLog: List[Tuple[str, ...]] = []
130
+ # Holds total bytes of observed disk usage for the last job run under open()
131
+ self._job_disk_used: Optional[int] = None
128
132
 
129
133
  @staticmethod
130
134
  def createFileStore(
@@ -188,6 +192,7 @@ class AbstractFileStore(ABC):
188
192
  :param job: The job instance of the toil job to run.
189
193
  """
190
194
  failed = True
195
+ job_requested_disk = job.disk
191
196
  try:
192
197
  yield
193
198
  failed = False
@@ -197,6 +202,33 @@ class AbstractFileStore(ABC):
197
202
  if failed:
198
203
  self._dumpAccessLogs()
199
204
 
205
+ # See how much disk space is used at the end of the job.
206
+ # Not a real peak disk usage, but close enough to be useful for warning the user.
207
+ self._job_disk_used = getDirSizeRecursively(self.localTempDir)
208
+
209
+ # Report disk usage
210
+ percent: float = 0.0
211
+ if job_requested_disk and job_requested_disk > 0:
212
+ percent = float(self._job_disk_used) / job_requested_disk * 100
213
+ disk_usage: str = (f"Job {self.jobName} used {percent:.2f}% disk ({bytes2human(self._job_disk_used)}B [{self._job_disk_used}B] used, "
214
+ f"{bytes2human(job_requested_disk)}B [{job_requested_disk}B] requested).")
215
+ if self._job_disk_used > job_requested_disk:
216
+ self.log_to_leader("Job used more disk than requested. For CWL, consider increasing the outdirMin "
217
+ f"requirement, otherwise, consider increasing the disk requirement. {disk_usage}",
218
+ level=logging.WARNING)
219
+ else:
220
+ self.log_to_leader(disk_usage, level=logging.DEBUG)
221
+
222
+ def get_disk_usage(self) -> Optional[int]:
223
+ """
224
+ Get the number of bytes of disk used by the last job run under open().
225
+
226
+ Disk usage is measured at the end of the job.
227
+ TODO: Sample periodically and record peak usage.
228
+ """
229
+ return self._job_disk_used
230
+
231
+
200
232
  # Functions related to temp files and directories
201
233
  def getLocalTempDir(self) -> str:
202
234
  """
@@ -611,13 +643,30 @@ class AbstractFileStore(ABC):
611
643
  :param level: The logging level.
612
644
  """
613
645
  logger.log(level=level, msg=("LOG-TO-MASTER: " + text))
614
- self.loggingMessages.append(dict(text=text, level=level))
646
+ self.logging_messages.append(dict(text=text, level=level))
615
647
 
616
648
 
617
649
  @deprecated(new_function_name='export_file')
618
650
  def logToMaster(self, text: str, level: int = logging.INFO) -> None:
619
651
  self.log_to_leader(text, level)
620
-
652
+
653
+ def log_user_stream(self, name: str, stream: IO[bytes]) -> None:
654
+ """
655
+ Send a stream of UTF-8 text to the leader as a named log stream.
656
+
657
+ Useful for things like the error logs of Docker containers. The leader
658
+ will show it to the user or organize it appropriately for user-level
659
+ log information.
660
+
661
+ :param name: A hierarchical, .-delimited string.
662
+ :param stream: A stream of encoded text. Encoding errors will be
663
+ tolerated.
664
+ """
665
+
666
+ # Read the whole stream into memory
667
+ steam_data = stream.read().decode('utf-8', errors='replace')
668
+ # And remember it for the worker to fish out
669
+ self.logging_user_streams.append(dict(name=name, text=steam_data))
621
670
 
622
671
  # Functions run after the completion of the job.
623
672
  @abstractmethod
@@ -32,13 +32,12 @@ from typing import (Any,
32
32
  Sequence,
33
33
  Tuple)
34
34
 
35
- from toil.common import cacheDirName, getDirSizeRecursively, getFileSystemSize
35
+ from toil.common import cacheDirName, getFileSystemSize
36
36
  from toil.fileStores import FileID
37
37
  from toil.fileStores.abstractFileStore import AbstractFileStore
38
38
  from toil.job import Job, JobDescription
39
39
  from toil.jobStores.abstractJobStore import AbstractJobStore
40
40
  from toil.lib.compatibility import deprecated
41
- from toil.lib.conversions import bytes2human
42
41
  from toil.lib.io import (atomic_copy,
43
42
  atomic_copyobj,
44
43
  make_public_dir,
@@ -1041,7 +1040,8 @@ class CachingFileStore(AbstractFileStore):
1041
1040
  # Check the status of all jobs on this node. If there are jobs that started and died before
1042
1041
  # cleaning up their presence from the database, clean them up ourselves.
1043
1042
  self._removeDeadJobs(self.coordination_dir, self.con)
1044
- # Get the requirements for the job.
1043
+ # Get the disk requirement for the job, which we will use to know if we
1044
+ # have filled the cache or not.
1045
1045
  self.jobDiskBytes = job.disk
1046
1046
 
1047
1047
  logger.debug('Actually running job (%s) with ID (%s) which wants %d of our %d bytes.',
@@ -1055,22 +1055,6 @@ class CachingFileStore(AbstractFileStore):
1055
1055
  with super().open(job):
1056
1056
  yield
1057
1057
  finally:
1058
- # See how much disk space is used at the end of the job.
1059
- # Not a real peak disk usage, but close enough to be useful for warning the user.
1060
- # TODO: Push this logic into the abstract file store
1061
- disk: int = getDirSizeRecursively(self.localTempDir)
1062
- percent: float = 0.0
1063
- if self.jobDiskBytes and self.jobDiskBytes > 0:
1064
- percent = float(disk) / self.jobDiskBytes * 100
1065
- disk_usage: str = (f"Job {self.jobName} used {percent:.2f}% disk ({bytes2human(disk)}B [{disk}B] used, "
1066
- f"{bytes2human(self.jobDiskBytes)}B [{self.jobDiskBytes}B] requested).")
1067
- if disk > self.jobDiskBytes:
1068
- self.log_to_leader("Job used more disk than requested. For CWL, consider increasing the outdirMin "
1069
- f"requirement, otherwise, consider increasing the disk requirement. {disk_usage}",
1070
- level=logging.WARNING)
1071
- else:
1072
- self.log_to_leader(disk_usage, level=logging.DEBUG)
1073
-
1074
1058
  # Go back up to the per-worker local temp directory.
1075
1059
  os.chdir(startingDir)
1076
1060
  self.cleanupInProgress = True
@@ -1095,7 +1079,7 @@ class CachingFileStore(AbstractFileStore):
1095
1079
  # Create an empty file to get an ID.
1096
1080
  # Make sure to pass along the file basename.
1097
1081
  # TODO: this empty file could leak if we die now...
1098
- fileID = self.jobStore.getEmptyFileStoreID(creatorID, cleanup, os.path.basename(localFileName))
1082
+ fileID = self.jobStore.get_empty_file_store_id(creatorID, cleanup, os.path.basename(localFileName))
1099
1083
  # Work out who we are
1100
1084
  with self.as_process() as me:
1101
1085
 
@@ -35,13 +35,12 @@ from typing import (IO,
35
35
 
36
36
  import dill
37
37
 
38
- from toil.common import getDirSizeRecursively, getFileSystemSize
38
+ from toil.common import getFileSystemSize
39
39
  from toil.fileStores import FileID
40
40
  from toil.fileStores.abstractFileStore import AbstractFileStore
41
41
  from toil.job import Job, JobDescription
42
42
  from toil.jobStores.abstractJobStore import AbstractJobStore
43
43
  from toil.lib.compatibility import deprecated
44
- from toil.lib.conversions import bytes2human
45
44
  from toil.lib.io import make_public_dir, robust_rmtree
46
45
  from toil.lib.retry import ErrorCondition, retry
47
46
  from toil.lib.threading import get_process_name, process_name_exists
@@ -102,7 +101,6 @@ class NonCachingFileStore(AbstractFileStore):
102
101
 
103
102
  @contextmanager
104
103
  def open(self, job: Job) -> Generator[None, None, None]:
105
- jobReqs = job.disk
106
104
  startingDir = os.getcwd()
107
105
  self.localTempDir: str = make_public_dir(in_directory=self.localTempDir)
108
106
  self._removeDeadJobs(self.coordination_dir)
@@ -116,16 +114,6 @@ class NonCachingFileStore(AbstractFileStore):
116
114
  with super().open(job):
117
115
  yield
118
116
  finally:
119
- disk = getDirSizeRecursively(self.localTempDir)
120
- percent = float(disk) / jobReqs * 100 if jobReqs > 0 else 0.0
121
- disk_usage = (f"Job {self.jobName} used {percent:.2f}% disk ({bytes2human(disk)}B [{disk}B] used, "
122
- f"{bytes2human(jobReqs)}B [{jobReqs}B] requested).")
123
- if disk > jobReqs:
124
- self.log_to_leader("Job used more disk than requested. For CWL, consider increasing the outdirMin "
125
- f"requirement, otherwise, consider increasing the disk requirement. {disk_usage}",
126
- level=logging.WARNING)
127
- else:
128
- self.log_to_leader(disk_usage, level=logging.DEBUG)
129
117
  os.chdir(startingDir)
130
118
  # Finally delete the job from the worker
131
119
  self.check_for_state_corruption()
@@ -362,7 +350,10 @@ class NonCachingFileStore(AbstractFileStore):
362
350
  jobState = {'jobProcessName': get_process_name(self.coordination_dir),
363
351
  'jobName': self.jobName,
364
352
  'jobDir': self.localTempDir}
365
- (fd, jobStateFile) = tempfile.mkstemp(suffix='.jobState.tmp', dir=self.coordination_dir)
353
+ try:
354
+ (fd, jobStateFile) = tempfile.mkstemp(suffix='.jobState.tmp', dir=self.coordination_dir)
355
+ except Exception as e:
356
+ raise RuntimeError("Could not make state file in " + self.coordination_dir) from e
366
357
  with open(fd, 'wb') as fH:
367
358
  # Write data
368
359
  dill.dump(jobState, fH)
toil/job.py CHANGED
@@ -45,6 +45,7 @@ from typing import (TYPE_CHECKING,
45
45
 
46
46
  from configargparse import ArgParser
47
47
 
48
+ from toil.bus import Names
48
49
  from toil.lib.compatibility import deprecated
49
50
 
50
51
  if sys.version_info >= (3, 8):
@@ -710,7 +711,6 @@ class Requirer:
710
711
  parts = ['no requirements']
711
712
  return ', '.join(parts)
712
713
 
713
-
714
714
  class JobDescription(Requirer):
715
715
  """
716
716
  Stores all the information that the Toil Leader ever needs to know about a Job.
@@ -814,11 +814,14 @@ class JobDescription(Requirer):
814
814
  # in the process of being committed.
815
815
  self.filesToDelete = []
816
816
 
817
- # Holds JobStore Job IDs of the jobs that have been chained into this
817
+ # Holds job names and IDs of the jobs that have been chained into this
818
818
  # job, and which should be deleted when this job finally is deleted
819
819
  # (but not before). The successor relationships with them will have
820
- # been cut, so we need to hold onto them somehow.
821
- self.merged_jobs = []
820
+ # been cut, so we need to hold onto them somehow. Includes each
821
+ # chained-in job with its original ID, and also this job's ID with its
822
+ # original names, or is empty if no chaining has happened.
823
+ # The first job in the chain comes first in the list.
824
+ self._merged_job_names: List[Names] = []
822
825
 
823
826
  # The number of direct predecessors of the job. Needs to be stored at
824
827
  # the JobDescription to support dynamically-created jobs with multiple
@@ -867,9 +870,26 @@ class JobDescription(Requirer):
867
870
  # And we log who made the version (by PID)
868
871
  self._job_version_writer = 0
869
872
 
870
- # Human-readable names of jobs that were run as part of this job's
871
- # invocation, starting with this job
872
- self.chainedJobs = []
873
+ def get_names(self) -> Names:
874
+ """
875
+ Get the names and ID of this job as a named tuple.
876
+ """
877
+ return Names(self.jobName, self.unitName, self.displayName, self.displayName, str(self.jobStoreID))
878
+
879
+ def get_chain(self) -> List[Names]:
880
+ """
881
+ Get all the jobs that executed in this job's chain, in order.
882
+
883
+ For each job, produces a named tuple with its various names and its
884
+ original job store ID. The jobs in the chain are in execution order.
885
+
886
+ If the job hasn't run yet or it didn't chain, produces a one-item list.
887
+ """
888
+ if len(self._merged_job_names) == 0:
889
+ # We haven't merged so we're just ourselves.
890
+ return [self.get_names()]
891
+ else:
892
+ return list(self._merged_job_names)
873
893
 
874
894
  def serviceHostIDsInBatches(self) -> Iterator[List[str]]:
875
895
  """
@@ -1045,8 +1065,23 @@ class JobDescription(Requirer):
1045
1065
  self.successor_phases = old_phases + self.successor_phases
1046
1066
 
1047
1067
  # When deleting, we need to delete the files for our old ID, and also
1048
- # anything that needed to be deleted for the job we are replacing.
1049
- self.merged_jobs += [self.jobStoreID] + other.merged_jobs
1068
+ # anything that needed to be deleted for the job we are replacing. And
1069
+ # we need to keep track of all the names of jobs involved for logging.
1070
+
1071
+ # We need first the job we are merging into if nothing has merged into
1072
+ # it yet, then anything that already merged into it (including it),
1073
+ # then us if nothing has yet merged into us, then anything that merged
1074
+ # into us (inclusing us)
1075
+ _merged_job_names = []
1076
+ if len(other._merged_job_names) == 0:
1077
+ _merged_job_names.append(other.get_names())
1078
+ _merged_job_names += other._merged_job_names
1079
+ if len(self._merged_job_names) == 0:
1080
+ _merged_job_names.append(self.get_names())
1081
+ _merged_job_names += self._merged_job_names
1082
+ self._merged_job_names = _merged_job_names
1083
+
1084
+ # Now steal its ID.
1050
1085
  self.jobStoreID = other.jobStoreID
1051
1086
 
1052
1087
  if len(other.filesToDelete) > 0:
@@ -1263,26 +1298,6 @@ class JobDescription(Requirer):
1263
1298
  self._job_version_writer = os.getpid()
1264
1299
  logger.debug("New job version: %s", self)
1265
1300
 
1266
- def get_job_kind(self) -> str:
1267
- """
1268
- Return an identifying string for the job.
1269
-
1270
- The result may contain spaces.
1271
-
1272
- Returns: Either the unit name, job name, or display name, which identifies
1273
- the kind of job it is to toil.
1274
- Otherwise "Unknown Job" in case no identifier is available
1275
- """
1276
- if self.unitName:
1277
- return self.unitName
1278
- elif self.jobName:
1279
- return self.jobName
1280
- elif self.displayName:
1281
- return self.displayName
1282
- else:
1283
- return "Unknown Job"
1284
-
1285
-
1286
1301
  class ServiceJobDescription(JobDescription):
1287
1302
  """A description of a job that hosts a service."""
1288
1303
 
@@ -2787,7 +2802,8 @@ class Job:
2787
2802
  clock=str(totalCpuTime - startClock),
2788
2803
  class_name=self._jobName(),
2789
2804
  memory=str(totalMemoryUsage),
2790
- requested_cores=str(self.cores)
2805
+ requested_cores=str(self.cores),
2806
+ disk=str(fileStore.get_disk_usage())
2791
2807
  )
2792
2808
  )
2793
2809
 
@@ -835,16 +835,17 @@ class AbstractJobStore(ABC):
835
835
  root_job_description = self.load_root_job()
836
836
  reachable_from_root: Set[str] = set()
837
837
 
838
- # Add first root job outside of the loop below.
839
- reachable_from_root.add(str(root_job_description.jobStoreID))
840
- # add all of root's linked service jobs as well
841
- for service_jobstore_id in root_job_description.services:
842
- if haveJob(service_jobstore_id):
843
- reachable_from_root.add(service_jobstore_id)
844
- for merged_jobstore_id in root_job_description.merged_jobs:
838
+
839
+ for merged_in in root_job_description.get_chain():
840
+ # Add the job itself and any other jobs that chained with it.
845
841
  # Keep merged-in jobs around themselves, but don't bother
846
842
  # exploring them, since we took their successors.
847
- reachable_from_root.add(merged_jobstore_id)
843
+ reachable_from_root.add(merged_in.job_store_id)
844
+ # add all of root's linked service jobs as well
845
+ for service_job_store_id in root_job_description.services:
846
+ if haveJob(service_job_store_id):
847
+ reachable_from_root.add(service_job_store_id)
848
+
848
849
 
849
850
  # Unprocessed means it might have successor jobs we need to add.
850
851
  unprocessed_job_descriptions = [root_job_description]
@@ -852,24 +853,21 @@ class AbstractJobStore(ABC):
852
853
  while unprocessed_job_descriptions:
853
854
  new_job_descriptions_to_process = [] # Reset.
854
855
  for job_description in unprocessed_job_descriptions:
855
- for successor_jobstore_id in job_description.allSuccessors():
856
- if successor_jobstore_id not in reachable_from_root and haveJob(successor_jobstore_id):
857
- successor_job_description = getJobDescription(successor_jobstore_id)
858
-
859
- # Add each successor job.
860
- reachable_from_root.add(
861
- str(successor_job_description.jobStoreID)
862
- )
856
+ for merged_in in job_description.get_chain():
857
+ # Add the job and anything chained with it.
858
+ # Keep merged-in jobs around themselves, but don't bother
859
+ # exploring them, since we took their successors.
860
+ reachable_from_root.add(merged_in.job_store_id)
861
+ for successor_job_store_id in job_description.allSuccessors():
862
+ if successor_job_store_id not in reachable_from_root and haveJob(successor_job_store_id):
863
+ successor_job_description = getJobDescription(successor_job_store_id)
864
+
863
865
  # Add all of the successor's linked service jobs as well.
864
- for service_jobstore_id in successor_job_description.services:
865
- if haveJob(service_jobstore_id):
866
- reachable_from_root.add(service_jobstore_id)
866
+ for service_job_store_id in successor_job_description.services:
867
+ if haveJob(service_job_store_id):
868
+ reachable_from_root.add(service_job_store_id)
867
869
 
868
870
  new_job_descriptions_to_process.append(successor_job_description)
869
- for merged_jobstore_id in job_description.merged_jobs:
870
- # Keep merged-in jobs around themselves, but don't bother
871
- # exploring them, since we took their successors.
872
- reachable_from_root.add(merged_jobstore_id)
873
871
  unprocessed_job_descriptions = new_job_descriptions_to_process
874
872
 
875
873
  logger.debug(f"{len(reachable_from_root)} jobs reachable from root.")