toil 6.1.0__py3-none-any.whl → 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. toil/__init__.py +1 -232
  2. toil/batchSystems/abstractBatchSystem.py +22 -13
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +59 -45
  4. toil/batchSystems/awsBatch.py +8 -8
  5. toil/batchSystems/contained_executor.py +4 -5
  6. toil/batchSystems/gridengine.py +1 -1
  7. toil/batchSystems/htcondor.py +5 -5
  8. toil/batchSystems/kubernetes.py +25 -11
  9. toil/batchSystems/local_support.py +3 -3
  10. toil/batchSystems/lsf.py +2 -2
  11. toil/batchSystems/mesos/batchSystem.py +4 -4
  12. toil/batchSystems/mesos/executor.py +3 -2
  13. toil/batchSystems/options.py +9 -0
  14. toil/batchSystems/singleMachine.py +11 -10
  15. toil/batchSystems/slurm.py +64 -22
  16. toil/batchSystems/torque.py +1 -1
  17. toil/bus.py +7 -3
  18. toil/common.py +36 -13
  19. toil/cwl/cwltoil.py +365 -312
  20. toil/deferred.py +1 -1
  21. toil/fileStores/abstractFileStore.py +17 -17
  22. toil/fileStores/cachingFileStore.py +2 -2
  23. toil/fileStores/nonCachingFileStore.py +1 -1
  24. toil/job.py +228 -60
  25. toil/jobStores/abstractJobStore.py +18 -10
  26. toil/jobStores/aws/jobStore.py +280 -218
  27. toil/jobStores/aws/utils.py +57 -29
  28. toil/jobStores/conftest.py +2 -2
  29. toil/jobStores/fileJobStore.py +2 -2
  30. toil/jobStores/googleJobStore.py +3 -4
  31. toil/leader.py +72 -24
  32. toil/lib/aws/__init__.py +26 -10
  33. toil/lib/aws/iam.py +2 -2
  34. toil/lib/aws/session.py +62 -22
  35. toil/lib/aws/utils.py +73 -37
  36. toil/lib/conversions.py +5 -1
  37. toil/lib/ec2.py +118 -69
  38. toil/lib/expando.py +1 -1
  39. toil/lib/io.py +14 -2
  40. toil/lib/misc.py +1 -3
  41. toil/lib/resources.py +55 -21
  42. toil/lib/retry.py +12 -5
  43. toil/lib/threading.py +2 -2
  44. toil/lib/throttle.py +1 -1
  45. toil/options/common.py +27 -24
  46. toil/provisioners/__init__.py +9 -3
  47. toil/provisioners/abstractProvisioner.py +9 -7
  48. toil/provisioners/aws/__init__.py +20 -15
  49. toil/provisioners/aws/awsProvisioner.py +406 -329
  50. toil/provisioners/gceProvisioner.py +2 -2
  51. toil/provisioners/node.py +13 -5
  52. toil/server/app.py +1 -1
  53. toil/statsAndLogging.py +58 -16
  54. toil/test/__init__.py +27 -12
  55. toil/test/batchSystems/batchSystemTest.py +40 -33
  56. toil/test/batchSystems/batch_system_plugin_test.py +79 -0
  57. toil/test/batchSystems/test_slurm.py +1 -1
  58. toil/test/cwl/cwlTest.py +8 -91
  59. toil/test/cwl/seqtk_seq.cwl +1 -1
  60. toil/test/docs/scriptsTest.py +10 -13
  61. toil/test/jobStores/jobStoreTest.py +33 -49
  62. toil/test/lib/aws/test_iam.py +2 -2
  63. toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
  64. toil/test/provisioners/clusterTest.py +90 -8
  65. toil/test/server/serverTest.py +2 -2
  66. toil/test/src/autoDeploymentTest.py +1 -1
  67. toil/test/src/dockerCheckTest.py +2 -1
  68. toil/test/src/environmentTest.py +125 -0
  69. toil/test/src/fileStoreTest.py +1 -1
  70. toil/test/src/jobDescriptionTest.py +18 -8
  71. toil/test/src/jobTest.py +1 -1
  72. toil/test/src/realtimeLoggerTest.py +4 -0
  73. toil/test/src/workerTest.py +52 -19
  74. toil/test/utils/toilDebugTest.py +61 -3
  75. toil/test/utils/utilsTest.py +20 -18
  76. toil/test/wdl/wdltoil_test.py +24 -71
  77. toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
  78. toil/toilState.py +68 -9
  79. toil/utils/toilDebugJob.py +153 -26
  80. toil/utils/toilLaunchCluster.py +12 -2
  81. toil/utils/toilRsyncCluster.py +7 -2
  82. toil/utils/toilSshCluster.py +7 -3
  83. toil/utils/toilStats.py +2 -1
  84. toil/utils/toilStatus.py +97 -51
  85. toil/version.py +10 -10
  86. toil/wdl/wdltoil.py +318 -51
  87. toil/worker.py +96 -69
  88. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
  89. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/METADATA +55 -21
  90. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/RECORD +93 -90
  91. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
  92. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
  93. {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
toil/deferred.py CHANGED
@@ -178,7 +178,7 @@ class DeferredFunctionManager:
178
178
 
179
179
  try:
180
180
  def defer(deferredFunction):
181
- # Just serialize defered functions one after the other.
181
+ # Just serialize deferred functions one after the other.
182
182
  # If serializing later ones fails, eariler ones will still be intact.
183
183
  # We trust dill to protect sufficiently against partial reads later.
184
184
  logger.debug("Deferring function %s" % repr(deferredFunction))
@@ -39,7 +39,7 @@ import dill
39
39
 
40
40
  from toil.common import Toil, cacheDirName, getDirSizeRecursively
41
41
  from toil.fileStores import FileID
42
- from toil.job import Job, JobDescription
42
+ from toil.job import Job, JobDescription, DebugStoppingPointReached
43
43
  from toil.jobStores.abstractJobStore import AbstractJobStore
44
44
  from toil.lib.compatibility import deprecated
45
45
  from toil.lib.conversions import bytes2human
@@ -113,9 +113,7 @@ class AbstractFileStore(ABC):
113
113
  assert self.jobStore.config.workflowID is not None
114
114
  self.workflow_dir: str = Toil.getLocalWorkflowDir(self.jobStore.config.workflowID, self.jobStore.config.workDir)
115
115
  self.coordination_dir: str =Toil.get_local_workflow_coordination_dir(self.jobStore.config.workflowID, self.jobStore.config.workDir, self.jobStore.config.coordination_dir)
116
- self.jobName: str = (
117
- self.jobDesc.command.split()[1] if self.jobDesc.command else ""
118
- )
116
+ self.jobName: str = str(self.jobDesc)
119
117
  self.waitForPreviousCommit = waitForPreviousCommit
120
118
  self.logging_messages: List[Dict[str, Union[int, str]]] = []
121
119
  self.logging_user_streams: List[dict[str, str]] = []
@@ -191,17 +189,17 @@ class AbstractFileStore(ABC):
191
189
 
192
190
  :param job: The job instance of the toil job to run.
193
191
  """
194
- failed = True
195
192
  job_requested_disk = job.disk
196
193
  try:
197
194
  yield
198
195
  failed = False
199
- finally:
200
- # Do a finally instead of an except/raise because we don't want
201
- # to appear as "another exception occurred" in the stack trace.
202
- if failed:
196
+ except BaseException as e:
197
+ if isinstance(e, DebugStoppingPointReached):
198
+ self._dumpAccessLogs(job_type="Debugged", log_level=logging.INFO)
199
+ else:
203
200
  self._dumpAccessLogs()
204
-
201
+ raise
202
+ finally:
205
203
  # See how much disk space is used at the end of the job.
206
204
  # Not a real peak disk usage, but close enough to be useful for warning the user.
207
205
  self._job_disk_used = getDirSizeRecursively(self.localTempDir)
@@ -363,14 +361,16 @@ class AbstractFileStore(ABC):
363
361
 
364
362
  yield wrappedStream, fileID
365
363
 
366
- def _dumpAccessLogs(self) -> None:
364
+ def _dumpAccessLogs(self, job_type: str = "Failed", log_level: int = logging.WARNING) -> None:
367
365
  """
368
- When something goes wrong, log a report.
366
+ Log a report of the files accessed.
369
367
 
370
368
  Includes the files that were accessed while the file store was open.
369
+
370
+ :param job_type: Adjective to describe the job in the report.
371
371
  """
372
372
  if len(self._accessLog) > 0:
373
- logger.warning('Failed job accessed files:')
373
+ logger.log(log_level, '%s job accessed files:', job_type)
374
374
 
375
375
  for item in self._accessLog:
376
376
  # For each access record
@@ -379,14 +379,14 @@ class AbstractFileStore(ABC):
379
379
  file_id, dest_path = item
380
380
  if os.path.exists(dest_path):
381
381
  if os.path.islink(dest_path):
382
- logger.warning('Symlinked file \'%s\' to path \'%s\'', file_id, dest_path)
382
+ logger.log(log_level, 'Symlinked file \'%s\' to path \'%s\'', file_id, dest_path)
383
383
  else:
384
- logger.warning('Downloaded file \'%s\' to path \'%s\'', file_id, dest_path)
384
+ logger.log(log_level, 'Downloaded file \'%s\' to path \'%s\'', file_id, dest_path)
385
385
  else:
386
- logger.warning('Downloaded file \'%s\' to path \'%s\' (gone!)', file_id, dest_path)
386
+ logger.log(log_level, 'Downloaded file \'%s\' to path \'%s\' (gone!)', file_id, dest_path)
387
387
  else:
388
388
  # Otherwise dump without the name
389
- logger.warning('Streamed file \'%s\'', *item)
389
+ logger.log(log_level, 'Streamed file \'%s\'', *item)
390
390
 
391
391
  def logAccess(
392
392
  self, fileStoreID: Union[FileID, str], destination: Union[str, None] = None
@@ -1036,7 +1036,7 @@ class CachingFileStore(AbstractFileStore):
1036
1036
  # Create a working directory for the job
1037
1037
  startingDir = os.getcwd()
1038
1038
  # Move self.localTempDir from the worker directory set up in __init__ to a per-job directory.
1039
- self.localTempDir = make_public_dir(in_directory=self.localTempDir)
1039
+ self.localTempDir = make_public_dir(self.localTempDir, suggested_name="job")
1040
1040
  # Check the status of all jobs on this node. If there are jobs that started and died before
1041
1041
  # cleaning up their presence from the database, clean them up ourselves.
1042
1042
  self._removeDeadJobs(self.coordination_dir, self.con)
@@ -1859,7 +1859,7 @@ class CachingFileStore(AbstractFileStore):
1859
1859
  logger.debug('Starting commit of %s forked from %s', state_to_commit, self.jobDesc)
1860
1860
  # Make sure the deep copy isn't summoning ghosts of old job
1861
1861
  # versions. It must be as new or newer at this point.
1862
- self.jobDesc.check_new_version(state_to_commit)
1862
+ self.jobDesc.assert_is_not_newer_than(state_to_commit)
1863
1863
 
1864
1864
  # Bump the original's version since saving will do that too and we
1865
1865
  # don't want duplicate versions.
@@ -102,7 +102,7 @@ class NonCachingFileStore(AbstractFileStore):
102
102
  @contextmanager
103
103
  def open(self, job: Job) -> Generator[None, None, None]:
104
104
  startingDir = os.getcwd()
105
- self.localTempDir: str = make_public_dir(in_directory=self.localTempDir)
105
+ self.localTempDir: str = make_public_dir(self.localTempDir, suggested_name="job")
106
106
  self._removeDeadJobs(self.coordination_dir)
107
107
  self.jobStateFile = self._createJobStateFile()
108
108
  self.check_for_state_corruption()
toil/job.py CHANGED
@@ -34,6 +34,7 @@ from typing import (TYPE_CHECKING,
34
34
  Iterator,
35
35
  List,
36
36
  Mapping,
37
+ NamedTuple,
37
38
  Optional,
38
39
  Sequence,
39
40
  Set,
@@ -68,8 +69,7 @@ from toil.deferred import DeferredFunction
68
69
  from toil.fileStores import FileID
69
70
  from toil.lib.conversions import bytes2human, human2bytes
70
71
  from toil.lib.expando import Expando
71
- from toil.lib.resources import (get_total_cpu_time,
72
- get_total_cpu_time_and_memory_usage)
72
+ from toil.lib.resources import ResourceMonitor
73
73
  from toil.resource import ModuleDescriptor
74
74
  from toil.statsAndLogging import set_logging_from_options
75
75
 
@@ -122,6 +122,23 @@ class ConflictingPredecessorError(Exception):
122
122
  f'The given job: "{predecessor.description}" is already a predecessor of job: "{successor.description}".'
123
123
  )
124
124
 
125
+ class DebugStoppingPointReached(BaseException):
126
+ """
127
+ Raised when a job reaches a point at which it has been instructed to stop for debugging.
128
+ """
129
+ pass
130
+
131
+ class FilesDownloadedStoppingPointReached(DebugStoppingPointReached):
132
+ """
133
+ Raised when a job stops because it was asked to download its files, and the files are downloaded.
134
+ """
135
+
136
+ def __init__(self, message, host_and_job_paths: Optional[List[Tuple[str, str]]] = None):
137
+ super().__init__(message)
138
+
139
+ # Save the host and user-code-visible paths of files, in case we're
140
+ # using a container and they are different.
141
+ self.host_and_job_paths = host_and_job_paths
125
142
 
126
143
  class TemporaryID:
127
144
  """
@@ -227,7 +244,7 @@ def parse_accelerator(spec: Union[int, str, Dict[str, Union[str, int]]]) -> Acce
227
244
  of them. Knows that "gpu" is a kind, and "cuda" is an API, and "nvidia"
228
245
  is a brand.
229
246
 
230
- :raises ValueError: if it gets somethign it can't parse
247
+ :raises ValueError: if it gets something it can't parse
231
248
  :raises TypeError: if it gets something it can't parse because it's the wrong type.
232
249
  """
233
250
  KINDS = {'gpu'}
@@ -711,13 +728,24 @@ class Requirer:
711
728
  parts = ['no requirements']
712
729
  return ', '.join(parts)
713
730
 
731
+ class JobBodyReference(NamedTuple):
732
+ """
733
+ Reference from a job description to its body.
734
+ """
735
+ file_store_id: str
736
+ """File ID (or special shared file name for the root job) of the job's body."""
737
+ module_string: str
738
+ """Stringified description of the module needed to load the body."""
739
+
714
740
  class JobDescription(Requirer):
715
741
  """
716
742
  Stores all the information that the Toil Leader ever needs to know about a Job.
717
-
718
- (requirements information, dependency information, commands to issue,
719
- etc.)
720
-
743
+
744
+ This includes:
745
+ * Resource requirements.
746
+ * Which jobs are children or follow-ons or predecessors of this job.
747
+ * A reference to the Job object in the job store.
748
+
721
749
  Can be obtained from an actual (i.e. executable) Job object, and can be
722
750
  used to obtain the Job object from the JobStore.
723
751
 
@@ -732,8 +760,7 @@ class JobDescription(Requirer):
732
760
  requirements: Mapping[str, Union[int, str, bool]],
733
761
  jobName: str,
734
762
  unitName: Optional[str] = "",
735
- displayName: Optional[str] = "",
736
- command: Optional[str] = None,
763
+ displayName: Optional[str] = "",
737
764
  local: Optional[bool] = None
738
765
  ) -> None:
739
766
  """
@@ -780,14 +807,10 @@ class JobDescription(Requirer):
780
807
  # ID of this job description in the JobStore.
781
808
  self.jobStoreID: Union[str, TemporaryID] = TemporaryID()
782
809
 
783
- # Mostly fake, not-really-executable command string that encodes how to
784
- # find the Job body data that this JobDescription describes, and the
785
- # module(s) needed to unpickle it.
786
- #
787
- # Gets replaced with/rewritten into the real, executable command when
788
- # the leader passes the description off to the batch system to be
789
- # executed.
790
- self.command: Optional[str] = command
810
+ # Information that encodes how to find the Job body data that this
811
+ # JobDescription describes, and the module(s) needed to unpickle it.
812
+ # None if no body needs to run.
813
+ self._body: Optional[JobBodyReference] = None
791
814
 
792
815
  # Set scheduling properties that the leader read to think about scheduling.
793
816
 
@@ -882,7 +905,7 @@ class JobDescription(Requirer):
882
905
 
883
906
  For each job, produces a named tuple with its various names and its
884
907
  original job store ID. The jobs in the chain are in execution order.
885
-
908
+
886
909
  If the job hasn't run yet or it didn't chain, produces a one-item list.
887
910
  """
888
911
  if len(self._merged_job_names) == 0:
@@ -955,7 +978,47 @@ class JobDescription(Requirer):
955
978
  """
956
979
  return list(self.serviceTree.keys())
957
980
 
958
- def nextSuccessors(self) -> Set[str]:
981
+ def has_body(self) -> bool:
982
+ """
983
+ Returns True if we have a job body associated, and False otherwise.
984
+ """
985
+ return self._body is not None
986
+
987
+ def attach_body(self, file_store_id: str, user_script: ModuleDescriptor) -> None:
988
+ """
989
+ Attach a job body to this JobDescription.
990
+
991
+ Takes the file store ID that the body is stored at, and the required
992
+ user script module.
993
+
994
+ The file store ID can also be "firstJob" for the root job, stored as a
995
+ shared file instead.
996
+ """
997
+
998
+ self._body = JobBodyReference(file_store_id, user_script.toCommand())
999
+
1000
+ def detach_body(self) -> None:
1001
+ """
1002
+ Drop the body reference from a JobDescription.
1003
+ """
1004
+ self._body = None
1005
+
1006
+ def get_body(self) -> Tuple[str, ModuleDescriptor]:
1007
+ """
1008
+ Get the information needed to load the job body.
1009
+
1010
+ :returns: a file store ID (or magic shared file name "firstJob") and a
1011
+ user script module.
1012
+
1013
+ Fails if no body is attached; check has_body() first.
1014
+ """
1015
+
1016
+ if not self.has_body():
1017
+ raise RuntimeError(f"Cannot load the body of a job {self} without one")
1018
+
1019
+ return self._body.file_store_id, ModuleDescriptor.fromCommand(self._body.module_string)
1020
+
1021
+ def nextSuccessors(self) -> Optional[Set[str]]:
959
1022
  """
960
1023
  Return the collection of job IDs for the successors of this job that are ready to run.
961
1024
 
@@ -966,7 +1029,7 @@ class JobDescription(Requirer):
966
1029
  empty collection if there are more phases but they can't be entered yet
967
1030
  (e.g. because we are waiting for the job itself to run).
968
1031
  """
969
- if self.command is not None:
1032
+ if self.has_body():
970
1033
  # We ourselves need to run. So there's not nothing to do
971
1034
  # but no successors are ready.
972
1035
  return set()
@@ -1038,7 +1101,7 @@ class JobDescription(Requirer):
1038
1101
  :returns: True if the job appears to be done, and all related child,
1039
1102
  follow-on, and service jobs appear to be finished and removed.
1040
1103
  """
1041
- return self.command == None and next(self.successorsAndServiceHosts(), None) is None
1104
+ return not self.has_body() and next(self.successorsAndServiceHosts(), None) is None
1042
1105
 
1043
1106
  def replace(self, other: "JobDescription") -> None:
1044
1107
  """
@@ -1067,7 +1130,7 @@ class JobDescription(Requirer):
1067
1130
  # When deleting, we need to delete the files for our old ID, and also
1068
1131
  # anything that needed to be deleted for the job we are replacing. And
1069
1132
  # we need to keep track of all the names of jobs involved for logging.
1070
-
1133
+
1071
1134
  # We need first the job we are merging into if nothing has merged into
1072
1135
  # it yet, then anything that already merged into it (including it),
1073
1136
  # then us if nothing has yet merged into us, then anything that merged
@@ -1080,7 +1143,7 @@ class JobDescription(Requirer):
1080
1143
  _merged_job_names.append(self.get_names())
1081
1144
  _merged_job_names += self._merged_job_names
1082
1145
  self._merged_job_names = _merged_job_names
1083
-
1146
+
1084
1147
  # Now steal its ID.
1085
1148
  self.jobStoreID = other.jobStoreID
1086
1149
 
@@ -1092,13 +1155,46 @@ class JobDescription(Requirer):
1092
1155
  self._job_version = other._job_version
1093
1156
  self._job_version_writer = os.getpid()
1094
1157
 
1095
- def check_new_version(self, other: "JobDescription") -> None:
1158
+ def assert_is_not_newer_than(self, other: "JobDescription") -> None:
1096
1159
  """
1097
- Make sure a prospective new version of the JobDescription is actually moving forward in time and not backward.
1160
+ Make sure this JobDescription is not newer than a prospective new version of the JobDescription.
1098
1161
  """
1099
1162
  if other._job_version < self._job_version:
1100
1163
  raise RuntimeError(f"Cannot replace {self} from PID {self._job_version_writer} with older version {other} from PID {other._job_version_writer}")
1101
1164
 
1165
+ def is_updated_by(self, other: "JobDescription") -> bool:
1166
+ """
1167
+ Return True if the passed JobDescription is a distinct, newer version of this one.
1168
+ """
1169
+
1170
+ if self.jobStoreID != other.jobStoreID:
1171
+ # Not the same job
1172
+ logger.warning(
1173
+ "Found ID %s in job %s from PID %s but expected ID %s to "
1174
+ "update job %s from PID %s",
1175
+ other.jobStoreID,
1176
+ other,
1177
+ other._job_version_writer,
1178
+ self.jobStoreID,
1179
+ self,
1180
+ self._job_version_writer
1181
+ )
1182
+ return False
1183
+
1184
+ if self._job_version >= other._job_version:
1185
+ # Version isn't strictly newer
1186
+ logger.debug(
1187
+ "Expected newer version in job %s from PID %s but it is no "
1188
+ "newer than job %s from PID %s",
1189
+ other,
1190
+ other._job_version_writer,
1191
+ self,
1192
+ self._job_version_writer
1193
+ )
1194
+ return False
1195
+
1196
+ return True
1197
+
1102
1198
  def addChild(self, childID: str) -> None:
1103
1199
  """Make the job with the given ID a child of the described job."""
1104
1200
  self.childIDs.add(childID)
@@ -1345,12 +1441,29 @@ class CheckpointJobDescription(JobDescription):
1345
1441
 
1346
1442
  # Set checkpoint-specific properties
1347
1443
 
1348
- # None, or a copy of the original command string used to reestablish the job after failure.
1349
- self.checkpoint = None
1444
+ # None, or a copy of the original self._body used to reestablish the job after failure.
1445
+ self.checkpoint: Optional[JobBodyReference] = None
1350
1446
 
1351
1447
  # Files that can not be deleted until the job and its successors have completed
1352
1448
  self.checkpointFilesToDelete = []
1353
1449
 
1450
+ def set_checkpoint(self) -> str:
1451
+ """
1452
+ Save a body checkpoint into self.checkpoint
1453
+ """
1454
+
1455
+ if not self.has_body():
1456
+ raise RuntimeError(f"Cannot snapshot the body of a job {self} without one")
1457
+ self.checkpoint = self._body
1458
+
1459
+ def restore_checkpoint(self) -> None:
1460
+ """
1461
+ Restore the body checkpoint from self.checkpoint
1462
+ """
1463
+ if self.checkpoint is None:
1464
+ raise RuntimeError(f"Cannot restore an empty checkpoint for a job {self}")
1465
+ self._body = self.checkpoint
1466
+
1354
1467
  def restartCheckpoint(self, jobStore: "AbstractJobStore") -> List[str]:
1355
1468
  """
1356
1469
  Restart a checkpoint after the total failure of jobs in its subtree.
@@ -1365,13 +1478,13 @@ class CheckpointJobDescription(JobDescription):
1365
1478
  raise RuntimeError("Cannot restart a checkpoint job. The checkpoint was never set.")
1366
1479
  successorsDeleted = []
1367
1480
  all_successors = list(self.allSuccessors())
1368
- if len(all_successors) > 0 or self.serviceTree or self.command is not None:
1369
- if self.command is not None:
1370
- if self.command != self.checkpoint:
1371
- raise RuntimeError("The command and checkpoint are not the same.")
1372
- logger.debug("Checkpoint job already has command set to run")
1481
+ if len(all_successors) > 0 or self.serviceTree or self.has_body():
1482
+ if self.has_body():
1483
+ if self._body != self.checkpoint:
1484
+ raise RuntimeError("The stored body reference and checkpoint are not the same.")
1485
+ logger.debug("Checkpoint job already has body set to run")
1373
1486
  else:
1374
- self.command = self.checkpoint
1487
+ self.restore_checkpoint()
1375
1488
 
1376
1489
  jobStore.update_job(self) # Update immediately to ensure that checkpoint
1377
1490
  # is made before deleting any remaining successors
@@ -1516,6 +1629,9 @@ class Job:
1516
1629
  self._defer = None
1517
1630
  self._tempDir = None
1518
1631
 
1632
+ # Holds flags set by set_debug_flag()
1633
+ self._debug_flags: Set[str] = set()
1634
+
1519
1635
  def __str__(self):
1520
1636
  """
1521
1637
  Produce a useful logging string to identify this Job and distinguish it
@@ -1526,6 +1642,19 @@ class Job:
1526
1642
  else:
1527
1643
  return 'Job(' + str(self.description) + ')'
1528
1644
 
1645
+ def check_initialized(self) -> None:
1646
+ """
1647
+ Ensure that Job.__init__() has been called by any subclass __init__().
1648
+
1649
+ This uses the fact that the self._description instance variable should always
1650
+ be set after __init__().
1651
+
1652
+ If __init__() has not been called, raise an error.
1653
+ """
1654
+ if not hasattr(self, "_description"):
1655
+ raise ValueError(f"Job instance of type {type(self)} has not been initialized. super().__init__() may not "
1656
+ f"have been called.")
1657
+
1529
1658
  @property
1530
1659
  def jobStoreID(self) -> Union[str, TemporaryID]:
1531
1660
  """Get the ID of this Job."""
@@ -1656,6 +1785,11 @@ class Job:
1656
1785
  """
1657
1786
  if not isinstance(childJob, Job):
1658
1787
  raise RuntimeError("The type of the child job is not a job.")
1788
+
1789
+ # Check that both jobs have been initialized
1790
+ self.check_initialized()
1791
+ childJob.check_initialized()
1792
+
1659
1793
  # Join the job graphs
1660
1794
  self._jobGraphsJoined(childJob)
1661
1795
  # Remember the child relationship
@@ -1683,6 +1817,11 @@ class Job:
1683
1817
  """
1684
1818
  if not isinstance(followOnJob, Job):
1685
1819
  raise RuntimeError("The type of the follow-on job is not a job.")
1820
+
1821
+ # Check that both jobs have been initialized
1822
+ self.check_initialized()
1823
+ followOnJob.check_initialized()
1824
+
1686
1825
  # Join the job graphs
1687
1826
  self._jobGraphsJoined(followOnJob)
1688
1827
  # Remember the follow-on relationship
@@ -2567,8 +2706,8 @@ class Job:
2567
2706
  # filter_main() in _unpickle( ) do its job of resolving any user-defined type or function.
2568
2707
  userScript = self.getUserScript().globalize()
2569
2708
 
2570
- # The command connects the body of the job to the JobDescription
2571
- self._description.command = ' '.join(('_toil', fileStoreID) + userScript.toCommand())
2709
+ # Connect the body of the job to the JobDescription
2710
+ self._description.attach_body(fileStoreID, userScript)
2572
2711
 
2573
2712
  def _saveJobGraph(self, jobStore: "AbstractJobStore", saveSelf: bool = False, returnValues: bool = None):
2574
2713
  """
@@ -2697,38 +2836,33 @@ class Job:
2697
2836
 
2698
2837
  @classmethod
2699
2838
  def loadJob(
2700
- cls, jobStore: "AbstractJobStore", jobDescription: JobDescription
2839
+ cls, job_store: "AbstractJobStore", job_description: JobDescription
2701
2840
  ) -> "Job":
2702
2841
  """
2703
2842
  Retrieves a :class:`toil.job.Job` instance from a JobStore
2704
2843
 
2705
- :param jobStore: The job store.
2706
- :param jobDescription: the JobDescription of the job to retrieve.
2844
+ :param job_store: The job store.
2845
+ :param job_description: the JobDescription of the job to retrieve.
2707
2846
  :returns: The job referenced by the JobDescription.
2708
2847
  """
2709
- # Grab the command that connects the description to the job body
2710
- command = jobDescription.command
2711
-
2712
- commandTokens = command.split()
2713
- if "_toil" != commandTokens[0]:
2714
- raise RuntimeError("An invalid command was passed into the job.")
2715
- userModule = ModuleDescriptor.fromCommand(commandTokens[2:])
2716
- logger.debug('Loading user module %s.', userModule)
2717
- userModule = cls._loadUserModule(userModule)
2718
- pickleFile = commandTokens[1]
2848
+
2849
+ file_store_id, user_module_descriptor = job_description.get_body()
2850
+ logger.debug('Loading user module %s.', user_module_descriptor)
2851
+ user_module = cls._loadUserModule(user_module_descriptor)
2719
2852
 
2720
2853
  #Loads context manager using file stream
2721
- if pickleFile == "firstJob":
2722
- manager = jobStore.read_shared_file_stream(pickleFile)
2854
+ if file_store_id == "firstJob":
2855
+ # This one is actually a shared file name and not a file ID.
2856
+ manager = job_store.read_shared_file_stream(file_store_id)
2723
2857
  else:
2724
- manager = jobStore.read_file_stream(pickleFile)
2858
+ manager = job_store.read_file_stream(file_store_id)
2725
2859
 
2726
2860
  #Open and unpickle
2727
- with manager as fileHandle:
2861
+ with manager as file_handle:
2728
2862
 
2729
- job = cls._unpickle(userModule, fileHandle, requireInstanceOf=Job)
2863
+ job = cls._unpickle(user_module, file_handle, requireInstanceOf=Job)
2730
2864
  # Fill in the current description
2731
- job._description = jobDescription
2865
+ job._description = job_description
2732
2866
 
2733
2867
  # Set up the registry again, so children and follow-ons can be added on the worker
2734
2868
  job._registry = {job.jobStoreID: job}
@@ -2771,11 +2905,16 @@ class Job:
2771
2905
  """
2772
2906
  if stats is not None:
2773
2907
  startTime = time.time()
2774
- startClock = get_total_cpu_time()
2908
+ startClock = ResourceMonitor.get_total_cpu_time()
2775
2909
  baseDir = os.getcwd()
2776
2910
 
2777
2911
  yield
2778
2912
 
2913
+ if "download_only" in self._debug_flags:
2914
+ # We should stop right away
2915
+ logger.debug("Job did not stop itself after downloading files; stopping.")
2916
+ raise DebugStoppingPointReached()
2917
+
2779
2918
  # If the job is not a checkpoint job, add the promise files to delete
2780
2919
  # to the list of jobStoreFileIDs to delete
2781
2920
  # TODO: why is Promise holding a global list here???
@@ -2795,7 +2934,7 @@ class Job:
2795
2934
  os.chdir(baseDir)
2796
2935
  # Finish up the stats
2797
2936
  if stats is not None:
2798
- totalCpuTime, totalMemoryUsage = get_total_cpu_time_and_memory_usage()
2937
+ totalCpuTime, totalMemoryUsage = ResourceMonitor.get_total_cpu_time_and_memory_usage()
2799
2938
  stats.jobs.append(
2800
2939
  Expando(
2801
2940
  time=str(time.time() - startTime),
@@ -2817,7 +2956,7 @@ class Job:
2817
2956
  """
2818
2957
  Run the job, and serialise the next jobs.
2819
2958
 
2820
- It marks the job as completed (by clearing its command) and creates the
2959
+ It marks the job as completed (by clearing its body) and creates the
2821
2960
  successor relationships to new successors, but it doesn't actually
2822
2961
  commit those updates to the current job into the JobStore.
2823
2962
 
@@ -2852,9 +2991,9 @@ class Job:
2852
2991
  # Serialize the new Jobs defined by the run method to the jobStore
2853
2992
  self._saveJobGraph(jobStore, saveSelf=False, returnValues=returnValues)
2854
2993
 
2855
- # Clear out the command, because the job is done.
2856
- self.description.command = None
2857
-
2994
+ # Clear out the body, because the job is done.
2995
+ self.description.detach_body()
2996
+
2858
2997
  # That and the new child/follow-on relationships will need to be
2859
2998
  # recorded later by an update() of the JobDescription.
2860
2999
 
@@ -2864,6 +3003,35 @@ class Job:
2864
3003
  """
2865
3004
  return self._description.displayName
2866
3005
 
3006
+ def set_debug_flag(self, flag: str) -> None:
3007
+ """
3008
+ Enable the given debug option on the job.
3009
+ """
3010
+ self._debug_flags.add(flag)
3011
+
3012
+ def has_debug_flag(self, flag: str) -> bool:
3013
+ """
3014
+ Return true if the given debug flag is set.
3015
+ """
3016
+
3017
+ return flag in self._debug_flags
3018
+
3019
+ def files_downloaded_hook(self, host_and_job_paths: Optional[List[Tuple[str, str]]] = None) -> None:
3020
+ """
3021
+ Function that subclasses can call when they have downloaded their input files.
3022
+
3023
+ Will abort the job if the "download_only" debug flag is set.
3024
+
3025
+ Can be hinted a list of file path pairs outside and inside the job
3026
+ container, in which case the container environment can be
3027
+ reconstructed.
3028
+ """
3029
+
3030
+ if self.has_debug_flag("download_only"):
3031
+ # Stop the worker!
3032
+ logger.info("Job has downloaded its files. Stopping.")
3033
+ # Send off the path mapping for the debugging wrapper.
3034
+ raise FilesDownloadedStoppingPointReached("Files downloaded", host_and_job_paths=host_and_job_paths)
2867
3035
 
2868
3036
  class JobException(Exception):
2869
3037
  """General job exception."""