toil 6.1.0a1__py3-none-any.whl → 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. toil/__init__.py +1 -232
  2. toil/batchSystems/abstractBatchSystem.py +41 -17
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +79 -65
  4. toil/batchSystems/awsBatch.py +8 -8
  5. toil/batchSystems/cleanup_support.py +7 -3
  6. toil/batchSystems/contained_executor.py +4 -5
  7. toil/batchSystems/gridengine.py +1 -1
  8. toil/batchSystems/htcondor.py +5 -5
  9. toil/batchSystems/kubernetes.py +25 -11
  10. toil/batchSystems/local_support.py +3 -3
  11. toil/batchSystems/lsf.py +9 -9
  12. toil/batchSystems/mesos/batchSystem.py +4 -4
  13. toil/batchSystems/mesos/executor.py +3 -2
  14. toil/batchSystems/options.py +9 -0
  15. toil/batchSystems/singleMachine.py +11 -10
  16. toil/batchSystems/slurm.py +129 -16
  17. toil/batchSystems/torque.py +1 -1
  18. toil/bus.py +45 -3
  19. toil/common.py +56 -31
  20. toil/cwl/cwltoil.py +442 -371
  21. toil/deferred.py +1 -1
  22. toil/exceptions.py +1 -1
  23. toil/fileStores/abstractFileStore.py +69 -20
  24. toil/fileStores/cachingFileStore.py +6 -22
  25. toil/fileStores/nonCachingFileStore.py +6 -15
  26. toil/job.py +270 -86
  27. toil/jobStores/abstractJobStore.py +37 -31
  28. toil/jobStores/aws/jobStore.py +280 -218
  29. toil/jobStores/aws/utils.py +60 -31
  30. toil/jobStores/conftest.py +2 -2
  31. toil/jobStores/fileJobStore.py +3 -3
  32. toil/jobStores/googleJobStore.py +3 -4
  33. toil/leader.py +89 -38
  34. toil/lib/aws/__init__.py +26 -10
  35. toil/lib/aws/iam.py +2 -2
  36. toil/lib/aws/session.py +62 -22
  37. toil/lib/aws/utils.py +73 -37
  38. toil/lib/conversions.py +24 -1
  39. toil/lib/ec2.py +118 -69
  40. toil/lib/expando.py +1 -1
  41. toil/lib/generatedEC2Lists.py +8 -8
  42. toil/lib/io.py +42 -4
  43. toil/lib/misc.py +1 -3
  44. toil/lib/resources.py +57 -16
  45. toil/lib/retry.py +12 -5
  46. toil/lib/threading.py +29 -14
  47. toil/lib/throttle.py +1 -1
  48. toil/options/common.py +31 -30
  49. toil/options/wdl.py +5 -0
  50. toil/provisioners/__init__.py +9 -3
  51. toil/provisioners/abstractProvisioner.py +12 -2
  52. toil/provisioners/aws/__init__.py +20 -15
  53. toil/provisioners/aws/awsProvisioner.py +406 -329
  54. toil/provisioners/gceProvisioner.py +2 -2
  55. toil/provisioners/node.py +13 -5
  56. toil/server/app.py +1 -1
  57. toil/statsAndLogging.py +93 -23
  58. toil/test/__init__.py +27 -12
  59. toil/test/batchSystems/batchSystemTest.py +40 -33
  60. toil/test/batchSystems/batch_system_plugin_test.py +79 -0
  61. toil/test/batchSystems/test_slurm.py +22 -7
  62. toil/test/cactus/__init__.py +0 -0
  63. toil/test/cactus/test_cactus_integration.py +58 -0
  64. toil/test/cwl/cwlTest.py +245 -236
  65. toil/test/cwl/seqtk_seq.cwl +1 -1
  66. toil/test/docs/scriptsTest.py +11 -14
  67. toil/test/jobStores/jobStoreTest.py +40 -54
  68. toil/test/lib/aws/test_iam.py +2 -2
  69. toil/test/lib/test_ec2.py +1 -1
  70. toil/test/options/__init__.py +13 -0
  71. toil/test/options/options.py +37 -0
  72. toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
  73. toil/test/provisioners/clusterTest.py +99 -16
  74. toil/test/server/serverTest.py +2 -2
  75. toil/test/src/autoDeploymentTest.py +1 -1
  76. toil/test/src/dockerCheckTest.py +2 -1
  77. toil/test/src/environmentTest.py +125 -0
  78. toil/test/src/fileStoreTest.py +1 -1
  79. toil/test/src/jobDescriptionTest.py +18 -8
  80. toil/test/src/jobTest.py +1 -1
  81. toil/test/src/realtimeLoggerTest.py +4 -0
  82. toil/test/src/workerTest.py +52 -19
  83. toil/test/utils/toilDebugTest.py +62 -4
  84. toil/test/utils/utilsTest.py +23 -21
  85. toil/test/wdl/wdltoil_test.py +49 -21
  86. toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
  87. toil/toilState.py +68 -9
  88. toil/utils/toilDebugFile.py +1 -1
  89. toil/utils/toilDebugJob.py +153 -26
  90. toil/utils/toilLaunchCluster.py +12 -2
  91. toil/utils/toilRsyncCluster.py +7 -2
  92. toil/utils/toilSshCluster.py +7 -3
  93. toil/utils/toilStats.py +310 -266
  94. toil/utils/toilStatus.py +98 -52
  95. toil/version.py +11 -11
  96. toil/wdl/wdltoil.py +644 -225
  97. toil/worker.py +125 -83
  98. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
  99. toil-7.0.0.dist-info/METADATA +158 -0
  100. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/RECORD +103 -96
  101. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
  102. toil-6.1.0a1.dist-info/METADATA +0 -125
  103. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
  104. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
toil/job.py CHANGED
@@ -34,6 +34,7 @@ from typing import (TYPE_CHECKING,
34
34
  Iterator,
35
35
  List,
36
36
  Mapping,
37
+ NamedTuple,
37
38
  Optional,
38
39
  Sequence,
39
40
  Set,
@@ -45,6 +46,7 @@ from typing import (TYPE_CHECKING,
45
46
 
46
47
  from configargparse import ArgParser
47
48
 
49
+ from toil.bus import Names
48
50
  from toil.lib.compatibility import deprecated
49
51
 
50
52
  if sys.version_info >= (3, 8):
@@ -67,8 +69,7 @@ from toil.deferred import DeferredFunction
67
69
  from toil.fileStores import FileID
68
70
  from toil.lib.conversions import bytes2human, human2bytes
69
71
  from toil.lib.expando import Expando
70
- from toil.lib.resources import (get_total_cpu_time,
71
- get_total_cpu_time_and_memory_usage)
72
+ from toil.lib.resources import ResourceMonitor
72
73
  from toil.resource import ModuleDescriptor
73
74
  from toil.statsAndLogging import set_logging_from_options
74
75
 
@@ -121,6 +122,23 @@ class ConflictingPredecessorError(Exception):
121
122
  f'The given job: "{predecessor.description}" is already a predecessor of job: "{successor.description}".'
122
123
  )
123
124
 
125
+ class DebugStoppingPointReached(BaseException):
126
+ """
127
+ Raised when a job reaches a point at which it has been instructed to stop for debugging.
128
+ """
129
+ pass
130
+
131
+ class FilesDownloadedStoppingPointReached(DebugStoppingPointReached):
132
+ """
133
+ Raised when a job stops because it was asked to download its files, and the files are downloaded.
134
+ """
135
+
136
+ def __init__(self, message, host_and_job_paths: Optional[List[Tuple[str, str]]] = None):
137
+ super().__init__(message)
138
+
139
+ # Save the host and user-code-visible paths of files, in case we're
140
+ # using a container and they are different.
141
+ self.host_and_job_paths = host_and_job_paths
124
142
 
125
143
  class TemporaryID:
126
144
  """
@@ -226,7 +244,7 @@ def parse_accelerator(spec: Union[int, str, Dict[str, Union[str, int]]]) -> Acce
226
244
  of them. Knows that "gpu" is a kind, and "cuda" is an API, and "nvidia"
227
245
  is a brand.
228
246
 
229
- :raises ValueError: if it gets somethign it can't parse
247
+ :raises ValueError: if it gets something it can't parse
230
248
  :raises TypeError: if it gets something it can't parse because it's the wrong type.
231
249
  """
232
250
  KINDS = {'gpu'}
@@ -710,14 +728,24 @@ class Requirer:
710
728
  parts = ['no requirements']
711
729
  return ', '.join(parts)
712
730
 
731
+ class JobBodyReference(NamedTuple):
732
+ """
733
+ Reference from a job description to its body.
734
+ """
735
+ file_store_id: str
736
+ """File ID (or special shared file name for the root job) of the job's body."""
737
+ module_string: str
738
+ """Stringified description of the module needed to load the body."""
713
739
 
714
740
  class JobDescription(Requirer):
715
741
  """
716
742
  Stores all the information that the Toil Leader ever needs to know about a Job.
717
-
718
- (requirements information, dependency information, commands to issue,
719
- etc.)
720
-
743
+
744
+ This includes:
745
+ * Resource requirements.
746
+ * Which jobs are children or follow-ons or predecessors of this job.
747
+ * A reference to the Job object in the job store.
748
+
721
749
  Can be obtained from an actual (i.e. executable) Job object, and can be
722
750
  used to obtain the Job object from the JobStore.
723
751
 
@@ -732,8 +760,7 @@ class JobDescription(Requirer):
732
760
  requirements: Mapping[str, Union[int, str, bool]],
733
761
  jobName: str,
734
762
  unitName: Optional[str] = "",
735
- displayName: Optional[str] = "",
736
- command: Optional[str] = None,
763
+ displayName: Optional[str] = "",
737
764
  local: Optional[bool] = None
738
765
  ) -> None:
739
766
  """
@@ -780,14 +807,10 @@ class JobDescription(Requirer):
780
807
  # ID of this job description in the JobStore.
781
808
  self.jobStoreID: Union[str, TemporaryID] = TemporaryID()
782
809
 
783
- # Mostly fake, not-really-executable command string that encodes how to
784
- # find the Job body data that this JobDescription describes, and the
785
- # module(s) needed to unpickle it.
786
- #
787
- # Gets replaced with/rewritten into the real, executable command when
788
- # the leader passes the description off to the batch system to be
789
- # executed.
790
- self.command: Optional[str] = command
810
+ # Information that encodes how to find the Job body data that this
811
+ # JobDescription describes, and the module(s) needed to unpickle it.
812
+ # None if no body needs to run.
813
+ self._body: Optional[JobBodyReference] = None
791
814
 
792
815
  # Set scheduling properties that the leader read to think about scheduling.
793
816
 
@@ -814,11 +837,14 @@ class JobDescription(Requirer):
814
837
  # in the process of being committed.
815
838
  self.filesToDelete = []
816
839
 
817
- # Holds JobStore Job IDs of the jobs that have been chained into this
840
+ # Holds job names and IDs of the jobs that have been chained into this
818
841
  # job, and which should be deleted when this job finally is deleted
819
842
  # (but not before). The successor relationships with them will have
820
- # been cut, so we need to hold onto them somehow.
821
- self.merged_jobs = []
843
+ # been cut, so we need to hold onto them somehow. Includes each
844
+ # chained-in job with its original ID, and also this job's ID with its
845
+ # original names, or is empty if no chaining has happened.
846
+ # The first job in the chain comes first in the list.
847
+ self._merged_job_names: List[Names] = []
822
848
 
823
849
  # The number of direct predecessors of the job. Needs to be stored at
824
850
  # the JobDescription to support dynamically-created jobs with multiple
@@ -867,9 +893,26 @@ class JobDescription(Requirer):
867
893
  # And we log who made the version (by PID)
868
894
  self._job_version_writer = 0
869
895
 
870
- # Human-readable names of jobs that were run as part of this job's
871
- # invocation, starting with this job
872
- self.chainedJobs = []
896
+ def get_names(self) -> Names:
897
+ """
898
+ Get the names and ID of this job as a named tuple.
899
+ """
900
+ return Names(self.jobName, self.unitName, self.displayName, self.displayName, str(self.jobStoreID))
901
+
902
+ def get_chain(self) -> List[Names]:
903
+ """
904
+ Get all the jobs that executed in this job's chain, in order.
905
+
906
+ For each job, produces a named tuple with its various names and its
907
+ original job store ID. The jobs in the chain are in execution order.
908
+
909
+ If the job hasn't run yet or it didn't chain, produces a one-item list.
910
+ """
911
+ if len(self._merged_job_names) == 0:
912
+ # We haven't merged so we're just ourselves.
913
+ return [self.get_names()]
914
+ else:
915
+ return list(self._merged_job_names)
873
916
 
874
917
  def serviceHostIDsInBatches(self) -> Iterator[List[str]]:
875
918
  """
@@ -935,7 +978,47 @@ class JobDescription(Requirer):
935
978
  """
936
979
  return list(self.serviceTree.keys())
937
980
 
938
- def nextSuccessors(self) -> Set[str]:
981
+ def has_body(self) -> bool:
982
+ """
983
+ Returns True if we have a job body associated, and False otherwise.
984
+ """
985
+ return self._body is not None
986
+
987
+ def attach_body(self, file_store_id: str, user_script: ModuleDescriptor) -> None:
988
+ """
989
+ Attach a job body to this JobDescription.
990
+
991
+ Takes the file store ID that the body is stored at, and the required
992
+ user script module.
993
+
994
+ The file store ID can also be "firstJob" for the root job, stored as a
995
+ shared file instead.
996
+ """
997
+
998
+ self._body = JobBodyReference(file_store_id, user_script.toCommand())
999
+
1000
+ def detach_body(self) -> None:
1001
+ """
1002
+ Drop the body reference from a JobDescription.
1003
+ """
1004
+ self._body = None
1005
+
1006
+ def get_body(self) -> Tuple[str, ModuleDescriptor]:
1007
+ """
1008
+ Get the information needed to load the job body.
1009
+
1010
+ :returns: a file store ID (or magic shared file name "firstJob") and a
1011
+ user script module.
1012
+
1013
+ Fails if no body is attached; check has_body() first.
1014
+ """
1015
+
1016
+ if not self.has_body():
1017
+ raise RuntimeError(f"Cannot load the body of a job {self} without one")
1018
+
1019
+ return self._body.file_store_id, ModuleDescriptor.fromCommand(self._body.module_string)
1020
+
1021
+ def nextSuccessors(self) -> Optional[Set[str]]:
939
1022
  """
940
1023
  Return the collection of job IDs for the successors of this job that are ready to run.
941
1024
 
@@ -946,7 +1029,7 @@ class JobDescription(Requirer):
946
1029
  empty collection if there are more phases but they can't be entered yet
947
1030
  (e.g. because we are waiting for the job itself to run).
948
1031
  """
949
- if self.command is not None:
1032
+ if self.has_body():
950
1033
  # We ourselves need to run. So there's not nothing to do
951
1034
  # but no successors are ready.
952
1035
  return set()
@@ -1018,7 +1101,7 @@ class JobDescription(Requirer):
1018
1101
  :returns: True if the job appears to be done, and all related child,
1019
1102
  follow-on, and service jobs appear to be finished and removed.
1020
1103
  """
1021
- return self.command == None and next(self.successorsAndServiceHosts(), None) is None
1104
+ return not self.has_body() and next(self.successorsAndServiceHosts(), None) is None
1022
1105
 
1023
1106
  def replace(self, other: "JobDescription") -> None:
1024
1107
  """
@@ -1045,8 +1128,23 @@ class JobDescription(Requirer):
1045
1128
  self.successor_phases = old_phases + self.successor_phases
1046
1129
 
1047
1130
  # When deleting, we need to delete the files for our old ID, and also
1048
- # anything that needed to be deleted for the job we are replacing.
1049
- self.merged_jobs += [self.jobStoreID] + other.merged_jobs
1131
+ # anything that needed to be deleted for the job we are replacing. And
1132
+ # we need to keep track of all the names of jobs involved for logging.
1133
+
1134
+ # We need first the job we are merging into if nothing has merged into
1135
+ # it yet, then anything that already merged into it (including it),
1136
+ # then us if nothing has yet merged into us, then anything that merged
1137
+ # into us (inclusing us)
1138
+ _merged_job_names = []
1139
+ if len(other._merged_job_names) == 0:
1140
+ _merged_job_names.append(other.get_names())
1141
+ _merged_job_names += other._merged_job_names
1142
+ if len(self._merged_job_names) == 0:
1143
+ _merged_job_names.append(self.get_names())
1144
+ _merged_job_names += self._merged_job_names
1145
+ self._merged_job_names = _merged_job_names
1146
+
1147
+ # Now steal its ID.
1050
1148
  self.jobStoreID = other.jobStoreID
1051
1149
 
1052
1150
  if len(other.filesToDelete) > 0:
@@ -1057,13 +1155,46 @@ class JobDescription(Requirer):
1057
1155
  self._job_version = other._job_version
1058
1156
  self._job_version_writer = os.getpid()
1059
1157
 
1060
- def check_new_version(self, other: "JobDescription") -> None:
1158
+ def assert_is_not_newer_than(self, other: "JobDescription") -> None:
1061
1159
  """
1062
- Make sure a prospective new version of the JobDescription is actually moving forward in time and not backward.
1160
+ Make sure this JobDescription is not newer than a prospective new version of the JobDescription.
1063
1161
  """
1064
1162
  if other._job_version < self._job_version:
1065
1163
  raise RuntimeError(f"Cannot replace {self} from PID {self._job_version_writer} with older version {other} from PID {other._job_version_writer}")
1066
1164
 
1165
+ def is_updated_by(self, other: "JobDescription") -> bool:
1166
+ """
1167
+ Return True if the passed JobDescription is a distinct, newer version of this one.
1168
+ """
1169
+
1170
+ if self.jobStoreID != other.jobStoreID:
1171
+ # Not the same job
1172
+ logger.warning(
1173
+ "Found ID %s in job %s from PID %s but expected ID %s to "
1174
+ "update job %s from PID %s",
1175
+ other.jobStoreID,
1176
+ other,
1177
+ other._job_version_writer,
1178
+ self.jobStoreID,
1179
+ self,
1180
+ self._job_version_writer
1181
+ )
1182
+ return False
1183
+
1184
+ if self._job_version >= other._job_version:
1185
+ # Version isn't strictly newer
1186
+ logger.debug(
1187
+ "Expected newer version in job %s from PID %s but it is no "
1188
+ "newer than job %s from PID %s",
1189
+ other,
1190
+ other._job_version_writer,
1191
+ self,
1192
+ self._job_version_writer
1193
+ )
1194
+ return False
1195
+
1196
+ return True
1197
+
1067
1198
  def addChild(self, childID: str) -> None:
1068
1199
  """Make the job with the given ID a child of the described job."""
1069
1200
  self.childIDs.add(childID)
@@ -1263,26 +1394,6 @@ class JobDescription(Requirer):
1263
1394
  self._job_version_writer = os.getpid()
1264
1395
  logger.debug("New job version: %s", self)
1265
1396
 
1266
- def get_job_kind(self) -> str:
1267
- """
1268
- Return an identifying string for the job.
1269
-
1270
- The result may contain spaces.
1271
-
1272
- Returns: Either the unit name, job name, or display name, which identifies
1273
- the kind of job it is to toil.
1274
- Otherwise "Unknown Job" in case no identifier is available
1275
- """
1276
- if self.unitName:
1277
- return self.unitName
1278
- elif self.jobName:
1279
- return self.jobName
1280
- elif self.displayName:
1281
- return self.displayName
1282
- else:
1283
- return "Unknown Job"
1284
-
1285
-
1286
1397
  class ServiceJobDescription(JobDescription):
1287
1398
  """A description of a job that hosts a service."""
1288
1399
 
@@ -1330,12 +1441,29 @@ class CheckpointJobDescription(JobDescription):
1330
1441
 
1331
1442
  # Set checkpoint-specific properties
1332
1443
 
1333
- # None, or a copy of the original command string used to reestablish the job after failure.
1334
- self.checkpoint = None
1444
+ # None, or a copy of the original self._body used to reestablish the job after failure.
1445
+ self.checkpoint: Optional[JobBodyReference] = None
1335
1446
 
1336
1447
  # Files that can not be deleted until the job and its successors have completed
1337
1448
  self.checkpointFilesToDelete = []
1338
1449
 
1450
+ def set_checkpoint(self) -> str:
1451
+ """
1452
+ Save a body checkpoint into self.checkpoint
1453
+ """
1454
+
1455
+ if not self.has_body():
1456
+ raise RuntimeError(f"Cannot snapshot the body of a job {self} without one")
1457
+ self.checkpoint = self._body
1458
+
1459
+ def restore_checkpoint(self) -> None:
1460
+ """
1461
+ Restore the body checkpoint from self.checkpoint
1462
+ """
1463
+ if self.checkpoint is None:
1464
+ raise RuntimeError(f"Cannot restore an empty checkpoint for a job {self}")
1465
+ self._body = self.checkpoint
1466
+
1339
1467
  def restartCheckpoint(self, jobStore: "AbstractJobStore") -> List[str]:
1340
1468
  """
1341
1469
  Restart a checkpoint after the total failure of jobs in its subtree.
@@ -1350,13 +1478,13 @@ class CheckpointJobDescription(JobDescription):
1350
1478
  raise RuntimeError("Cannot restart a checkpoint job. The checkpoint was never set.")
1351
1479
  successorsDeleted = []
1352
1480
  all_successors = list(self.allSuccessors())
1353
- if len(all_successors) > 0 or self.serviceTree or self.command is not None:
1354
- if self.command is not None:
1355
- if self.command != self.checkpoint:
1356
- raise RuntimeError("The command and checkpoint are not the same.")
1357
- logger.debug("Checkpoint job already has command set to run")
1481
+ if len(all_successors) > 0 or self.serviceTree or self.has_body():
1482
+ if self.has_body():
1483
+ if self._body != self.checkpoint:
1484
+ raise RuntimeError("The stored body reference and checkpoint are not the same.")
1485
+ logger.debug("Checkpoint job already has body set to run")
1358
1486
  else:
1359
- self.command = self.checkpoint
1487
+ self.restore_checkpoint()
1360
1488
 
1361
1489
  jobStore.update_job(self) # Update immediately to ensure that checkpoint
1362
1490
  # is made before deleting any remaining successors
@@ -1501,6 +1629,9 @@ class Job:
1501
1629
  self._defer = None
1502
1630
  self._tempDir = None
1503
1631
 
1632
+ # Holds flags set by set_debug_flag()
1633
+ self._debug_flags: Set[str] = set()
1634
+
1504
1635
  def __str__(self):
1505
1636
  """
1506
1637
  Produce a useful logging string to identify this Job and distinguish it
@@ -1511,6 +1642,19 @@ class Job:
1511
1642
  else:
1512
1643
  return 'Job(' + str(self.description) + ')'
1513
1644
 
1645
+ def check_initialized(self) -> None:
1646
+ """
1647
+ Ensure that Job.__init__() has been called by any subclass __init__().
1648
+
1649
+ This uses the fact that the self._description instance variable should always
1650
+ be set after __init__().
1651
+
1652
+ If __init__() has not been called, raise an error.
1653
+ """
1654
+ if not hasattr(self, "_description"):
1655
+ raise ValueError(f"Job instance of type {type(self)} has not been initialized. super().__init__() may not "
1656
+ f"have been called.")
1657
+
1514
1658
  @property
1515
1659
  def jobStoreID(self) -> Union[str, TemporaryID]:
1516
1660
  """Get the ID of this Job."""
@@ -1641,6 +1785,11 @@ class Job:
1641
1785
  """
1642
1786
  if not isinstance(childJob, Job):
1643
1787
  raise RuntimeError("The type of the child job is not a job.")
1788
+
1789
+ # Check that both jobs have been initialized
1790
+ self.check_initialized()
1791
+ childJob.check_initialized()
1792
+
1644
1793
  # Join the job graphs
1645
1794
  self._jobGraphsJoined(childJob)
1646
1795
  # Remember the child relationship
@@ -1668,6 +1817,11 @@ class Job:
1668
1817
  """
1669
1818
  if not isinstance(followOnJob, Job):
1670
1819
  raise RuntimeError("The type of the follow-on job is not a job.")
1820
+
1821
+ # Check that both jobs have been initialized
1822
+ self.check_initialized()
1823
+ followOnJob.check_initialized()
1824
+
1671
1825
  # Join the job graphs
1672
1826
  self._jobGraphsJoined(followOnJob)
1673
1827
  # Remember the follow-on relationship
@@ -2552,8 +2706,8 @@ class Job:
2552
2706
  # filter_main() in _unpickle( ) do its job of resolving any user-defined type or function.
2553
2707
  userScript = self.getUserScript().globalize()
2554
2708
 
2555
- # The command connects the body of the job to the JobDescription
2556
- self._description.command = ' '.join(('_toil', fileStoreID) + userScript.toCommand())
2709
+ # Connect the body of the job to the JobDescription
2710
+ self._description.attach_body(fileStoreID, userScript)
2557
2711
 
2558
2712
  def _saveJobGraph(self, jobStore: "AbstractJobStore", saveSelf: bool = False, returnValues: bool = None):
2559
2713
  """
@@ -2682,38 +2836,33 @@ class Job:
2682
2836
 
2683
2837
  @classmethod
2684
2838
  def loadJob(
2685
- cls, jobStore: "AbstractJobStore", jobDescription: JobDescription
2839
+ cls, job_store: "AbstractJobStore", job_description: JobDescription
2686
2840
  ) -> "Job":
2687
2841
  """
2688
2842
  Retrieves a :class:`toil.job.Job` instance from a JobStore
2689
2843
 
2690
- :param jobStore: The job store.
2691
- :param jobDescription: the JobDescription of the job to retrieve.
2844
+ :param job_store: The job store.
2845
+ :param job_description: the JobDescription of the job to retrieve.
2692
2846
  :returns: The job referenced by the JobDescription.
2693
2847
  """
2694
- # Grab the command that connects the description to the job body
2695
- command = jobDescription.command
2696
-
2697
- commandTokens = command.split()
2698
- if "_toil" != commandTokens[0]:
2699
- raise RuntimeError("An invalid command was passed into the job.")
2700
- userModule = ModuleDescriptor.fromCommand(commandTokens[2:])
2701
- logger.debug('Loading user module %s.', userModule)
2702
- userModule = cls._loadUserModule(userModule)
2703
- pickleFile = commandTokens[1]
2848
+
2849
+ file_store_id, user_module_descriptor = job_description.get_body()
2850
+ logger.debug('Loading user module %s.', user_module_descriptor)
2851
+ user_module = cls._loadUserModule(user_module_descriptor)
2704
2852
 
2705
2853
  #Loads context manager using file stream
2706
- if pickleFile == "firstJob":
2707
- manager = jobStore.read_shared_file_stream(pickleFile)
2854
+ if file_store_id == "firstJob":
2855
+ # This one is actually a shared file name and not a file ID.
2856
+ manager = job_store.read_shared_file_stream(file_store_id)
2708
2857
  else:
2709
- manager = jobStore.read_file_stream(pickleFile)
2858
+ manager = job_store.read_file_stream(file_store_id)
2710
2859
 
2711
2860
  #Open and unpickle
2712
- with manager as fileHandle:
2861
+ with manager as file_handle:
2713
2862
 
2714
- job = cls._unpickle(userModule, fileHandle, requireInstanceOf=Job)
2863
+ job = cls._unpickle(user_module, file_handle, requireInstanceOf=Job)
2715
2864
  # Fill in the current description
2716
- job._description = jobDescription
2865
+ job._description = job_description
2717
2866
 
2718
2867
  # Set up the registry again, so children and follow-ons can be added on the worker
2719
2868
  job._registry = {job.jobStoreID: job}
@@ -2756,11 +2905,16 @@ class Job:
2756
2905
  """
2757
2906
  if stats is not None:
2758
2907
  startTime = time.time()
2759
- startClock = get_total_cpu_time()
2908
+ startClock = ResourceMonitor.get_total_cpu_time()
2760
2909
  baseDir = os.getcwd()
2761
2910
 
2762
2911
  yield
2763
2912
 
2913
+ if "download_only" in self._debug_flags:
2914
+ # We should stop right away
2915
+ logger.debug("Job did not stop itself after downloading files; stopping.")
2916
+ raise DebugStoppingPointReached()
2917
+
2764
2918
  # If the job is not a checkpoint job, add the promise files to delete
2765
2919
  # to the list of jobStoreFileIDs to delete
2766
2920
  # TODO: why is Promise holding a global list here???
@@ -2780,14 +2934,15 @@ class Job:
2780
2934
  os.chdir(baseDir)
2781
2935
  # Finish up the stats
2782
2936
  if stats is not None:
2783
- totalCpuTime, totalMemoryUsage = get_total_cpu_time_and_memory_usage()
2937
+ totalCpuTime, totalMemoryUsage = ResourceMonitor.get_total_cpu_time_and_memory_usage()
2784
2938
  stats.jobs.append(
2785
2939
  Expando(
2786
2940
  time=str(time.time() - startTime),
2787
2941
  clock=str(totalCpuTime - startClock),
2788
2942
  class_name=self._jobName(),
2789
2943
  memory=str(totalMemoryUsage),
2790
- requested_cores=str(self.cores)
2944
+ requested_cores=str(self.cores),
2945
+ disk=str(fileStore.get_disk_usage())
2791
2946
  )
2792
2947
  )
2793
2948
 
@@ -2801,7 +2956,7 @@ class Job:
2801
2956
  """
2802
2957
  Run the job, and serialise the next jobs.
2803
2958
 
2804
- It marks the job as completed (by clearing its command) and creates the
2959
+ It marks the job as completed (by clearing its body) and creates the
2805
2960
  successor relationships to new successors, but it doesn't actually
2806
2961
  commit those updates to the current job into the JobStore.
2807
2962
 
@@ -2836,9 +2991,9 @@ class Job:
2836
2991
  # Serialize the new Jobs defined by the run method to the jobStore
2837
2992
  self._saveJobGraph(jobStore, saveSelf=False, returnValues=returnValues)
2838
2993
 
2839
- # Clear out the command, because the job is done.
2840
- self.description.command = None
2841
-
2994
+ # Clear out the body, because the job is done.
2995
+ self.description.detach_body()
2996
+
2842
2997
  # That and the new child/follow-on relationships will need to be
2843
2998
  # recorded later by an update() of the JobDescription.
2844
2999
 
@@ -2848,6 +3003,35 @@ class Job:
2848
3003
  """
2849
3004
  return self._description.displayName
2850
3005
 
3006
+ def set_debug_flag(self, flag: str) -> None:
3007
+ """
3008
+ Enable the given debug option on the job.
3009
+ """
3010
+ self._debug_flags.add(flag)
3011
+
3012
+ def has_debug_flag(self, flag: str) -> bool:
3013
+ """
3014
+ Return true if the given debug flag is set.
3015
+ """
3016
+
3017
+ return flag in self._debug_flags
3018
+
3019
+ def files_downloaded_hook(self, host_and_job_paths: Optional[List[Tuple[str, str]]] = None) -> None:
3020
+ """
3021
+ Function that subclasses can call when they have downloaded their input files.
3022
+
3023
+ Will abort the job if the "download_only" debug flag is set.
3024
+
3025
+ Can be hinted a list of file path pairs outside and inside the job
3026
+ container, in which case the container environment can be
3027
+ reconstructed.
3028
+ """
3029
+
3030
+ if self.has_debug_flag("download_only"):
3031
+ # Stop the worker!
3032
+ logger.info("Job has downloaded its files. Stopping.")
3033
+ # Send off the path mapping for the debugging wrapper.
3034
+ raise FilesDownloadedStoppingPointReached("Files downloaded", host_and_job_paths=host_and_job_paths)
2851
3035
 
2852
3036
  class JobException(Exception):
2853
3037
  """General job exception."""