toil 6.1.0a1__py3-none-any.whl → 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +1 -232
- toil/batchSystems/abstractBatchSystem.py +41 -17
- toil/batchSystems/abstractGridEngineBatchSystem.py +79 -65
- toil/batchSystems/awsBatch.py +8 -8
- toil/batchSystems/cleanup_support.py +7 -3
- toil/batchSystems/contained_executor.py +4 -5
- toil/batchSystems/gridengine.py +1 -1
- toil/batchSystems/htcondor.py +5 -5
- toil/batchSystems/kubernetes.py +25 -11
- toil/batchSystems/local_support.py +3 -3
- toil/batchSystems/lsf.py +9 -9
- toil/batchSystems/mesos/batchSystem.py +4 -4
- toil/batchSystems/mesos/executor.py +3 -2
- toil/batchSystems/options.py +9 -0
- toil/batchSystems/singleMachine.py +11 -10
- toil/batchSystems/slurm.py +129 -16
- toil/batchSystems/torque.py +1 -1
- toil/bus.py +45 -3
- toil/common.py +56 -31
- toil/cwl/cwltoil.py +442 -371
- toil/deferred.py +1 -1
- toil/exceptions.py +1 -1
- toil/fileStores/abstractFileStore.py +69 -20
- toil/fileStores/cachingFileStore.py +6 -22
- toil/fileStores/nonCachingFileStore.py +6 -15
- toil/job.py +270 -86
- toil/jobStores/abstractJobStore.py +37 -31
- toil/jobStores/aws/jobStore.py +280 -218
- toil/jobStores/aws/utils.py +60 -31
- toil/jobStores/conftest.py +2 -2
- toil/jobStores/fileJobStore.py +3 -3
- toil/jobStores/googleJobStore.py +3 -4
- toil/leader.py +89 -38
- toil/lib/aws/__init__.py +26 -10
- toil/lib/aws/iam.py +2 -2
- toil/lib/aws/session.py +62 -22
- toil/lib/aws/utils.py +73 -37
- toil/lib/conversions.py +24 -1
- toil/lib/ec2.py +118 -69
- toil/lib/expando.py +1 -1
- toil/lib/generatedEC2Lists.py +8 -8
- toil/lib/io.py +42 -4
- toil/lib/misc.py +1 -3
- toil/lib/resources.py +57 -16
- toil/lib/retry.py +12 -5
- toil/lib/threading.py +29 -14
- toil/lib/throttle.py +1 -1
- toil/options/common.py +31 -30
- toil/options/wdl.py +5 -0
- toil/provisioners/__init__.py +9 -3
- toil/provisioners/abstractProvisioner.py +12 -2
- toil/provisioners/aws/__init__.py +20 -15
- toil/provisioners/aws/awsProvisioner.py +406 -329
- toil/provisioners/gceProvisioner.py +2 -2
- toil/provisioners/node.py +13 -5
- toil/server/app.py +1 -1
- toil/statsAndLogging.py +93 -23
- toil/test/__init__.py +27 -12
- toil/test/batchSystems/batchSystemTest.py +40 -33
- toil/test/batchSystems/batch_system_plugin_test.py +79 -0
- toil/test/batchSystems/test_slurm.py +22 -7
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +58 -0
- toil/test/cwl/cwlTest.py +245 -236
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +11 -14
- toil/test/jobStores/jobStoreTest.py +40 -54
- toil/test/lib/aws/test_iam.py +2 -2
- toil/test/lib/test_ec2.py +1 -1
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +37 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
- toil/test/provisioners/clusterTest.py +99 -16
- toil/test/server/serverTest.py +2 -2
- toil/test/src/autoDeploymentTest.py +1 -1
- toil/test/src/dockerCheckTest.py +2 -1
- toil/test/src/environmentTest.py +125 -0
- toil/test/src/fileStoreTest.py +1 -1
- toil/test/src/jobDescriptionTest.py +18 -8
- toil/test/src/jobTest.py +1 -1
- toil/test/src/realtimeLoggerTest.py +4 -0
- toil/test/src/workerTest.py +52 -19
- toil/test/utils/toilDebugTest.py +62 -4
- toil/test/utils/utilsTest.py +23 -21
- toil/test/wdl/wdltoil_test.py +49 -21
- toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
- toil/toilState.py +68 -9
- toil/utils/toilDebugFile.py +1 -1
- toil/utils/toilDebugJob.py +153 -26
- toil/utils/toilLaunchCluster.py +12 -2
- toil/utils/toilRsyncCluster.py +7 -2
- toil/utils/toilSshCluster.py +7 -3
- toil/utils/toilStats.py +310 -266
- toil/utils/toilStatus.py +98 -52
- toil/version.py +11 -11
- toil/wdl/wdltoil.py +644 -225
- toil/worker.py +125 -83
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
- toil-7.0.0.dist-info/METADATA +158 -0
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/RECORD +103 -96
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
toil/job.py
CHANGED
|
@@ -34,6 +34,7 @@ from typing import (TYPE_CHECKING,
|
|
|
34
34
|
Iterator,
|
|
35
35
|
List,
|
|
36
36
|
Mapping,
|
|
37
|
+
NamedTuple,
|
|
37
38
|
Optional,
|
|
38
39
|
Sequence,
|
|
39
40
|
Set,
|
|
@@ -45,6 +46,7 @@ from typing import (TYPE_CHECKING,
|
|
|
45
46
|
|
|
46
47
|
from configargparse import ArgParser
|
|
47
48
|
|
|
49
|
+
from toil.bus import Names
|
|
48
50
|
from toil.lib.compatibility import deprecated
|
|
49
51
|
|
|
50
52
|
if sys.version_info >= (3, 8):
|
|
@@ -67,8 +69,7 @@ from toil.deferred import DeferredFunction
|
|
|
67
69
|
from toil.fileStores import FileID
|
|
68
70
|
from toil.lib.conversions import bytes2human, human2bytes
|
|
69
71
|
from toil.lib.expando import Expando
|
|
70
|
-
from toil.lib.resources import
|
|
71
|
-
get_total_cpu_time_and_memory_usage)
|
|
72
|
+
from toil.lib.resources import ResourceMonitor
|
|
72
73
|
from toil.resource import ModuleDescriptor
|
|
73
74
|
from toil.statsAndLogging import set_logging_from_options
|
|
74
75
|
|
|
@@ -121,6 +122,23 @@ class ConflictingPredecessorError(Exception):
|
|
|
121
122
|
f'The given job: "{predecessor.description}" is already a predecessor of job: "{successor.description}".'
|
|
122
123
|
)
|
|
123
124
|
|
|
125
|
+
class DebugStoppingPointReached(BaseException):
|
|
126
|
+
"""
|
|
127
|
+
Raised when a job reaches a point at which it has been instructed to stop for debugging.
|
|
128
|
+
"""
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
class FilesDownloadedStoppingPointReached(DebugStoppingPointReached):
|
|
132
|
+
"""
|
|
133
|
+
Raised when a job stops because it was asked to download its files, and the files are downloaded.
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
def __init__(self, message, host_and_job_paths: Optional[List[Tuple[str, str]]] = None):
|
|
137
|
+
super().__init__(message)
|
|
138
|
+
|
|
139
|
+
# Save the host and user-code-visible paths of files, in case we're
|
|
140
|
+
# using a container and they are different.
|
|
141
|
+
self.host_and_job_paths = host_and_job_paths
|
|
124
142
|
|
|
125
143
|
class TemporaryID:
|
|
126
144
|
"""
|
|
@@ -226,7 +244,7 @@ def parse_accelerator(spec: Union[int, str, Dict[str, Union[str, int]]]) -> Acce
|
|
|
226
244
|
of them. Knows that "gpu" is a kind, and "cuda" is an API, and "nvidia"
|
|
227
245
|
is a brand.
|
|
228
246
|
|
|
229
|
-
:raises ValueError: if it gets
|
|
247
|
+
:raises ValueError: if it gets something it can't parse
|
|
230
248
|
:raises TypeError: if it gets something it can't parse because it's the wrong type.
|
|
231
249
|
"""
|
|
232
250
|
KINDS = {'gpu'}
|
|
@@ -710,14 +728,24 @@ class Requirer:
|
|
|
710
728
|
parts = ['no requirements']
|
|
711
729
|
return ', '.join(parts)
|
|
712
730
|
|
|
731
|
+
class JobBodyReference(NamedTuple):
|
|
732
|
+
"""
|
|
733
|
+
Reference from a job description to its body.
|
|
734
|
+
"""
|
|
735
|
+
file_store_id: str
|
|
736
|
+
"""File ID (or special shared file name for the root job) of the job's body."""
|
|
737
|
+
module_string: str
|
|
738
|
+
"""Stringified description of the module needed to load the body."""
|
|
713
739
|
|
|
714
740
|
class JobDescription(Requirer):
|
|
715
741
|
"""
|
|
716
742
|
Stores all the information that the Toil Leader ever needs to know about a Job.
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
743
|
+
|
|
744
|
+
This includes:
|
|
745
|
+
* Resource requirements.
|
|
746
|
+
* Which jobs are children or follow-ons or predecessors of this job.
|
|
747
|
+
* A reference to the Job object in the job store.
|
|
748
|
+
|
|
721
749
|
Can be obtained from an actual (i.e. executable) Job object, and can be
|
|
722
750
|
used to obtain the Job object from the JobStore.
|
|
723
751
|
|
|
@@ -732,8 +760,7 @@ class JobDescription(Requirer):
|
|
|
732
760
|
requirements: Mapping[str, Union[int, str, bool]],
|
|
733
761
|
jobName: str,
|
|
734
762
|
unitName: Optional[str] = "",
|
|
735
|
-
displayName: Optional[str] = "",
|
|
736
|
-
command: Optional[str] = None,
|
|
763
|
+
displayName: Optional[str] = "",
|
|
737
764
|
local: Optional[bool] = None
|
|
738
765
|
) -> None:
|
|
739
766
|
"""
|
|
@@ -780,14 +807,10 @@ class JobDescription(Requirer):
|
|
|
780
807
|
# ID of this job description in the JobStore.
|
|
781
808
|
self.jobStoreID: Union[str, TemporaryID] = TemporaryID()
|
|
782
809
|
|
|
783
|
-
#
|
|
784
|
-
#
|
|
785
|
-
#
|
|
786
|
-
|
|
787
|
-
# Gets replaced with/rewritten into the real, executable command when
|
|
788
|
-
# the leader passes the description off to the batch system to be
|
|
789
|
-
# executed.
|
|
790
|
-
self.command: Optional[str] = command
|
|
810
|
+
# Information that encodes how to find the Job body data that this
|
|
811
|
+
# JobDescription describes, and the module(s) needed to unpickle it.
|
|
812
|
+
# None if no body needs to run.
|
|
813
|
+
self._body: Optional[JobBodyReference] = None
|
|
791
814
|
|
|
792
815
|
# Set scheduling properties that the leader read to think about scheduling.
|
|
793
816
|
|
|
@@ -814,11 +837,14 @@ class JobDescription(Requirer):
|
|
|
814
837
|
# in the process of being committed.
|
|
815
838
|
self.filesToDelete = []
|
|
816
839
|
|
|
817
|
-
# Holds
|
|
840
|
+
# Holds job names and IDs of the jobs that have been chained into this
|
|
818
841
|
# job, and which should be deleted when this job finally is deleted
|
|
819
842
|
# (but not before). The successor relationships with them will have
|
|
820
|
-
# been cut, so we need to hold onto them somehow.
|
|
821
|
-
|
|
843
|
+
# been cut, so we need to hold onto them somehow. Includes each
|
|
844
|
+
# chained-in job with its original ID, and also this job's ID with its
|
|
845
|
+
# original names, or is empty if no chaining has happened.
|
|
846
|
+
# The first job in the chain comes first in the list.
|
|
847
|
+
self._merged_job_names: List[Names] = []
|
|
822
848
|
|
|
823
849
|
# The number of direct predecessors of the job. Needs to be stored at
|
|
824
850
|
# the JobDescription to support dynamically-created jobs with multiple
|
|
@@ -867,9 +893,26 @@ class JobDescription(Requirer):
|
|
|
867
893
|
# And we log who made the version (by PID)
|
|
868
894
|
self._job_version_writer = 0
|
|
869
895
|
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
896
|
+
def get_names(self) -> Names:
|
|
897
|
+
"""
|
|
898
|
+
Get the names and ID of this job as a named tuple.
|
|
899
|
+
"""
|
|
900
|
+
return Names(self.jobName, self.unitName, self.displayName, self.displayName, str(self.jobStoreID))
|
|
901
|
+
|
|
902
|
+
def get_chain(self) -> List[Names]:
|
|
903
|
+
"""
|
|
904
|
+
Get all the jobs that executed in this job's chain, in order.
|
|
905
|
+
|
|
906
|
+
For each job, produces a named tuple with its various names and its
|
|
907
|
+
original job store ID. The jobs in the chain are in execution order.
|
|
908
|
+
|
|
909
|
+
If the job hasn't run yet or it didn't chain, produces a one-item list.
|
|
910
|
+
"""
|
|
911
|
+
if len(self._merged_job_names) == 0:
|
|
912
|
+
# We haven't merged so we're just ourselves.
|
|
913
|
+
return [self.get_names()]
|
|
914
|
+
else:
|
|
915
|
+
return list(self._merged_job_names)
|
|
873
916
|
|
|
874
917
|
def serviceHostIDsInBatches(self) -> Iterator[List[str]]:
|
|
875
918
|
"""
|
|
@@ -935,7 +978,47 @@ class JobDescription(Requirer):
|
|
|
935
978
|
"""
|
|
936
979
|
return list(self.serviceTree.keys())
|
|
937
980
|
|
|
938
|
-
def
|
|
981
|
+
def has_body(self) -> bool:
|
|
982
|
+
"""
|
|
983
|
+
Returns True if we have a job body associated, and False otherwise.
|
|
984
|
+
"""
|
|
985
|
+
return self._body is not None
|
|
986
|
+
|
|
987
|
+
def attach_body(self, file_store_id: str, user_script: ModuleDescriptor) -> None:
|
|
988
|
+
"""
|
|
989
|
+
Attach a job body to this JobDescription.
|
|
990
|
+
|
|
991
|
+
Takes the file store ID that the body is stored at, and the required
|
|
992
|
+
user script module.
|
|
993
|
+
|
|
994
|
+
The file store ID can also be "firstJob" for the root job, stored as a
|
|
995
|
+
shared file instead.
|
|
996
|
+
"""
|
|
997
|
+
|
|
998
|
+
self._body = JobBodyReference(file_store_id, user_script.toCommand())
|
|
999
|
+
|
|
1000
|
+
def detach_body(self) -> None:
|
|
1001
|
+
"""
|
|
1002
|
+
Drop the body reference from a JobDescription.
|
|
1003
|
+
"""
|
|
1004
|
+
self._body = None
|
|
1005
|
+
|
|
1006
|
+
def get_body(self) -> Tuple[str, ModuleDescriptor]:
|
|
1007
|
+
"""
|
|
1008
|
+
Get the information needed to load the job body.
|
|
1009
|
+
|
|
1010
|
+
:returns: a file store ID (or magic shared file name "firstJob") and a
|
|
1011
|
+
user script module.
|
|
1012
|
+
|
|
1013
|
+
Fails if no body is attached; check has_body() first.
|
|
1014
|
+
"""
|
|
1015
|
+
|
|
1016
|
+
if not self.has_body():
|
|
1017
|
+
raise RuntimeError(f"Cannot load the body of a job {self} without one")
|
|
1018
|
+
|
|
1019
|
+
return self._body.file_store_id, ModuleDescriptor.fromCommand(self._body.module_string)
|
|
1020
|
+
|
|
1021
|
+
def nextSuccessors(self) -> Optional[Set[str]]:
|
|
939
1022
|
"""
|
|
940
1023
|
Return the collection of job IDs for the successors of this job that are ready to run.
|
|
941
1024
|
|
|
@@ -946,7 +1029,7 @@ class JobDescription(Requirer):
|
|
|
946
1029
|
empty collection if there are more phases but they can't be entered yet
|
|
947
1030
|
(e.g. because we are waiting for the job itself to run).
|
|
948
1031
|
"""
|
|
949
|
-
if self.
|
|
1032
|
+
if self.has_body():
|
|
950
1033
|
# We ourselves need to run. So there's not nothing to do
|
|
951
1034
|
# but no successors are ready.
|
|
952
1035
|
return set()
|
|
@@ -1018,7 +1101,7 @@ class JobDescription(Requirer):
|
|
|
1018
1101
|
:returns: True if the job appears to be done, and all related child,
|
|
1019
1102
|
follow-on, and service jobs appear to be finished and removed.
|
|
1020
1103
|
"""
|
|
1021
|
-
return self.
|
|
1104
|
+
return not self.has_body() and next(self.successorsAndServiceHosts(), None) is None
|
|
1022
1105
|
|
|
1023
1106
|
def replace(self, other: "JobDescription") -> None:
|
|
1024
1107
|
"""
|
|
@@ -1045,8 +1128,23 @@ class JobDescription(Requirer):
|
|
|
1045
1128
|
self.successor_phases = old_phases + self.successor_phases
|
|
1046
1129
|
|
|
1047
1130
|
# When deleting, we need to delete the files for our old ID, and also
|
|
1048
|
-
# anything that needed to be deleted for the job we are replacing.
|
|
1049
|
-
|
|
1131
|
+
# anything that needed to be deleted for the job we are replacing. And
|
|
1132
|
+
# we need to keep track of all the names of jobs involved for logging.
|
|
1133
|
+
|
|
1134
|
+
# We need first the job we are merging into if nothing has merged into
|
|
1135
|
+
# it yet, then anything that already merged into it (including it),
|
|
1136
|
+
# then us if nothing has yet merged into us, then anything that merged
|
|
1137
|
+
# into us (inclusing us)
|
|
1138
|
+
_merged_job_names = []
|
|
1139
|
+
if len(other._merged_job_names) == 0:
|
|
1140
|
+
_merged_job_names.append(other.get_names())
|
|
1141
|
+
_merged_job_names += other._merged_job_names
|
|
1142
|
+
if len(self._merged_job_names) == 0:
|
|
1143
|
+
_merged_job_names.append(self.get_names())
|
|
1144
|
+
_merged_job_names += self._merged_job_names
|
|
1145
|
+
self._merged_job_names = _merged_job_names
|
|
1146
|
+
|
|
1147
|
+
# Now steal its ID.
|
|
1050
1148
|
self.jobStoreID = other.jobStoreID
|
|
1051
1149
|
|
|
1052
1150
|
if len(other.filesToDelete) > 0:
|
|
@@ -1057,13 +1155,46 @@ class JobDescription(Requirer):
|
|
|
1057
1155
|
self._job_version = other._job_version
|
|
1058
1156
|
self._job_version_writer = os.getpid()
|
|
1059
1157
|
|
|
1060
|
-
def
|
|
1158
|
+
def assert_is_not_newer_than(self, other: "JobDescription") -> None:
|
|
1061
1159
|
"""
|
|
1062
|
-
Make sure a prospective new version of the JobDescription
|
|
1160
|
+
Make sure this JobDescription is not newer than a prospective new version of the JobDescription.
|
|
1063
1161
|
"""
|
|
1064
1162
|
if other._job_version < self._job_version:
|
|
1065
1163
|
raise RuntimeError(f"Cannot replace {self} from PID {self._job_version_writer} with older version {other} from PID {other._job_version_writer}")
|
|
1066
1164
|
|
|
1165
|
+
def is_updated_by(self, other: "JobDescription") -> bool:
|
|
1166
|
+
"""
|
|
1167
|
+
Return True if the passed JobDescription is a distinct, newer version of this one.
|
|
1168
|
+
"""
|
|
1169
|
+
|
|
1170
|
+
if self.jobStoreID != other.jobStoreID:
|
|
1171
|
+
# Not the same job
|
|
1172
|
+
logger.warning(
|
|
1173
|
+
"Found ID %s in job %s from PID %s but expected ID %s to "
|
|
1174
|
+
"update job %s from PID %s",
|
|
1175
|
+
other.jobStoreID,
|
|
1176
|
+
other,
|
|
1177
|
+
other._job_version_writer,
|
|
1178
|
+
self.jobStoreID,
|
|
1179
|
+
self,
|
|
1180
|
+
self._job_version_writer
|
|
1181
|
+
)
|
|
1182
|
+
return False
|
|
1183
|
+
|
|
1184
|
+
if self._job_version >= other._job_version:
|
|
1185
|
+
# Version isn't strictly newer
|
|
1186
|
+
logger.debug(
|
|
1187
|
+
"Expected newer version in job %s from PID %s but it is no "
|
|
1188
|
+
"newer than job %s from PID %s",
|
|
1189
|
+
other,
|
|
1190
|
+
other._job_version_writer,
|
|
1191
|
+
self,
|
|
1192
|
+
self._job_version_writer
|
|
1193
|
+
)
|
|
1194
|
+
return False
|
|
1195
|
+
|
|
1196
|
+
return True
|
|
1197
|
+
|
|
1067
1198
|
def addChild(self, childID: str) -> None:
|
|
1068
1199
|
"""Make the job with the given ID a child of the described job."""
|
|
1069
1200
|
self.childIDs.add(childID)
|
|
@@ -1263,26 +1394,6 @@ class JobDescription(Requirer):
|
|
|
1263
1394
|
self._job_version_writer = os.getpid()
|
|
1264
1395
|
logger.debug("New job version: %s", self)
|
|
1265
1396
|
|
|
1266
|
-
def get_job_kind(self) -> str:
|
|
1267
|
-
"""
|
|
1268
|
-
Return an identifying string for the job.
|
|
1269
|
-
|
|
1270
|
-
The result may contain spaces.
|
|
1271
|
-
|
|
1272
|
-
Returns: Either the unit name, job name, or display name, which identifies
|
|
1273
|
-
the kind of job it is to toil.
|
|
1274
|
-
Otherwise "Unknown Job" in case no identifier is available
|
|
1275
|
-
"""
|
|
1276
|
-
if self.unitName:
|
|
1277
|
-
return self.unitName
|
|
1278
|
-
elif self.jobName:
|
|
1279
|
-
return self.jobName
|
|
1280
|
-
elif self.displayName:
|
|
1281
|
-
return self.displayName
|
|
1282
|
-
else:
|
|
1283
|
-
return "Unknown Job"
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
1397
|
class ServiceJobDescription(JobDescription):
|
|
1287
1398
|
"""A description of a job that hosts a service."""
|
|
1288
1399
|
|
|
@@ -1330,12 +1441,29 @@ class CheckpointJobDescription(JobDescription):
|
|
|
1330
1441
|
|
|
1331
1442
|
# Set checkpoint-specific properties
|
|
1332
1443
|
|
|
1333
|
-
# None, or a copy of the original
|
|
1334
|
-
self.checkpoint = None
|
|
1444
|
+
# None, or a copy of the original self._body used to reestablish the job after failure.
|
|
1445
|
+
self.checkpoint: Optional[JobBodyReference] = None
|
|
1335
1446
|
|
|
1336
1447
|
# Files that can not be deleted until the job and its successors have completed
|
|
1337
1448
|
self.checkpointFilesToDelete = []
|
|
1338
1449
|
|
|
1450
|
+
def set_checkpoint(self) -> str:
|
|
1451
|
+
"""
|
|
1452
|
+
Save a body checkpoint into self.checkpoint
|
|
1453
|
+
"""
|
|
1454
|
+
|
|
1455
|
+
if not self.has_body():
|
|
1456
|
+
raise RuntimeError(f"Cannot snapshot the body of a job {self} without one")
|
|
1457
|
+
self.checkpoint = self._body
|
|
1458
|
+
|
|
1459
|
+
def restore_checkpoint(self) -> None:
|
|
1460
|
+
"""
|
|
1461
|
+
Restore the body checkpoint from self.checkpoint
|
|
1462
|
+
"""
|
|
1463
|
+
if self.checkpoint is None:
|
|
1464
|
+
raise RuntimeError(f"Cannot restore an empty checkpoint for a job {self}")
|
|
1465
|
+
self._body = self.checkpoint
|
|
1466
|
+
|
|
1339
1467
|
def restartCheckpoint(self, jobStore: "AbstractJobStore") -> List[str]:
|
|
1340
1468
|
"""
|
|
1341
1469
|
Restart a checkpoint after the total failure of jobs in its subtree.
|
|
@@ -1350,13 +1478,13 @@ class CheckpointJobDescription(JobDescription):
|
|
|
1350
1478
|
raise RuntimeError("Cannot restart a checkpoint job. The checkpoint was never set.")
|
|
1351
1479
|
successorsDeleted = []
|
|
1352
1480
|
all_successors = list(self.allSuccessors())
|
|
1353
|
-
if len(all_successors) > 0 or self.serviceTree or self.
|
|
1354
|
-
if self.
|
|
1355
|
-
if self.
|
|
1356
|
-
raise RuntimeError("The
|
|
1357
|
-
logger.debug("Checkpoint job already has
|
|
1481
|
+
if len(all_successors) > 0 or self.serviceTree or self.has_body():
|
|
1482
|
+
if self.has_body():
|
|
1483
|
+
if self._body != self.checkpoint:
|
|
1484
|
+
raise RuntimeError("The stored body reference and checkpoint are not the same.")
|
|
1485
|
+
logger.debug("Checkpoint job already has body set to run")
|
|
1358
1486
|
else:
|
|
1359
|
-
self.
|
|
1487
|
+
self.restore_checkpoint()
|
|
1360
1488
|
|
|
1361
1489
|
jobStore.update_job(self) # Update immediately to ensure that checkpoint
|
|
1362
1490
|
# is made before deleting any remaining successors
|
|
@@ -1501,6 +1629,9 @@ class Job:
|
|
|
1501
1629
|
self._defer = None
|
|
1502
1630
|
self._tempDir = None
|
|
1503
1631
|
|
|
1632
|
+
# Holds flags set by set_debug_flag()
|
|
1633
|
+
self._debug_flags: Set[str] = set()
|
|
1634
|
+
|
|
1504
1635
|
def __str__(self):
|
|
1505
1636
|
"""
|
|
1506
1637
|
Produce a useful logging string to identify this Job and distinguish it
|
|
@@ -1511,6 +1642,19 @@ class Job:
|
|
|
1511
1642
|
else:
|
|
1512
1643
|
return 'Job(' + str(self.description) + ')'
|
|
1513
1644
|
|
|
1645
|
+
def check_initialized(self) -> None:
|
|
1646
|
+
"""
|
|
1647
|
+
Ensure that Job.__init__() has been called by any subclass __init__().
|
|
1648
|
+
|
|
1649
|
+
This uses the fact that the self._description instance variable should always
|
|
1650
|
+
be set after __init__().
|
|
1651
|
+
|
|
1652
|
+
If __init__() has not been called, raise an error.
|
|
1653
|
+
"""
|
|
1654
|
+
if not hasattr(self, "_description"):
|
|
1655
|
+
raise ValueError(f"Job instance of type {type(self)} has not been initialized. super().__init__() may not "
|
|
1656
|
+
f"have been called.")
|
|
1657
|
+
|
|
1514
1658
|
@property
|
|
1515
1659
|
def jobStoreID(self) -> Union[str, TemporaryID]:
|
|
1516
1660
|
"""Get the ID of this Job."""
|
|
@@ -1641,6 +1785,11 @@ class Job:
|
|
|
1641
1785
|
"""
|
|
1642
1786
|
if not isinstance(childJob, Job):
|
|
1643
1787
|
raise RuntimeError("The type of the child job is not a job.")
|
|
1788
|
+
|
|
1789
|
+
# Check that both jobs have been initialized
|
|
1790
|
+
self.check_initialized()
|
|
1791
|
+
childJob.check_initialized()
|
|
1792
|
+
|
|
1644
1793
|
# Join the job graphs
|
|
1645
1794
|
self._jobGraphsJoined(childJob)
|
|
1646
1795
|
# Remember the child relationship
|
|
@@ -1668,6 +1817,11 @@ class Job:
|
|
|
1668
1817
|
"""
|
|
1669
1818
|
if not isinstance(followOnJob, Job):
|
|
1670
1819
|
raise RuntimeError("The type of the follow-on job is not a job.")
|
|
1820
|
+
|
|
1821
|
+
# Check that both jobs have been initialized
|
|
1822
|
+
self.check_initialized()
|
|
1823
|
+
followOnJob.check_initialized()
|
|
1824
|
+
|
|
1671
1825
|
# Join the job graphs
|
|
1672
1826
|
self._jobGraphsJoined(followOnJob)
|
|
1673
1827
|
# Remember the follow-on relationship
|
|
@@ -2552,8 +2706,8 @@ class Job:
|
|
|
2552
2706
|
# filter_main() in _unpickle( ) do its job of resolving any user-defined type or function.
|
|
2553
2707
|
userScript = self.getUserScript().globalize()
|
|
2554
2708
|
|
|
2555
|
-
#
|
|
2556
|
-
self._description.
|
|
2709
|
+
# Connect the body of the job to the JobDescription
|
|
2710
|
+
self._description.attach_body(fileStoreID, userScript)
|
|
2557
2711
|
|
|
2558
2712
|
def _saveJobGraph(self, jobStore: "AbstractJobStore", saveSelf: bool = False, returnValues: bool = None):
|
|
2559
2713
|
"""
|
|
@@ -2682,38 +2836,33 @@ class Job:
|
|
|
2682
2836
|
|
|
2683
2837
|
@classmethod
|
|
2684
2838
|
def loadJob(
|
|
2685
|
-
cls,
|
|
2839
|
+
cls, job_store: "AbstractJobStore", job_description: JobDescription
|
|
2686
2840
|
) -> "Job":
|
|
2687
2841
|
"""
|
|
2688
2842
|
Retrieves a :class:`toil.job.Job` instance from a JobStore
|
|
2689
2843
|
|
|
2690
|
-
:param
|
|
2691
|
-
:param
|
|
2844
|
+
:param job_store: The job store.
|
|
2845
|
+
:param job_description: the JobDescription of the job to retrieve.
|
|
2692
2846
|
:returns: The job referenced by the JobDescription.
|
|
2693
2847
|
"""
|
|
2694
|
-
|
|
2695
|
-
|
|
2696
|
-
|
|
2697
|
-
|
|
2698
|
-
if "_toil" != commandTokens[0]:
|
|
2699
|
-
raise RuntimeError("An invalid command was passed into the job.")
|
|
2700
|
-
userModule = ModuleDescriptor.fromCommand(commandTokens[2:])
|
|
2701
|
-
logger.debug('Loading user module %s.', userModule)
|
|
2702
|
-
userModule = cls._loadUserModule(userModule)
|
|
2703
|
-
pickleFile = commandTokens[1]
|
|
2848
|
+
|
|
2849
|
+
file_store_id, user_module_descriptor = job_description.get_body()
|
|
2850
|
+
logger.debug('Loading user module %s.', user_module_descriptor)
|
|
2851
|
+
user_module = cls._loadUserModule(user_module_descriptor)
|
|
2704
2852
|
|
|
2705
2853
|
#Loads context manager using file stream
|
|
2706
|
-
if
|
|
2707
|
-
|
|
2854
|
+
if file_store_id == "firstJob":
|
|
2855
|
+
# This one is actually a shared file name and not a file ID.
|
|
2856
|
+
manager = job_store.read_shared_file_stream(file_store_id)
|
|
2708
2857
|
else:
|
|
2709
|
-
manager =
|
|
2858
|
+
manager = job_store.read_file_stream(file_store_id)
|
|
2710
2859
|
|
|
2711
2860
|
#Open and unpickle
|
|
2712
|
-
with manager as
|
|
2861
|
+
with manager as file_handle:
|
|
2713
2862
|
|
|
2714
|
-
job = cls._unpickle(
|
|
2863
|
+
job = cls._unpickle(user_module, file_handle, requireInstanceOf=Job)
|
|
2715
2864
|
# Fill in the current description
|
|
2716
|
-
job._description =
|
|
2865
|
+
job._description = job_description
|
|
2717
2866
|
|
|
2718
2867
|
# Set up the registry again, so children and follow-ons can be added on the worker
|
|
2719
2868
|
job._registry = {job.jobStoreID: job}
|
|
@@ -2756,11 +2905,16 @@ class Job:
|
|
|
2756
2905
|
"""
|
|
2757
2906
|
if stats is not None:
|
|
2758
2907
|
startTime = time.time()
|
|
2759
|
-
startClock = get_total_cpu_time()
|
|
2908
|
+
startClock = ResourceMonitor.get_total_cpu_time()
|
|
2760
2909
|
baseDir = os.getcwd()
|
|
2761
2910
|
|
|
2762
2911
|
yield
|
|
2763
2912
|
|
|
2913
|
+
if "download_only" in self._debug_flags:
|
|
2914
|
+
# We should stop right away
|
|
2915
|
+
logger.debug("Job did not stop itself after downloading files; stopping.")
|
|
2916
|
+
raise DebugStoppingPointReached()
|
|
2917
|
+
|
|
2764
2918
|
# If the job is not a checkpoint job, add the promise files to delete
|
|
2765
2919
|
# to the list of jobStoreFileIDs to delete
|
|
2766
2920
|
# TODO: why is Promise holding a global list here???
|
|
@@ -2780,14 +2934,15 @@ class Job:
|
|
|
2780
2934
|
os.chdir(baseDir)
|
|
2781
2935
|
# Finish up the stats
|
|
2782
2936
|
if stats is not None:
|
|
2783
|
-
totalCpuTime, totalMemoryUsage = get_total_cpu_time_and_memory_usage()
|
|
2937
|
+
totalCpuTime, totalMemoryUsage = ResourceMonitor.get_total_cpu_time_and_memory_usage()
|
|
2784
2938
|
stats.jobs.append(
|
|
2785
2939
|
Expando(
|
|
2786
2940
|
time=str(time.time() - startTime),
|
|
2787
2941
|
clock=str(totalCpuTime - startClock),
|
|
2788
2942
|
class_name=self._jobName(),
|
|
2789
2943
|
memory=str(totalMemoryUsage),
|
|
2790
|
-
requested_cores=str(self.cores)
|
|
2944
|
+
requested_cores=str(self.cores),
|
|
2945
|
+
disk=str(fileStore.get_disk_usage())
|
|
2791
2946
|
)
|
|
2792
2947
|
)
|
|
2793
2948
|
|
|
@@ -2801,7 +2956,7 @@ class Job:
|
|
|
2801
2956
|
"""
|
|
2802
2957
|
Run the job, and serialise the next jobs.
|
|
2803
2958
|
|
|
2804
|
-
It marks the job as completed (by clearing its
|
|
2959
|
+
It marks the job as completed (by clearing its body) and creates the
|
|
2805
2960
|
successor relationships to new successors, but it doesn't actually
|
|
2806
2961
|
commit those updates to the current job into the JobStore.
|
|
2807
2962
|
|
|
@@ -2836,9 +2991,9 @@ class Job:
|
|
|
2836
2991
|
# Serialize the new Jobs defined by the run method to the jobStore
|
|
2837
2992
|
self._saveJobGraph(jobStore, saveSelf=False, returnValues=returnValues)
|
|
2838
2993
|
|
|
2839
|
-
# Clear out the
|
|
2840
|
-
self.description.
|
|
2841
|
-
|
|
2994
|
+
# Clear out the body, because the job is done.
|
|
2995
|
+
self.description.detach_body()
|
|
2996
|
+
|
|
2842
2997
|
# That and the new child/follow-on relationships will need to be
|
|
2843
2998
|
# recorded later by an update() of the JobDescription.
|
|
2844
2999
|
|
|
@@ -2848,6 +3003,35 @@ class Job:
|
|
|
2848
3003
|
"""
|
|
2849
3004
|
return self._description.displayName
|
|
2850
3005
|
|
|
3006
|
+
def set_debug_flag(self, flag: str) -> None:
|
|
3007
|
+
"""
|
|
3008
|
+
Enable the given debug option on the job.
|
|
3009
|
+
"""
|
|
3010
|
+
self._debug_flags.add(flag)
|
|
3011
|
+
|
|
3012
|
+
def has_debug_flag(self, flag: str) -> bool:
|
|
3013
|
+
"""
|
|
3014
|
+
Return true if the given debug flag is set.
|
|
3015
|
+
"""
|
|
3016
|
+
|
|
3017
|
+
return flag in self._debug_flags
|
|
3018
|
+
|
|
3019
|
+
def files_downloaded_hook(self, host_and_job_paths: Optional[List[Tuple[str, str]]] = None) -> None:
|
|
3020
|
+
"""
|
|
3021
|
+
Function that subclasses can call when they have downloaded their input files.
|
|
3022
|
+
|
|
3023
|
+
Will abort the job if the "download_only" debug flag is set.
|
|
3024
|
+
|
|
3025
|
+
Can be hinted a list of file path pairs outside and inside the job
|
|
3026
|
+
container, in which case the container environment can be
|
|
3027
|
+
reconstructed.
|
|
3028
|
+
"""
|
|
3029
|
+
|
|
3030
|
+
if self.has_debug_flag("download_only"):
|
|
3031
|
+
# Stop the worker!
|
|
3032
|
+
logger.info("Job has downloaded its files. Stopping.")
|
|
3033
|
+
# Send off the path mapping for the debugging wrapper.
|
|
3034
|
+
raise FilesDownloadedStoppingPointReached("Files downloaded", host_and_job_paths=host_and_job_paths)
|
|
2851
3035
|
|
|
2852
3036
|
class JobException(Exception):
|
|
2853
3037
|
"""General job exception."""
|