toil 6.1.0__py3-none-any.whl → 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +1 -232
- toil/batchSystems/abstractBatchSystem.py +22 -13
- toil/batchSystems/abstractGridEngineBatchSystem.py +59 -45
- toil/batchSystems/awsBatch.py +8 -8
- toil/batchSystems/contained_executor.py +4 -5
- toil/batchSystems/gridengine.py +1 -1
- toil/batchSystems/htcondor.py +5 -5
- toil/batchSystems/kubernetes.py +25 -11
- toil/batchSystems/local_support.py +3 -3
- toil/batchSystems/lsf.py +2 -2
- toil/batchSystems/mesos/batchSystem.py +4 -4
- toil/batchSystems/mesos/executor.py +3 -2
- toil/batchSystems/options.py +9 -0
- toil/batchSystems/singleMachine.py +11 -10
- toil/batchSystems/slurm.py +64 -22
- toil/batchSystems/torque.py +1 -1
- toil/bus.py +7 -3
- toil/common.py +36 -13
- toil/cwl/cwltoil.py +365 -312
- toil/deferred.py +1 -1
- toil/fileStores/abstractFileStore.py +17 -17
- toil/fileStores/cachingFileStore.py +2 -2
- toil/fileStores/nonCachingFileStore.py +1 -1
- toil/job.py +228 -60
- toil/jobStores/abstractJobStore.py +18 -10
- toil/jobStores/aws/jobStore.py +280 -218
- toil/jobStores/aws/utils.py +57 -29
- toil/jobStores/conftest.py +2 -2
- toil/jobStores/fileJobStore.py +2 -2
- toil/jobStores/googleJobStore.py +3 -4
- toil/leader.py +72 -24
- toil/lib/aws/__init__.py +26 -10
- toil/lib/aws/iam.py +2 -2
- toil/lib/aws/session.py +62 -22
- toil/lib/aws/utils.py +73 -37
- toil/lib/conversions.py +5 -1
- toil/lib/ec2.py +118 -69
- toil/lib/expando.py +1 -1
- toil/lib/io.py +14 -2
- toil/lib/misc.py +1 -3
- toil/lib/resources.py +55 -21
- toil/lib/retry.py +12 -5
- toil/lib/threading.py +2 -2
- toil/lib/throttle.py +1 -1
- toil/options/common.py +27 -24
- toil/provisioners/__init__.py +9 -3
- toil/provisioners/abstractProvisioner.py +9 -7
- toil/provisioners/aws/__init__.py +20 -15
- toil/provisioners/aws/awsProvisioner.py +406 -329
- toil/provisioners/gceProvisioner.py +2 -2
- toil/provisioners/node.py +13 -5
- toil/server/app.py +1 -1
- toil/statsAndLogging.py +58 -16
- toil/test/__init__.py +27 -12
- toil/test/batchSystems/batchSystemTest.py +40 -33
- toil/test/batchSystems/batch_system_plugin_test.py +79 -0
- toil/test/batchSystems/test_slurm.py +1 -1
- toil/test/cwl/cwlTest.py +8 -91
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +10 -13
- toil/test/jobStores/jobStoreTest.py +33 -49
- toil/test/lib/aws/test_iam.py +2 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
- toil/test/provisioners/clusterTest.py +90 -8
- toil/test/server/serverTest.py +2 -2
- toil/test/src/autoDeploymentTest.py +1 -1
- toil/test/src/dockerCheckTest.py +2 -1
- toil/test/src/environmentTest.py +125 -0
- toil/test/src/fileStoreTest.py +1 -1
- toil/test/src/jobDescriptionTest.py +18 -8
- toil/test/src/jobTest.py +1 -1
- toil/test/src/realtimeLoggerTest.py +4 -0
- toil/test/src/workerTest.py +52 -19
- toil/test/utils/toilDebugTest.py +61 -3
- toil/test/utils/utilsTest.py +20 -18
- toil/test/wdl/wdltoil_test.py +24 -71
- toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
- toil/toilState.py +68 -9
- toil/utils/toilDebugJob.py +153 -26
- toil/utils/toilLaunchCluster.py +12 -2
- toil/utils/toilRsyncCluster.py +7 -2
- toil/utils/toilSshCluster.py +7 -3
- toil/utils/toilStats.py +2 -1
- toil/utils/toilStatus.py +97 -51
- toil/version.py +10 -10
- toil/wdl/wdltoil.py +318 -51
- toil/worker.py +96 -69
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/METADATA +55 -21
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/RECORD +93 -90
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
toil/deferred.py
CHANGED
|
@@ -178,7 +178,7 @@ class DeferredFunctionManager:
|
|
|
178
178
|
|
|
179
179
|
try:
|
|
180
180
|
def defer(deferredFunction):
|
|
181
|
-
# Just serialize
|
|
181
|
+
# Just serialize deferred functions one after the other.
|
|
182
182
|
# If serializing later ones fails, eariler ones will still be intact.
|
|
183
183
|
# We trust dill to protect sufficiently against partial reads later.
|
|
184
184
|
logger.debug("Deferring function %s" % repr(deferredFunction))
|
|
@@ -39,7 +39,7 @@ import dill
|
|
|
39
39
|
|
|
40
40
|
from toil.common import Toil, cacheDirName, getDirSizeRecursively
|
|
41
41
|
from toil.fileStores import FileID
|
|
42
|
-
from toil.job import Job, JobDescription
|
|
42
|
+
from toil.job import Job, JobDescription, DebugStoppingPointReached
|
|
43
43
|
from toil.jobStores.abstractJobStore import AbstractJobStore
|
|
44
44
|
from toil.lib.compatibility import deprecated
|
|
45
45
|
from toil.lib.conversions import bytes2human
|
|
@@ -113,9 +113,7 @@ class AbstractFileStore(ABC):
|
|
|
113
113
|
assert self.jobStore.config.workflowID is not None
|
|
114
114
|
self.workflow_dir: str = Toil.getLocalWorkflowDir(self.jobStore.config.workflowID, self.jobStore.config.workDir)
|
|
115
115
|
self.coordination_dir: str =Toil.get_local_workflow_coordination_dir(self.jobStore.config.workflowID, self.jobStore.config.workDir, self.jobStore.config.coordination_dir)
|
|
116
|
-
self.jobName: str = (
|
|
117
|
-
self.jobDesc.command.split()[1] if self.jobDesc.command else ""
|
|
118
|
-
)
|
|
116
|
+
self.jobName: str = str(self.jobDesc)
|
|
119
117
|
self.waitForPreviousCommit = waitForPreviousCommit
|
|
120
118
|
self.logging_messages: List[Dict[str, Union[int, str]]] = []
|
|
121
119
|
self.logging_user_streams: List[dict[str, str]] = []
|
|
@@ -191,17 +189,17 @@ class AbstractFileStore(ABC):
|
|
|
191
189
|
|
|
192
190
|
:param job: The job instance of the toil job to run.
|
|
193
191
|
"""
|
|
194
|
-
failed = True
|
|
195
192
|
job_requested_disk = job.disk
|
|
196
193
|
try:
|
|
197
194
|
yield
|
|
198
195
|
failed = False
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
196
|
+
except BaseException as e:
|
|
197
|
+
if isinstance(e, DebugStoppingPointReached):
|
|
198
|
+
self._dumpAccessLogs(job_type="Debugged", log_level=logging.INFO)
|
|
199
|
+
else:
|
|
203
200
|
self._dumpAccessLogs()
|
|
204
|
-
|
|
201
|
+
raise
|
|
202
|
+
finally:
|
|
205
203
|
# See how much disk space is used at the end of the job.
|
|
206
204
|
# Not a real peak disk usage, but close enough to be useful for warning the user.
|
|
207
205
|
self._job_disk_used = getDirSizeRecursively(self.localTempDir)
|
|
@@ -363,14 +361,16 @@ class AbstractFileStore(ABC):
|
|
|
363
361
|
|
|
364
362
|
yield wrappedStream, fileID
|
|
365
363
|
|
|
366
|
-
def _dumpAccessLogs(self) -> None:
|
|
364
|
+
def _dumpAccessLogs(self, job_type: str = "Failed", log_level: int = logging.WARNING) -> None:
|
|
367
365
|
"""
|
|
368
|
-
|
|
366
|
+
Log a report of the files accessed.
|
|
369
367
|
|
|
370
368
|
Includes the files that were accessed while the file store was open.
|
|
369
|
+
|
|
370
|
+
:param job_type: Adjective to describe the job in the report.
|
|
371
371
|
"""
|
|
372
372
|
if len(self._accessLog) > 0:
|
|
373
|
-
logger.
|
|
373
|
+
logger.log(log_level, '%s job accessed files:', job_type)
|
|
374
374
|
|
|
375
375
|
for item in self._accessLog:
|
|
376
376
|
# For each access record
|
|
@@ -379,14 +379,14 @@ class AbstractFileStore(ABC):
|
|
|
379
379
|
file_id, dest_path = item
|
|
380
380
|
if os.path.exists(dest_path):
|
|
381
381
|
if os.path.islink(dest_path):
|
|
382
|
-
logger.
|
|
382
|
+
logger.log(log_level, 'Symlinked file \'%s\' to path \'%s\'', file_id, dest_path)
|
|
383
383
|
else:
|
|
384
|
-
logger.
|
|
384
|
+
logger.log(log_level, 'Downloaded file \'%s\' to path \'%s\'', file_id, dest_path)
|
|
385
385
|
else:
|
|
386
|
-
logger.
|
|
386
|
+
logger.log(log_level, 'Downloaded file \'%s\' to path \'%s\' (gone!)', file_id, dest_path)
|
|
387
387
|
else:
|
|
388
388
|
# Otherwise dump without the name
|
|
389
|
-
logger.
|
|
389
|
+
logger.log(log_level, 'Streamed file \'%s\'', *item)
|
|
390
390
|
|
|
391
391
|
def logAccess(
|
|
392
392
|
self, fileStoreID: Union[FileID, str], destination: Union[str, None] = None
|
|
@@ -1036,7 +1036,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1036
1036
|
# Create a working directory for the job
|
|
1037
1037
|
startingDir = os.getcwd()
|
|
1038
1038
|
# Move self.localTempDir from the worker directory set up in __init__ to a per-job directory.
|
|
1039
|
-
self.localTempDir = make_public_dir(
|
|
1039
|
+
self.localTempDir = make_public_dir(self.localTempDir, suggested_name="job")
|
|
1040
1040
|
# Check the status of all jobs on this node. If there are jobs that started and died before
|
|
1041
1041
|
# cleaning up their presence from the database, clean them up ourselves.
|
|
1042
1042
|
self._removeDeadJobs(self.coordination_dir, self.con)
|
|
@@ -1859,7 +1859,7 @@ class CachingFileStore(AbstractFileStore):
|
|
|
1859
1859
|
logger.debug('Starting commit of %s forked from %s', state_to_commit, self.jobDesc)
|
|
1860
1860
|
# Make sure the deep copy isn't summoning ghosts of old job
|
|
1861
1861
|
# versions. It must be as new or newer at this point.
|
|
1862
|
-
self.jobDesc.
|
|
1862
|
+
self.jobDesc.assert_is_not_newer_than(state_to_commit)
|
|
1863
1863
|
|
|
1864
1864
|
# Bump the original's version since saving will do that too and we
|
|
1865
1865
|
# don't want duplicate versions.
|
|
@@ -102,7 +102,7 @@ class NonCachingFileStore(AbstractFileStore):
|
|
|
102
102
|
@contextmanager
|
|
103
103
|
def open(self, job: Job) -> Generator[None, None, None]:
|
|
104
104
|
startingDir = os.getcwd()
|
|
105
|
-
self.localTempDir: str = make_public_dir(
|
|
105
|
+
self.localTempDir: str = make_public_dir(self.localTempDir, suggested_name="job")
|
|
106
106
|
self._removeDeadJobs(self.coordination_dir)
|
|
107
107
|
self.jobStateFile = self._createJobStateFile()
|
|
108
108
|
self.check_for_state_corruption()
|
toil/job.py
CHANGED
|
@@ -34,6 +34,7 @@ from typing import (TYPE_CHECKING,
|
|
|
34
34
|
Iterator,
|
|
35
35
|
List,
|
|
36
36
|
Mapping,
|
|
37
|
+
NamedTuple,
|
|
37
38
|
Optional,
|
|
38
39
|
Sequence,
|
|
39
40
|
Set,
|
|
@@ -68,8 +69,7 @@ from toil.deferred import DeferredFunction
|
|
|
68
69
|
from toil.fileStores import FileID
|
|
69
70
|
from toil.lib.conversions import bytes2human, human2bytes
|
|
70
71
|
from toil.lib.expando import Expando
|
|
71
|
-
from toil.lib.resources import
|
|
72
|
-
get_total_cpu_time_and_memory_usage)
|
|
72
|
+
from toil.lib.resources import ResourceMonitor
|
|
73
73
|
from toil.resource import ModuleDescriptor
|
|
74
74
|
from toil.statsAndLogging import set_logging_from_options
|
|
75
75
|
|
|
@@ -122,6 +122,23 @@ class ConflictingPredecessorError(Exception):
|
|
|
122
122
|
f'The given job: "{predecessor.description}" is already a predecessor of job: "{successor.description}".'
|
|
123
123
|
)
|
|
124
124
|
|
|
125
|
+
class DebugStoppingPointReached(BaseException):
|
|
126
|
+
"""
|
|
127
|
+
Raised when a job reaches a point at which it has been instructed to stop for debugging.
|
|
128
|
+
"""
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
class FilesDownloadedStoppingPointReached(DebugStoppingPointReached):
|
|
132
|
+
"""
|
|
133
|
+
Raised when a job stops because it was asked to download its files, and the files are downloaded.
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
def __init__(self, message, host_and_job_paths: Optional[List[Tuple[str, str]]] = None):
|
|
137
|
+
super().__init__(message)
|
|
138
|
+
|
|
139
|
+
# Save the host and user-code-visible paths of files, in case we're
|
|
140
|
+
# using a container and they are different.
|
|
141
|
+
self.host_and_job_paths = host_and_job_paths
|
|
125
142
|
|
|
126
143
|
class TemporaryID:
|
|
127
144
|
"""
|
|
@@ -227,7 +244,7 @@ def parse_accelerator(spec: Union[int, str, Dict[str, Union[str, int]]]) -> Acce
|
|
|
227
244
|
of them. Knows that "gpu" is a kind, and "cuda" is an API, and "nvidia"
|
|
228
245
|
is a brand.
|
|
229
246
|
|
|
230
|
-
:raises ValueError: if it gets
|
|
247
|
+
:raises ValueError: if it gets something it can't parse
|
|
231
248
|
:raises TypeError: if it gets something it can't parse because it's the wrong type.
|
|
232
249
|
"""
|
|
233
250
|
KINDS = {'gpu'}
|
|
@@ -711,13 +728,24 @@ class Requirer:
|
|
|
711
728
|
parts = ['no requirements']
|
|
712
729
|
return ', '.join(parts)
|
|
713
730
|
|
|
731
|
+
class JobBodyReference(NamedTuple):
|
|
732
|
+
"""
|
|
733
|
+
Reference from a job description to its body.
|
|
734
|
+
"""
|
|
735
|
+
file_store_id: str
|
|
736
|
+
"""File ID (or special shared file name for the root job) of the job's body."""
|
|
737
|
+
module_string: str
|
|
738
|
+
"""Stringified description of the module needed to load the body."""
|
|
739
|
+
|
|
714
740
|
class JobDescription(Requirer):
|
|
715
741
|
"""
|
|
716
742
|
Stores all the information that the Toil Leader ever needs to know about a Job.
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
743
|
+
|
|
744
|
+
This includes:
|
|
745
|
+
* Resource requirements.
|
|
746
|
+
* Which jobs are children or follow-ons or predecessors of this job.
|
|
747
|
+
* A reference to the Job object in the job store.
|
|
748
|
+
|
|
721
749
|
Can be obtained from an actual (i.e. executable) Job object, and can be
|
|
722
750
|
used to obtain the Job object from the JobStore.
|
|
723
751
|
|
|
@@ -732,8 +760,7 @@ class JobDescription(Requirer):
|
|
|
732
760
|
requirements: Mapping[str, Union[int, str, bool]],
|
|
733
761
|
jobName: str,
|
|
734
762
|
unitName: Optional[str] = "",
|
|
735
|
-
displayName: Optional[str] = "",
|
|
736
|
-
command: Optional[str] = None,
|
|
763
|
+
displayName: Optional[str] = "",
|
|
737
764
|
local: Optional[bool] = None
|
|
738
765
|
) -> None:
|
|
739
766
|
"""
|
|
@@ -780,14 +807,10 @@ class JobDescription(Requirer):
|
|
|
780
807
|
# ID of this job description in the JobStore.
|
|
781
808
|
self.jobStoreID: Union[str, TemporaryID] = TemporaryID()
|
|
782
809
|
|
|
783
|
-
#
|
|
784
|
-
#
|
|
785
|
-
#
|
|
786
|
-
|
|
787
|
-
# Gets replaced with/rewritten into the real, executable command when
|
|
788
|
-
# the leader passes the description off to the batch system to be
|
|
789
|
-
# executed.
|
|
790
|
-
self.command: Optional[str] = command
|
|
810
|
+
# Information that encodes how to find the Job body data that this
|
|
811
|
+
# JobDescription describes, and the module(s) needed to unpickle it.
|
|
812
|
+
# None if no body needs to run.
|
|
813
|
+
self._body: Optional[JobBodyReference] = None
|
|
791
814
|
|
|
792
815
|
# Set scheduling properties that the leader read to think about scheduling.
|
|
793
816
|
|
|
@@ -882,7 +905,7 @@ class JobDescription(Requirer):
|
|
|
882
905
|
|
|
883
906
|
For each job, produces a named tuple with its various names and its
|
|
884
907
|
original job store ID. The jobs in the chain are in execution order.
|
|
885
|
-
|
|
908
|
+
|
|
886
909
|
If the job hasn't run yet or it didn't chain, produces a one-item list.
|
|
887
910
|
"""
|
|
888
911
|
if len(self._merged_job_names) == 0:
|
|
@@ -955,7 +978,47 @@ class JobDescription(Requirer):
|
|
|
955
978
|
"""
|
|
956
979
|
return list(self.serviceTree.keys())
|
|
957
980
|
|
|
958
|
-
def
|
|
981
|
+
def has_body(self) -> bool:
|
|
982
|
+
"""
|
|
983
|
+
Returns True if we have a job body associated, and False otherwise.
|
|
984
|
+
"""
|
|
985
|
+
return self._body is not None
|
|
986
|
+
|
|
987
|
+
def attach_body(self, file_store_id: str, user_script: ModuleDescriptor) -> None:
|
|
988
|
+
"""
|
|
989
|
+
Attach a job body to this JobDescription.
|
|
990
|
+
|
|
991
|
+
Takes the file store ID that the body is stored at, and the required
|
|
992
|
+
user script module.
|
|
993
|
+
|
|
994
|
+
The file store ID can also be "firstJob" for the root job, stored as a
|
|
995
|
+
shared file instead.
|
|
996
|
+
"""
|
|
997
|
+
|
|
998
|
+
self._body = JobBodyReference(file_store_id, user_script.toCommand())
|
|
999
|
+
|
|
1000
|
+
def detach_body(self) -> None:
|
|
1001
|
+
"""
|
|
1002
|
+
Drop the body reference from a JobDescription.
|
|
1003
|
+
"""
|
|
1004
|
+
self._body = None
|
|
1005
|
+
|
|
1006
|
+
def get_body(self) -> Tuple[str, ModuleDescriptor]:
|
|
1007
|
+
"""
|
|
1008
|
+
Get the information needed to load the job body.
|
|
1009
|
+
|
|
1010
|
+
:returns: a file store ID (or magic shared file name "firstJob") and a
|
|
1011
|
+
user script module.
|
|
1012
|
+
|
|
1013
|
+
Fails if no body is attached; check has_body() first.
|
|
1014
|
+
"""
|
|
1015
|
+
|
|
1016
|
+
if not self.has_body():
|
|
1017
|
+
raise RuntimeError(f"Cannot load the body of a job {self} without one")
|
|
1018
|
+
|
|
1019
|
+
return self._body.file_store_id, ModuleDescriptor.fromCommand(self._body.module_string)
|
|
1020
|
+
|
|
1021
|
+
def nextSuccessors(self) -> Optional[Set[str]]:
|
|
959
1022
|
"""
|
|
960
1023
|
Return the collection of job IDs for the successors of this job that are ready to run.
|
|
961
1024
|
|
|
@@ -966,7 +1029,7 @@ class JobDescription(Requirer):
|
|
|
966
1029
|
empty collection if there are more phases but they can't be entered yet
|
|
967
1030
|
(e.g. because we are waiting for the job itself to run).
|
|
968
1031
|
"""
|
|
969
|
-
if self.
|
|
1032
|
+
if self.has_body():
|
|
970
1033
|
# We ourselves need to run. So there's not nothing to do
|
|
971
1034
|
# but no successors are ready.
|
|
972
1035
|
return set()
|
|
@@ -1038,7 +1101,7 @@ class JobDescription(Requirer):
|
|
|
1038
1101
|
:returns: True if the job appears to be done, and all related child,
|
|
1039
1102
|
follow-on, and service jobs appear to be finished and removed.
|
|
1040
1103
|
"""
|
|
1041
|
-
return self.
|
|
1104
|
+
return not self.has_body() and next(self.successorsAndServiceHosts(), None) is None
|
|
1042
1105
|
|
|
1043
1106
|
def replace(self, other: "JobDescription") -> None:
|
|
1044
1107
|
"""
|
|
@@ -1067,7 +1130,7 @@ class JobDescription(Requirer):
|
|
|
1067
1130
|
# When deleting, we need to delete the files for our old ID, and also
|
|
1068
1131
|
# anything that needed to be deleted for the job we are replacing. And
|
|
1069
1132
|
# we need to keep track of all the names of jobs involved for logging.
|
|
1070
|
-
|
|
1133
|
+
|
|
1071
1134
|
# We need first the job we are merging into if nothing has merged into
|
|
1072
1135
|
# it yet, then anything that already merged into it (including it),
|
|
1073
1136
|
# then us if nothing has yet merged into us, then anything that merged
|
|
@@ -1080,7 +1143,7 @@ class JobDescription(Requirer):
|
|
|
1080
1143
|
_merged_job_names.append(self.get_names())
|
|
1081
1144
|
_merged_job_names += self._merged_job_names
|
|
1082
1145
|
self._merged_job_names = _merged_job_names
|
|
1083
|
-
|
|
1146
|
+
|
|
1084
1147
|
# Now steal its ID.
|
|
1085
1148
|
self.jobStoreID = other.jobStoreID
|
|
1086
1149
|
|
|
@@ -1092,13 +1155,46 @@ class JobDescription(Requirer):
|
|
|
1092
1155
|
self._job_version = other._job_version
|
|
1093
1156
|
self._job_version_writer = os.getpid()
|
|
1094
1157
|
|
|
1095
|
-
def
|
|
1158
|
+
def assert_is_not_newer_than(self, other: "JobDescription") -> None:
|
|
1096
1159
|
"""
|
|
1097
|
-
Make sure a prospective new version of the JobDescription
|
|
1160
|
+
Make sure this JobDescription is not newer than a prospective new version of the JobDescription.
|
|
1098
1161
|
"""
|
|
1099
1162
|
if other._job_version < self._job_version:
|
|
1100
1163
|
raise RuntimeError(f"Cannot replace {self} from PID {self._job_version_writer} with older version {other} from PID {other._job_version_writer}")
|
|
1101
1164
|
|
|
1165
|
+
def is_updated_by(self, other: "JobDescription") -> bool:
|
|
1166
|
+
"""
|
|
1167
|
+
Return True if the passed JobDescription is a distinct, newer version of this one.
|
|
1168
|
+
"""
|
|
1169
|
+
|
|
1170
|
+
if self.jobStoreID != other.jobStoreID:
|
|
1171
|
+
# Not the same job
|
|
1172
|
+
logger.warning(
|
|
1173
|
+
"Found ID %s in job %s from PID %s but expected ID %s to "
|
|
1174
|
+
"update job %s from PID %s",
|
|
1175
|
+
other.jobStoreID,
|
|
1176
|
+
other,
|
|
1177
|
+
other._job_version_writer,
|
|
1178
|
+
self.jobStoreID,
|
|
1179
|
+
self,
|
|
1180
|
+
self._job_version_writer
|
|
1181
|
+
)
|
|
1182
|
+
return False
|
|
1183
|
+
|
|
1184
|
+
if self._job_version >= other._job_version:
|
|
1185
|
+
# Version isn't strictly newer
|
|
1186
|
+
logger.debug(
|
|
1187
|
+
"Expected newer version in job %s from PID %s but it is no "
|
|
1188
|
+
"newer than job %s from PID %s",
|
|
1189
|
+
other,
|
|
1190
|
+
other._job_version_writer,
|
|
1191
|
+
self,
|
|
1192
|
+
self._job_version_writer
|
|
1193
|
+
)
|
|
1194
|
+
return False
|
|
1195
|
+
|
|
1196
|
+
return True
|
|
1197
|
+
|
|
1102
1198
|
def addChild(self, childID: str) -> None:
|
|
1103
1199
|
"""Make the job with the given ID a child of the described job."""
|
|
1104
1200
|
self.childIDs.add(childID)
|
|
@@ -1345,12 +1441,29 @@ class CheckpointJobDescription(JobDescription):
|
|
|
1345
1441
|
|
|
1346
1442
|
# Set checkpoint-specific properties
|
|
1347
1443
|
|
|
1348
|
-
# None, or a copy of the original
|
|
1349
|
-
self.checkpoint = None
|
|
1444
|
+
# None, or a copy of the original self._body used to reestablish the job after failure.
|
|
1445
|
+
self.checkpoint: Optional[JobBodyReference] = None
|
|
1350
1446
|
|
|
1351
1447
|
# Files that can not be deleted until the job and its successors have completed
|
|
1352
1448
|
self.checkpointFilesToDelete = []
|
|
1353
1449
|
|
|
1450
|
+
def set_checkpoint(self) -> str:
|
|
1451
|
+
"""
|
|
1452
|
+
Save a body checkpoint into self.checkpoint
|
|
1453
|
+
"""
|
|
1454
|
+
|
|
1455
|
+
if not self.has_body():
|
|
1456
|
+
raise RuntimeError(f"Cannot snapshot the body of a job {self} without one")
|
|
1457
|
+
self.checkpoint = self._body
|
|
1458
|
+
|
|
1459
|
+
def restore_checkpoint(self) -> None:
|
|
1460
|
+
"""
|
|
1461
|
+
Restore the body checkpoint from self.checkpoint
|
|
1462
|
+
"""
|
|
1463
|
+
if self.checkpoint is None:
|
|
1464
|
+
raise RuntimeError(f"Cannot restore an empty checkpoint for a job {self}")
|
|
1465
|
+
self._body = self.checkpoint
|
|
1466
|
+
|
|
1354
1467
|
def restartCheckpoint(self, jobStore: "AbstractJobStore") -> List[str]:
|
|
1355
1468
|
"""
|
|
1356
1469
|
Restart a checkpoint after the total failure of jobs in its subtree.
|
|
@@ -1365,13 +1478,13 @@ class CheckpointJobDescription(JobDescription):
|
|
|
1365
1478
|
raise RuntimeError("Cannot restart a checkpoint job. The checkpoint was never set.")
|
|
1366
1479
|
successorsDeleted = []
|
|
1367
1480
|
all_successors = list(self.allSuccessors())
|
|
1368
|
-
if len(all_successors) > 0 or self.serviceTree or self.
|
|
1369
|
-
if self.
|
|
1370
|
-
if self.
|
|
1371
|
-
raise RuntimeError("The
|
|
1372
|
-
logger.debug("Checkpoint job already has
|
|
1481
|
+
if len(all_successors) > 0 or self.serviceTree or self.has_body():
|
|
1482
|
+
if self.has_body():
|
|
1483
|
+
if self._body != self.checkpoint:
|
|
1484
|
+
raise RuntimeError("The stored body reference and checkpoint are not the same.")
|
|
1485
|
+
logger.debug("Checkpoint job already has body set to run")
|
|
1373
1486
|
else:
|
|
1374
|
-
self.
|
|
1487
|
+
self.restore_checkpoint()
|
|
1375
1488
|
|
|
1376
1489
|
jobStore.update_job(self) # Update immediately to ensure that checkpoint
|
|
1377
1490
|
# is made before deleting any remaining successors
|
|
@@ -1516,6 +1629,9 @@ class Job:
|
|
|
1516
1629
|
self._defer = None
|
|
1517
1630
|
self._tempDir = None
|
|
1518
1631
|
|
|
1632
|
+
# Holds flags set by set_debug_flag()
|
|
1633
|
+
self._debug_flags: Set[str] = set()
|
|
1634
|
+
|
|
1519
1635
|
def __str__(self):
|
|
1520
1636
|
"""
|
|
1521
1637
|
Produce a useful logging string to identify this Job and distinguish it
|
|
@@ -1526,6 +1642,19 @@ class Job:
|
|
|
1526
1642
|
else:
|
|
1527
1643
|
return 'Job(' + str(self.description) + ')'
|
|
1528
1644
|
|
|
1645
|
+
def check_initialized(self) -> None:
|
|
1646
|
+
"""
|
|
1647
|
+
Ensure that Job.__init__() has been called by any subclass __init__().
|
|
1648
|
+
|
|
1649
|
+
This uses the fact that the self._description instance variable should always
|
|
1650
|
+
be set after __init__().
|
|
1651
|
+
|
|
1652
|
+
If __init__() has not been called, raise an error.
|
|
1653
|
+
"""
|
|
1654
|
+
if not hasattr(self, "_description"):
|
|
1655
|
+
raise ValueError(f"Job instance of type {type(self)} has not been initialized. super().__init__() may not "
|
|
1656
|
+
f"have been called.")
|
|
1657
|
+
|
|
1529
1658
|
@property
|
|
1530
1659
|
def jobStoreID(self) -> Union[str, TemporaryID]:
|
|
1531
1660
|
"""Get the ID of this Job."""
|
|
@@ -1656,6 +1785,11 @@ class Job:
|
|
|
1656
1785
|
"""
|
|
1657
1786
|
if not isinstance(childJob, Job):
|
|
1658
1787
|
raise RuntimeError("The type of the child job is not a job.")
|
|
1788
|
+
|
|
1789
|
+
# Check that both jobs have been initialized
|
|
1790
|
+
self.check_initialized()
|
|
1791
|
+
childJob.check_initialized()
|
|
1792
|
+
|
|
1659
1793
|
# Join the job graphs
|
|
1660
1794
|
self._jobGraphsJoined(childJob)
|
|
1661
1795
|
# Remember the child relationship
|
|
@@ -1683,6 +1817,11 @@ class Job:
|
|
|
1683
1817
|
"""
|
|
1684
1818
|
if not isinstance(followOnJob, Job):
|
|
1685
1819
|
raise RuntimeError("The type of the follow-on job is not a job.")
|
|
1820
|
+
|
|
1821
|
+
# Check that both jobs have been initialized
|
|
1822
|
+
self.check_initialized()
|
|
1823
|
+
followOnJob.check_initialized()
|
|
1824
|
+
|
|
1686
1825
|
# Join the job graphs
|
|
1687
1826
|
self._jobGraphsJoined(followOnJob)
|
|
1688
1827
|
# Remember the follow-on relationship
|
|
@@ -2567,8 +2706,8 @@ class Job:
|
|
|
2567
2706
|
# filter_main() in _unpickle( ) do its job of resolving any user-defined type or function.
|
|
2568
2707
|
userScript = self.getUserScript().globalize()
|
|
2569
2708
|
|
|
2570
|
-
#
|
|
2571
|
-
self._description.
|
|
2709
|
+
# Connect the body of the job to the JobDescription
|
|
2710
|
+
self._description.attach_body(fileStoreID, userScript)
|
|
2572
2711
|
|
|
2573
2712
|
def _saveJobGraph(self, jobStore: "AbstractJobStore", saveSelf: bool = False, returnValues: bool = None):
|
|
2574
2713
|
"""
|
|
@@ -2697,38 +2836,33 @@ class Job:
|
|
|
2697
2836
|
|
|
2698
2837
|
@classmethod
|
|
2699
2838
|
def loadJob(
|
|
2700
|
-
cls,
|
|
2839
|
+
cls, job_store: "AbstractJobStore", job_description: JobDescription
|
|
2701
2840
|
) -> "Job":
|
|
2702
2841
|
"""
|
|
2703
2842
|
Retrieves a :class:`toil.job.Job` instance from a JobStore
|
|
2704
2843
|
|
|
2705
|
-
:param
|
|
2706
|
-
:param
|
|
2844
|
+
:param job_store: The job store.
|
|
2845
|
+
:param job_description: the JobDescription of the job to retrieve.
|
|
2707
2846
|
:returns: The job referenced by the JobDescription.
|
|
2708
2847
|
"""
|
|
2709
|
-
|
|
2710
|
-
|
|
2711
|
-
|
|
2712
|
-
|
|
2713
|
-
if "_toil" != commandTokens[0]:
|
|
2714
|
-
raise RuntimeError("An invalid command was passed into the job.")
|
|
2715
|
-
userModule = ModuleDescriptor.fromCommand(commandTokens[2:])
|
|
2716
|
-
logger.debug('Loading user module %s.', userModule)
|
|
2717
|
-
userModule = cls._loadUserModule(userModule)
|
|
2718
|
-
pickleFile = commandTokens[1]
|
|
2848
|
+
|
|
2849
|
+
file_store_id, user_module_descriptor = job_description.get_body()
|
|
2850
|
+
logger.debug('Loading user module %s.', user_module_descriptor)
|
|
2851
|
+
user_module = cls._loadUserModule(user_module_descriptor)
|
|
2719
2852
|
|
|
2720
2853
|
#Loads context manager using file stream
|
|
2721
|
-
if
|
|
2722
|
-
|
|
2854
|
+
if file_store_id == "firstJob":
|
|
2855
|
+
# This one is actually a shared file name and not a file ID.
|
|
2856
|
+
manager = job_store.read_shared_file_stream(file_store_id)
|
|
2723
2857
|
else:
|
|
2724
|
-
manager =
|
|
2858
|
+
manager = job_store.read_file_stream(file_store_id)
|
|
2725
2859
|
|
|
2726
2860
|
#Open and unpickle
|
|
2727
|
-
with manager as
|
|
2861
|
+
with manager as file_handle:
|
|
2728
2862
|
|
|
2729
|
-
job = cls._unpickle(
|
|
2863
|
+
job = cls._unpickle(user_module, file_handle, requireInstanceOf=Job)
|
|
2730
2864
|
# Fill in the current description
|
|
2731
|
-
job._description =
|
|
2865
|
+
job._description = job_description
|
|
2732
2866
|
|
|
2733
2867
|
# Set up the registry again, so children and follow-ons can be added on the worker
|
|
2734
2868
|
job._registry = {job.jobStoreID: job}
|
|
@@ -2771,11 +2905,16 @@ class Job:
|
|
|
2771
2905
|
"""
|
|
2772
2906
|
if stats is not None:
|
|
2773
2907
|
startTime = time.time()
|
|
2774
|
-
startClock = get_total_cpu_time()
|
|
2908
|
+
startClock = ResourceMonitor.get_total_cpu_time()
|
|
2775
2909
|
baseDir = os.getcwd()
|
|
2776
2910
|
|
|
2777
2911
|
yield
|
|
2778
2912
|
|
|
2913
|
+
if "download_only" in self._debug_flags:
|
|
2914
|
+
# We should stop right away
|
|
2915
|
+
logger.debug("Job did not stop itself after downloading files; stopping.")
|
|
2916
|
+
raise DebugStoppingPointReached()
|
|
2917
|
+
|
|
2779
2918
|
# If the job is not a checkpoint job, add the promise files to delete
|
|
2780
2919
|
# to the list of jobStoreFileIDs to delete
|
|
2781
2920
|
# TODO: why is Promise holding a global list here???
|
|
@@ -2795,7 +2934,7 @@ class Job:
|
|
|
2795
2934
|
os.chdir(baseDir)
|
|
2796
2935
|
# Finish up the stats
|
|
2797
2936
|
if stats is not None:
|
|
2798
|
-
totalCpuTime, totalMemoryUsage = get_total_cpu_time_and_memory_usage()
|
|
2937
|
+
totalCpuTime, totalMemoryUsage = ResourceMonitor.get_total_cpu_time_and_memory_usage()
|
|
2799
2938
|
stats.jobs.append(
|
|
2800
2939
|
Expando(
|
|
2801
2940
|
time=str(time.time() - startTime),
|
|
@@ -2817,7 +2956,7 @@ class Job:
|
|
|
2817
2956
|
"""
|
|
2818
2957
|
Run the job, and serialise the next jobs.
|
|
2819
2958
|
|
|
2820
|
-
It marks the job as completed (by clearing its
|
|
2959
|
+
It marks the job as completed (by clearing its body) and creates the
|
|
2821
2960
|
successor relationships to new successors, but it doesn't actually
|
|
2822
2961
|
commit those updates to the current job into the JobStore.
|
|
2823
2962
|
|
|
@@ -2852,9 +2991,9 @@ class Job:
|
|
|
2852
2991
|
# Serialize the new Jobs defined by the run method to the jobStore
|
|
2853
2992
|
self._saveJobGraph(jobStore, saveSelf=False, returnValues=returnValues)
|
|
2854
2993
|
|
|
2855
|
-
# Clear out the
|
|
2856
|
-
self.description.
|
|
2857
|
-
|
|
2994
|
+
# Clear out the body, because the job is done.
|
|
2995
|
+
self.description.detach_body()
|
|
2996
|
+
|
|
2858
2997
|
# That and the new child/follow-on relationships will need to be
|
|
2859
2998
|
# recorded later by an update() of the JobDescription.
|
|
2860
2999
|
|
|
@@ -2864,6 +3003,35 @@ class Job:
|
|
|
2864
3003
|
"""
|
|
2865
3004
|
return self._description.displayName
|
|
2866
3005
|
|
|
3006
|
+
def set_debug_flag(self, flag: str) -> None:
|
|
3007
|
+
"""
|
|
3008
|
+
Enable the given debug option on the job.
|
|
3009
|
+
"""
|
|
3010
|
+
self._debug_flags.add(flag)
|
|
3011
|
+
|
|
3012
|
+
def has_debug_flag(self, flag: str) -> bool:
|
|
3013
|
+
"""
|
|
3014
|
+
Return true if the given debug flag is set.
|
|
3015
|
+
"""
|
|
3016
|
+
|
|
3017
|
+
return flag in self._debug_flags
|
|
3018
|
+
|
|
3019
|
+
def files_downloaded_hook(self, host_and_job_paths: Optional[List[Tuple[str, str]]] = None) -> None:
|
|
3020
|
+
"""
|
|
3021
|
+
Function that subclasses can call when they have downloaded their input files.
|
|
3022
|
+
|
|
3023
|
+
Will abort the job if the "download_only" debug flag is set.
|
|
3024
|
+
|
|
3025
|
+
Can be hinted a list of file path pairs outside and inside the job
|
|
3026
|
+
container, in which case the container environment can be
|
|
3027
|
+
reconstructed.
|
|
3028
|
+
"""
|
|
3029
|
+
|
|
3030
|
+
if self.has_debug_flag("download_only"):
|
|
3031
|
+
# Stop the worker!
|
|
3032
|
+
logger.info("Job has downloaded its files. Stopping.")
|
|
3033
|
+
# Send off the path mapping for the debugging wrapper.
|
|
3034
|
+
raise FilesDownloadedStoppingPointReached("Files downloaded", host_and_job_paths=host_and_job_paths)
|
|
2867
3035
|
|
|
2868
3036
|
class JobException(Exception):
|
|
2869
3037
|
"""General job exception."""
|