toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +122 -315
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +173 -89
- toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
- toil/batchSystems/awsBatch.py +244 -135
- toil/batchSystems/cleanup_support.py +26 -16
- toil/batchSystems/contained_executor.py +31 -28
- toil/batchSystems/gridengine.py +86 -50
- toil/batchSystems/htcondor.py +166 -89
- toil/batchSystems/kubernetes.py +632 -382
- toil/batchSystems/local_support.py +20 -15
- toil/batchSystems/lsf.py +134 -81
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +290 -151
- toil/batchSystems/mesos/executor.py +79 -50
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +46 -28
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +296 -125
- toil/batchSystems/slurm.py +603 -138
- toil/batchSystems/torque.py +47 -33
- toil/bus.py +186 -76
- toil/common.py +664 -368
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1136 -483
- toil/cwl/utils.py +17 -22
- toil/deferred.py +63 -42
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +140 -60
- toil/fileStores/cachingFileStore.py +717 -269
- toil/fileStores/nonCachingFileStore.py +116 -87
- toil/job.py +1225 -368
- toil/jobStores/abstractJobStore.py +416 -266
- toil/jobStores/aws/jobStore.py +863 -477
- toil/jobStores/aws/utils.py +201 -120
- toil/jobStores/conftest.py +3 -2
- toil/jobStores/fileJobStore.py +292 -154
- toil/jobStores/googleJobStore.py +140 -74
- toil/jobStores/utils.py +36 -15
- toil/leader.py +668 -272
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +74 -31
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +214 -39
- toil/lib/aws/utils.py +287 -231
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +104 -47
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +361 -199
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +5 -3
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +141 -15
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +66 -21
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +68 -15
- toil/lib/retry.py +126 -81
- toil/lib/threading.py +299 -82
- toil/lib/throttle.py +16 -15
- toil/options/common.py +843 -409
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +73 -17
- toil/provisioners/__init__.py +117 -46
- toil/provisioners/abstractProvisioner.py +332 -157
- toil/provisioners/aws/__init__.py +70 -33
- toil/provisioners/aws/awsProvisioner.py +1145 -715
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +155 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +128 -62
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +224 -70
- toil/test/__init__.py +282 -183
- toil/test/batchSystems/batchSystemTest.py +460 -210
- toil/test/batchSystems/batch_system_plugin_test.py +90 -0
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +110 -49
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +56 -0
- toil/test/cwl/cwlTest.py +496 -287
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +69 -46
- toil/test/jobStores/jobStoreTest.py +427 -264
- toil/test/lib/aws/test_iam.py +118 -50
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +58 -50
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +42 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +166 -44
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +141 -101
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +32 -24
- toil/test/src/environmentTest.py +135 -0
- toil/test/src/fileStoreTest.py +539 -272
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +46 -21
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +121 -71
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +10 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +73 -23
- toil/test/utils/toilDebugTest.py +103 -33
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +245 -106
- toil/test/wdl/wdltoil_test.py +818 -149
- toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
- toil/toilState.py +120 -35
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +214 -27
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +256 -140
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +32 -14
- toil/utils/toilSshCluster.py +49 -22
- toil/utils/toilStats.py +356 -273
- toil/utils/toilStatus.py +292 -139
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +12 -12
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3913 -1033
- toil/worker.py +367 -184
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- toil-6.1.0a1.dist-info/RECORD +0 -237
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/leader.py
CHANGED
|
@@ -21,30 +21,36 @@ import os
|
|
|
21
21
|
import pickle
|
|
22
22
|
import sys
|
|
23
23
|
import time
|
|
24
|
-
from typing import Any,
|
|
24
|
+
from typing import Any, Optional, Union
|
|
25
25
|
|
|
26
26
|
import enlighten
|
|
27
27
|
|
|
28
28
|
from toil import resolveEntryPoint
|
|
29
29
|
from toil.batchSystems import DeadlockException
|
|
30
|
-
from toil.batchSystems.abstractBatchSystem import (
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
30
|
+
from toil.batchSystems.abstractBatchSystem import (
|
|
31
|
+
EXIT_STATUS_UNAVAILABLE_VALUE,
|
|
32
|
+
AbstractBatchSystem,
|
|
33
|
+
BatchJobExitReason,
|
|
34
|
+
)
|
|
35
|
+
from toil.bus import (
|
|
36
|
+
JobCompletedMessage,
|
|
37
|
+
JobFailedMessage,
|
|
38
|
+
JobIssuedMessage,
|
|
39
|
+
JobMissingMessage,
|
|
40
|
+
JobUpdatedMessage,
|
|
41
|
+
QueueSizeMessage,
|
|
42
|
+
get_job_kind,
|
|
43
|
+
)
|
|
39
44
|
from toil.common import Config, ToilMetrics
|
|
40
45
|
from toil.cwl.utils import CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
|
|
41
46
|
from toil.exceptions import FailedJobsException
|
|
42
|
-
from toil.job import (
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
47
|
+
from toil.job import (
|
|
48
|
+
CheckpointJobDescription,
|
|
49
|
+
JobDescription,
|
|
50
|
+
ServiceJobDescription,
|
|
51
|
+
TemporaryID,
|
|
52
|
+
)
|
|
53
|
+
from toil.jobStores.abstractJobStore import AbstractJobStore, NoSuchJobException
|
|
48
54
|
from toil.lib.throttle import LocalThrottle
|
|
49
55
|
from toil.provisioners.abstractProvisioner import AbstractProvisioner
|
|
50
56
|
from toil.provisioners.clusterScaler import ScalerThread
|
|
@@ -78,13 +84,15 @@ class Leader:
|
|
|
78
84
|
consulting the job store, and issuing them in the batch system.
|
|
79
85
|
"""
|
|
80
86
|
|
|
81
|
-
def __init__(
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
87
|
+
def __init__(
|
|
88
|
+
self,
|
|
89
|
+
config: Config,
|
|
90
|
+
batchSystem: AbstractBatchSystem,
|
|
91
|
+
provisioner: Optional[AbstractProvisioner],
|
|
92
|
+
jobStore: AbstractJobStore,
|
|
93
|
+
rootJob: JobDescription,
|
|
94
|
+
jobCache: Optional[dict[Union[str, TemporaryID], JobDescription]] = None,
|
|
95
|
+
) -> None:
|
|
88
96
|
"""
|
|
89
97
|
Create a Toil Leader object.
|
|
90
98
|
|
|
@@ -114,14 +122,11 @@ class Leader:
|
|
|
114
122
|
# state change information about jobs.
|
|
115
123
|
self.toilState = ToilState(self.jobStore)
|
|
116
124
|
|
|
117
|
-
if self.config.write_messages is None:
|
|
118
|
-
# The user hasn't specified a place for the message bus so we
|
|
119
|
-
# should make one.
|
|
120
|
-
self.config.write_messages = gen_message_bus_path()
|
|
121
|
-
|
|
122
125
|
# Message bus messages need to go to the given file.
|
|
123
126
|
# Keep a reference to the return value so the listener stays alive.
|
|
124
|
-
self._message_subscription = self.toilState.bus.connect_output_file(
|
|
127
|
+
self._message_subscription = self.toilState.bus.connect_output_file(
|
|
128
|
+
self.config.write_messages
|
|
129
|
+
)
|
|
125
130
|
|
|
126
131
|
# Connect to the message bus, so we will get all the messages of these
|
|
127
132
|
# types in an inbox.
|
|
@@ -136,17 +141,22 @@ class Leader:
|
|
|
136
141
|
# this, somehow, so they can also see messages from this?
|
|
137
142
|
self.toilState.load_workflow(rootJob, jobCache=jobCache)
|
|
138
143
|
|
|
139
|
-
logger.debug(
|
|
140
|
-
|
|
144
|
+
logger.debug(
|
|
145
|
+
"Found %s jobs to start and %i jobs with successors to run",
|
|
146
|
+
self._messages.count(JobUpdatedMessage),
|
|
147
|
+
len(self.toilState.successorCounts),
|
|
148
|
+
)
|
|
141
149
|
|
|
142
150
|
# Batch system
|
|
143
151
|
self.batchSystem = batchSystem
|
|
144
152
|
if len(self.batchSystem.getIssuedBatchJobIDs()) != 0:
|
|
145
|
-
raise RuntimeError(
|
|
153
|
+
raise RuntimeError(
|
|
154
|
+
"The initialized batchsystem did not start with 0 active jobs."
|
|
155
|
+
)
|
|
146
156
|
logger.debug("Checked batch system has no running jobs and no updated jobs")
|
|
147
157
|
|
|
148
158
|
# Map of batch system IDs to job store IDs
|
|
149
|
-
self.issued_jobs_by_batch_system_id:
|
|
159
|
+
self.issued_jobs_by_batch_system_id: dict[int, str] = {}
|
|
150
160
|
|
|
151
161
|
# Number of preemptible jobs currently being run by batch system
|
|
152
162
|
self.preemptibleJobsIssued = 0
|
|
@@ -154,10 +164,12 @@ class Leader:
|
|
|
154
164
|
# Tracking the number service jobs issued,
|
|
155
165
|
# this is used limit the number of services issued to the batch system
|
|
156
166
|
self.serviceJobsIssued = 0
|
|
157
|
-
self.serviceJobsToBeIssued:
|
|
167
|
+
self.serviceJobsToBeIssued: list[str] = (
|
|
168
|
+
[]
|
|
169
|
+
) # A queue of IDs of service jobs that await scheduling
|
|
158
170
|
# Equivalents for service jobs to be run on preemptible nodes
|
|
159
171
|
self.preemptibleServiceJobsIssued = 0
|
|
160
|
-
self.preemptibleServiceJobsToBeIssued:
|
|
172
|
+
self.preemptibleServiceJobsToBeIssued: list[str] = []
|
|
161
173
|
|
|
162
174
|
# Timing of the rescuing method
|
|
163
175
|
self.timeSinceJobsLastRescued = None
|
|
@@ -165,7 +177,7 @@ class Leader:
|
|
|
165
177
|
# For each issued job's batch system ID, how many times did we not see
|
|
166
178
|
# it when we should have? If this hits a threshold, the job is declared
|
|
167
179
|
# missing and killed and possibly retried.
|
|
168
|
-
self.reissueMissingJobs_missingHash:
|
|
180
|
+
self.reissueMissingJobs_missingHash: dict[int, int] = {}
|
|
169
181
|
|
|
170
182
|
# Class used to create/destroy nodes in the cluster, may be None if
|
|
171
183
|
# using a statically defined cluster
|
|
@@ -183,7 +195,7 @@ class Leader:
|
|
|
183
195
|
self.statsAndLogging = StatsAndLogging(self.jobStore, self.config)
|
|
184
196
|
|
|
185
197
|
# Set used to monitor deadlocked jobs
|
|
186
|
-
self.potentialDeadlockedJobs:
|
|
198
|
+
self.potentialDeadlockedJobs: set[str] = set()
|
|
187
199
|
self.potentialDeadlockTime = 0
|
|
188
200
|
|
|
189
201
|
# A dashboard that runs on the leader node in AWS clusters to track the state
|
|
@@ -191,8 +203,13 @@ class Leader:
|
|
|
191
203
|
self.toilMetrics: Optional[ToilMetrics] = None
|
|
192
204
|
|
|
193
205
|
# internal jobs we should not expose at top level debugging
|
|
194
|
-
self.debugJobNames = (
|
|
195
|
-
|
|
206
|
+
self.debugJobNames = (
|
|
207
|
+
"CWLJob",
|
|
208
|
+
"CWLWorkflow",
|
|
209
|
+
"CWLScatter",
|
|
210
|
+
"CWLGather",
|
|
211
|
+
"ResolveIndirect",
|
|
212
|
+
)
|
|
196
213
|
|
|
197
214
|
self.deadlockThrottler = LocalThrottle(self.config.deadlockCheckInterval)
|
|
198
215
|
|
|
@@ -210,8 +227,10 @@ class Leader:
|
|
|
210
227
|
self.GOOD_COLOR = (0, 60, 108)
|
|
211
228
|
self.BAD_COLOR = (253, 199, 0)
|
|
212
229
|
# And set a format that shows failures
|
|
213
|
-
self.PROGRESS_BAR_FORMAT = (
|
|
214
|
-
|
|
230
|
+
self.PROGRESS_BAR_FORMAT = (
|
|
231
|
+
"{desc}{desc_pad}{percentage:3.0f}%|{bar}| {count:{len_total}d}/{total:d} "
|
|
232
|
+
"({count_1:d} failures) [{elapsed}<{eta}, {rate:.2f}{unit_pad}{unit}/s]"
|
|
233
|
+
)
|
|
215
234
|
# TODO: No way to set background color on the terminal for the bar.
|
|
216
235
|
|
|
217
236
|
# What exit code should the process use if the workflow failed?
|
|
@@ -229,16 +248,25 @@ class Leader:
|
|
|
229
248
|
"""
|
|
230
249
|
self.jobStore.write_kill_flag(kill=False)
|
|
231
250
|
|
|
232
|
-
with enlighten.get_manager(
|
|
251
|
+
with enlighten.get_manager(
|
|
252
|
+
stream=sys.stderr, enabled=not self.config.disableProgress
|
|
253
|
+
) as manager:
|
|
233
254
|
# Set up the fancy console UI if desirable
|
|
234
|
-
self.progress_overall = manager.counter(
|
|
235
|
-
|
|
255
|
+
self.progress_overall = manager.counter(
|
|
256
|
+
total=0,
|
|
257
|
+
desc="Workflow Progress",
|
|
258
|
+
unit="jobs",
|
|
259
|
+
color=self.GOOD_COLOR,
|
|
260
|
+
bar_format=self.PROGRESS_BAR_FORMAT,
|
|
261
|
+
)
|
|
236
262
|
self.progress_failed = self.progress_overall.add_subcounter(self.BAD_COLOR)
|
|
237
263
|
|
|
238
264
|
# Start the stats/logging aggregation thread
|
|
239
265
|
self.statsAndLogging.start()
|
|
240
266
|
if self.config.metrics:
|
|
241
|
-
self.toilMetrics = ToilMetrics(
|
|
267
|
+
self.toilMetrics = ToilMetrics(
|
|
268
|
+
self.toilState.bus, provisioner=self.provisioner
|
|
269
|
+
)
|
|
242
270
|
|
|
243
271
|
try:
|
|
244
272
|
|
|
@@ -255,10 +283,13 @@ class Leader:
|
|
|
255
283
|
self.innerLoop()
|
|
256
284
|
finally:
|
|
257
285
|
if self.clusterScaler is not None:
|
|
258
|
-
logger.debug(
|
|
286
|
+
logger.debug("Waiting for workers to shutdown.")
|
|
259
287
|
startTime = time.time()
|
|
260
288
|
self.clusterScaler.shutdown()
|
|
261
|
-
logger.debug(
|
|
289
|
+
logger.debug(
|
|
290
|
+
"Worker shutdown complete in %s seconds.",
|
|
291
|
+
time.time() - startTime,
|
|
292
|
+
)
|
|
262
293
|
|
|
263
294
|
finally:
|
|
264
295
|
# Ensure service manager thread is properly shutdown
|
|
@@ -271,37 +302,59 @@ class Leader:
|
|
|
271
302
|
self.toilMetrics.shutdown()
|
|
272
303
|
|
|
273
304
|
# Filter the failed jobs
|
|
274
|
-
self.toilState.totalFailedJobs = [
|
|
305
|
+
self.toilState.totalFailedJobs = [
|
|
306
|
+
j
|
|
307
|
+
for j in self.toilState.totalFailedJobs
|
|
308
|
+
if self.toilState.job_exists(j)
|
|
309
|
+
]
|
|
275
310
|
|
|
276
311
|
try:
|
|
277
312
|
self.create_status_sentinel_file(self.toilState.totalFailedJobs)
|
|
278
313
|
except OSError as e:
|
|
279
|
-
logger.debug(f
|
|
314
|
+
logger.debug(f"Error from importFile with hardlink=True: {e}")
|
|
280
315
|
|
|
281
|
-
logger.info(
|
|
282
|
-
|
|
283
|
-
|
|
316
|
+
logger.info(
|
|
317
|
+
"Finished toil run %s"
|
|
318
|
+
% (
|
|
319
|
+
"successfully."
|
|
320
|
+
if not self.toilState.totalFailedJobs
|
|
321
|
+
else ("with %s failed jobs." % len(self.toilState.totalFailedJobs))
|
|
322
|
+
)
|
|
323
|
+
)
|
|
284
324
|
|
|
285
325
|
if len(self.toilState.totalFailedJobs):
|
|
286
326
|
failed_jobs = []
|
|
287
327
|
for job_id in self.toilState.totalFailedJobs:
|
|
288
328
|
# Refresh all the failed jobs to get e.g. the log file IDs that the workers wrote
|
|
289
329
|
self.toilState.reset_job(job_id)
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
330
|
+
try:
|
|
331
|
+
failed_jobs.append(self.toilState.get_job(job_id))
|
|
332
|
+
except NoSuchJobException:
|
|
333
|
+
# Job actually finished and was removed
|
|
334
|
+
pass
|
|
335
|
+
|
|
336
|
+
logger.info(
|
|
337
|
+
"Failed jobs at end of the run: %s",
|
|
338
|
+
" ".join(str(j) for j in failed_jobs),
|
|
339
|
+
)
|
|
340
|
+
raise FailedJobsException(
|
|
341
|
+
self.jobStore,
|
|
342
|
+
failed_jobs,
|
|
343
|
+
exit_code=self.recommended_fail_exit_code,
|
|
344
|
+
)
|
|
294
345
|
|
|
295
346
|
return self.jobStore.get_root_job_return_value()
|
|
296
347
|
|
|
297
348
|
def create_status_sentinel_file(self, fail: bool) -> None:
|
|
298
349
|
"""Create a file in the jobstore indicating failure or success."""
|
|
299
|
-
logName =
|
|
350
|
+
logName = "failed.log" if fail else "succeeded.log"
|
|
300
351
|
localLog = os.path.join(os.getcwd(), logName)
|
|
301
|
-
open(localLog,
|
|
302
|
-
self.jobStore.import_file(
|
|
352
|
+
open(localLog, "w").close()
|
|
353
|
+
self.jobStore.import_file("file://" + localLog, logName, hardlink=True)
|
|
303
354
|
|
|
304
|
-
if os.path.exists(
|
|
355
|
+
if os.path.exists(
|
|
356
|
+
localLog
|
|
357
|
+
): # Bandaid for Jenkins tests failing stochastically and unexplainably.
|
|
305
358
|
os.remove(localLog)
|
|
306
359
|
|
|
307
360
|
def _handledFailedSuccessor(self, successor_id: str, predecessor_id: str) -> bool:
|
|
@@ -313,8 +366,11 @@ class Leader:
|
|
|
313
366
|
:returns: True if there are still active successors.
|
|
314
367
|
False if all successors have failed and the job is queued to run to handle the failed successors.
|
|
315
368
|
"""
|
|
316
|
-
logger.debug(
|
|
317
|
-
|
|
369
|
+
logger.debug(
|
|
370
|
+
"Successor job: %s of job: %s has failed " "" "predecessors",
|
|
371
|
+
self.toilState.get_job(successor_id),
|
|
372
|
+
self.toilState.get_job(predecessor_id),
|
|
373
|
+
)
|
|
318
374
|
|
|
319
375
|
# Add the job to the set having failed successors
|
|
320
376
|
self.toilState.hasFailedSuccessors.add(predecessor_id)
|
|
@@ -328,9 +384,12 @@ class Leader:
|
|
|
328
384
|
# If the job now has no active successors, add to active jobs
|
|
329
385
|
# so it can be processed as a job with failed successors.
|
|
330
386
|
if self.toilState.count_pending_successors(predecessor_id) == 0:
|
|
331
|
-
logger.debug(
|
|
332
|
-
|
|
333
|
-
|
|
387
|
+
logger.debug(
|
|
388
|
+
"Job: %s has no successors to run "
|
|
389
|
+
"and some are failed, adding to list of jobs "
|
|
390
|
+
"with failed successors",
|
|
391
|
+
self.toilState.get_job(predecessor_id),
|
|
392
|
+
)
|
|
334
393
|
self._messages.publish(JobUpdatedMessage(predecessor_id, 0))
|
|
335
394
|
# Report no successors are running
|
|
336
395
|
return False
|
|
@@ -338,7 +397,9 @@ class Leader:
|
|
|
338
397
|
# Some successors are still active
|
|
339
398
|
return True
|
|
340
399
|
|
|
341
|
-
def _checkSuccessorReadyToRunMultiplePredecessors(
|
|
400
|
+
def _checkSuccessorReadyToRunMultiplePredecessors(
|
|
401
|
+
self, successor_id: str, predecessor_id: str
|
|
402
|
+
) -> bool:
|
|
342
403
|
"""
|
|
343
404
|
Check if a successor job is ready to run when there are multiple predecessors.
|
|
344
405
|
|
|
@@ -359,8 +420,11 @@ class Leader:
|
|
|
359
420
|
# Grab the predecessor for reporting
|
|
360
421
|
predecessor = self.toilState.get_job(predecessor_id)
|
|
361
422
|
|
|
362
|
-
logger.debug(
|
|
363
|
-
|
|
423
|
+
logger.debug(
|
|
424
|
+
"Successor job: %s of job: %s has multiple " "predecessors",
|
|
425
|
+
successor,
|
|
426
|
+
predecessor,
|
|
427
|
+
)
|
|
364
428
|
|
|
365
429
|
# Add the predecessor as a finished predecessor to the successor
|
|
366
430
|
successor.predecessorsFinished.add(predecessor_id)
|
|
@@ -379,13 +443,17 @@ class Leader:
|
|
|
379
443
|
if len(successor.predecessorsFinished) == successor.predecessorNumber:
|
|
380
444
|
# All the successor's predecessors are done now.
|
|
381
445
|
# Remove the successor job from the set of waiting multi-predecessor jobs.
|
|
382
|
-
self.toilState.jobsToBeScheduledWithMultiplePredecessors.remove(
|
|
446
|
+
self.toilState.jobsToBeScheduledWithMultiplePredecessors.remove(
|
|
447
|
+
successor_id
|
|
448
|
+
)
|
|
383
449
|
return True
|
|
384
450
|
else:
|
|
385
451
|
# The job is not ready to run
|
|
386
452
|
return False
|
|
387
453
|
|
|
388
|
-
def _makeJobSuccessorReadyToRun(
|
|
454
|
+
def _makeJobSuccessorReadyToRun(
|
|
455
|
+
self, successor_id: str, predecessor_id: str
|
|
456
|
+
) -> bool:
|
|
389
457
|
"""
|
|
390
458
|
Make a successor job ready to run if possible.
|
|
391
459
|
|
|
@@ -393,7 +461,7 @@ class Leader:
|
|
|
393
461
|
:param predecessor_id: The job which the successor comes after.
|
|
394
462
|
:returns: False if the successor job should not yet be run or True otherwise.
|
|
395
463
|
"""
|
|
396
|
-
#Build map from successor to predecessors.
|
|
464
|
+
# Build map from successor to predecessors.
|
|
397
465
|
if successor_id not in self.toilState.successor_to_predecessors:
|
|
398
466
|
self.toilState.successor_to_predecessors[successor_id] = set()
|
|
399
467
|
if not isinstance(successor_id, str):
|
|
@@ -404,9 +472,15 @@ class Leader:
|
|
|
404
472
|
|
|
405
473
|
# Grab the successor
|
|
406
474
|
successor = self.toilState.get_job(successor_id)
|
|
407
|
-
logger.debug(
|
|
475
|
+
logger.debug(
|
|
476
|
+
"Added job %s as coming after job %s",
|
|
477
|
+
successor,
|
|
478
|
+
self.toilState.get_job(predecessor_id),
|
|
479
|
+
)
|
|
408
480
|
if successor.predecessorNumber > 1:
|
|
409
|
-
return self._checkSuccessorReadyToRunMultiplePredecessors(
|
|
481
|
+
return self._checkSuccessorReadyToRunMultiplePredecessors(
|
|
482
|
+
successor_id, predecessor_id
|
|
483
|
+
)
|
|
410
484
|
else:
|
|
411
485
|
return True
|
|
412
486
|
|
|
@@ -425,13 +499,20 @@ class Leader:
|
|
|
425
499
|
next_successors = predecessor.nextSuccessors()
|
|
426
500
|
|
|
427
501
|
if next_successors is None or len(next_successors) == 0:
|
|
428
|
-
raise RuntimeError(
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
502
|
+
raise RuntimeError(
|
|
503
|
+
f"Job {self} trying to run successors, but it doesn't have any"
|
|
504
|
+
)
|
|
505
|
+
logger.debug(
|
|
506
|
+
"Job: %s has %i successors to schedule",
|
|
507
|
+
predecessor_id,
|
|
508
|
+
len(next_successors),
|
|
509
|
+
)
|
|
510
|
+
# Record the number of successors that must be completed before
|
|
511
|
+
# the job can be considered again
|
|
433
512
|
if self.toilState.count_pending_successors(predecessor_id) != 0:
|
|
434
|
-
raise RuntimeError(
|
|
513
|
+
raise RuntimeError(
|
|
514
|
+
"Attempted to schedule successors of the same job twice!"
|
|
515
|
+
)
|
|
435
516
|
self.toilState.successors_pending(predecessor_id, len(next_successors))
|
|
436
517
|
|
|
437
518
|
# For each successor schedule if all predecessors have been completed
|
|
@@ -442,7 +523,11 @@ class Leader:
|
|
|
442
523
|
except NoSuchJobException:
|
|
443
524
|
# Job already done and gone, but probably shouldn't be. Or maybe isn't visible yet.
|
|
444
525
|
# TODO: Shouldn't this be an error?
|
|
445
|
-
logger.warning(
|
|
526
|
+
logger.warning(
|
|
527
|
+
"Job %s is a successor of %s but is already done and gone.",
|
|
528
|
+
successor_id,
|
|
529
|
+
predecessor_id,
|
|
530
|
+
)
|
|
446
531
|
# Don't try and run it
|
|
447
532
|
continue
|
|
448
533
|
if self._makeJobSuccessorReadyToRun(successor_id, predecessor_id):
|
|
@@ -464,46 +549,62 @@ class Leader:
|
|
|
464
549
|
# The job has services running; signal for them to be killed.
|
|
465
550
|
# Once they are killed, then the job will be updated again and then
|
|
466
551
|
# scheduled to be removed.
|
|
467
|
-
logger.warning(
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
552
|
+
logger.warning(
|
|
553
|
+
"Telling job %s to terminate its services due to successor failure",
|
|
554
|
+
predecessor,
|
|
555
|
+
)
|
|
556
|
+
self.serviceManager.kill_services(
|
|
557
|
+
self.toilState.servicesIssued[predecessor_id], error=True
|
|
558
|
+
)
|
|
471
559
|
elif self.toilState.count_pending_successors(predecessor_id) > 0:
|
|
472
560
|
# The job has non-service jobs running; wait for them to finish.
|
|
473
561
|
# the job will be re-added to the updated jobs when these jobs
|
|
474
562
|
# are done
|
|
475
|
-
logger.debug(
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
563
|
+
logger.debug(
|
|
564
|
+
"Job %s with ID: %s with failed successors still has successor jobs running",
|
|
565
|
+
predecessor,
|
|
566
|
+
predecessor_id,
|
|
567
|
+
)
|
|
568
|
+
elif (
|
|
569
|
+
isinstance(predecessor, CheckpointJobDescription)
|
|
570
|
+
and predecessor.checkpoint is not None
|
|
571
|
+
and predecessor.remainingTryCount > 1
|
|
572
|
+
):
|
|
480
573
|
# If the job is a checkpoint and has remaining retries...
|
|
481
574
|
# The logic behind using > 1 rather than > 0 here: Since this job has
|
|
482
575
|
# been tried once (without decreasing its try count as the job
|
|
483
576
|
# itself was successful), and its subtree failed, it shouldn't be retried
|
|
484
577
|
# unless it has more than 1 try.
|
|
485
578
|
if predecessor_id in self.toilState.jobs_issued:
|
|
486
|
-
logger.debug(
|
|
579
|
+
logger.debug(
|
|
580
|
+
"Checkpoint job %s was updated while issued", predecessor_id
|
|
581
|
+
)
|
|
487
582
|
else:
|
|
488
583
|
# It hasn't already been reissued.
|
|
489
584
|
# This check lets us be robust against repeated job update
|
|
490
585
|
# messages (such as from services starting *and* failing), by
|
|
491
586
|
# making sure that we don't stay in a state that where we
|
|
492
587
|
# reissue the job every time we get one.
|
|
493
|
-
logger.warning(
|
|
494
|
-
|
|
588
|
+
logger.warning(
|
|
589
|
+
"Job: %s is being restarted as a checkpoint after the total "
|
|
590
|
+
"failure of jobs in its subtree.",
|
|
591
|
+
predecessor_id,
|
|
592
|
+
)
|
|
495
593
|
self.issueJob(predecessor)
|
|
496
594
|
else:
|
|
497
595
|
# Mark it totally failed
|
|
498
|
-
logger.debug(
|
|
596
|
+
logger.debug(
|
|
597
|
+
"Job %s is being processed as completely failed", predecessor_id
|
|
598
|
+
)
|
|
499
599
|
self.processTotallyFailedJob(predecessor_id)
|
|
500
600
|
|
|
501
601
|
def _processReadyJob(self, job_id: str, result_status: int):
|
|
502
602
|
# We operate on the JobDescription mostly.
|
|
503
603
|
readyJob = self.toilState.get_job(job_id)
|
|
504
604
|
|
|
505
|
-
logger.debug(
|
|
506
|
-
|
|
605
|
+
logger.debug(
|
|
606
|
+
"Updating status of job %s with result status: %s", readyJob, result_status
|
|
607
|
+
)
|
|
507
608
|
|
|
508
609
|
# TODO: Filter out nonexistent successors/services now, so we can tell
|
|
509
610
|
# if they are all done and the job needs deleting?
|
|
@@ -516,14 +617,17 @@ class Leader:
|
|
|
516
617
|
# want to act on it; we want to wait until it gets the update it
|
|
517
618
|
# gets when the service manager is done trying to start its
|
|
518
619
|
# services.
|
|
519
|
-
logger.debug(
|
|
520
|
-
|
|
620
|
+
logger.debug(
|
|
621
|
+
"Got a job to update which is still owned by the service "
|
|
622
|
+
"manager: %s",
|
|
623
|
+
readyJob.jobStoreID,
|
|
624
|
+
)
|
|
521
625
|
elif readyJob.jobStoreID in self.toilState.hasFailedSuccessors:
|
|
522
626
|
self._processFailedSuccessors(job_id)
|
|
523
|
-
elif readyJob.
|
|
524
|
-
# The job has a
|
|
627
|
+
elif readyJob.has_body() or result_status != 0:
|
|
628
|
+
# The job has a body it must be run before any successors.
|
|
525
629
|
# Similarly, if the job previously failed we rerun it, even if it doesn't have a
|
|
526
|
-
#
|
|
630
|
+
# body to run, to eliminate any parts of the stack now completed.
|
|
527
631
|
isServiceJob = readyJob.jobStoreID in self.toilState.service_to_client
|
|
528
632
|
|
|
529
633
|
# We want to run the job, and expend one of its "tries" (possibly
|
|
@@ -531,8 +635,9 @@ class Leader:
|
|
|
531
635
|
|
|
532
636
|
# If the job has run out of tries or is a service job whose error flag has
|
|
533
637
|
# been indicated, fail the job.
|
|
534
|
-
if
|
|
535
|
-
|
|
638
|
+
if readyJob.remainingTryCount == 0 or (
|
|
639
|
+
isServiceJob and not self.jobStore.file_exists(readyJob.errorJobStoreID)
|
|
640
|
+
):
|
|
536
641
|
self.processTotallyFailedJob(job_id)
|
|
537
642
|
logger.warning("Job %s is completely failed", readyJob)
|
|
538
643
|
else:
|
|
@@ -543,28 +648,39 @@ class Leader:
|
|
|
543
648
|
# Build a map from the service jobs to the job and a map
|
|
544
649
|
# of the services created for the job
|
|
545
650
|
if readyJob.jobStoreID in self.toilState.servicesIssued:
|
|
546
|
-
raise RuntimeError(
|
|
651
|
+
raise RuntimeError(
|
|
652
|
+
f"The ready job: {readyJob.jobStoreID} was already issued."
|
|
653
|
+
)
|
|
547
654
|
self.toilState.servicesIssued[readyJob.jobStoreID] = set()
|
|
548
655
|
for serviceJobList in readyJob.serviceHostIDsInBatches():
|
|
549
656
|
for serviceID in serviceJobList:
|
|
550
657
|
if serviceID in self.toilState.service_to_client:
|
|
551
|
-
raise RuntimeError(
|
|
658
|
+
raise RuntimeError(
|
|
659
|
+
f"The ready service ID: {serviceID} was already added."
|
|
660
|
+
)
|
|
661
|
+
# TODO: Why do we refresh here?
|
|
552
662
|
self.toilState.reset_job(serviceID)
|
|
553
663
|
serviceHost = self.toilState.get_job(serviceID)
|
|
554
664
|
self.toilState.service_to_client[serviceID] = readyJob.jobStoreID
|
|
555
665
|
self.toilState.servicesIssued[readyJob.jobStoreID].add(serviceID)
|
|
556
666
|
|
|
557
|
-
logger.debug(
|
|
667
|
+
logger.debug(
|
|
668
|
+
"Giving job: %s to service manager to schedule its jobs", readyJob
|
|
669
|
+
)
|
|
558
670
|
# Use the service manager to start the services
|
|
559
671
|
self.serviceManager.put_client(job_id)
|
|
560
672
|
elif readyJob.nextSuccessors() is not None:
|
|
561
673
|
# There are successors to run
|
|
562
674
|
self._runJobSuccessors(job_id)
|
|
563
675
|
elif readyJob.jobStoreID in self.toilState.servicesIssued:
|
|
564
|
-
logger.debug(
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
676
|
+
logger.debug(
|
|
677
|
+
"Telling job: %s to terminate its services due to the "
|
|
678
|
+
"successful completion of its successor jobs",
|
|
679
|
+
readyJob,
|
|
680
|
+
)
|
|
681
|
+
self.serviceManager.kill_services(
|
|
682
|
+
self.toilState.servicesIssued[readyJob.jobStoreID], error=False
|
|
683
|
+
)
|
|
568
684
|
else:
|
|
569
685
|
# There are no remaining tasks to schedule within the job.
|
|
570
686
|
#
|
|
@@ -593,7 +709,10 @@ class Leader:
|
|
|
593
709
|
try:
|
|
594
710
|
self.toilState.delete_job(readyJob.jobStoreID)
|
|
595
711
|
except Exception as e:
|
|
596
|
-
logger.exception(
|
|
712
|
+
logger.exception(
|
|
713
|
+
"Re-processing success for job we could not remove: %s",
|
|
714
|
+
readyJob,
|
|
715
|
+
)
|
|
597
716
|
# Kick it back to being handled as succeeded again. We
|
|
598
717
|
# don't want to have a failure here cause a Toil-level
|
|
599
718
|
# retry which causes more actual jobs to try to run.
|
|
@@ -605,12 +724,18 @@ class Leader:
|
|
|
605
724
|
self.processRemovedJob(readyJob, 0)
|
|
606
725
|
else:
|
|
607
726
|
self.processTotallyFailedJob(job_id)
|
|
608
|
-
logger.error(
|
|
727
|
+
logger.error(
|
|
728
|
+
"Job: %s is empty but completely failed - something is very wrong",
|
|
729
|
+
readyJob.jobStoreID,
|
|
730
|
+
)
|
|
609
731
|
|
|
610
732
|
def _processReadyJobs(self):
|
|
611
733
|
"""Process jobs that are ready to be scheduled/have successors to schedule."""
|
|
612
|
-
logger.debug(
|
|
613
|
-
|
|
734
|
+
logger.debug(
|
|
735
|
+
"Built the jobs list, currently have %i jobs to update and %i jobs issued",
|
|
736
|
+
self._messages.count(JobUpdatedMessage),
|
|
737
|
+
self.getNumberOfJobsIssued(),
|
|
738
|
+
)
|
|
614
739
|
|
|
615
740
|
# Now go through and, for each job that has updated this tick, process it.
|
|
616
741
|
|
|
@@ -625,9 +750,13 @@ class Leader:
|
|
|
625
750
|
if message.job_id in handled_with_status:
|
|
626
751
|
if handled_with_status[message.job_id] == message.result_status:
|
|
627
752
|
# This is a harmless duplicate
|
|
628
|
-
logger.debug(
|
|
629
|
-
|
|
630
|
-
|
|
753
|
+
logger.debug(
|
|
754
|
+
"Job %s already updated this tick with status %s and "
|
|
755
|
+
"we've received duplicate message %s",
|
|
756
|
+
message.job_id,
|
|
757
|
+
handled_with_status[message.job_id],
|
|
758
|
+
message,
|
|
759
|
+
)
|
|
631
760
|
else:
|
|
632
761
|
# This is a conflicting update. We may have already treated
|
|
633
762
|
# a job as succeeding but now we've heard it's failed, or
|
|
@@ -635,9 +764,13 @@ class Leader:
|
|
|
635
764
|
# This probably shouldn't happen, but does because the
|
|
636
765
|
# scheduler is not correct somehow and hasn't been for a
|
|
637
766
|
# long time. Complain about it.
|
|
638
|
-
logger.warning(
|
|
639
|
-
|
|
640
|
-
|
|
767
|
+
logger.warning(
|
|
768
|
+
"Job %s already updated this tick with status %s "
|
|
769
|
+
"but we've now received %s",
|
|
770
|
+
message.job_id,
|
|
771
|
+
handled_with_status[message.job_id],
|
|
772
|
+
message,
|
|
773
|
+
)
|
|
641
774
|
# Either way, we only want to handle one update per tick, like
|
|
642
775
|
# the old dict-based implementation.
|
|
643
776
|
continue
|
|
@@ -655,16 +788,21 @@ class Leader:
|
|
|
655
788
|
if service_id is None:
|
|
656
789
|
break
|
|
657
790
|
|
|
658
|
-
logger.debug(
|
|
791
|
+
logger.debug(
|
|
792
|
+
"Launching service job: %s", self.toilState.get_job(service_id)
|
|
793
|
+
)
|
|
659
794
|
self.issueServiceJob(service_id)
|
|
660
795
|
|
|
661
796
|
def _processJobsWithRunningServices(self):
|
|
662
797
|
"""Get jobs whose services have started."""
|
|
663
798
|
while True:
|
|
664
799
|
client_id = self.serviceManager.get_ready_client(0)
|
|
665
|
-
if client_id is None:
|
|
800
|
+
if client_id is None: # Stop trying to get jobs when function returns None
|
|
666
801
|
break
|
|
667
|
-
logger.debug(
|
|
802
|
+
logger.debug(
|
|
803
|
+
"Job: %s has established its services; all services are running",
|
|
804
|
+
client_id,
|
|
805
|
+
)
|
|
668
806
|
|
|
669
807
|
# Grab the client job description
|
|
670
808
|
client = self.toilState.get_job(client_id)
|
|
@@ -677,9 +815,9 @@ class Leader:
|
|
|
677
815
|
"""Get jobs whose services have failed to start."""
|
|
678
816
|
while True:
|
|
679
817
|
client_id = self.serviceManager.get_unservable_client(0)
|
|
680
|
-
if client_id is None:
|
|
818
|
+
if client_id is None: # Stop trying to get jobs when function returns None
|
|
681
819
|
break
|
|
682
|
-
logger.debug(
|
|
820
|
+
logger.debug("Job: %s has failed to establish its services.", client_id)
|
|
683
821
|
|
|
684
822
|
# Grab the client job description
|
|
685
823
|
client = self.toilState.get_job(client_id)
|
|
@@ -694,29 +832,56 @@ class Leader:
|
|
|
694
832
|
def _gatherUpdatedJobs(self, updatedJobTuple):
|
|
695
833
|
"""Gather any new, updated JobDescriptions from the batch system."""
|
|
696
834
|
bsID, exitStatus, exitReason, wallTime = (
|
|
697
|
-
updatedJobTuple.jobID,
|
|
698
|
-
updatedJobTuple.
|
|
835
|
+
updatedJobTuple.jobID,
|
|
836
|
+
updatedJobTuple.exitStatus,
|
|
837
|
+
updatedJobTuple.exitReason,
|
|
838
|
+
updatedJobTuple.wallTime,
|
|
839
|
+
)
|
|
699
840
|
# easy, track different state
|
|
700
841
|
try:
|
|
701
|
-
updatedJob = self.toilState.get_job(
|
|
842
|
+
updatedJob = self.toilState.get_job(
|
|
843
|
+
self.issued_jobs_by_batch_system_id[bsID]
|
|
844
|
+
)
|
|
702
845
|
except KeyError:
|
|
703
|
-
logger.warning(
|
|
846
|
+
logger.warning(
|
|
847
|
+
"A result seems to already have been processed for job %s", bsID
|
|
848
|
+
)
|
|
704
849
|
else:
|
|
705
850
|
if exitStatus == 0:
|
|
706
|
-
logger.debug(
|
|
851
|
+
logger.debug("Job ended: %s", updatedJob)
|
|
707
852
|
else:
|
|
708
|
-
|
|
709
|
-
|
|
853
|
+
status_string = (
|
|
854
|
+
str(exitStatus)
|
|
855
|
+
if exitStatus != EXIT_STATUS_UNAVAILABLE_VALUE
|
|
856
|
+
else "<UNAVAILABLE>"
|
|
857
|
+
)
|
|
858
|
+
logger.warning(
|
|
859
|
+
f"Job failed with exit value {status_string}: {updatedJob}\n"
|
|
860
|
+
f"Exit reason: {BatchJobExitReason.to_string(exitReason)}"
|
|
861
|
+
)
|
|
862
|
+
# This logic is undefined for which of the failing jobs will send its exit code
|
|
863
|
+
# when there are multiple failing jobs with different exit statuses
|
|
864
|
+
self.recommended_fail_exit_code = exitStatus
|
|
710
865
|
if exitStatus == CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE:
|
|
711
866
|
# This is a CWL job informing us that the workflow is
|
|
712
867
|
# asking things of us that Toil can't do. When we raise an
|
|
713
868
|
# exception because of this, make sure to forward along
|
|
714
869
|
# this exit code.
|
|
715
870
|
logger.warning("This indicates an unsupported CWL requirement!")
|
|
716
|
-
self.recommended_fail_exit_code =
|
|
871
|
+
self.recommended_fail_exit_code = (
|
|
872
|
+
CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
|
|
873
|
+
)
|
|
717
874
|
# Tell everyone it stopped running.
|
|
718
|
-
self._messages.publish(
|
|
719
|
-
|
|
875
|
+
self._messages.publish(
|
|
876
|
+
JobCompletedMessage(
|
|
877
|
+
get_job_kind(updatedJob.get_names()),
|
|
878
|
+
updatedJob.jobStoreID,
|
|
879
|
+
exitStatus,
|
|
880
|
+
)
|
|
881
|
+
)
|
|
882
|
+
self.process_finished_job(
|
|
883
|
+
bsID, exitStatus, wall_time=wallTime, exit_reason=exitReason
|
|
884
|
+
)
|
|
720
885
|
|
|
721
886
|
def _processLostJobs(self):
|
|
722
887
|
"""Process jobs that have gone awry."""
|
|
@@ -724,7 +889,9 @@ class Leader:
|
|
|
724
889
|
# gather for rescueJobsFrequency seconds) check if there are any jobs
|
|
725
890
|
# that have run too long (see self.reissueOverLongJobs) or which have
|
|
726
891
|
# gone missing from the batch system (see self.reissueMissingJobs)
|
|
727
|
-
if (
|
|
892
|
+
if (
|
|
893
|
+
time.time() - self.timeSinceJobsLastRescued
|
|
894
|
+
) >= self.config.rescueJobsFrequency:
|
|
728
895
|
# We only rescue jobs every N seconds, and when we have apparently
|
|
729
896
|
# exhausted the current job supply
|
|
730
897
|
self.reissueOverLongJobs()
|
|
@@ -744,9 +911,11 @@ class Leader:
|
|
|
744
911
|
"""
|
|
745
912
|
self.timeSinceJobsLastRescued = time.time()
|
|
746
913
|
|
|
747
|
-
while
|
|
748
|
-
|
|
749
|
-
|
|
914
|
+
while (
|
|
915
|
+
self._messages.count(JobUpdatedMessage) > 0
|
|
916
|
+
or self.getNumberOfJobsIssued()
|
|
917
|
+
or self.serviceManager.get_job_count()
|
|
918
|
+
):
|
|
750
919
|
|
|
751
920
|
if self._messages.count(JobUpdatedMessage) > 0:
|
|
752
921
|
self._processReadyJobs()
|
|
@@ -798,13 +967,21 @@ class Leader:
|
|
|
798
967
|
if not self._messages.empty():
|
|
799
968
|
raise RuntimeError(f"Pending messages at shutdown: {self._messages}")
|
|
800
969
|
if self.toilState.successorCounts != {}:
|
|
801
|
-
raise RuntimeError(
|
|
970
|
+
raise RuntimeError(
|
|
971
|
+
f"Jobs waiting on successors at shutdown: {self.toilState.successorCounts}"
|
|
972
|
+
)
|
|
802
973
|
if self.toilState.successor_to_predecessors != {}:
|
|
803
|
-
raise RuntimeError(
|
|
974
|
+
raise RuntimeError(
|
|
975
|
+
f"Successors pending for their predecessors at shutdown: {self.toilState.successor_to_predecessors}"
|
|
976
|
+
)
|
|
804
977
|
if self.toilState.service_to_client != {}:
|
|
805
|
-
raise RuntimeError(
|
|
978
|
+
raise RuntimeError(
|
|
979
|
+
f"Services pending for their clients at shutdown: {self.toilState.service_to_client}"
|
|
980
|
+
)
|
|
806
981
|
if self.toilState.servicesIssued != {}:
|
|
807
|
-
raise RuntimeError(
|
|
982
|
+
raise RuntimeError(
|
|
983
|
+
f"Services running at shutdown: {self.toilState.servicesIssued}"
|
|
984
|
+
)
|
|
808
985
|
|
|
809
986
|
def checkForDeadlocks(self):
|
|
810
987
|
"""Check if the system is deadlocked running service jobs."""
|
|
@@ -814,18 +991,22 @@ class Leader:
|
|
|
814
991
|
# If there are no updated jobs and at least some jobs running
|
|
815
992
|
if totalServicesIssued >= totalRunningJobs and totalRunningJobs > 0:
|
|
816
993
|
# Collect all running service job store IDs into a set to compare with the deadlock set
|
|
817
|
-
running_service_ids:
|
|
994
|
+
running_service_ids: set[str] = set()
|
|
818
995
|
for js_id in self.issued_jobs_by_batch_system_id.values():
|
|
819
996
|
job = self.toilState.get_job(js_id)
|
|
820
|
-
if isinstance(
|
|
997
|
+
if isinstance(
|
|
998
|
+
job, ServiceJobDescription
|
|
999
|
+
) and self.serviceManager.is_running(js_id):
|
|
821
1000
|
running_service_ids.add(js_id)
|
|
822
1001
|
|
|
823
1002
|
if len(running_service_ids) > totalRunningJobs:
|
|
824
1003
|
# This is too many services.
|
|
825
1004
|
# TODO: couldn't more jobs have started since we polled the
|
|
826
1005
|
# running job count?
|
|
827
|
-
raise RuntimeError(
|
|
828
|
-
|
|
1006
|
+
raise RuntimeError(
|
|
1007
|
+
f"Supposedly running {len(running_service_ids)} services, which is"
|
|
1008
|
+
f"more than the {totalRunningJobs} currently running jobs overall."
|
|
1009
|
+
)
|
|
829
1010
|
|
|
830
1011
|
# If all the running jobs are active services then we have a potential deadlock
|
|
831
1012
|
if len(running_service_ids) == totalRunningJobs:
|
|
@@ -839,27 +1020,49 @@ class Leader:
|
|
|
839
1020
|
# Use a generic message if none is available
|
|
840
1021
|
message = "Cluster may be too small."
|
|
841
1022
|
|
|
842
|
-
|
|
843
1023
|
# See if this is a new potential deadlock
|
|
844
1024
|
if self.potentialDeadlockedJobs != running_service_ids:
|
|
845
|
-
logger.warning(
|
|
846
|
-
|
|
1025
|
+
logger.warning(
|
|
1026
|
+
(
|
|
1027
|
+
"Potential deadlock detected! All %s running jobs are service jobs, "
|
|
1028
|
+
"with no normal jobs to use them! %s"
|
|
1029
|
+
),
|
|
1030
|
+
totalRunningJobs,
|
|
1031
|
+
message,
|
|
1032
|
+
)
|
|
847
1033
|
self.potentialDeadlockedJobs = running_service_ids
|
|
848
1034
|
self.potentialDeadlockTime = time.time()
|
|
849
1035
|
else:
|
|
850
1036
|
# We wait self.config.deadlockWait seconds before declaring the system deadlocked
|
|
851
1037
|
stuckFor = time.time() - self.potentialDeadlockTime
|
|
852
1038
|
if stuckFor >= self.config.deadlockWait:
|
|
853
|
-
logger.error(
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
1039
|
+
logger.error(
|
|
1040
|
+
"We have been deadlocked since %s on these service jobs: %s",
|
|
1041
|
+
self.potentialDeadlockTime,
|
|
1042
|
+
self.potentialDeadlockedJobs,
|
|
1043
|
+
)
|
|
1044
|
+
raise DeadlockException(
|
|
1045
|
+
(
|
|
1046
|
+
"The workflow is service deadlocked - all %d running jobs "
|
|
1047
|
+
"have been the same active services for at least %s seconds"
|
|
1048
|
+
)
|
|
1049
|
+
% (totalRunningJobs, self.config.deadlockWait)
|
|
1050
|
+
)
|
|
857
1051
|
else:
|
|
858
1052
|
# Complain that we are still stuck.
|
|
859
|
-
waitingNormalJobs =
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
1053
|
+
waitingNormalJobs = (
|
|
1054
|
+
self.getNumberOfJobsIssued() - totalServicesIssued
|
|
1055
|
+
)
|
|
1056
|
+
logger.warning(
|
|
1057
|
+
(
|
|
1058
|
+
"Potentially deadlocked for %.0f seconds. Waiting at most %.0f more seconds "
|
|
1059
|
+
"for any of %d issued non-service jobs to schedule and start. %s"
|
|
1060
|
+
),
|
|
1061
|
+
stuckFor,
|
|
1062
|
+
self.config.deadlockWait - stuckFor,
|
|
1063
|
+
waitingNormalJobs,
|
|
1064
|
+
message,
|
|
1065
|
+
)
|
|
863
1066
|
else:
|
|
864
1067
|
# We have observed non-service jobs running, so reset the potential deadlock
|
|
865
1068
|
self.feed_deadlock_watchdog()
|
|
@@ -880,34 +1083,38 @@ class Leader:
|
|
|
880
1083
|
"""Add a job to the queue of jobs currently trying to run."""
|
|
881
1084
|
# Never issue the same job multiple times simultaneously
|
|
882
1085
|
if jobNode.jobStoreID in self.toilState.jobs_issued:
|
|
883
|
-
raise RuntimeError(
|
|
1086
|
+
raise RuntimeError(
|
|
1087
|
+
f"Attempted to issue {jobNode} multiple times simultaneously!"
|
|
1088
|
+
)
|
|
884
1089
|
|
|
885
|
-
workerCommand = [
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
1090
|
+
workerCommand = [
|
|
1091
|
+
resolveEntryPoint("_toil_worker"),
|
|
1092
|
+
jobNode.jobName,
|
|
1093
|
+
self.jobStoreLocator,
|
|
1094
|
+
jobNode.jobStoreID,
|
|
1095
|
+
]
|
|
889
1096
|
|
|
890
1097
|
for context in self.batchSystem.getWorkerContexts():
|
|
891
1098
|
# For each context manager hook the batch system wants to run in
|
|
892
1099
|
# the worker, serialize and send it.
|
|
893
|
-
workerCommand.append(
|
|
894
|
-
workerCommand.append(
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
# the job store, or we will detach the job body from the job
|
|
898
|
-
# description. TODO: Don't do it this way! It's weird!
|
|
899
|
-
jobNode.command = ' '.join(workerCommand)
|
|
1100
|
+
workerCommand.append("--context")
|
|
1101
|
+
workerCommand.append(
|
|
1102
|
+
base64.b64encode(pickle.dumps(context)).decode("utf-8")
|
|
1103
|
+
)
|
|
900
1104
|
|
|
901
|
-
omp_threads = os.environ.get(
|
|
902
|
-
|
|
1105
|
+
omp_threads = os.environ.get("OMP_NUM_THREADS") or str(
|
|
1106
|
+
max(1, int(jobNode.cores))
|
|
1107
|
+
) # make sure OMP_NUM_THREADS is a positive integer
|
|
903
1108
|
|
|
904
1109
|
job_environment = {
|
|
905
1110
|
# Set the number of cores used by OpenMP applications
|
|
906
|
-
|
|
1111
|
+
"OMP_NUM_THREADS": omp_threads,
|
|
907
1112
|
}
|
|
908
1113
|
|
|
909
1114
|
# jobBatchSystemID is an int for each job
|
|
910
|
-
jobBatchSystemID = self.batchSystem.issueBatchJob(
|
|
1115
|
+
jobBatchSystemID = self.batchSystem.issueBatchJob(
|
|
1116
|
+
" ".join(workerCommand), jobNode, job_environment=job_environment
|
|
1117
|
+
)
|
|
911
1118
|
# Record the job by the ID the batch system will use to talk about it with us
|
|
912
1119
|
self.issued_jobs_by_batch_system_id[jobBatchSystemID] = jobNode.jobStoreID
|
|
913
1120
|
# Record that this job is issued right now and shouldn't e.g. be issued again.
|
|
@@ -917,11 +1124,18 @@ class Leader:
|
|
|
917
1124
|
# so increment this value after the job is added to the issuedJob dict
|
|
918
1125
|
self.preemptibleJobsIssued += 1
|
|
919
1126
|
cur_logger = logger.debug if jobNode.local else logger.info
|
|
920
|
-
cur_logger(
|
|
921
|
-
|
|
922
|
-
|
|
1127
|
+
cur_logger(
|
|
1128
|
+
"Issued job %s with job batch system ID: " "%s and %s",
|
|
1129
|
+
jobNode,
|
|
1130
|
+
str(jobBatchSystemID),
|
|
1131
|
+
jobNode.requirements_string(),
|
|
1132
|
+
)
|
|
923
1133
|
# Tell everyone it is issued and the queue size changed
|
|
924
|
-
self._messages.publish(
|
|
1134
|
+
self._messages.publish(
|
|
1135
|
+
JobIssuedMessage(
|
|
1136
|
+
get_job_kind(jobNode.get_names()), jobNode.jobStoreID, jobBatchSystemID
|
|
1137
|
+
)
|
|
1138
|
+
)
|
|
925
1139
|
self._messages.publish(QueueSizeMessage(self.getNumberOfJobsIssued()))
|
|
926
1140
|
# Tell the user there's another job to do
|
|
927
1141
|
self.progress_overall.total += 1
|
|
@@ -941,7 +1155,9 @@ class Leader:
|
|
|
941
1155
|
# Grab the service job description
|
|
942
1156
|
service = self.toilState.get_job(service_id)
|
|
943
1157
|
if not isinstance(service, ServiceJobDescription):
|
|
944
|
-
raise RuntimeError(
|
|
1158
|
+
raise RuntimeError(
|
|
1159
|
+
"The grabbed service job description is not the right type."
|
|
1160
|
+
)
|
|
945
1161
|
|
|
946
1162
|
if service.preemptible:
|
|
947
1163
|
self.preemptibleServiceJobsToBeIssued.append(service_id)
|
|
@@ -951,14 +1167,23 @@ class Leader:
|
|
|
951
1167
|
|
|
952
1168
|
def issueQueingServiceJobs(self):
|
|
953
1169
|
"""Issues any queuing service jobs up to the limit of the maximum allowed."""
|
|
954
|
-
while
|
|
1170
|
+
while (
|
|
1171
|
+
len(self.serviceJobsToBeIssued) > 0
|
|
1172
|
+
and self.serviceJobsIssued < self.config.maxServiceJobs
|
|
1173
|
+
):
|
|
955
1174
|
self.issueJob(self.toilState.get_job(self.serviceJobsToBeIssued.pop()))
|
|
956
1175
|
self.serviceJobsIssued += 1
|
|
957
|
-
while
|
|
958
|
-
|
|
1176
|
+
while (
|
|
1177
|
+
len(self.preemptibleServiceJobsToBeIssued) > 0
|
|
1178
|
+
and self.preemptibleServiceJobsIssued
|
|
1179
|
+
< self.config.maxPreemptibleServiceJobs
|
|
1180
|
+
):
|
|
1181
|
+
self.issueJob(
|
|
1182
|
+
self.toilState.get_job(self.preemptibleServiceJobsToBeIssued.pop())
|
|
1183
|
+
)
|
|
959
1184
|
self.preemptibleServiceJobsIssued += 1
|
|
960
1185
|
|
|
961
|
-
def getNumberOfJobsIssued(self, preemptible: Optional[bool]=None) -> int:
|
|
1186
|
+
def getNumberOfJobsIssued(self, preemptible: Optional[bool] = None) -> int:
|
|
962
1187
|
"""
|
|
963
1188
|
Get number of jobs that have been added by issueJob(s) and not removed by removeJob.
|
|
964
1189
|
|
|
@@ -1008,12 +1233,16 @@ class Leader:
|
|
|
1008
1233
|
"""
|
|
1009
1234
|
if jobBatchSystemID not in self.issued_jobs_by_batch_system_id:
|
|
1010
1235
|
raise RuntimeError("Job was already removed or was never issued.")
|
|
1011
|
-
issuedDesc = self.toilState.get_job(
|
|
1236
|
+
issuedDesc = self.toilState.get_job(
|
|
1237
|
+
self.issued_jobs_by_batch_system_id[jobBatchSystemID]
|
|
1238
|
+
)
|
|
1012
1239
|
if issuedDesc.preemptible:
|
|
1013
1240
|
# len(issued_jobs_by_batch_system_id) should always be greater than or equal to preemptibleJobsIssued,
|
|
1014
1241
|
# so decrement this value before removing the job from the issuedJob map
|
|
1015
1242
|
if self.preemptibleJobsIssued <= 0:
|
|
1016
|
-
raise RuntimeError(
|
|
1243
|
+
raise RuntimeError(
|
|
1244
|
+
"The number of preemptive issued jobs cannot be negative."
|
|
1245
|
+
)
|
|
1017
1246
|
self.preemptibleJobsIssued -= 1
|
|
1018
1247
|
# It's not issued anymore.
|
|
1019
1248
|
del self.issued_jobs_by_batch_system_id[jobBatchSystemID]
|
|
@@ -1033,19 +1262,24 @@ class Leader:
|
|
|
1033
1262
|
|
|
1034
1263
|
return issuedDesc
|
|
1035
1264
|
|
|
1036
|
-
def getJobs(self, preemptible: Optional[bool] = None) ->
|
|
1265
|
+
def getJobs(self, preemptible: Optional[bool] = None) -> list[JobDescription]:
|
|
1037
1266
|
"""
|
|
1038
1267
|
Get all issued jobs.
|
|
1039
1268
|
|
|
1040
1269
|
:param preemptible: If specified, select only preemptible or only non-preemptible jobs.
|
|
1041
1270
|
"""
|
|
1042
1271
|
|
|
1043
|
-
jobs = [
|
|
1272
|
+
jobs = [
|
|
1273
|
+
self.toilState.get_job(job_store_id)
|
|
1274
|
+
for job_store_id in self.issued_jobs_by_batch_system_id.values()
|
|
1275
|
+
]
|
|
1044
1276
|
if preemptible is not None:
|
|
1045
1277
|
jobs = [job for job in jobs if job.preemptible == preemptible]
|
|
1046
1278
|
return jobs
|
|
1047
1279
|
|
|
1048
|
-
def killJobs(
|
|
1280
|
+
def killJobs(
|
|
1281
|
+
self, jobsToKill, exit_reason: BatchJobExitReason = BatchJobExitReason.KILLED
|
|
1282
|
+
):
|
|
1049
1283
|
"""
|
|
1050
1284
|
Kills the given set of jobs and then sends them for processing.
|
|
1051
1285
|
|
|
@@ -1059,7 +1293,9 @@ class Leader:
|
|
|
1059
1293
|
self.batchSystem.killBatchJobs(jobsToKill)
|
|
1060
1294
|
for jobBatchSystemID in jobsToKill:
|
|
1061
1295
|
# Reissue immediately, noting that we killed the job
|
|
1062
|
-
willRerun = self.process_finished_job(
|
|
1296
|
+
willRerun = self.process_finished_job(
|
|
1297
|
+
jobBatchSystemID, 1, exit_reason=exit_reason
|
|
1298
|
+
)
|
|
1063
1299
|
|
|
1064
1300
|
if willRerun:
|
|
1065
1301
|
# Compose a list of all the jobs that will run again
|
|
@@ -1067,8 +1303,7 @@ class Leader:
|
|
|
1067
1303
|
|
|
1068
1304
|
return jobsRerunning
|
|
1069
1305
|
|
|
1070
|
-
|
|
1071
|
-
#Following functions handle error cases for when jobs have gone awry with the batch system.
|
|
1306
|
+
# Following functions handle error cases for when jobs have gone awry with the batch system.
|
|
1072
1307
|
|
|
1073
1308
|
def reissueOverLongJobs(self) -> None:
|
|
1074
1309
|
"""
|
|
@@ -1079,20 +1314,30 @@ class Leader:
|
|
|
1079
1314
|
"""
|
|
1080
1315
|
maxJobDuration = self.config.maxJobDuration
|
|
1081
1316
|
jobsToKill = []
|
|
1082
|
-
if
|
|
1317
|
+
if (
|
|
1318
|
+
maxJobDuration < 10000000
|
|
1319
|
+
): # We won't bother doing anything if rescue time > 16 weeks.
|
|
1083
1320
|
runningJobs = self.batchSystem.getRunningBatchJobIDs()
|
|
1084
1321
|
for jobBatchSystemID in list(runningJobs.keys()):
|
|
1085
1322
|
if runningJobs[jobBatchSystemID] > maxJobDuration:
|
|
1086
|
-
logger.warning(
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1323
|
+
logger.warning(
|
|
1324
|
+
"The job: %s has been running for: %s seconds, more than the "
|
|
1325
|
+
"max job duration: %s, we'll kill it",
|
|
1326
|
+
self.issued_jobs_by_batch_system_id[jobBatchSystemID],
|
|
1327
|
+
str(runningJobs[jobBatchSystemID]),
|
|
1328
|
+
str(maxJobDuration),
|
|
1329
|
+
)
|
|
1091
1330
|
jobsToKill.append(jobBatchSystemID)
|
|
1092
|
-
reissued = self.killJobs(
|
|
1331
|
+
reissued = self.killJobs(
|
|
1332
|
+
jobsToKill, exit_reason=BatchJobExitReason.MAXJOBDURATION
|
|
1333
|
+
)
|
|
1093
1334
|
if len(jobsToKill) > 0:
|
|
1094
1335
|
# Summarize our actions
|
|
1095
|
-
logger.info(
|
|
1336
|
+
logger.info(
|
|
1337
|
+
"Killed %d over long jobs and reissued %d of them",
|
|
1338
|
+
len(jobsToKill),
|
|
1339
|
+
len(reissued),
|
|
1340
|
+
)
|
|
1096
1341
|
|
|
1097
1342
|
def reissueMissingJobs(self, killAfterNTimesMissing=3):
|
|
1098
1343
|
"""
|
|
@@ -1104,11 +1349,13 @@ class Leader:
|
|
|
1104
1349
|
"""
|
|
1105
1350
|
issuedJobs = set(self.batchSystem.getIssuedBatchJobIDs())
|
|
1106
1351
|
jobBatchSystemIDsSet = set(list(self.issued_jobs_by_batch_system_id.keys()))
|
|
1107
|
-
#Clean up the reissueMissingJobs_missingHash hash, getting rid of jobs that have turned up
|
|
1352
|
+
# Clean up the reissueMissingJobs_missingHash hash, getting rid of jobs that have turned up
|
|
1108
1353
|
missingJobIDsSet = set(list(self.reissueMissingJobs_missingHash.keys()))
|
|
1109
1354
|
for jobBatchSystemID in missingJobIDsSet.difference(jobBatchSystemIDsSet):
|
|
1110
1355
|
self.reissueMissingJobs_missingHash.pop(jobBatchSystemID)
|
|
1111
|
-
logger.warning(
|
|
1356
|
+
logger.warning(
|
|
1357
|
+
"Batch system id: %s is no longer missing", str(jobBatchSystemID)
|
|
1358
|
+
)
|
|
1112
1359
|
# checks we have no unexpected jobs running
|
|
1113
1360
|
if not issuedJobs.issubset(jobBatchSystemIDsSet):
|
|
1114
1361
|
raise RuntimeError("An unexpected job is still running.")
|
|
@@ -1120,24 +1367,33 @@ class Leader:
|
|
|
1120
1367
|
else:
|
|
1121
1368
|
self.reissueMissingJobs_missingHash[jobBatchSystemID] = 1
|
|
1122
1369
|
timesMissing = self.reissueMissingJobs_missingHash[jobBatchSystemID]
|
|
1123
|
-
logger.warning(
|
|
1124
|
-
|
|
1370
|
+
logger.warning(
|
|
1371
|
+
"Job store ID %s with batch system id %s is missing for the %i time",
|
|
1372
|
+
jobStoreID,
|
|
1373
|
+
str(jobBatchSystemID),
|
|
1374
|
+
timesMissing,
|
|
1375
|
+
)
|
|
1125
1376
|
# Tell everyone it is missing
|
|
1126
1377
|
self._messages.publish(JobMissingMessage(jobStoreID))
|
|
1127
1378
|
if timesMissing == killAfterNTimesMissing:
|
|
1128
1379
|
self.reissueMissingJobs_missingHash.pop(jobBatchSystemID)
|
|
1129
1380
|
jobsToKill.append(jobBatchSystemID)
|
|
1130
|
-
self.killJobs(jobsToKill)
|
|
1131
|
-
return len(
|
|
1132
|
-
#if there are missing jobs
|
|
1381
|
+
self.killJobs(jobsToKill, exit_reason=BatchJobExitReason.MISSING)
|
|
1382
|
+
return len(self.reissueMissingJobs_missingHash) == 0 # We use this to inform
|
|
1383
|
+
# if there are missing jobs
|
|
1133
1384
|
|
|
1134
1385
|
def processRemovedJob(self, issuedJob, result_status):
|
|
1135
1386
|
if result_status != 0:
|
|
1136
|
-
logger.warning(
|
|
1137
|
-
|
|
1387
|
+
logger.warning(
|
|
1388
|
+
"Despite the batch system claiming failure the "
|
|
1389
|
+
"job %s seems to have finished and been removed",
|
|
1390
|
+
issuedJob,
|
|
1391
|
+
)
|
|
1138
1392
|
self._updatePredecessorStatus(issuedJob.jobStoreID)
|
|
1139
1393
|
|
|
1140
|
-
def process_finished_job(
|
|
1394
|
+
def process_finished_job(
|
|
1395
|
+
self, batch_system_id, result_status, wall_time=None, exit_reason=None
|
|
1396
|
+
) -> bool:
|
|
1141
1397
|
"""
|
|
1142
1398
|
Process finished jobs.
|
|
1143
1399
|
|
|
@@ -1157,15 +1413,21 @@ class Leader:
|
|
|
1157
1413
|
self.progress_overall.update(incr=-1)
|
|
1158
1414
|
self.progress_failed.update(incr=1)
|
|
1159
1415
|
|
|
1160
|
-
# Delegate to the
|
|
1161
|
-
return self.process_finished_job_description(
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1416
|
+
# Delegate to the version that uses a JobDescription
|
|
1417
|
+
return self.process_finished_job_description(
|
|
1418
|
+
issued_job, result_status, wall_time, exit_reason, batch_system_id
|
|
1419
|
+
)
|
|
1420
|
+
|
|
1421
|
+
def process_finished_job_description(
|
|
1422
|
+
self,
|
|
1423
|
+
finished_job: JobDescription,
|
|
1424
|
+
result_status: int,
|
|
1425
|
+
wall_time: Optional[float] = None,
|
|
1426
|
+
exit_reason: Optional[BatchJobExitReason] = None,
|
|
1427
|
+
batch_system_id: Optional[int] = None,
|
|
1428
|
+
) -> bool:
|
|
1167
1429
|
"""
|
|
1168
|
-
Process a finished JobDescription based upon its
|
|
1430
|
+
Process a finished JobDescription based upon its success or failure.
|
|
1169
1431
|
|
|
1170
1432
|
If wall-clock time is available, informs the cluster scaler about the
|
|
1171
1433
|
job finishing.
|
|
@@ -1185,22 +1447,67 @@ class Leader:
|
|
|
1185
1447
|
# TODO: Use message bus?
|
|
1186
1448
|
self.clusterScaler.addCompletedJob(finished_job, wall_time)
|
|
1187
1449
|
if self.toilState.job_exists(job_store_id):
|
|
1188
|
-
logger.debug(
|
|
1450
|
+
logger.debug(
|
|
1451
|
+
"Job %s continues to exist (i.e. has more to do)", finished_job
|
|
1452
|
+
)
|
|
1189
1453
|
try:
|
|
1190
1454
|
# Reload the job as modified by the worker
|
|
1191
|
-
|
|
1192
|
-
|
|
1455
|
+
if finished_job.has_body():
|
|
1456
|
+
# The worker was expected to do some work. We expect the
|
|
1457
|
+
# worker to have updated the job description.
|
|
1458
|
+
|
|
1459
|
+
# If the job succeeded, we wait around to see the update
|
|
1460
|
+
# and fail the job if we don't see it.
|
|
1461
|
+
if result_status == 0:
|
|
1462
|
+
timeout = self.config.job_store_timeout
|
|
1463
|
+
complaint = (
|
|
1464
|
+
f"has no new version available after {timeout} "
|
|
1465
|
+
"seconds. Either worker updates to "
|
|
1466
|
+
"the job store are delayed longer than your "
|
|
1467
|
+
"--jobStoreTimeout, or the worker trying to run the "
|
|
1468
|
+
"job was killed (or never started)."
|
|
1469
|
+
)
|
|
1470
|
+
else:
|
|
1471
|
+
timeout = 0
|
|
1472
|
+
complaint = (
|
|
1473
|
+
"has no new version available immediately. The "
|
|
1474
|
+
"batch system may have killed (or never started) "
|
|
1475
|
+
"the Toil worker."
|
|
1476
|
+
)
|
|
1477
|
+
change_detected = self.toilState.reset_job_expecting_change(
|
|
1478
|
+
job_store_id, timeout
|
|
1479
|
+
)
|
|
1480
|
+
replacement_job = self.toilState.get_job(job_store_id)
|
|
1481
|
+
|
|
1482
|
+
if not change_detected:
|
|
1483
|
+
logger.warning("Job %s %s", replacement_job, complaint)
|
|
1484
|
+
if result_status == 0:
|
|
1485
|
+
# Make the job fail because we ran it and it finished
|
|
1486
|
+
# and we never heard back.
|
|
1487
|
+
logger.error(
|
|
1488
|
+
"Marking ostensibly successful job %s that did "
|
|
1489
|
+
"not report in to the job store before "
|
|
1490
|
+
"--jobStoreTimeout as having been partitioned "
|
|
1491
|
+
"from us.",
|
|
1492
|
+
replacement_job,
|
|
1493
|
+
)
|
|
1494
|
+
result_status = EXIT_STATUS_UNAVAILABLE_VALUE
|
|
1495
|
+
exit_reason = BatchJobExitReason.PARTITION
|
|
1496
|
+
else:
|
|
1497
|
+
# If there was no body sent, the worker won't commit any
|
|
1498
|
+
# changes to the job description. So don't wait around for
|
|
1499
|
+
# any and don't complain if we don't see them.
|
|
1500
|
+
self.toilState.reset_job(job_store_id)
|
|
1501
|
+
replacement_job = self.toilState.get_job(job_store_id)
|
|
1502
|
+
|
|
1193
1503
|
except NoSuchJobException:
|
|
1194
1504
|
# We have a ghost job - the job has been deleted but a stale
|
|
1195
1505
|
# read from e.g. a non-POSIX-compliant filesystem gave us a
|
|
1196
1506
|
# false positive when we checked for its existence. Process the
|
|
1197
1507
|
# job from here as any other job removed from the job store.
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
logger.warning('Got a stale read for job %s; caught its '
|
|
1202
|
-
'completion in time, but other jobs may try to run twice! Fix '
|
|
1203
|
-
'the consistency of your job store storage!', finished_job)
|
|
1508
|
+
logger.debug(
|
|
1509
|
+
"Job %s is actually complete upon closer inspection", finished_job
|
|
1510
|
+
)
|
|
1204
1511
|
self.processRemovedJob(finished_job, result_status)
|
|
1205
1512
|
return False
|
|
1206
1513
|
if replacement_job.logJobStoreFileID is not None:
|
|
@@ -1208,17 +1515,31 @@ class Leader:
|
|
|
1208
1515
|
# more memory efficient than read().striplines() while leaving off the
|
|
1209
1516
|
# trailing \n left when using readlines()
|
|
1210
1517
|
# http://stackoverflow.com/a/15233739
|
|
1211
|
-
StatsAndLogging.logWithFormatting(
|
|
1212
|
-
|
|
1518
|
+
StatsAndLogging.logWithFormatting(
|
|
1519
|
+
f'Log from job "{job_store_id}"',
|
|
1520
|
+
log_stream,
|
|
1521
|
+
method=logger.warning,
|
|
1522
|
+
message="The job seems to have left a log file, indicating failure: %s"
|
|
1523
|
+
% replacement_job,
|
|
1524
|
+
)
|
|
1213
1525
|
if self.config.writeLogs or self.config.writeLogsGzip:
|
|
1214
1526
|
with replacement_job.getLogFileHandle(self.jobStore) as log_stream:
|
|
1215
|
-
|
|
1527
|
+
# Send log data from the job store to each per-job log file involved.
|
|
1528
|
+
StatsAndLogging.writeLogFiles(
|
|
1529
|
+
[names.stats_name for names in replacement_job.get_chain()],
|
|
1530
|
+
log_stream,
|
|
1531
|
+
self.config,
|
|
1532
|
+
failed=True,
|
|
1533
|
+
)
|
|
1216
1534
|
if result_status != 0:
|
|
1217
1535
|
# If the batch system returned a non-zero exit code then the worker
|
|
1218
1536
|
# is assumed not to have captured the failure of the job, so we
|
|
1219
1537
|
# reduce the try count here.
|
|
1220
1538
|
if replacement_job.logJobStoreFileID is None:
|
|
1221
|
-
logger.warning(
|
|
1539
|
+
logger.warning(
|
|
1540
|
+
"No log file is present, despite job failing: %s",
|
|
1541
|
+
replacement_job,
|
|
1542
|
+
)
|
|
1222
1543
|
|
|
1223
1544
|
if batch_system_id is not None:
|
|
1224
1545
|
# Look for any standard output/error files created by the batch system.
|
|
@@ -1227,31 +1548,60 @@ class Leader:
|
|
|
1227
1548
|
# --workDir / TOIL_WORKDIR is on a shared file system.
|
|
1228
1549
|
# They live directly in the Toil work directory because that is
|
|
1229
1550
|
# guaranteed to exist on the leader and workers.
|
|
1230
|
-
file_list = glob.glob(
|
|
1551
|
+
file_list = glob.glob(
|
|
1552
|
+
self.batchSystem.format_std_out_err_glob(batch_system_id)
|
|
1553
|
+
)
|
|
1231
1554
|
for log_file in file_list:
|
|
1232
1555
|
try:
|
|
1233
|
-
log_stream = open(log_file,
|
|
1556
|
+
log_stream = open(log_file, "rb")
|
|
1234
1557
|
except:
|
|
1235
|
-
logger.warning(
|
|
1558
|
+
logger.warning(
|
|
1559
|
+
"The batch system left a file %s, but it could not be opened"
|
|
1560
|
+
% log_file
|
|
1561
|
+
)
|
|
1236
1562
|
else:
|
|
1237
1563
|
with log_stream:
|
|
1238
1564
|
if os.path.getsize(log_file) > 0:
|
|
1239
|
-
StatsAndLogging.logWithFormatting(
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1565
|
+
StatsAndLogging.logWithFormatting(
|
|
1566
|
+
f'Log from job "{job_store_id}"',
|
|
1567
|
+
log_stream,
|
|
1568
|
+
method=logger.warning,
|
|
1569
|
+
message="The batch system left a non-empty file %s:"
|
|
1570
|
+
% log_file,
|
|
1571
|
+
)
|
|
1572
|
+
if (
|
|
1573
|
+
self.config.writeLogs
|
|
1574
|
+
or self.config.writeLogsGzip
|
|
1575
|
+
):
|
|
1576
|
+
file_root, _ = os.path.splitext(
|
|
1577
|
+
os.path.basename(log_file)
|
|
1578
|
+
)
|
|
1579
|
+
job_names = [
|
|
1580
|
+
names.stats_name
|
|
1581
|
+
for names in replacement_job.get_chain()
|
|
1582
|
+
]
|
|
1583
|
+
# Tack the batch system log file name onto each job's name
|
|
1584
|
+
job_names = [
|
|
1585
|
+
j + "_" + file_root for j in job_names
|
|
1586
|
+
]
|
|
1247
1587
|
log_stream.seek(0)
|
|
1248
|
-
StatsAndLogging.writeLogFiles(
|
|
1588
|
+
StatsAndLogging.writeLogFiles(
|
|
1589
|
+
job_names,
|
|
1590
|
+
log_stream,
|
|
1591
|
+
self.config,
|
|
1592
|
+
failed=True,
|
|
1593
|
+
)
|
|
1249
1594
|
else:
|
|
1250
|
-
logger.warning(
|
|
1595
|
+
logger.warning(
|
|
1596
|
+
"The batch system left an empty file %s"
|
|
1597
|
+
% log_file
|
|
1598
|
+
)
|
|
1251
1599
|
|
|
1252
1600
|
# Tell the job to reset itself after a failure.
|
|
1253
1601
|
# It needs to know the failure reason if available; some are handled specially.
|
|
1254
|
-
replacement_job.setupJobAfterFailure(
|
|
1602
|
+
replacement_job.setupJobAfterFailure(
|
|
1603
|
+
exit_status=result_status, exit_reason=exit_reason
|
|
1604
|
+
)
|
|
1255
1605
|
self.toilState.commit_job(job_store_id)
|
|
1256
1606
|
|
|
1257
1607
|
elif job_store_id in self.toilState.hasFailedSuccessors:
|
|
@@ -1259,18 +1609,20 @@ class Leader:
|
|
|
1259
1609
|
self.toilState.hasFailedSuccessors.remove(job_store_id)
|
|
1260
1610
|
|
|
1261
1611
|
# Now that we know the job is done we can add it to the list of updated jobs
|
|
1262
|
-
self._messages.publish(
|
|
1612
|
+
self._messages.publish(
|
|
1613
|
+
JobUpdatedMessage(replacement_job.jobStoreID, result_status)
|
|
1614
|
+
)
|
|
1263
1615
|
logger.debug("Added job: %s to updated jobs", replacement_job)
|
|
1264
1616
|
|
|
1265
1617
|
# Return True if it will rerun (still has retries) and false if it
|
|
1266
1618
|
# is completely failed.
|
|
1267
1619
|
return replacement_job.remainingTryCount > 0
|
|
1268
|
-
else: #The job is done
|
|
1620
|
+
else: # The job is done
|
|
1269
1621
|
self.processRemovedJob(finished_job, result_status)
|
|
1270
1622
|
# Being done, it won't run again.
|
|
1271
1623
|
return False
|
|
1272
1624
|
|
|
1273
|
-
def getSuccessors(self, job_id: str, alreadySeenSuccessors:
|
|
1625
|
+
def getSuccessors(self, job_id: str, alreadySeenSuccessors: set[str]) -> set[str]:
|
|
1274
1626
|
"""
|
|
1275
1627
|
Get successors of the given job by walking the job graph recursively.
|
|
1276
1628
|
|
|
@@ -1278,6 +1630,7 @@ class Leader:
|
|
|
1278
1630
|
:returns: The set of found successors. This set is added to alreadySeenSuccessors.
|
|
1279
1631
|
"""
|
|
1280
1632
|
successors = set()
|
|
1633
|
+
|
|
1281
1634
|
def successorRecursion(job_id: str) -> None:
|
|
1282
1635
|
# TODO: do we need to reload from the job store here, or is the cache OK?
|
|
1283
1636
|
jobDesc = self.toilState.get_job(job_id)
|
|
@@ -1309,12 +1662,15 @@ class Leader:
|
|
|
1309
1662
|
|
|
1310
1663
|
# Tell everyone it failed
|
|
1311
1664
|
|
|
1312
|
-
self._messages.publish(
|
|
1665
|
+
self._messages.publish(
|
|
1666
|
+
JobFailedMessage(get_job_kind(job_desc.get_names()), job_id)
|
|
1667
|
+
)
|
|
1313
1668
|
|
|
1314
1669
|
if job_id in self.toilState.service_to_client:
|
|
1315
1670
|
# Is a service job
|
|
1316
|
-
logger.debug(
|
|
1317
|
-
|
|
1671
|
+
logger.debug(
|
|
1672
|
+
"Service job is being processed as a totally failed job: %s", job_desc
|
|
1673
|
+
)
|
|
1318
1674
|
|
|
1319
1675
|
if not isinstance(job_desc, ServiceJobDescription):
|
|
1320
1676
|
raise RuntimeError("The service job description type is incorrect.")
|
|
@@ -1338,8 +1694,13 @@ class Leader:
|
|
|
1338
1694
|
# properly, and to remember that this service failed with an error
|
|
1339
1695
|
# and possibly never started.
|
|
1340
1696
|
if client_id in self.toilState.servicesIssued:
|
|
1341
|
-
self.serviceManager.kill_services(
|
|
1342
|
-
|
|
1697
|
+
self.serviceManager.kill_services(
|
|
1698
|
+
self.toilState.servicesIssued[client_id], error=True
|
|
1699
|
+
)
|
|
1700
|
+
logger.warning(
|
|
1701
|
+
"Job: %s is instructing all other services of its parent job to quit",
|
|
1702
|
+
job_desc,
|
|
1703
|
+
)
|
|
1343
1704
|
|
|
1344
1705
|
# This ensures that the job will not attempt to run any of it's
|
|
1345
1706
|
# successors on the stack
|
|
@@ -1363,9 +1724,14 @@ class Leader:
|
|
|
1363
1724
|
# Any successor already in toilState.failedSuccessors will not be traversed
|
|
1364
1725
|
# All successors traversed will be added to toilState.failedSuccessors and returned
|
|
1365
1726
|
# as a set (unseenSuccessors).
|
|
1366
|
-
unseenSuccessors = self.getSuccessors(
|
|
1367
|
-
|
|
1368
|
-
|
|
1727
|
+
unseenSuccessors = self.getSuccessors(
|
|
1728
|
+
job_id, self.toilState.failedSuccessors
|
|
1729
|
+
)
|
|
1730
|
+
logger.debug(
|
|
1731
|
+
"Found new failed successors: %s of job: %s",
|
|
1732
|
+
" ".join(unseenSuccessors),
|
|
1733
|
+
job_desc,
|
|
1734
|
+
)
|
|
1369
1735
|
|
|
1370
1736
|
# For each newly found successor
|
|
1371
1737
|
for successorJobStoreID in unseenSuccessors:
|
|
@@ -1376,7 +1742,9 @@ class Leader:
|
|
|
1376
1742
|
# For each such predecessor job
|
|
1377
1743
|
# (we remove the successor from toilState.successor_to_predecessors to avoid doing
|
|
1378
1744
|
# this multiple times for each failed predecessor)
|
|
1379
|
-
for predecessor_id in self.toilState.successor_to_predecessors.pop(
|
|
1745
|
+
for predecessor_id in self.toilState.successor_to_predecessors.pop(
|
|
1746
|
+
successorJobStoreID
|
|
1747
|
+
):
|
|
1380
1748
|
|
|
1381
1749
|
predecessor = self.toilState.get_job(predecessor_id)
|
|
1382
1750
|
|
|
@@ -1385,8 +1753,11 @@ class Leader:
|
|
|
1385
1753
|
|
|
1386
1754
|
# Indicate that it has failed jobs.
|
|
1387
1755
|
self.toilState.hasFailedSuccessors.add(predecessor_id)
|
|
1388
|
-
logger.debug(
|
|
1389
|
-
|
|
1756
|
+
logger.debug(
|
|
1757
|
+
"Marking job: %s as having failed successors (found by "
|
|
1758
|
+
"reading successors failed job)",
|
|
1759
|
+
predecessor,
|
|
1760
|
+
)
|
|
1390
1761
|
|
|
1391
1762
|
# If the predecessor has no remaining successors, add to list of updated jobs
|
|
1392
1763
|
if self.toilState.count_pending_successors(predecessor_id) == 0:
|
|
@@ -1400,8 +1771,12 @@ class Leader:
|
|
|
1400
1771
|
|
|
1401
1772
|
# Mark the predecessor as failed
|
|
1402
1773
|
self.toilState.hasFailedSuccessors.add(predecessor_id)
|
|
1403
|
-
logger.debug(
|
|
1404
|
-
|
|
1774
|
+
logger.debug(
|
|
1775
|
+
"Totally failed job: %s is marking direct predecessor: %s "
|
|
1776
|
+
"as having failed jobs",
|
|
1777
|
+
job_desc,
|
|
1778
|
+
self.toilState.get_job(predecessor_id),
|
|
1779
|
+
)
|
|
1405
1780
|
|
|
1406
1781
|
self._updatePredecessorStatus(job_id)
|
|
1407
1782
|
|
|
@@ -1411,38 +1786,59 @@ class Leader:
|
|
|
1411
1786
|
# Is a service host job, so its predecessor is its client
|
|
1412
1787
|
client_id = self.toilState.service_to_client.pop(jobStoreID)
|
|
1413
1788
|
self.toilState.servicesIssued[client_id].remove(jobStoreID)
|
|
1414
|
-
if
|
|
1789
|
+
if (
|
|
1790
|
+
len(self.toilState.servicesIssued[client_id]) == 0
|
|
1791
|
+
): # Predecessor job has
|
|
1415
1792
|
# all its services terminated
|
|
1416
|
-
self.toilState.servicesIssued.pop(
|
|
1793
|
+
self.toilState.servicesIssued.pop(
|
|
1794
|
+
client_id
|
|
1795
|
+
) # The job has no running services
|
|
1417
1796
|
|
|
1418
|
-
logger.debug(
|
|
1797
|
+
logger.debug(
|
|
1798
|
+
"Job %s is no longer waiting on services; all services have stopped",
|
|
1799
|
+
self.toilState.get_job(client_id),
|
|
1800
|
+
)
|
|
1419
1801
|
|
|
1420
1802
|
# Now we know the job is done we can add it to the list of
|
|
1421
1803
|
# updated job files
|
|
1422
1804
|
self._messages.publish(JobUpdatedMessage(client_id, 0))
|
|
1423
1805
|
else:
|
|
1424
|
-
logger.debug(
|
|
1425
|
-
|
|
1426
|
-
|
|
1806
|
+
logger.debug(
|
|
1807
|
+
"Job %s is still waiting on %d services",
|
|
1808
|
+
self.toilState.get_job(client_id),
|
|
1809
|
+
len(self.toilState.servicesIssued[client_id]),
|
|
1810
|
+
)
|
|
1427
1811
|
elif jobStoreID not in self.toilState.successor_to_predecessors:
|
|
1428
|
-
#We have reach the root job
|
|
1812
|
+
# We have reach the root job
|
|
1429
1813
|
if self._messages.count(JobUpdatedMessage) != 0:
|
|
1430
1814
|
raise RuntimeError("Root job is done but other jobs are still updated")
|
|
1431
1815
|
if len(self.toilState.successor_to_predecessors) != 0:
|
|
1432
|
-
raise RuntimeError(
|
|
1433
|
-
|
|
1816
|
+
raise RuntimeError(
|
|
1817
|
+
"Job {} is finished and had no predecessor, but we have other outstanding jobs "
|
|
1818
|
+
"with predecessors: {}".format(
|
|
1819
|
+
jobStoreID, self.toilState.successor_to_predecessors.keys()
|
|
1820
|
+
)
|
|
1821
|
+
)
|
|
1434
1822
|
if len(self.toilState.successorCounts) != 0:
|
|
1435
|
-
raise RuntimeError(
|
|
1436
|
-
|
|
1823
|
+
raise RuntimeError(
|
|
1824
|
+
"Root job is done but jobs waiting on successors: {self.toilState.successorCounts}"
|
|
1825
|
+
)
|
|
1826
|
+
logger.debug(
|
|
1827
|
+
"Reached root job %s so no predecessors to clean up" % jobStoreID
|
|
1828
|
+
)
|
|
1437
1829
|
|
|
1438
1830
|
else:
|
|
1439
1831
|
# Is a non-root, non-service job
|
|
1440
1832
|
logger.debug("Cleaning the predecessors of %s" % jobStoreID)
|
|
1441
1833
|
|
|
1442
1834
|
# For each predecessor
|
|
1443
|
-
for predecessor_id in self.toilState.successor_to_predecessors.pop(
|
|
1835
|
+
for predecessor_id in self.toilState.successor_to_predecessors.pop(
|
|
1836
|
+
jobStoreID
|
|
1837
|
+
):
|
|
1444
1838
|
if not isinstance(predecessor_id, str):
|
|
1445
|
-
raise RuntimeError(
|
|
1839
|
+
raise RuntimeError(
|
|
1840
|
+
"Predecessor ID should be str but is {type(predecessor_id)}"
|
|
1841
|
+
)
|
|
1446
1842
|
predecessor = self.toilState.get_job(predecessor_id)
|
|
1447
1843
|
|
|
1448
1844
|
# Tell the predecessor that this job is done (keep only other successor jobs)
|