toil 7.0.0__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +121 -83
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +137 -77
- toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
- toil/batchSystems/awsBatch.py +237 -128
- toil/batchSystems/cleanup_support.py +22 -16
- toil/batchSystems/contained_executor.py +30 -26
- toil/batchSystems/gridengine.py +85 -49
- toil/batchSystems/htcondor.py +164 -87
- toil/batchSystems/kubernetes.py +622 -386
- toil/batchSystems/local_support.py +17 -12
- toil/batchSystems/lsf.py +132 -79
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +288 -149
- toil/batchSystems/mesos/executor.py +77 -49
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +38 -29
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +293 -123
- toil/batchSystems/slurm.py +489 -137
- toil/batchSystems/torque.py +46 -32
- toil/bus.py +141 -73
- toil/common.py +630 -359
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1114 -532
- toil/cwl/utils.py +17 -22
- toil/deferred.py +62 -41
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +88 -57
- toil/fileStores/cachingFileStore.py +711 -247
- toil/fileStores/nonCachingFileStore.py +113 -75
- toil/job.py +988 -315
- toil/jobStores/abstractJobStore.py +387 -243
- toil/jobStores/aws/jobStore.py +727 -403
- toil/jobStores/aws/utils.py +161 -109
- toil/jobStores/conftest.py +1 -0
- toil/jobStores/fileJobStore.py +289 -151
- toil/jobStores/googleJobStore.py +137 -70
- toil/jobStores/utils.py +36 -15
- toil/leader.py +614 -269
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +55 -28
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +193 -58
- toil/lib/aws/utils.py +238 -218
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +83 -49
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +322 -209
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +4 -2
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +99 -11
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +65 -18
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +19 -7
- toil/lib/retry.py +115 -77
- toil/lib/threading.py +282 -80
- toil/lib/throttle.py +15 -14
- toil/options/common.py +834 -401
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +70 -19
- toil/provisioners/__init__.py +111 -46
- toil/provisioners/abstractProvisioner.py +322 -157
- toil/provisioners/aws/__init__.py +62 -30
- toil/provisioners/aws/awsProvisioner.py +980 -627
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +147 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +127 -61
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +148 -64
- toil/test/__init__.py +263 -179
- toil/test/batchSystems/batchSystemTest.py +438 -195
- toil/test/batchSystems/batch_system_plugin_test.py +18 -7
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +93 -47
- toil/test/cactus/test_cactus_integration.py +20 -22
- toil/test/cwl/cwlTest.py +271 -71
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/docs/scriptsTest.py +60 -34
- toil/test/jobStores/jobStoreTest.py +412 -235
- toil/test/lib/aws/test_iam.py +116 -48
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +57 -49
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/options.py +7 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +81 -42
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +140 -100
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +33 -26
- toil/test/src/environmentTest.py +20 -10
- toil/test/src/fileStoreTest.py +538 -271
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +32 -17
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +120 -70
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +6 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +33 -16
- toil/test/utils/toilDebugTest.py +70 -58
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +239 -102
- toil/test/wdl/wdltoil_test.py +789 -148
- toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
- toil/toilState.py +52 -26
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +85 -25
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +251 -145
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +27 -14
- toil/utils/toilSshCluster.py +45 -22
- toil/utils/toilStats.py +75 -36
- toil/utils/toilStatus.py +226 -119
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +11 -11
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3513 -1052
- toil/worker.py +269 -128
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-7.0.0.dist-info/METADATA +0 -158
- toil-7.0.0.dist-info/RECORD +0 -244
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/LICENSE +0 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/leader.py
CHANGED
|
@@ -21,32 +21,36 @@ import os
|
|
|
21
21
|
import pickle
|
|
22
22
|
import sys
|
|
23
23
|
import time
|
|
24
|
-
from typing import Any,
|
|
24
|
+
from typing import Any, Optional, Union
|
|
25
25
|
|
|
26
26
|
import enlighten
|
|
27
27
|
|
|
28
28
|
from toil import resolveEntryPoint
|
|
29
29
|
from toil.batchSystems import DeadlockException
|
|
30
|
-
from toil.batchSystems.abstractBatchSystem import (
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
30
|
+
from toil.batchSystems.abstractBatchSystem import (
|
|
31
|
+
EXIT_STATUS_UNAVAILABLE_VALUE,
|
|
32
|
+
AbstractBatchSystem,
|
|
33
|
+
BatchJobExitReason,
|
|
34
|
+
)
|
|
35
|
+
from toil.bus import (
|
|
36
|
+
JobCompletedMessage,
|
|
37
|
+
JobFailedMessage,
|
|
38
|
+
JobIssuedMessage,
|
|
39
|
+
JobMissingMessage,
|
|
40
|
+
JobUpdatedMessage,
|
|
41
|
+
QueueSizeMessage,
|
|
42
|
+
get_job_kind,
|
|
43
|
+
)
|
|
41
44
|
from toil.common import Config, ToilMetrics
|
|
42
45
|
from toil.cwl.utils import CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
|
|
43
46
|
from toil.exceptions import FailedJobsException
|
|
44
|
-
from toil.job import (
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
47
|
+
from toil.job import (
|
|
48
|
+
CheckpointJobDescription,
|
|
49
|
+
JobDescription,
|
|
50
|
+
ServiceJobDescription,
|
|
51
|
+
TemporaryID,
|
|
52
|
+
)
|
|
53
|
+
from toil.jobStores.abstractJobStore import AbstractJobStore, NoSuchJobException
|
|
50
54
|
from toil.lib.throttle import LocalThrottle
|
|
51
55
|
from toil.provisioners.abstractProvisioner import AbstractProvisioner
|
|
52
56
|
from toil.provisioners.clusterScaler import ScalerThread
|
|
@@ -80,13 +84,15 @@ class Leader:
|
|
|
80
84
|
consulting the job store, and issuing them in the batch system.
|
|
81
85
|
"""
|
|
82
86
|
|
|
83
|
-
def __init__(
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
87
|
+
def __init__(
|
|
88
|
+
self,
|
|
89
|
+
config: Config,
|
|
90
|
+
batchSystem: AbstractBatchSystem,
|
|
91
|
+
provisioner: Optional[AbstractProvisioner],
|
|
92
|
+
jobStore: AbstractJobStore,
|
|
93
|
+
rootJob: JobDescription,
|
|
94
|
+
jobCache: Optional[dict[Union[str, TemporaryID], JobDescription]] = None,
|
|
95
|
+
) -> None:
|
|
90
96
|
"""
|
|
91
97
|
Create a Toil Leader object.
|
|
92
98
|
|
|
@@ -116,19 +122,11 @@ class Leader:
|
|
|
116
122
|
# state change information about jobs.
|
|
117
123
|
self.toilState = ToilState(self.jobStore)
|
|
118
124
|
|
|
119
|
-
if self.config.write_messages is None:
|
|
120
|
-
# The user hasn't specified a place for the message bus so we
|
|
121
|
-
# should make one.
|
|
122
|
-
# pass in coordination_dir for toil-cwl-runner; we want to obey --tmpdir-prefix
|
|
123
|
-
# from cwltool and we change the coordination_dir when detected. we don't want
|
|
124
|
-
# to make another config attribute so put the message bus in the already prefixed dir
|
|
125
|
-
# if a coordination_dir is provided normally, we can still put the bus in there
|
|
126
|
-
# as the coordination dir should serve a similar purpose to the tmp directory
|
|
127
|
-
self.config.write_messages = gen_message_bus_path(config.coordination_dir)
|
|
128
|
-
|
|
129
125
|
# Message bus messages need to go to the given file.
|
|
130
126
|
# Keep a reference to the return value so the listener stays alive.
|
|
131
|
-
self._message_subscription = self.toilState.bus.connect_output_file(
|
|
127
|
+
self._message_subscription = self.toilState.bus.connect_output_file(
|
|
128
|
+
self.config.write_messages
|
|
129
|
+
)
|
|
132
130
|
|
|
133
131
|
# Connect to the message bus, so we will get all the messages of these
|
|
134
132
|
# types in an inbox.
|
|
@@ -143,17 +141,22 @@ class Leader:
|
|
|
143
141
|
# this, somehow, so they can also see messages from this?
|
|
144
142
|
self.toilState.load_workflow(rootJob, jobCache=jobCache)
|
|
145
143
|
|
|
146
|
-
logger.debug(
|
|
147
|
-
|
|
144
|
+
logger.debug(
|
|
145
|
+
"Found %s jobs to start and %i jobs with successors to run",
|
|
146
|
+
self._messages.count(JobUpdatedMessage),
|
|
147
|
+
len(self.toilState.successorCounts),
|
|
148
|
+
)
|
|
148
149
|
|
|
149
150
|
# Batch system
|
|
150
151
|
self.batchSystem = batchSystem
|
|
151
152
|
if len(self.batchSystem.getIssuedBatchJobIDs()) != 0:
|
|
152
|
-
raise RuntimeError(
|
|
153
|
+
raise RuntimeError(
|
|
154
|
+
"The initialized batchsystem did not start with 0 active jobs."
|
|
155
|
+
)
|
|
153
156
|
logger.debug("Checked batch system has no running jobs and no updated jobs")
|
|
154
157
|
|
|
155
158
|
# Map of batch system IDs to job store IDs
|
|
156
|
-
self.issued_jobs_by_batch_system_id:
|
|
159
|
+
self.issued_jobs_by_batch_system_id: dict[int, str] = {}
|
|
157
160
|
|
|
158
161
|
# Number of preemptible jobs currently being run by batch system
|
|
159
162
|
self.preemptibleJobsIssued = 0
|
|
@@ -161,10 +164,12 @@ class Leader:
|
|
|
161
164
|
# Tracking the number service jobs issued,
|
|
162
165
|
# this is used limit the number of services issued to the batch system
|
|
163
166
|
self.serviceJobsIssued = 0
|
|
164
|
-
self.serviceJobsToBeIssued:
|
|
167
|
+
self.serviceJobsToBeIssued: list[str] = (
|
|
168
|
+
[]
|
|
169
|
+
) # A queue of IDs of service jobs that await scheduling
|
|
165
170
|
# Equivalents for service jobs to be run on preemptible nodes
|
|
166
171
|
self.preemptibleServiceJobsIssued = 0
|
|
167
|
-
self.preemptibleServiceJobsToBeIssued:
|
|
172
|
+
self.preemptibleServiceJobsToBeIssued: list[str] = []
|
|
168
173
|
|
|
169
174
|
# Timing of the rescuing method
|
|
170
175
|
self.timeSinceJobsLastRescued = None
|
|
@@ -172,7 +177,7 @@ class Leader:
|
|
|
172
177
|
# For each issued job's batch system ID, how many times did we not see
|
|
173
178
|
# it when we should have? If this hits a threshold, the job is declared
|
|
174
179
|
# missing and killed and possibly retried.
|
|
175
|
-
self.reissueMissingJobs_missingHash:
|
|
180
|
+
self.reissueMissingJobs_missingHash: dict[int, int] = {}
|
|
176
181
|
|
|
177
182
|
# Class used to create/destroy nodes in the cluster, may be None if
|
|
178
183
|
# using a statically defined cluster
|
|
@@ -190,7 +195,7 @@ class Leader:
|
|
|
190
195
|
self.statsAndLogging = StatsAndLogging(self.jobStore, self.config)
|
|
191
196
|
|
|
192
197
|
# Set used to monitor deadlocked jobs
|
|
193
|
-
self.potentialDeadlockedJobs:
|
|
198
|
+
self.potentialDeadlockedJobs: set[str] = set()
|
|
194
199
|
self.potentialDeadlockTime = 0
|
|
195
200
|
|
|
196
201
|
# A dashboard that runs on the leader node in AWS clusters to track the state
|
|
@@ -198,8 +203,13 @@ class Leader:
|
|
|
198
203
|
self.toilMetrics: Optional[ToilMetrics] = None
|
|
199
204
|
|
|
200
205
|
# internal jobs we should not expose at top level debugging
|
|
201
|
-
self.debugJobNames = (
|
|
202
|
-
|
|
206
|
+
self.debugJobNames = (
|
|
207
|
+
"CWLJob",
|
|
208
|
+
"CWLWorkflow",
|
|
209
|
+
"CWLScatter",
|
|
210
|
+
"CWLGather",
|
|
211
|
+
"ResolveIndirect",
|
|
212
|
+
)
|
|
203
213
|
|
|
204
214
|
self.deadlockThrottler = LocalThrottle(self.config.deadlockCheckInterval)
|
|
205
215
|
|
|
@@ -217,8 +227,10 @@ class Leader:
|
|
|
217
227
|
self.GOOD_COLOR = (0, 60, 108)
|
|
218
228
|
self.BAD_COLOR = (253, 199, 0)
|
|
219
229
|
# And set a format that shows failures
|
|
220
|
-
self.PROGRESS_BAR_FORMAT = (
|
|
221
|
-
|
|
230
|
+
self.PROGRESS_BAR_FORMAT = (
|
|
231
|
+
"{desc}{desc_pad}{percentage:3.0f}%|{bar}| {count:{len_total}d}/{total:d} "
|
|
232
|
+
"({count_1:d} failures) [{elapsed}<{eta}, {rate:.2f}{unit_pad}{unit}/s]"
|
|
233
|
+
)
|
|
222
234
|
# TODO: No way to set background color on the terminal for the bar.
|
|
223
235
|
|
|
224
236
|
# What exit code should the process use if the workflow failed?
|
|
@@ -236,16 +248,25 @@ class Leader:
|
|
|
236
248
|
"""
|
|
237
249
|
self.jobStore.write_kill_flag(kill=False)
|
|
238
250
|
|
|
239
|
-
with enlighten.get_manager(
|
|
251
|
+
with enlighten.get_manager(
|
|
252
|
+
stream=sys.stderr, enabled=not self.config.disableProgress
|
|
253
|
+
) as manager:
|
|
240
254
|
# Set up the fancy console UI if desirable
|
|
241
|
-
self.progress_overall = manager.counter(
|
|
242
|
-
|
|
255
|
+
self.progress_overall = manager.counter(
|
|
256
|
+
total=0,
|
|
257
|
+
desc="Workflow Progress",
|
|
258
|
+
unit="jobs",
|
|
259
|
+
color=self.GOOD_COLOR,
|
|
260
|
+
bar_format=self.PROGRESS_BAR_FORMAT,
|
|
261
|
+
)
|
|
243
262
|
self.progress_failed = self.progress_overall.add_subcounter(self.BAD_COLOR)
|
|
244
263
|
|
|
245
264
|
# Start the stats/logging aggregation thread
|
|
246
265
|
self.statsAndLogging.start()
|
|
247
266
|
if self.config.metrics:
|
|
248
|
-
self.toilMetrics = ToilMetrics(
|
|
267
|
+
self.toilMetrics = ToilMetrics(
|
|
268
|
+
self.toilState.bus, provisioner=self.provisioner
|
|
269
|
+
)
|
|
249
270
|
|
|
250
271
|
try:
|
|
251
272
|
|
|
@@ -262,10 +283,13 @@ class Leader:
|
|
|
262
283
|
self.innerLoop()
|
|
263
284
|
finally:
|
|
264
285
|
if self.clusterScaler is not None:
|
|
265
|
-
logger.debug(
|
|
286
|
+
logger.debug("Waiting for workers to shutdown.")
|
|
266
287
|
startTime = time.time()
|
|
267
288
|
self.clusterScaler.shutdown()
|
|
268
|
-
logger.debug(
|
|
289
|
+
logger.debug(
|
|
290
|
+
"Worker shutdown complete in %s seconds.",
|
|
291
|
+
time.time() - startTime,
|
|
292
|
+
)
|
|
269
293
|
|
|
270
294
|
finally:
|
|
271
295
|
# Ensure service manager thread is properly shutdown
|
|
@@ -278,16 +302,25 @@ class Leader:
|
|
|
278
302
|
self.toilMetrics.shutdown()
|
|
279
303
|
|
|
280
304
|
# Filter the failed jobs
|
|
281
|
-
self.toilState.totalFailedJobs = [
|
|
305
|
+
self.toilState.totalFailedJobs = [
|
|
306
|
+
j
|
|
307
|
+
for j in self.toilState.totalFailedJobs
|
|
308
|
+
if self.toilState.job_exists(j)
|
|
309
|
+
]
|
|
282
310
|
|
|
283
311
|
try:
|
|
284
312
|
self.create_status_sentinel_file(self.toilState.totalFailedJobs)
|
|
285
313
|
except OSError as e:
|
|
286
|
-
logger.debug(f
|
|
314
|
+
logger.debug(f"Error from importFile with hardlink=True: {e}")
|
|
287
315
|
|
|
288
|
-
logger.info(
|
|
289
|
-
|
|
290
|
-
|
|
316
|
+
logger.info(
|
|
317
|
+
"Finished toil run %s"
|
|
318
|
+
% (
|
|
319
|
+
"successfully."
|
|
320
|
+
if not self.toilState.totalFailedJobs
|
|
321
|
+
else ("with %s failed jobs." % len(self.toilState.totalFailedJobs))
|
|
322
|
+
)
|
|
323
|
+
)
|
|
291
324
|
|
|
292
325
|
if len(self.toilState.totalFailedJobs):
|
|
293
326
|
failed_jobs = []
|
|
@@ -300,19 +333,28 @@ class Leader:
|
|
|
300
333
|
# Job actually finished and was removed
|
|
301
334
|
pass
|
|
302
335
|
|
|
303
|
-
logger.info(
|
|
304
|
-
|
|
336
|
+
logger.info(
|
|
337
|
+
"Failed jobs at end of the run: %s",
|
|
338
|
+
" ".join(str(j) for j in failed_jobs),
|
|
339
|
+
)
|
|
340
|
+
raise FailedJobsException(
|
|
341
|
+
self.jobStore,
|
|
342
|
+
failed_jobs,
|
|
343
|
+
exit_code=self.recommended_fail_exit_code,
|
|
344
|
+
)
|
|
305
345
|
|
|
306
346
|
return self.jobStore.get_root_job_return_value()
|
|
307
347
|
|
|
308
348
|
def create_status_sentinel_file(self, fail: bool) -> None:
|
|
309
349
|
"""Create a file in the jobstore indicating failure or success."""
|
|
310
|
-
logName =
|
|
350
|
+
logName = "failed.log" if fail else "succeeded.log"
|
|
311
351
|
localLog = os.path.join(os.getcwd(), logName)
|
|
312
|
-
open(localLog,
|
|
313
|
-
self.jobStore.import_file(
|
|
352
|
+
open(localLog, "w").close()
|
|
353
|
+
self.jobStore.import_file("file://" + localLog, logName, hardlink=True)
|
|
314
354
|
|
|
315
|
-
if os.path.exists(
|
|
355
|
+
if os.path.exists(
|
|
356
|
+
localLog
|
|
357
|
+
): # Bandaid for Jenkins tests failing stochastically and unexplainably.
|
|
316
358
|
os.remove(localLog)
|
|
317
359
|
|
|
318
360
|
def _handledFailedSuccessor(self, successor_id: str, predecessor_id: str) -> bool:
|
|
@@ -324,8 +366,11 @@ class Leader:
|
|
|
324
366
|
:returns: True if there are still active successors.
|
|
325
367
|
False if all successors have failed and the job is queued to run to handle the failed successors.
|
|
326
368
|
"""
|
|
327
|
-
logger.debug(
|
|
328
|
-
|
|
369
|
+
logger.debug(
|
|
370
|
+
"Successor job: %s of job: %s has failed " "" "predecessors",
|
|
371
|
+
self.toilState.get_job(successor_id),
|
|
372
|
+
self.toilState.get_job(predecessor_id),
|
|
373
|
+
)
|
|
329
374
|
|
|
330
375
|
# Add the job to the set having failed successors
|
|
331
376
|
self.toilState.hasFailedSuccessors.add(predecessor_id)
|
|
@@ -339,9 +384,12 @@ class Leader:
|
|
|
339
384
|
# If the job now has no active successors, add to active jobs
|
|
340
385
|
# so it can be processed as a job with failed successors.
|
|
341
386
|
if self.toilState.count_pending_successors(predecessor_id) == 0:
|
|
342
|
-
logger.debug(
|
|
343
|
-
|
|
344
|
-
|
|
387
|
+
logger.debug(
|
|
388
|
+
"Job: %s has no successors to run "
|
|
389
|
+
"and some are failed, adding to list of jobs "
|
|
390
|
+
"with failed successors",
|
|
391
|
+
self.toilState.get_job(predecessor_id),
|
|
392
|
+
)
|
|
345
393
|
self._messages.publish(JobUpdatedMessage(predecessor_id, 0))
|
|
346
394
|
# Report no successors are running
|
|
347
395
|
return False
|
|
@@ -349,7 +397,9 @@ class Leader:
|
|
|
349
397
|
# Some successors are still active
|
|
350
398
|
return True
|
|
351
399
|
|
|
352
|
-
def _checkSuccessorReadyToRunMultiplePredecessors(
|
|
400
|
+
def _checkSuccessorReadyToRunMultiplePredecessors(
|
|
401
|
+
self, successor_id: str, predecessor_id: str
|
|
402
|
+
) -> bool:
|
|
353
403
|
"""
|
|
354
404
|
Check if a successor job is ready to run when there are multiple predecessors.
|
|
355
405
|
|
|
@@ -370,8 +420,11 @@ class Leader:
|
|
|
370
420
|
# Grab the predecessor for reporting
|
|
371
421
|
predecessor = self.toilState.get_job(predecessor_id)
|
|
372
422
|
|
|
373
|
-
logger.debug(
|
|
374
|
-
|
|
423
|
+
logger.debug(
|
|
424
|
+
"Successor job: %s of job: %s has multiple " "predecessors",
|
|
425
|
+
successor,
|
|
426
|
+
predecessor,
|
|
427
|
+
)
|
|
375
428
|
|
|
376
429
|
# Add the predecessor as a finished predecessor to the successor
|
|
377
430
|
successor.predecessorsFinished.add(predecessor_id)
|
|
@@ -390,13 +443,17 @@ class Leader:
|
|
|
390
443
|
if len(successor.predecessorsFinished) == successor.predecessorNumber:
|
|
391
444
|
# All the successor's predecessors are done now.
|
|
392
445
|
# Remove the successor job from the set of waiting multi-predecessor jobs.
|
|
393
|
-
self.toilState.jobsToBeScheduledWithMultiplePredecessors.remove(
|
|
446
|
+
self.toilState.jobsToBeScheduledWithMultiplePredecessors.remove(
|
|
447
|
+
successor_id
|
|
448
|
+
)
|
|
394
449
|
return True
|
|
395
450
|
else:
|
|
396
451
|
# The job is not ready to run
|
|
397
452
|
return False
|
|
398
453
|
|
|
399
|
-
def _makeJobSuccessorReadyToRun(
|
|
454
|
+
def _makeJobSuccessorReadyToRun(
|
|
455
|
+
self, successor_id: str, predecessor_id: str
|
|
456
|
+
) -> bool:
|
|
400
457
|
"""
|
|
401
458
|
Make a successor job ready to run if possible.
|
|
402
459
|
|
|
@@ -404,7 +461,7 @@ class Leader:
|
|
|
404
461
|
:param predecessor_id: The job which the successor comes after.
|
|
405
462
|
:returns: False if the successor job should not yet be run or True otherwise.
|
|
406
463
|
"""
|
|
407
|
-
#Build map from successor to predecessors.
|
|
464
|
+
# Build map from successor to predecessors.
|
|
408
465
|
if successor_id not in self.toilState.successor_to_predecessors:
|
|
409
466
|
self.toilState.successor_to_predecessors[successor_id] = set()
|
|
410
467
|
if not isinstance(successor_id, str):
|
|
@@ -415,9 +472,15 @@ class Leader:
|
|
|
415
472
|
|
|
416
473
|
# Grab the successor
|
|
417
474
|
successor = self.toilState.get_job(successor_id)
|
|
418
|
-
logger.debug(
|
|
475
|
+
logger.debug(
|
|
476
|
+
"Added job %s as coming after job %s",
|
|
477
|
+
successor,
|
|
478
|
+
self.toilState.get_job(predecessor_id),
|
|
479
|
+
)
|
|
419
480
|
if successor.predecessorNumber > 1:
|
|
420
|
-
return self._checkSuccessorReadyToRunMultiplePredecessors(
|
|
481
|
+
return self._checkSuccessorReadyToRunMultiplePredecessors(
|
|
482
|
+
successor_id, predecessor_id
|
|
483
|
+
)
|
|
421
484
|
else:
|
|
422
485
|
return True
|
|
423
486
|
|
|
@@ -436,13 +499,20 @@ class Leader:
|
|
|
436
499
|
next_successors = predecessor.nextSuccessors()
|
|
437
500
|
|
|
438
501
|
if next_successors is None or len(next_successors) == 0:
|
|
439
|
-
raise RuntimeError(
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
502
|
+
raise RuntimeError(
|
|
503
|
+
f"Job {self} trying to run successors, but it doesn't have any"
|
|
504
|
+
)
|
|
505
|
+
logger.debug(
|
|
506
|
+
"Job: %s has %i successors to schedule",
|
|
507
|
+
predecessor_id,
|
|
508
|
+
len(next_successors),
|
|
509
|
+
)
|
|
510
|
+
# Record the number of successors that must be completed before
|
|
511
|
+
# the job can be considered again
|
|
444
512
|
if self.toilState.count_pending_successors(predecessor_id) != 0:
|
|
445
|
-
raise RuntimeError(
|
|
513
|
+
raise RuntimeError(
|
|
514
|
+
"Attempted to schedule successors of the same job twice!"
|
|
515
|
+
)
|
|
446
516
|
self.toilState.successors_pending(predecessor_id, len(next_successors))
|
|
447
517
|
|
|
448
518
|
# For each successor schedule if all predecessors have been completed
|
|
@@ -453,7 +523,11 @@ class Leader:
|
|
|
453
523
|
except NoSuchJobException:
|
|
454
524
|
# Job already done and gone, but probably shouldn't be. Or maybe isn't visible yet.
|
|
455
525
|
# TODO: Shouldn't this be an error?
|
|
456
|
-
logger.warning(
|
|
526
|
+
logger.warning(
|
|
527
|
+
"Job %s is a successor of %s but is already done and gone.",
|
|
528
|
+
successor_id,
|
|
529
|
+
predecessor_id,
|
|
530
|
+
)
|
|
457
531
|
# Don't try and run it
|
|
458
532
|
continue
|
|
459
533
|
if self._makeJobSuccessorReadyToRun(successor_id, predecessor_id):
|
|
@@ -475,46 +549,62 @@ class Leader:
|
|
|
475
549
|
# The job has services running; signal for them to be killed.
|
|
476
550
|
# Once they are killed, then the job will be updated again and then
|
|
477
551
|
# scheduled to be removed.
|
|
478
|
-
logger.warning(
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
552
|
+
logger.warning(
|
|
553
|
+
"Telling job %s to terminate its services due to successor failure",
|
|
554
|
+
predecessor,
|
|
555
|
+
)
|
|
556
|
+
self.serviceManager.kill_services(
|
|
557
|
+
self.toilState.servicesIssued[predecessor_id], error=True
|
|
558
|
+
)
|
|
482
559
|
elif self.toilState.count_pending_successors(predecessor_id) > 0:
|
|
483
560
|
# The job has non-service jobs running; wait for them to finish.
|
|
484
561
|
# the job will be re-added to the updated jobs when these jobs
|
|
485
562
|
# are done
|
|
486
|
-
logger.debug(
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
563
|
+
logger.debug(
|
|
564
|
+
"Job %s with ID: %s with failed successors still has successor jobs running",
|
|
565
|
+
predecessor,
|
|
566
|
+
predecessor_id,
|
|
567
|
+
)
|
|
568
|
+
elif (
|
|
569
|
+
isinstance(predecessor, CheckpointJobDescription)
|
|
570
|
+
and predecessor.checkpoint is not None
|
|
571
|
+
and predecessor.remainingTryCount > 1
|
|
572
|
+
):
|
|
491
573
|
# If the job is a checkpoint and has remaining retries...
|
|
492
574
|
# The logic behind using > 1 rather than > 0 here: Since this job has
|
|
493
575
|
# been tried once (without decreasing its try count as the job
|
|
494
576
|
# itself was successful), and its subtree failed, it shouldn't be retried
|
|
495
577
|
# unless it has more than 1 try.
|
|
496
578
|
if predecessor_id in self.toilState.jobs_issued:
|
|
497
|
-
logger.debug(
|
|
579
|
+
logger.debug(
|
|
580
|
+
"Checkpoint job %s was updated while issued", predecessor_id
|
|
581
|
+
)
|
|
498
582
|
else:
|
|
499
583
|
# It hasn't already been reissued.
|
|
500
584
|
# This check lets us be robust against repeated job update
|
|
501
585
|
# messages (such as from services starting *and* failing), by
|
|
502
586
|
# making sure that we don't stay in a state that where we
|
|
503
587
|
# reissue the job every time we get one.
|
|
504
|
-
logger.warning(
|
|
505
|
-
|
|
588
|
+
logger.warning(
|
|
589
|
+
"Job: %s is being restarted as a checkpoint after the total "
|
|
590
|
+
"failure of jobs in its subtree.",
|
|
591
|
+
predecessor_id,
|
|
592
|
+
)
|
|
506
593
|
self.issueJob(predecessor)
|
|
507
594
|
else:
|
|
508
595
|
# Mark it totally failed
|
|
509
|
-
logger.debug(
|
|
596
|
+
logger.debug(
|
|
597
|
+
"Job %s is being processed as completely failed", predecessor_id
|
|
598
|
+
)
|
|
510
599
|
self.processTotallyFailedJob(predecessor_id)
|
|
511
600
|
|
|
512
601
|
def _processReadyJob(self, job_id: str, result_status: int):
|
|
513
602
|
# We operate on the JobDescription mostly.
|
|
514
603
|
readyJob = self.toilState.get_job(job_id)
|
|
515
604
|
|
|
516
|
-
logger.debug(
|
|
517
|
-
|
|
605
|
+
logger.debug(
|
|
606
|
+
"Updating status of job %s with result status: %s", readyJob, result_status
|
|
607
|
+
)
|
|
518
608
|
|
|
519
609
|
# TODO: Filter out nonexistent successors/services now, so we can tell
|
|
520
610
|
# if they are all done and the job needs deleting?
|
|
@@ -527,8 +617,11 @@ class Leader:
|
|
|
527
617
|
# want to act on it; we want to wait until it gets the update it
|
|
528
618
|
# gets when the service manager is done trying to start its
|
|
529
619
|
# services.
|
|
530
|
-
logger.debug(
|
|
531
|
-
|
|
620
|
+
logger.debug(
|
|
621
|
+
"Got a job to update which is still owned by the service "
|
|
622
|
+
"manager: %s",
|
|
623
|
+
readyJob.jobStoreID,
|
|
624
|
+
)
|
|
532
625
|
elif readyJob.jobStoreID in self.toilState.hasFailedSuccessors:
|
|
533
626
|
self._processFailedSuccessors(job_id)
|
|
534
627
|
elif readyJob.has_body() or result_status != 0:
|
|
@@ -542,8 +635,9 @@ class Leader:
|
|
|
542
635
|
|
|
543
636
|
# If the job has run out of tries or is a service job whose error flag has
|
|
544
637
|
# been indicated, fail the job.
|
|
545
|
-
if
|
|
546
|
-
|
|
638
|
+
if readyJob.remainingTryCount == 0 or (
|
|
639
|
+
isServiceJob and not self.jobStore.file_exists(readyJob.errorJobStoreID)
|
|
640
|
+
):
|
|
547
641
|
self.processTotallyFailedJob(job_id)
|
|
548
642
|
logger.warning("Job %s is completely failed", readyJob)
|
|
549
643
|
else:
|
|
@@ -554,29 +648,39 @@ class Leader:
|
|
|
554
648
|
# Build a map from the service jobs to the job and a map
|
|
555
649
|
# of the services created for the job
|
|
556
650
|
if readyJob.jobStoreID in self.toilState.servicesIssued:
|
|
557
|
-
raise RuntimeError(
|
|
651
|
+
raise RuntimeError(
|
|
652
|
+
f"The ready job: {readyJob.jobStoreID} was already issued."
|
|
653
|
+
)
|
|
558
654
|
self.toilState.servicesIssued[readyJob.jobStoreID] = set()
|
|
559
655
|
for serviceJobList in readyJob.serviceHostIDsInBatches():
|
|
560
656
|
for serviceID in serviceJobList:
|
|
561
657
|
if serviceID in self.toilState.service_to_client:
|
|
562
|
-
raise RuntimeError(
|
|
658
|
+
raise RuntimeError(
|
|
659
|
+
f"The ready service ID: {serviceID} was already added."
|
|
660
|
+
)
|
|
563
661
|
# TODO: Why do we refresh here?
|
|
564
662
|
self.toilState.reset_job(serviceID)
|
|
565
663
|
serviceHost = self.toilState.get_job(serviceID)
|
|
566
664
|
self.toilState.service_to_client[serviceID] = readyJob.jobStoreID
|
|
567
665
|
self.toilState.servicesIssued[readyJob.jobStoreID].add(serviceID)
|
|
568
666
|
|
|
569
|
-
logger.debug(
|
|
667
|
+
logger.debug(
|
|
668
|
+
"Giving job: %s to service manager to schedule its jobs", readyJob
|
|
669
|
+
)
|
|
570
670
|
# Use the service manager to start the services
|
|
571
671
|
self.serviceManager.put_client(job_id)
|
|
572
672
|
elif readyJob.nextSuccessors() is not None:
|
|
573
673
|
# There are successors to run
|
|
574
674
|
self._runJobSuccessors(job_id)
|
|
575
675
|
elif readyJob.jobStoreID in self.toilState.servicesIssued:
|
|
576
|
-
logger.debug(
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
676
|
+
logger.debug(
|
|
677
|
+
"Telling job: %s to terminate its services due to the "
|
|
678
|
+
"successful completion of its successor jobs",
|
|
679
|
+
readyJob,
|
|
680
|
+
)
|
|
681
|
+
self.serviceManager.kill_services(
|
|
682
|
+
self.toilState.servicesIssued[readyJob.jobStoreID], error=False
|
|
683
|
+
)
|
|
580
684
|
else:
|
|
581
685
|
# There are no remaining tasks to schedule within the job.
|
|
582
686
|
#
|
|
@@ -605,7 +709,10 @@ class Leader:
|
|
|
605
709
|
try:
|
|
606
710
|
self.toilState.delete_job(readyJob.jobStoreID)
|
|
607
711
|
except Exception as e:
|
|
608
|
-
logger.exception(
|
|
712
|
+
logger.exception(
|
|
713
|
+
"Re-processing success for job we could not remove: %s",
|
|
714
|
+
readyJob,
|
|
715
|
+
)
|
|
609
716
|
# Kick it back to being handled as succeeded again. We
|
|
610
717
|
# don't want to have a failure here cause a Toil-level
|
|
611
718
|
# retry which causes more actual jobs to try to run.
|
|
@@ -617,12 +724,18 @@ class Leader:
|
|
|
617
724
|
self.processRemovedJob(readyJob, 0)
|
|
618
725
|
else:
|
|
619
726
|
self.processTotallyFailedJob(job_id)
|
|
620
|
-
logger.error(
|
|
727
|
+
logger.error(
|
|
728
|
+
"Job: %s is empty but completely failed - something is very wrong",
|
|
729
|
+
readyJob.jobStoreID,
|
|
730
|
+
)
|
|
621
731
|
|
|
622
732
|
def _processReadyJobs(self):
|
|
623
733
|
"""Process jobs that are ready to be scheduled/have successors to schedule."""
|
|
624
|
-
logger.debug(
|
|
625
|
-
|
|
734
|
+
logger.debug(
|
|
735
|
+
"Built the jobs list, currently have %i jobs to update and %i jobs issued",
|
|
736
|
+
self._messages.count(JobUpdatedMessage),
|
|
737
|
+
self.getNumberOfJobsIssued(),
|
|
738
|
+
)
|
|
626
739
|
|
|
627
740
|
# Now go through and, for each job that has updated this tick, process it.
|
|
628
741
|
|
|
@@ -637,9 +750,13 @@ class Leader:
|
|
|
637
750
|
if message.job_id in handled_with_status:
|
|
638
751
|
if handled_with_status[message.job_id] == message.result_status:
|
|
639
752
|
# This is a harmless duplicate
|
|
640
|
-
logger.debug(
|
|
641
|
-
|
|
642
|
-
|
|
753
|
+
logger.debug(
|
|
754
|
+
"Job %s already updated this tick with status %s and "
|
|
755
|
+
"we've received duplicate message %s",
|
|
756
|
+
message.job_id,
|
|
757
|
+
handled_with_status[message.job_id],
|
|
758
|
+
message,
|
|
759
|
+
)
|
|
643
760
|
else:
|
|
644
761
|
# This is a conflicting update. We may have already treated
|
|
645
762
|
# a job as succeeding but now we've heard it's failed, or
|
|
@@ -647,9 +764,13 @@ class Leader:
|
|
|
647
764
|
# This probably shouldn't happen, but does because the
|
|
648
765
|
# scheduler is not correct somehow and hasn't been for a
|
|
649
766
|
# long time. Complain about it.
|
|
650
|
-
logger.warning(
|
|
651
|
-
|
|
652
|
-
|
|
767
|
+
logger.warning(
|
|
768
|
+
"Job %s already updated this tick with status %s "
|
|
769
|
+
"but we've now received %s",
|
|
770
|
+
message.job_id,
|
|
771
|
+
handled_with_status[message.job_id],
|
|
772
|
+
message,
|
|
773
|
+
)
|
|
653
774
|
# Either way, we only want to handle one update per tick, like
|
|
654
775
|
# the old dict-based implementation.
|
|
655
776
|
continue
|
|
@@ -667,16 +788,21 @@ class Leader:
|
|
|
667
788
|
if service_id is None:
|
|
668
789
|
break
|
|
669
790
|
|
|
670
|
-
logger.debug(
|
|
791
|
+
logger.debug(
|
|
792
|
+
"Launching service job: %s", self.toilState.get_job(service_id)
|
|
793
|
+
)
|
|
671
794
|
self.issueServiceJob(service_id)
|
|
672
795
|
|
|
673
796
|
def _processJobsWithRunningServices(self):
|
|
674
797
|
"""Get jobs whose services have started."""
|
|
675
798
|
while True:
|
|
676
799
|
client_id = self.serviceManager.get_ready_client(0)
|
|
677
|
-
if client_id is None:
|
|
800
|
+
if client_id is None: # Stop trying to get jobs when function returns None
|
|
678
801
|
break
|
|
679
|
-
logger.debug(
|
|
802
|
+
logger.debug(
|
|
803
|
+
"Job: %s has established its services; all services are running",
|
|
804
|
+
client_id,
|
|
805
|
+
)
|
|
680
806
|
|
|
681
807
|
# Grab the client job description
|
|
682
808
|
client = self.toilState.get_job(client_id)
|
|
@@ -689,9 +815,9 @@ class Leader:
|
|
|
689
815
|
"""Get jobs whose services have failed to start."""
|
|
690
816
|
while True:
|
|
691
817
|
client_id = self.serviceManager.get_unservable_client(0)
|
|
692
|
-
if client_id is None:
|
|
818
|
+
if client_id is None: # Stop trying to get jobs when function returns None
|
|
693
819
|
break
|
|
694
|
-
logger.debug(
|
|
820
|
+
logger.debug("Job: %s has failed to establish its services.", client_id)
|
|
695
821
|
|
|
696
822
|
# Grab the client job description
|
|
697
823
|
client = self.toilState.get_job(client_id)
|
|
@@ -706,30 +832,56 @@ class Leader:
|
|
|
706
832
|
def _gatherUpdatedJobs(self, updatedJobTuple):
|
|
707
833
|
"""Gather any new, updated JobDescriptions from the batch system."""
|
|
708
834
|
bsID, exitStatus, exitReason, wallTime = (
|
|
709
|
-
updatedJobTuple.jobID,
|
|
710
|
-
updatedJobTuple.
|
|
835
|
+
updatedJobTuple.jobID,
|
|
836
|
+
updatedJobTuple.exitStatus,
|
|
837
|
+
updatedJobTuple.exitReason,
|
|
838
|
+
updatedJobTuple.wallTime,
|
|
839
|
+
)
|
|
711
840
|
# easy, track different state
|
|
712
841
|
try:
|
|
713
|
-
updatedJob = self.toilState.get_job(
|
|
842
|
+
updatedJob = self.toilState.get_job(
|
|
843
|
+
self.issued_jobs_by_batch_system_id[bsID]
|
|
844
|
+
)
|
|
714
845
|
except KeyError:
|
|
715
|
-
logger.warning(
|
|
846
|
+
logger.warning(
|
|
847
|
+
"A result seems to already have been processed for job %s", bsID
|
|
848
|
+
)
|
|
716
849
|
else:
|
|
717
850
|
if exitStatus == 0:
|
|
718
|
-
logger.debug(
|
|
851
|
+
logger.debug("Job ended: %s", updatedJob)
|
|
719
852
|
else:
|
|
720
|
-
status_string =
|
|
721
|
-
|
|
722
|
-
|
|
853
|
+
status_string = (
|
|
854
|
+
str(exitStatus)
|
|
855
|
+
if exitStatus != EXIT_STATUS_UNAVAILABLE_VALUE
|
|
856
|
+
else "<UNAVAILABLE>"
|
|
857
|
+
)
|
|
858
|
+
logger.warning(
|
|
859
|
+
f"Job failed with exit value {status_string}: {updatedJob}\n"
|
|
860
|
+
f"Exit reason: {BatchJobExitReason.to_string(exitReason)}"
|
|
861
|
+
)
|
|
862
|
+
# This logic is undefined for which of the failing jobs will send its exit code
|
|
863
|
+
# when there are multiple failing jobs with different exit statuses
|
|
864
|
+
self.recommended_fail_exit_code = exitStatus
|
|
723
865
|
if exitStatus == CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE:
|
|
724
866
|
# This is a CWL job informing us that the workflow is
|
|
725
867
|
# asking things of us that Toil can't do. When we raise an
|
|
726
868
|
# exception because of this, make sure to forward along
|
|
727
869
|
# this exit code.
|
|
728
870
|
logger.warning("This indicates an unsupported CWL requirement!")
|
|
729
|
-
self.recommended_fail_exit_code =
|
|
871
|
+
self.recommended_fail_exit_code = (
|
|
872
|
+
CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
|
|
873
|
+
)
|
|
730
874
|
# Tell everyone it stopped running.
|
|
731
|
-
self._messages.publish(
|
|
732
|
-
|
|
875
|
+
self._messages.publish(
|
|
876
|
+
JobCompletedMessage(
|
|
877
|
+
get_job_kind(updatedJob.get_names()),
|
|
878
|
+
updatedJob.jobStoreID,
|
|
879
|
+
exitStatus,
|
|
880
|
+
)
|
|
881
|
+
)
|
|
882
|
+
self.process_finished_job(
|
|
883
|
+
bsID, exitStatus, wall_time=wallTime, exit_reason=exitReason
|
|
884
|
+
)
|
|
733
885
|
|
|
734
886
|
def _processLostJobs(self):
|
|
735
887
|
"""Process jobs that have gone awry."""
|
|
@@ -737,7 +889,9 @@ class Leader:
|
|
|
737
889
|
# gather for rescueJobsFrequency seconds) check if there are any jobs
|
|
738
890
|
# that have run too long (see self.reissueOverLongJobs) or which have
|
|
739
891
|
# gone missing from the batch system (see self.reissueMissingJobs)
|
|
740
|
-
if (
|
|
892
|
+
if (
|
|
893
|
+
time.time() - self.timeSinceJobsLastRescued
|
|
894
|
+
) >= self.config.rescueJobsFrequency:
|
|
741
895
|
# We only rescue jobs every N seconds, and when we have apparently
|
|
742
896
|
# exhausted the current job supply
|
|
743
897
|
self.reissueOverLongJobs()
|
|
@@ -757,9 +911,11 @@ class Leader:
|
|
|
757
911
|
"""
|
|
758
912
|
self.timeSinceJobsLastRescued = time.time()
|
|
759
913
|
|
|
760
|
-
while
|
|
761
|
-
|
|
762
|
-
|
|
914
|
+
while (
|
|
915
|
+
self._messages.count(JobUpdatedMessage) > 0
|
|
916
|
+
or self.getNumberOfJobsIssued()
|
|
917
|
+
or self.serviceManager.get_job_count()
|
|
918
|
+
):
|
|
763
919
|
|
|
764
920
|
if self._messages.count(JobUpdatedMessage) > 0:
|
|
765
921
|
self._processReadyJobs()
|
|
@@ -811,13 +967,21 @@ class Leader:
|
|
|
811
967
|
if not self._messages.empty():
|
|
812
968
|
raise RuntimeError(f"Pending messages at shutdown: {self._messages}")
|
|
813
969
|
if self.toilState.successorCounts != {}:
|
|
814
|
-
raise RuntimeError(
|
|
970
|
+
raise RuntimeError(
|
|
971
|
+
f"Jobs waiting on successors at shutdown: {self.toilState.successorCounts}"
|
|
972
|
+
)
|
|
815
973
|
if self.toilState.successor_to_predecessors != {}:
|
|
816
|
-
raise RuntimeError(
|
|
974
|
+
raise RuntimeError(
|
|
975
|
+
f"Successors pending for their predecessors at shutdown: {self.toilState.successor_to_predecessors}"
|
|
976
|
+
)
|
|
817
977
|
if self.toilState.service_to_client != {}:
|
|
818
|
-
raise RuntimeError(
|
|
978
|
+
raise RuntimeError(
|
|
979
|
+
f"Services pending for their clients at shutdown: {self.toilState.service_to_client}"
|
|
980
|
+
)
|
|
819
981
|
if self.toilState.servicesIssued != {}:
|
|
820
|
-
raise RuntimeError(
|
|
982
|
+
raise RuntimeError(
|
|
983
|
+
f"Services running at shutdown: {self.toilState.servicesIssued}"
|
|
984
|
+
)
|
|
821
985
|
|
|
822
986
|
def checkForDeadlocks(self):
|
|
823
987
|
"""Check if the system is deadlocked running service jobs."""
|
|
@@ -827,18 +991,22 @@ class Leader:
|
|
|
827
991
|
# If there are no updated jobs and at least some jobs running
|
|
828
992
|
if totalServicesIssued >= totalRunningJobs and totalRunningJobs > 0:
|
|
829
993
|
# Collect all running service job store IDs into a set to compare with the deadlock set
|
|
830
|
-
running_service_ids:
|
|
994
|
+
running_service_ids: set[str] = set()
|
|
831
995
|
for js_id in self.issued_jobs_by_batch_system_id.values():
|
|
832
996
|
job = self.toilState.get_job(js_id)
|
|
833
|
-
if isinstance(
|
|
997
|
+
if isinstance(
|
|
998
|
+
job, ServiceJobDescription
|
|
999
|
+
) and self.serviceManager.is_running(js_id):
|
|
834
1000
|
running_service_ids.add(js_id)
|
|
835
1001
|
|
|
836
1002
|
if len(running_service_ids) > totalRunningJobs:
|
|
837
1003
|
# This is too many services.
|
|
838
1004
|
# TODO: couldn't more jobs have started since we polled the
|
|
839
1005
|
# running job count?
|
|
840
|
-
raise RuntimeError(
|
|
841
|
-
|
|
1006
|
+
raise RuntimeError(
|
|
1007
|
+
f"Supposedly running {len(running_service_ids)} services, which is"
|
|
1008
|
+
f"more than the {totalRunningJobs} currently running jobs overall."
|
|
1009
|
+
)
|
|
842
1010
|
|
|
843
1011
|
# If all the running jobs are active services then we have a potential deadlock
|
|
844
1012
|
if len(running_service_ids) == totalRunningJobs:
|
|
@@ -852,27 +1020,49 @@ class Leader:
|
|
|
852
1020
|
# Use a generic message if none is available
|
|
853
1021
|
message = "Cluster may be too small."
|
|
854
1022
|
|
|
855
|
-
|
|
856
1023
|
# See if this is a new potential deadlock
|
|
857
1024
|
if self.potentialDeadlockedJobs != running_service_ids:
|
|
858
|
-
logger.warning(
|
|
859
|
-
|
|
1025
|
+
logger.warning(
|
|
1026
|
+
(
|
|
1027
|
+
"Potential deadlock detected! All %s running jobs are service jobs, "
|
|
1028
|
+
"with no normal jobs to use them! %s"
|
|
1029
|
+
),
|
|
1030
|
+
totalRunningJobs,
|
|
1031
|
+
message,
|
|
1032
|
+
)
|
|
860
1033
|
self.potentialDeadlockedJobs = running_service_ids
|
|
861
1034
|
self.potentialDeadlockTime = time.time()
|
|
862
1035
|
else:
|
|
863
1036
|
# We wait self.config.deadlockWait seconds before declaring the system deadlocked
|
|
864
1037
|
stuckFor = time.time() - self.potentialDeadlockTime
|
|
865
1038
|
if stuckFor >= self.config.deadlockWait:
|
|
866
|
-
logger.error(
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
1039
|
+
logger.error(
|
|
1040
|
+
"We have been deadlocked since %s on these service jobs: %s",
|
|
1041
|
+
self.potentialDeadlockTime,
|
|
1042
|
+
self.potentialDeadlockedJobs,
|
|
1043
|
+
)
|
|
1044
|
+
raise DeadlockException(
|
|
1045
|
+
(
|
|
1046
|
+
"The workflow is service deadlocked - all %d running jobs "
|
|
1047
|
+
"have been the same active services for at least %s seconds"
|
|
1048
|
+
)
|
|
1049
|
+
% (totalRunningJobs, self.config.deadlockWait)
|
|
1050
|
+
)
|
|
870
1051
|
else:
|
|
871
1052
|
# Complain that we are still stuck.
|
|
872
|
-
waitingNormalJobs =
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
1053
|
+
waitingNormalJobs = (
|
|
1054
|
+
self.getNumberOfJobsIssued() - totalServicesIssued
|
|
1055
|
+
)
|
|
1056
|
+
logger.warning(
|
|
1057
|
+
(
|
|
1058
|
+
"Potentially deadlocked for %.0f seconds. Waiting at most %.0f more seconds "
|
|
1059
|
+
"for any of %d issued non-service jobs to schedule and start. %s"
|
|
1060
|
+
),
|
|
1061
|
+
stuckFor,
|
|
1062
|
+
self.config.deadlockWait - stuckFor,
|
|
1063
|
+
waitingNormalJobs,
|
|
1064
|
+
message,
|
|
1065
|
+
)
|
|
876
1066
|
else:
|
|
877
1067
|
# We have observed non-service jobs running, so reset the potential deadlock
|
|
878
1068
|
self.feed_deadlock_watchdog()
|
|
@@ -893,29 +1083,38 @@ class Leader:
|
|
|
893
1083
|
"""Add a job to the queue of jobs currently trying to run."""
|
|
894
1084
|
# Never issue the same job multiple times simultaneously
|
|
895
1085
|
if jobNode.jobStoreID in self.toilState.jobs_issued:
|
|
896
|
-
raise RuntimeError(
|
|
1086
|
+
raise RuntimeError(
|
|
1087
|
+
f"Attempted to issue {jobNode} multiple times simultaneously!"
|
|
1088
|
+
)
|
|
897
1089
|
|
|
898
|
-
workerCommand = [
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
1090
|
+
workerCommand = [
|
|
1091
|
+
resolveEntryPoint("_toil_worker"),
|
|
1092
|
+
jobNode.jobName,
|
|
1093
|
+
self.jobStoreLocator,
|
|
1094
|
+
jobNode.jobStoreID,
|
|
1095
|
+
]
|
|
902
1096
|
|
|
903
1097
|
for context in self.batchSystem.getWorkerContexts():
|
|
904
1098
|
# For each context manager hook the batch system wants to run in
|
|
905
1099
|
# the worker, serialize and send it.
|
|
906
|
-
workerCommand.append(
|
|
907
|
-
workerCommand.append(
|
|
1100
|
+
workerCommand.append("--context")
|
|
1101
|
+
workerCommand.append(
|
|
1102
|
+
base64.b64encode(pickle.dumps(context)).decode("utf-8")
|
|
1103
|
+
)
|
|
908
1104
|
|
|
909
|
-
omp_threads = os.environ.get(
|
|
910
|
-
|
|
1105
|
+
omp_threads = os.environ.get("OMP_NUM_THREADS") or str(
|
|
1106
|
+
max(1, int(jobNode.cores))
|
|
1107
|
+
) # make sure OMP_NUM_THREADS is a positive integer
|
|
911
1108
|
|
|
912
1109
|
job_environment = {
|
|
913
1110
|
# Set the number of cores used by OpenMP applications
|
|
914
|
-
|
|
1111
|
+
"OMP_NUM_THREADS": omp_threads,
|
|
915
1112
|
}
|
|
916
1113
|
|
|
917
1114
|
# jobBatchSystemID is an int for each job
|
|
918
|
-
jobBatchSystemID = self.batchSystem.issueBatchJob(
|
|
1115
|
+
jobBatchSystemID = self.batchSystem.issueBatchJob(
|
|
1116
|
+
" ".join(workerCommand), jobNode, job_environment=job_environment
|
|
1117
|
+
)
|
|
919
1118
|
# Record the job by the ID the batch system will use to talk about it with us
|
|
920
1119
|
self.issued_jobs_by_batch_system_id[jobBatchSystemID] = jobNode.jobStoreID
|
|
921
1120
|
# Record that this job is issued right now and shouldn't e.g. be issued again.
|
|
@@ -925,11 +1124,18 @@ class Leader:
|
|
|
925
1124
|
# so increment this value after the job is added to the issuedJob dict
|
|
926
1125
|
self.preemptibleJobsIssued += 1
|
|
927
1126
|
cur_logger = logger.debug if jobNode.local else logger.info
|
|
928
|
-
cur_logger(
|
|
929
|
-
|
|
930
|
-
|
|
1127
|
+
cur_logger(
|
|
1128
|
+
"Issued job %s with job batch system ID: " "%s and %s",
|
|
1129
|
+
jobNode,
|
|
1130
|
+
str(jobBatchSystemID),
|
|
1131
|
+
jobNode.requirements_string(),
|
|
1132
|
+
)
|
|
931
1133
|
# Tell everyone it is issued and the queue size changed
|
|
932
|
-
self._messages.publish(
|
|
1134
|
+
self._messages.publish(
|
|
1135
|
+
JobIssuedMessage(
|
|
1136
|
+
get_job_kind(jobNode.get_names()), jobNode.jobStoreID, jobBatchSystemID
|
|
1137
|
+
)
|
|
1138
|
+
)
|
|
933
1139
|
self._messages.publish(QueueSizeMessage(self.getNumberOfJobsIssued()))
|
|
934
1140
|
# Tell the user there's another job to do
|
|
935
1141
|
self.progress_overall.total += 1
|
|
@@ -949,7 +1155,9 @@ class Leader:
|
|
|
949
1155
|
# Grab the service job description
|
|
950
1156
|
service = self.toilState.get_job(service_id)
|
|
951
1157
|
if not isinstance(service, ServiceJobDescription):
|
|
952
|
-
raise RuntimeError(
|
|
1158
|
+
raise RuntimeError(
|
|
1159
|
+
"The grabbed service job description is not the right type."
|
|
1160
|
+
)
|
|
953
1161
|
|
|
954
1162
|
if service.preemptible:
|
|
955
1163
|
self.preemptibleServiceJobsToBeIssued.append(service_id)
|
|
@@ -959,14 +1167,23 @@ class Leader:
|
|
|
959
1167
|
|
|
960
1168
|
def issueQueingServiceJobs(self):
|
|
961
1169
|
"""Issues any queuing service jobs up to the limit of the maximum allowed."""
|
|
962
|
-
while
|
|
1170
|
+
while (
|
|
1171
|
+
len(self.serviceJobsToBeIssued) > 0
|
|
1172
|
+
and self.serviceJobsIssued < self.config.maxServiceJobs
|
|
1173
|
+
):
|
|
963
1174
|
self.issueJob(self.toilState.get_job(self.serviceJobsToBeIssued.pop()))
|
|
964
1175
|
self.serviceJobsIssued += 1
|
|
965
|
-
while
|
|
966
|
-
|
|
1176
|
+
while (
|
|
1177
|
+
len(self.preemptibleServiceJobsToBeIssued) > 0
|
|
1178
|
+
and self.preemptibleServiceJobsIssued
|
|
1179
|
+
< self.config.maxPreemptibleServiceJobs
|
|
1180
|
+
):
|
|
1181
|
+
self.issueJob(
|
|
1182
|
+
self.toilState.get_job(self.preemptibleServiceJobsToBeIssued.pop())
|
|
1183
|
+
)
|
|
967
1184
|
self.preemptibleServiceJobsIssued += 1
|
|
968
1185
|
|
|
969
|
-
def getNumberOfJobsIssued(self, preemptible: Optional[bool]=None) -> int:
|
|
1186
|
+
def getNumberOfJobsIssued(self, preemptible: Optional[bool] = None) -> int:
|
|
970
1187
|
"""
|
|
971
1188
|
Get number of jobs that have been added by issueJob(s) and not removed by removeJob.
|
|
972
1189
|
|
|
@@ -1016,12 +1233,16 @@ class Leader:
|
|
|
1016
1233
|
"""
|
|
1017
1234
|
if jobBatchSystemID not in self.issued_jobs_by_batch_system_id:
|
|
1018
1235
|
raise RuntimeError("Job was already removed or was never issued.")
|
|
1019
|
-
issuedDesc = self.toilState.get_job(
|
|
1236
|
+
issuedDesc = self.toilState.get_job(
|
|
1237
|
+
self.issued_jobs_by_batch_system_id[jobBatchSystemID]
|
|
1238
|
+
)
|
|
1020
1239
|
if issuedDesc.preemptible:
|
|
1021
1240
|
# len(issued_jobs_by_batch_system_id) should always be greater than or equal to preemptibleJobsIssued,
|
|
1022
1241
|
# so decrement this value before removing the job from the issuedJob map
|
|
1023
1242
|
if self.preemptibleJobsIssued <= 0:
|
|
1024
|
-
raise RuntimeError(
|
|
1243
|
+
raise RuntimeError(
|
|
1244
|
+
"The number of preemptive issued jobs cannot be negative."
|
|
1245
|
+
)
|
|
1025
1246
|
self.preemptibleJobsIssued -= 1
|
|
1026
1247
|
# It's not issued anymore.
|
|
1027
1248
|
del self.issued_jobs_by_batch_system_id[jobBatchSystemID]
|
|
@@ -1041,19 +1262,24 @@ class Leader:
|
|
|
1041
1262
|
|
|
1042
1263
|
return issuedDesc
|
|
1043
1264
|
|
|
1044
|
-
def getJobs(self, preemptible: Optional[bool] = None) ->
|
|
1265
|
+
def getJobs(self, preemptible: Optional[bool] = None) -> list[JobDescription]:
|
|
1045
1266
|
"""
|
|
1046
1267
|
Get all issued jobs.
|
|
1047
1268
|
|
|
1048
1269
|
:param preemptible: If specified, select only preemptible or only non-preemptible jobs.
|
|
1049
1270
|
"""
|
|
1050
1271
|
|
|
1051
|
-
jobs = [
|
|
1272
|
+
jobs = [
|
|
1273
|
+
self.toilState.get_job(job_store_id)
|
|
1274
|
+
for job_store_id in self.issued_jobs_by_batch_system_id.values()
|
|
1275
|
+
]
|
|
1052
1276
|
if preemptible is not None:
|
|
1053
1277
|
jobs = [job for job in jobs if job.preemptible == preemptible]
|
|
1054
1278
|
return jobs
|
|
1055
1279
|
|
|
1056
|
-
def killJobs(
|
|
1280
|
+
def killJobs(
|
|
1281
|
+
self, jobsToKill, exit_reason: BatchJobExitReason = BatchJobExitReason.KILLED
|
|
1282
|
+
):
|
|
1057
1283
|
"""
|
|
1058
1284
|
Kills the given set of jobs and then sends them for processing.
|
|
1059
1285
|
|
|
@@ -1067,7 +1293,9 @@ class Leader:
|
|
|
1067
1293
|
self.batchSystem.killBatchJobs(jobsToKill)
|
|
1068
1294
|
for jobBatchSystemID in jobsToKill:
|
|
1069
1295
|
# Reissue immediately, noting that we killed the job
|
|
1070
|
-
willRerun = self.process_finished_job(
|
|
1296
|
+
willRerun = self.process_finished_job(
|
|
1297
|
+
jobBatchSystemID, 1, exit_reason=exit_reason
|
|
1298
|
+
)
|
|
1071
1299
|
|
|
1072
1300
|
if willRerun:
|
|
1073
1301
|
# Compose a list of all the jobs that will run again
|
|
@@ -1075,8 +1303,7 @@ class Leader:
|
|
|
1075
1303
|
|
|
1076
1304
|
return jobsRerunning
|
|
1077
1305
|
|
|
1078
|
-
|
|
1079
|
-
#Following functions handle error cases for when jobs have gone awry with the batch system.
|
|
1306
|
+
# Following functions handle error cases for when jobs have gone awry with the batch system.
|
|
1080
1307
|
|
|
1081
1308
|
def reissueOverLongJobs(self) -> None:
|
|
1082
1309
|
"""
|
|
@@ -1087,20 +1314,30 @@ class Leader:
|
|
|
1087
1314
|
"""
|
|
1088
1315
|
maxJobDuration = self.config.maxJobDuration
|
|
1089
1316
|
jobsToKill = []
|
|
1090
|
-
if
|
|
1317
|
+
if (
|
|
1318
|
+
maxJobDuration < 10000000
|
|
1319
|
+
): # We won't bother doing anything if rescue time > 16 weeks.
|
|
1091
1320
|
runningJobs = self.batchSystem.getRunningBatchJobIDs()
|
|
1092
1321
|
for jobBatchSystemID in list(runningJobs.keys()):
|
|
1093
1322
|
if runningJobs[jobBatchSystemID] > maxJobDuration:
|
|
1094
|
-
logger.warning(
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1323
|
+
logger.warning(
|
|
1324
|
+
"The job: %s has been running for: %s seconds, more than the "
|
|
1325
|
+
"max job duration: %s, we'll kill it",
|
|
1326
|
+
self.issued_jobs_by_batch_system_id[jobBatchSystemID],
|
|
1327
|
+
str(runningJobs[jobBatchSystemID]),
|
|
1328
|
+
str(maxJobDuration),
|
|
1329
|
+
)
|
|
1099
1330
|
jobsToKill.append(jobBatchSystemID)
|
|
1100
|
-
reissued = self.killJobs(
|
|
1331
|
+
reissued = self.killJobs(
|
|
1332
|
+
jobsToKill, exit_reason=BatchJobExitReason.MAXJOBDURATION
|
|
1333
|
+
)
|
|
1101
1334
|
if len(jobsToKill) > 0:
|
|
1102
1335
|
# Summarize our actions
|
|
1103
|
-
logger.info(
|
|
1336
|
+
logger.info(
|
|
1337
|
+
"Killed %d over long jobs and reissued %d of them",
|
|
1338
|
+
len(jobsToKill),
|
|
1339
|
+
len(reissued),
|
|
1340
|
+
)
|
|
1104
1341
|
|
|
1105
1342
|
def reissueMissingJobs(self, killAfterNTimesMissing=3):
|
|
1106
1343
|
"""
|
|
@@ -1112,11 +1349,13 @@ class Leader:
|
|
|
1112
1349
|
"""
|
|
1113
1350
|
issuedJobs = set(self.batchSystem.getIssuedBatchJobIDs())
|
|
1114
1351
|
jobBatchSystemIDsSet = set(list(self.issued_jobs_by_batch_system_id.keys()))
|
|
1115
|
-
#Clean up the reissueMissingJobs_missingHash hash, getting rid of jobs that have turned up
|
|
1352
|
+
# Clean up the reissueMissingJobs_missingHash hash, getting rid of jobs that have turned up
|
|
1116
1353
|
missingJobIDsSet = set(list(self.reissueMissingJobs_missingHash.keys()))
|
|
1117
1354
|
for jobBatchSystemID in missingJobIDsSet.difference(jobBatchSystemIDsSet):
|
|
1118
1355
|
self.reissueMissingJobs_missingHash.pop(jobBatchSystemID)
|
|
1119
|
-
logger.warning(
|
|
1356
|
+
logger.warning(
|
|
1357
|
+
"Batch system id: %s is no longer missing", str(jobBatchSystemID)
|
|
1358
|
+
)
|
|
1120
1359
|
# checks we have no unexpected jobs running
|
|
1121
1360
|
if not issuedJobs.issubset(jobBatchSystemIDsSet):
|
|
1122
1361
|
raise RuntimeError("An unexpected job is still running.")
|
|
@@ -1128,24 +1367,33 @@ class Leader:
|
|
|
1128
1367
|
else:
|
|
1129
1368
|
self.reissueMissingJobs_missingHash[jobBatchSystemID] = 1
|
|
1130
1369
|
timesMissing = self.reissueMissingJobs_missingHash[jobBatchSystemID]
|
|
1131
|
-
logger.warning(
|
|
1132
|
-
|
|
1370
|
+
logger.warning(
|
|
1371
|
+
"Job store ID %s with batch system id %s is missing for the %i time",
|
|
1372
|
+
jobStoreID,
|
|
1373
|
+
str(jobBatchSystemID),
|
|
1374
|
+
timesMissing,
|
|
1375
|
+
)
|
|
1133
1376
|
# Tell everyone it is missing
|
|
1134
1377
|
self._messages.publish(JobMissingMessage(jobStoreID))
|
|
1135
1378
|
if timesMissing == killAfterNTimesMissing:
|
|
1136
1379
|
self.reissueMissingJobs_missingHash.pop(jobBatchSystemID)
|
|
1137
1380
|
jobsToKill.append(jobBatchSystemID)
|
|
1138
1381
|
self.killJobs(jobsToKill, exit_reason=BatchJobExitReason.MISSING)
|
|
1139
|
-
return len(
|
|
1140
|
-
#if there are missing jobs
|
|
1382
|
+
return len(self.reissueMissingJobs_missingHash) == 0 # We use this to inform
|
|
1383
|
+
# if there are missing jobs
|
|
1141
1384
|
|
|
1142
1385
|
def processRemovedJob(self, issuedJob, result_status):
|
|
1143
1386
|
if result_status != 0:
|
|
1144
|
-
logger.warning(
|
|
1145
|
-
|
|
1387
|
+
logger.warning(
|
|
1388
|
+
"Despite the batch system claiming failure the "
|
|
1389
|
+
"job %s seems to have finished and been removed",
|
|
1390
|
+
issuedJob,
|
|
1391
|
+
)
|
|
1146
1392
|
self._updatePredecessorStatus(issuedJob.jobStoreID)
|
|
1147
1393
|
|
|
1148
|
-
def process_finished_job(
|
|
1394
|
+
def process_finished_job(
|
|
1395
|
+
self, batch_system_id, result_status, wall_time=None, exit_reason=None
|
|
1396
|
+
) -> bool:
|
|
1149
1397
|
"""
|
|
1150
1398
|
Process finished jobs.
|
|
1151
1399
|
|
|
@@ -1166,12 +1414,18 @@ class Leader:
|
|
|
1166
1414
|
self.progress_failed.update(incr=1)
|
|
1167
1415
|
|
|
1168
1416
|
# Delegate to the version that uses a JobDescription
|
|
1169
|
-
return self.process_finished_job_description(
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1417
|
+
return self.process_finished_job_description(
|
|
1418
|
+
issued_job, result_status, wall_time, exit_reason, batch_system_id
|
|
1419
|
+
)
|
|
1420
|
+
|
|
1421
|
+
def process_finished_job_description(
|
|
1422
|
+
self,
|
|
1423
|
+
finished_job: JobDescription,
|
|
1424
|
+
result_status: int,
|
|
1425
|
+
wall_time: Optional[float] = None,
|
|
1426
|
+
exit_reason: Optional[BatchJobExitReason] = None,
|
|
1427
|
+
batch_system_id: Optional[int] = None,
|
|
1428
|
+
) -> bool:
|
|
1175
1429
|
"""
|
|
1176
1430
|
Process a finished JobDescription based upon its success or failure.
|
|
1177
1431
|
|
|
@@ -1193,7 +1447,9 @@ class Leader:
|
|
|
1193
1447
|
# TODO: Use message bus?
|
|
1194
1448
|
self.clusterScaler.addCompletedJob(finished_job, wall_time)
|
|
1195
1449
|
if self.toilState.job_exists(job_store_id):
|
|
1196
|
-
logger.debug(
|
|
1450
|
+
logger.debug(
|
|
1451
|
+
"Job %s continues to exist (i.e. has more to do)", finished_job
|
|
1452
|
+
)
|
|
1197
1453
|
try:
|
|
1198
1454
|
# Reload the job as modified by the worker
|
|
1199
1455
|
if finished_job.has_body():
|
|
@@ -1218,24 +1474,22 @@ class Leader:
|
|
|
1218
1474
|
"batch system may have killed (or never started) "
|
|
1219
1475
|
"the Toil worker."
|
|
1220
1476
|
)
|
|
1221
|
-
change_detected = self.toilState.reset_job_expecting_change(
|
|
1477
|
+
change_detected = self.toilState.reset_job_expecting_change(
|
|
1478
|
+
job_store_id, timeout
|
|
1479
|
+
)
|
|
1222
1480
|
replacement_job = self.toilState.get_job(job_store_id)
|
|
1223
1481
|
|
|
1224
1482
|
if not change_detected:
|
|
1225
|
-
logger.warning(
|
|
1226
|
-
'Job %s %s',
|
|
1227
|
-
replacement_job,
|
|
1228
|
-
complaint
|
|
1229
|
-
)
|
|
1483
|
+
logger.warning("Job %s %s", replacement_job, complaint)
|
|
1230
1484
|
if result_status == 0:
|
|
1231
1485
|
# Make the job fail because we ran it and it finished
|
|
1232
1486
|
# and we never heard back.
|
|
1233
1487
|
logger.error(
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
replacement_job
|
|
1488
|
+
"Marking ostensibly successful job %s that did "
|
|
1489
|
+
"not report in to the job store before "
|
|
1490
|
+
"--jobStoreTimeout as having been partitioned "
|
|
1491
|
+
"from us.",
|
|
1492
|
+
replacement_job,
|
|
1239
1493
|
)
|
|
1240
1494
|
result_status = EXIT_STATUS_UNAVAILABLE_VALUE
|
|
1241
1495
|
exit_reason = BatchJobExitReason.PARTITION
|
|
@@ -1251,7 +1505,9 @@ class Leader:
|
|
|
1251
1505
|
# read from e.g. a non-POSIX-compliant filesystem gave us a
|
|
1252
1506
|
# false positive when we checked for its existence. Process the
|
|
1253
1507
|
# job from here as any other job removed from the job store.
|
|
1254
|
-
logger.debug(
|
|
1508
|
+
logger.debug(
|
|
1509
|
+
"Job %s is actually complete upon closer inspection", finished_job
|
|
1510
|
+
)
|
|
1255
1511
|
self.processRemovedJob(finished_job, result_status)
|
|
1256
1512
|
return False
|
|
1257
1513
|
if replacement_job.logJobStoreFileID is not None:
|
|
@@ -1259,18 +1515,31 @@ class Leader:
|
|
|
1259
1515
|
# more memory efficient than read().striplines() while leaving off the
|
|
1260
1516
|
# trailing \n left when using readlines()
|
|
1261
1517
|
# http://stackoverflow.com/a/15233739
|
|
1262
|
-
StatsAndLogging.logWithFormatting(
|
|
1263
|
-
|
|
1518
|
+
StatsAndLogging.logWithFormatting(
|
|
1519
|
+
f'Log from job "{job_store_id}"',
|
|
1520
|
+
log_stream,
|
|
1521
|
+
method=logger.warning,
|
|
1522
|
+
message="The job seems to have left a log file, indicating failure: %s"
|
|
1523
|
+
% replacement_job,
|
|
1524
|
+
)
|
|
1264
1525
|
if self.config.writeLogs or self.config.writeLogsGzip:
|
|
1265
1526
|
with replacement_job.getLogFileHandle(self.jobStore) as log_stream:
|
|
1266
1527
|
# Send log data from the job store to each per-job log file involved.
|
|
1267
|
-
StatsAndLogging.writeLogFiles(
|
|
1528
|
+
StatsAndLogging.writeLogFiles(
|
|
1529
|
+
[names.stats_name for names in replacement_job.get_chain()],
|
|
1530
|
+
log_stream,
|
|
1531
|
+
self.config,
|
|
1532
|
+
failed=True,
|
|
1533
|
+
)
|
|
1268
1534
|
if result_status != 0:
|
|
1269
1535
|
# If the batch system returned a non-zero exit code then the worker
|
|
1270
1536
|
# is assumed not to have captured the failure of the job, so we
|
|
1271
1537
|
# reduce the try count here.
|
|
1272
1538
|
if replacement_job.logJobStoreFileID is None:
|
|
1273
|
-
logger.warning(
|
|
1539
|
+
logger.warning(
|
|
1540
|
+
"No log file is present, despite job failing: %s",
|
|
1541
|
+
replacement_job,
|
|
1542
|
+
)
|
|
1274
1543
|
|
|
1275
1544
|
if batch_system_id is not None:
|
|
1276
1545
|
# Look for any standard output/error files created by the batch system.
|
|
@@ -1279,30 +1548,60 @@ class Leader:
|
|
|
1279
1548
|
# --workDir / TOIL_WORKDIR is on a shared file system.
|
|
1280
1549
|
# They live directly in the Toil work directory because that is
|
|
1281
1550
|
# guaranteed to exist on the leader and workers.
|
|
1282
|
-
file_list = glob.glob(
|
|
1551
|
+
file_list = glob.glob(
|
|
1552
|
+
self.batchSystem.format_std_out_err_glob(batch_system_id)
|
|
1553
|
+
)
|
|
1283
1554
|
for log_file in file_list:
|
|
1284
1555
|
try:
|
|
1285
|
-
log_stream = open(log_file,
|
|
1556
|
+
log_stream = open(log_file, "rb")
|
|
1286
1557
|
except:
|
|
1287
|
-
logger.warning(
|
|
1558
|
+
logger.warning(
|
|
1559
|
+
"The batch system left a file %s, but it could not be opened"
|
|
1560
|
+
% log_file
|
|
1561
|
+
)
|
|
1288
1562
|
else:
|
|
1289
1563
|
with log_stream:
|
|
1290
1564
|
if os.path.getsize(log_file) > 0:
|
|
1291
|
-
StatsAndLogging.logWithFormatting(
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1565
|
+
StatsAndLogging.logWithFormatting(
|
|
1566
|
+
f'Log from job "{job_store_id}"',
|
|
1567
|
+
log_stream,
|
|
1568
|
+
method=logger.warning,
|
|
1569
|
+
message="The batch system left a non-empty file %s:"
|
|
1570
|
+
% log_file,
|
|
1571
|
+
)
|
|
1572
|
+
if (
|
|
1573
|
+
self.config.writeLogs
|
|
1574
|
+
or self.config.writeLogsGzip
|
|
1575
|
+
):
|
|
1576
|
+
file_root, _ = os.path.splitext(
|
|
1577
|
+
os.path.basename(log_file)
|
|
1578
|
+
)
|
|
1579
|
+
job_names = [
|
|
1580
|
+
names.stats_name
|
|
1581
|
+
for names in replacement_job.get_chain()
|
|
1582
|
+
]
|
|
1296
1583
|
# Tack the batch system log file name onto each job's name
|
|
1297
|
-
job_names = [
|
|
1584
|
+
job_names = [
|
|
1585
|
+
j + "_" + file_root for j in job_names
|
|
1586
|
+
]
|
|
1298
1587
|
log_stream.seek(0)
|
|
1299
|
-
StatsAndLogging.writeLogFiles(
|
|
1588
|
+
StatsAndLogging.writeLogFiles(
|
|
1589
|
+
job_names,
|
|
1590
|
+
log_stream,
|
|
1591
|
+
self.config,
|
|
1592
|
+
failed=True,
|
|
1593
|
+
)
|
|
1300
1594
|
else:
|
|
1301
|
-
logger.warning(
|
|
1595
|
+
logger.warning(
|
|
1596
|
+
"The batch system left an empty file %s"
|
|
1597
|
+
% log_file
|
|
1598
|
+
)
|
|
1302
1599
|
|
|
1303
1600
|
# Tell the job to reset itself after a failure.
|
|
1304
1601
|
# It needs to know the failure reason if available; some are handled specially.
|
|
1305
|
-
replacement_job.setupJobAfterFailure(
|
|
1602
|
+
replacement_job.setupJobAfterFailure(
|
|
1603
|
+
exit_status=result_status, exit_reason=exit_reason
|
|
1604
|
+
)
|
|
1306
1605
|
self.toilState.commit_job(job_store_id)
|
|
1307
1606
|
|
|
1308
1607
|
elif job_store_id in self.toilState.hasFailedSuccessors:
|
|
@@ -1310,18 +1609,20 @@ class Leader:
|
|
|
1310
1609
|
self.toilState.hasFailedSuccessors.remove(job_store_id)
|
|
1311
1610
|
|
|
1312
1611
|
# Now that we know the job is done we can add it to the list of updated jobs
|
|
1313
|
-
self._messages.publish(
|
|
1612
|
+
self._messages.publish(
|
|
1613
|
+
JobUpdatedMessage(replacement_job.jobStoreID, result_status)
|
|
1614
|
+
)
|
|
1314
1615
|
logger.debug("Added job: %s to updated jobs", replacement_job)
|
|
1315
1616
|
|
|
1316
1617
|
# Return True if it will rerun (still has retries) and false if it
|
|
1317
1618
|
# is completely failed.
|
|
1318
1619
|
return replacement_job.remainingTryCount > 0
|
|
1319
|
-
else: #The job is done
|
|
1620
|
+
else: # The job is done
|
|
1320
1621
|
self.processRemovedJob(finished_job, result_status)
|
|
1321
1622
|
# Being done, it won't run again.
|
|
1322
1623
|
return False
|
|
1323
1624
|
|
|
1324
|
-
def getSuccessors(self, job_id: str, alreadySeenSuccessors:
|
|
1625
|
+
def getSuccessors(self, job_id: str, alreadySeenSuccessors: set[str]) -> set[str]:
|
|
1325
1626
|
"""
|
|
1326
1627
|
Get successors of the given job by walking the job graph recursively.
|
|
1327
1628
|
|
|
@@ -1329,6 +1630,7 @@ class Leader:
|
|
|
1329
1630
|
:returns: The set of found successors. This set is added to alreadySeenSuccessors.
|
|
1330
1631
|
"""
|
|
1331
1632
|
successors = set()
|
|
1633
|
+
|
|
1332
1634
|
def successorRecursion(job_id: str) -> None:
|
|
1333
1635
|
# TODO: do we need to reload from the job store here, or is the cache OK?
|
|
1334
1636
|
jobDesc = self.toilState.get_job(job_id)
|
|
@@ -1360,12 +1662,15 @@ class Leader:
|
|
|
1360
1662
|
|
|
1361
1663
|
# Tell everyone it failed
|
|
1362
1664
|
|
|
1363
|
-
self._messages.publish(
|
|
1665
|
+
self._messages.publish(
|
|
1666
|
+
JobFailedMessage(get_job_kind(job_desc.get_names()), job_id)
|
|
1667
|
+
)
|
|
1364
1668
|
|
|
1365
1669
|
if job_id in self.toilState.service_to_client:
|
|
1366
1670
|
# Is a service job
|
|
1367
|
-
logger.debug(
|
|
1368
|
-
|
|
1671
|
+
logger.debug(
|
|
1672
|
+
"Service job is being processed as a totally failed job: %s", job_desc
|
|
1673
|
+
)
|
|
1369
1674
|
|
|
1370
1675
|
if not isinstance(job_desc, ServiceJobDescription):
|
|
1371
1676
|
raise RuntimeError("The service job description type is incorrect.")
|
|
@@ -1389,8 +1694,13 @@ class Leader:
|
|
|
1389
1694
|
# properly, and to remember that this service failed with an error
|
|
1390
1695
|
# and possibly never started.
|
|
1391
1696
|
if client_id in self.toilState.servicesIssued:
|
|
1392
|
-
self.serviceManager.kill_services(
|
|
1393
|
-
|
|
1697
|
+
self.serviceManager.kill_services(
|
|
1698
|
+
self.toilState.servicesIssued[client_id], error=True
|
|
1699
|
+
)
|
|
1700
|
+
logger.warning(
|
|
1701
|
+
"Job: %s is instructing all other services of its parent job to quit",
|
|
1702
|
+
job_desc,
|
|
1703
|
+
)
|
|
1394
1704
|
|
|
1395
1705
|
# This ensures that the job will not attempt to run any of it's
|
|
1396
1706
|
# successors on the stack
|
|
@@ -1414,9 +1724,14 @@ class Leader:
|
|
|
1414
1724
|
# Any successor already in toilState.failedSuccessors will not be traversed
|
|
1415
1725
|
# All successors traversed will be added to toilState.failedSuccessors and returned
|
|
1416
1726
|
# as a set (unseenSuccessors).
|
|
1417
|
-
unseenSuccessors = self.getSuccessors(
|
|
1418
|
-
|
|
1419
|
-
|
|
1727
|
+
unseenSuccessors = self.getSuccessors(
|
|
1728
|
+
job_id, self.toilState.failedSuccessors
|
|
1729
|
+
)
|
|
1730
|
+
logger.debug(
|
|
1731
|
+
"Found new failed successors: %s of job: %s",
|
|
1732
|
+
" ".join(unseenSuccessors),
|
|
1733
|
+
job_desc,
|
|
1734
|
+
)
|
|
1420
1735
|
|
|
1421
1736
|
# For each newly found successor
|
|
1422
1737
|
for successorJobStoreID in unseenSuccessors:
|
|
@@ -1427,7 +1742,9 @@ class Leader:
|
|
|
1427
1742
|
# For each such predecessor job
|
|
1428
1743
|
# (we remove the successor from toilState.successor_to_predecessors to avoid doing
|
|
1429
1744
|
# this multiple times for each failed predecessor)
|
|
1430
|
-
for predecessor_id in self.toilState.successor_to_predecessors.pop(
|
|
1745
|
+
for predecessor_id in self.toilState.successor_to_predecessors.pop(
|
|
1746
|
+
successorJobStoreID
|
|
1747
|
+
):
|
|
1431
1748
|
|
|
1432
1749
|
predecessor = self.toilState.get_job(predecessor_id)
|
|
1433
1750
|
|
|
@@ -1436,8 +1753,11 @@ class Leader:
|
|
|
1436
1753
|
|
|
1437
1754
|
# Indicate that it has failed jobs.
|
|
1438
1755
|
self.toilState.hasFailedSuccessors.add(predecessor_id)
|
|
1439
|
-
logger.debug(
|
|
1440
|
-
|
|
1756
|
+
logger.debug(
|
|
1757
|
+
"Marking job: %s as having failed successors (found by "
|
|
1758
|
+
"reading successors failed job)",
|
|
1759
|
+
predecessor,
|
|
1760
|
+
)
|
|
1441
1761
|
|
|
1442
1762
|
# If the predecessor has no remaining successors, add to list of updated jobs
|
|
1443
1763
|
if self.toilState.count_pending_successors(predecessor_id) == 0:
|
|
@@ -1451,8 +1771,12 @@ class Leader:
|
|
|
1451
1771
|
|
|
1452
1772
|
# Mark the predecessor as failed
|
|
1453
1773
|
self.toilState.hasFailedSuccessors.add(predecessor_id)
|
|
1454
|
-
logger.debug(
|
|
1455
|
-
|
|
1774
|
+
logger.debug(
|
|
1775
|
+
"Totally failed job: %s is marking direct predecessor: %s "
|
|
1776
|
+
"as having failed jobs",
|
|
1777
|
+
job_desc,
|
|
1778
|
+
self.toilState.get_job(predecessor_id),
|
|
1779
|
+
)
|
|
1456
1780
|
|
|
1457
1781
|
self._updatePredecessorStatus(job_id)
|
|
1458
1782
|
|
|
@@ -1462,38 +1786,59 @@ class Leader:
|
|
|
1462
1786
|
# Is a service host job, so its predecessor is its client
|
|
1463
1787
|
client_id = self.toilState.service_to_client.pop(jobStoreID)
|
|
1464
1788
|
self.toilState.servicesIssued[client_id].remove(jobStoreID)
|
|
1465
|
-
if
|
|
1789
|
+
if (
|
|
1790
|
+
len(self.toilState.servicesIssued[client_id]) == 0
|
|
1791
|
+
): # Predecessor job has
|
|
1466
1792
|
# all its services terminated
|
|
1467
|
-
self.toilState.servicesIssued.pop(
|
|
1793
|
+
self.toilState.servicesIssued.pop(
|
|
1794
|
+
client_id
|
|
1795
|
+
) # The job has no running services
|
|
1468
1796
|
|
|
1469
|
-
logger.debug(
|
|
1797
|
+
logger.debug(
|
|
1798
|
+
"Job %s is no longer waiting on services; all services have stopped",
|
|
1799
|
+
self.toilState.get_job(client_id),
|
|
1800
|
+
)
|
|
1470
1801
|
|
|
1471
1802
|
# Now we know the job is done we can add it to the list of
|
|
1472
1803
|
# updated job files
|
|
1473
1804
|
self._messages.publish(JobUpdatedMessage(client_id, 0))
|
|
1474
1805
|
else:
|
|
1475
|
-
logger.debug(
|
|
1476
|
-
|
|
1477
|
-
|
|
1806
|
+
logger.debug(
|
|
1807
|
+
"Job %s is still waiting on %d services",
|
|
1808
|
+
self.toilState.get_job(client_id),
|
|
1809
|
+
len(self.toilState.servicesIssued[client_id]),
|
|
1810
|
+
)
|
|
1478
1811
|
elif jobStoreID not in self.toilState.successor_to_predecessors:
|
|
1479
|
-
#We have reach the root job
|
|
1812
|
+
# We have reach the root job
|
|
1480
1813
|
if self._messages.count(JobUpdatedMessage) != 0:
|
|
1481
1814
|
raise RuntimeError("Root job is done but other jobs are still updated")
|
|
1482
1815
|
if len(self.toilState.successor_to_predecessors) != 0:
|
|
1483
|
-
raise RuntimeError(
|
|
1484
|
-
|
|
1816
|
+
raise RuntimeError(
|
|
1817
|
+
"Job {} is finished and had no predecessor, but we have other outstanding jobs "
|
|
1818
|
+
"with predecessors: {}".format(
|
|
1819
|
+
jobStoreID, self.toilState.successor_to_predecessors.keys()
|
|
1820
|
+
)
|
|
1821
|
+
)
|
|
1485
1822
|
if len(self.toilState.successorCounts) != 0:
|
|
1486
|
-
raise RuntimeError(
|
|
1487
|
-
|
|
1823
|
+
raise RuntimeError(
|
|
1824
|
+
"Root job is done but jobs waiting on successors: {self.toilState.successorCounts}"
|
|
1825
|
+
)
|
|
1826
|
+
logger.debug(
|
|
1827
|
+
"Reached root job %s so no predecessors to clean up" % jobStoreID
|
|
1828
|
+
)
|
|
1488
1829
|
|
|
1489
1830
|
else:
|
|
1490
1831
|
# Is a non-root, non-service job
|
|
1491
1832
|
logger.debug("Cleaning the predecessors of %s" % jobStoreID)
|
|
1492
1833
|
|
|
1493
1834
|
# For each predecessor
|
|
1494
|
-
for predecessor_id in self.toilState.successor_to_predecessors.pop(
|
|
1835
|
+
for predecessor_id in self.toilState.successor_to_predecessors.pop(
|
|
1836
|
+
jobStoreID
|
|
1837
|
+
):
|
|
1495
1838
|
if not isinstance(predecessor_id, str):
|
|
1496
|
-
raise RuntimeError(
|
|
1839
|
+
raise RuntimeError(
|
|
1840
|
+
"Predecessor ID should be str but is {type(predecessor_id)}"
|
|
1841
|
+
)
|
|
1497
1842
|
predecessor = self.toilState.get_job(predecessor_id)
|
|
1498
1843
|
|
|
1499
1844
|
# Tell the predecessor that this job is done (keep only other successor jobs)
|